You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

109 lines
3.6 KiB

#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
#pragma kernel Main
#pragma multi_compile _ MSAA2X MSAA4X MSAA8X
#pragma multi_compile _ COARSE_STENCIL
#pragma multi_compile _ RESOLVE
#if defined(MSAA2X)
# define NUM_SAMPLES 2
#elif defined(MSAA4X)
# define NUM_SAMPLES 4
#elif defined(MSAA8X)
# define NUM_SAMPLES 8
#else
# define NUM_SAMPLES 1
#endif
#if (NUM_SAMPLES > 1)
# define MSAA
#endif
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"
#ifdef MSAA
TEXTURE2D_X_MSAA(uint2, _StencilTexture);
RW_TEXTURE2D_X(uint2, _OutputStencilBuffer);
#else
TEXTURE2D_X_UINT2(_StencilTexture);
#endif
// TODO: Wasting 3 bytes here per entry, but still better than a texture as can be scalar read.
// We could sub-index the right byte inside the uint, but it takes extra ALU and won't save bandwidth (just memory)
// For now the extra memory cost is acceptable (3 bytes * 1/64th of a render target).
// Note that using RawBuffers seems to have problem, so using structured buffers for now, but is worth revisiting if the perf difference is a concern.
RWStructuredBuffer<uint> _CoarseStencilBuffer;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
#define USE_INTRINSICS (PLATFORM_LANE_COUNT == 64)
#else
#define USE_INTRINSICS 0
#endif
#if USE_INTRINSICS == 0
groupshared uint coarseStencilValue;
#endif
[numthreads(64, 1, 1)]
void Main(uint3 groupId : SV_GroupID, uint groupThreadIndex : SV_GroupIndex)
{
UNITY_XR_ASSIGN_VIEW_INDEX(groupId.z);
// The best shot at resolving is being overly conservative, hence the OR operator. This is by nature inaccurate, but there is no way to blend MSAA sub-samples properly and we need to pick the lesser evil.
uint resolvedStencil = 0;
uint2 pixelCoord = (groupId.xy << 3) + uint2(groupThreadIndex & 0x7, groupThreadIndex >> 3);
if (pixelCoord.x < (uint)_ScreenSize.x && pixelCoord.y < (uint)_ScreenSize.y)
{
UNITY_UNROLL
for (uint i = 0; i < NUM_SAMPLES; i++)
{
uint2 sampledStencil;
#ifndef MSAA
sampledStencil = LOAD_TEXTURE2D_X(_StencilTexture, pixelCoord.xy);
#else
sampledStencil = LOAD_TEXTURE2D_X_MSAA(_StencilTexture, pixelCoord.xy, i);
#endif
resolvedStencil |= GetStencilValue(sampledStencil); // In not MSAA cases the | is the same as assigning given that NUM_SAMPLES is 1
}
}
#if defined(RESOLVE) && defined(MSAA)
_OutputStencilBuffer[COORD_TEXTURE2D_X(pixelCoord.xy)] = uint2(resolvedStencil, resolvedStencil);
#endif
#ifdef COARSE_STENCIL
#if USE_INTRINSICS
// Need to workaround a warning incorrectly triggered when on Xbox One, so instead of using WaveIsFirstLane()
// we check the groupThreadIndex as in the non intrinsic version.
uint coarseStencilValue = WaveActiveBitOr(resolvedStencil);
#else
if (groupThreadIndex == 0)
coarseStencilValue = 0;
GroupMemoryBarrierWithGroupSync();
InterlockedOr(coarseStencilValue, resolvedStencil);
GroupMemoryBarrierWithGroupSync();
#endif
//This temp is needed outside the if(groupThreadIndex == 0) condition to workaround a DXC DXIL codegen
// issue https://github.com/microsoft/DirectXShaderCompiler/issues/2743 until it's fixed
uint perThreadCoarseStencilValue = coarseStencilValue;
if (groupThreadIndex == 0)
{
uint addressIndex = Get1DAddressFromPixelCoord(groupId.xy, _CoarseStencilBufferSize.xy, groupId.z);
_CoarseStencilBuffer[addressIndex] = perThreadCoarseStencilValue;
}
#endif
}