Laurens-Packages/Packages/com.unity.render-pipelines..../Runtime/ShaderLibrary/ResolveStencilBuffer.compute


								#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch switch2


								#pragma kernel Main


								#pragma multi_compile _ MSAA2X MSAA4X MSAA8X

								#pragma multi_compile _ COARSE_STENCIL

								#pragma multi_compile _ RESOLVE


								#if defined(MSAA2X)

								# define NUM_SAMPLES 2

								#elif defined(MSAA4X)

								# define NUM_SAMPLES 4

								#elif defined(MSAA8X)

								# define NUM_SAMPLES 8

								#else

								# define NUM_SAMPLES 1

								#endif


								#if (NUM_SAMPLES > 1)

								# define MSAA

								#endif


								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"


								#ifdef MSAA

								TEXTURE2D_X_MSAA(uint2, _StencilTexture);

								RW_TEXTURE2D_X(uint2, _OutputStencilBuffer);

								#else

								TYPED_TEXTURE2D_X(uint2, _StencilTexture);

								#endif


								// TODO: Wasting 3 bytes here per entry, but still better than a texture as can be scalar read.

								// We could sub-index the right byte inside the uint, but it takes extra ALU and won't save bandwidth (just memory)

								// For now the extra memory cost is acceptable (3 bytes * 1/64th of a render target).

								// Note that using RawBuffers seems to have problem, so using structured buffers for now, but is worth revisiting if the perf difference is a concern.

								RWStructuredBuffer<uint>    _CoarseStencilBuffer;


								#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS

								    #define USE_INTRINSICS (PLATFORM_LANE_COUNT == 64)

								#else

								    #define USE_INTRINSICS 0

								#endif


								#if USE_INTRINSICS == 0

								groupshared uint coarseStencilValue;

								#endif


								[numthreads(64, 1, 1)]

								void Main(uint3 groupId : SV_GroupID, uint  groupThreadIndex : SV_GroupIndex)

								{

								    UNITY_XR_ASSIGN_VIEW_INDEX(groupId.z);


								    // The best shot at resolving is being overly conservative, hence the OR operator. This is by nature inaccurate, but there is no way to blend MSAA sub-samples properly and we need to pick the lesser evil.

								    uint resolvedStencil = 0;

								    uint2 pixelCoord = (groupId.xy << 3) + uint2(groupThreadIndex & 0x7, groupThreadIndex >> 3);


								    if (pixelCoord.x < (uint)_ScreenSize.x && pixelCoord.y < (uint)_ScreenSize.y)

								    {

								        UNITY_UNROLL

								        for (uint i = 0; i < NUM_SAMPLES; i++)

								        {

								            uint2 sampledStencil;

								#ifndef MSAA

								            sampledStencil = LOAD_TEXTURE2D_X(_StencilTexture, pixelCoord.xy);

								#else

								            sampledStencil = LOAD_TEXTURE2D_X_MSAA(_StencilTexture, pixelCoord.xy, i);

								#endif

								            resolvedStencil |= GetStencilValue(sampledStencil); // In not MSAA cases the | is the same as assigning given that NUM_SAMPLES is 1

								        }

								    }

								#if defined(RESOLVE) && defined(MSAA)

								    _OutputStencilBuffer[COORD_TEXTURE2D_X(pixelCoord.xy)] = uint2(resolvedStencil, resolvedStencil);

								#endif


								#ifdef COARSE_STENCIL


								#if USE_INTRINSICS


								    // Need to workaround a warning incorrectly triggered when on Xbox One, so instead of using WaveIsFirstLane()

								    // we check the groupThreadIndex as in the non intrinsic version.

								    uint coarseStencilValue = WaveActiveBitOr(resolvedStencil);


								#else


								    if (groupThreadIndex == 0)

								        coarseStencilValue = 0;


								    GroupMemoryBarrierWithGroupSync();


								    InterlockedOr(coarseStencilValue, resolvedStencil);


								    GroupMemoryBarrierWithGroupSync();


								#endif


								    //This temp is needed outside the if(groupThreadIndex == 0) condition to workaround a DXC DXIL codegen

								    // issue https://github.com/microsoft/DirectXShaderCompiler/issues/2743 until it's fixed

								    uint perThreadCoarseStencilValue = coarseStencilValue;


								    if (groupThreadIndex == 0)

								    {

								        uint addressIndex = Get1DAddressFromPixelCoord(groupId.xy, _CoarseStencilBufferSize.xy, groupId.z);

								        _CoarseStencilBuffer[addressIndex] = perThreadCoarseStencilValue;

								    }


								#endif


								}