Laurens-Packages/Packages/com.unity.render-pipelines..../Runtime/Lighting/LightLoop/lightlistbuild.compute


								// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.

								// https://github.com/wolfgangfengel/GPU-Pro-7


								#pragma kernel TileLightListGen


								#pragma multi_compile _ USE_TWO_PASS_TILED_LIGHTING

								#pragma multi_compile _ USE_FEATURE_FLAGS

								#pragma multi_compile _ USE_OBLIQUE_MODE


								//#pragma enable_d3d11_debug_symbols


								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"


								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/TextureXR.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ShaderBase.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"


								#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL) && !defined(SHADER_API_SWITCH) && !defined(SHADER_API_GAMECORE) && !defined(SHADER_API_SWITCH2)

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/SortingComputeUtils.hlsl"

								#endif


								#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch switch2


								#define FINE_PRUNING_ENABLED

								#define PERFORM_SPHERICAL_INTERSECTION_TESTS


								#define LIGHT_FPTL_VISIBILITY_DWORD_COUNTS (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + 31)/32)


								StructuredBuffer<float4> g_vBoundsBuffer : register( t1 );

								StructuredBuffer<LightVolumeData> _LightVolumeData : register(t2);

								StructuredBuffer<SFiniteLightBound> g_data : register( t3 );


								#ifdef USE_TWO_PASS_TILED_LIGHTING

								StructuredBuffer<uint> g_vBigTileLightList : register( t4 );        // don't support Buffer yet in unity

								#endif


								#ifdef PLATFORM_LANE_COUNT                                          // We can infer the size of a wave. This is currently not possible on non-consoles, so we have to fallback to a sensible default in those cases.

								#define NR_THREADS              PLATFORM_LANE_COUNT

								#else

								#define NR_THREADS              64                                  // default to 64 threads per group on other platforms..

								#endif


								#define PIXEL_PER_THREAD      ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4


								// output buffer

								RWStructuredBuffer<uint> g_vLightList : register( u0 );             // don't support RWBuffer yet in unity


								#define CATEGORY_LIST_SIZE          LIGHTCATEGORY_COUNT


								groupshared unsigned int coarseList[LIGHT_LIST_MAX_COARSE_ENTRIES];

								groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES];     // temporarily support room for all 64 while in LDS


								groupshared uint ldsZMin;

								groupshared uint ldsZMax;

								groupshared uint lightOffs;

								#ifdef FINE_PRUNING_ENABLED

								groupshared uint ldsDoesLightIntersect[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS];

								#endif

								groupshared int ldsNrLightsFinal;


								groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								groupshared uint lightOffsSph;

								#endif


								#ifdef USE_FEATURE_FLAGS

								groupshared uint ldsFeatureFlags;

								RWStructuredBuffer<uint> g_TileFeatureFlags;

								#endif


								float GetLinearDepth(float2 pixXY, float zDptBufSpace)    // 0 is near 1 is far

								{

								    float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];


								#ifdef USE_OBLIQUE_MODE

								    float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;

								    return res2.x / res2.y;

								#else

								    // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)

								    // however this function must also work for orthographic projection so we keep it like this.

								    float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;

								    float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;


								    return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);

								#endif

								}


								float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)

								{

								    float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];


								    bool isOrthographic = g_isOrthographic!=0;

								    float fSx = g_mScrProjection[0].x;

								    float fSy = g_mScrProjection[1].y;

								    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;

								    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;


								#if USE_LEFT_HAND_CAMERA_SPACE

								    bool useLeftHandVersion = true;

								#else

								    bool useLeftHandVersion = isOrthographic;

								#endif


								    float s = useLeftHandVersion ? 1 : (-1);

								    float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);


								    return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);

								}


								float GetOnePixDiagWorldDistAtDepthOne()

								{

								    float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];

								    float fSx = g_mScrProjection[0].x;

								    float fSy = g_mScrProjection[1].y;


								    return length( float2(1.0/fSx,1.0/fSy) );

								}


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);

								#endif


								#ifdef FINE_PRUNING_ENABLED

								#if PIXEL_PER_THREAD == 4

								void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths);

								#else

								void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD]);

								#endif

								#endif


								#ifdef USE_TWO_PASS_TILED_LIGHTING

								uint FetchBigTileLightIndex(uint lightStart, uint lightOffset)

								{

								    const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light

								    // Light index are store on 16bit

								    return (g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * lightStart / 2 + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;

								}

								#endif


								[numthreads(NR_THREADS, 1, 1)]

								void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)

								{

								    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);

								    uint2 tileIDX = u3GroupID.xy;

								    uint t=threadID;

								    int i;


								    UNITY_UNROLLX(LIGHT_LIST_MAX_COARSE_ENTRIES)

								    for(i=t; i<LIGHT_LIST_MAX_COARSE_ENTRIES; i+=NR_THREADS)

								        if(i<LIGHT_LIST_MAX_COARSE_ENTRIES)

								            prunedList[i]=0;


								    uint iWidth = g_viDimensions.x;

								    uint iHeight = g_viDimensions.y;

								    uint nrTilesX = (iWidth+15)/16;

								    uint nrTilesY = (iHeight+15)/16;


								    // build tile scr boundary

								    const uint uFltMax = 0x7f7fffff;  // FLT_MAX as a uint

								    if(t==0)

								    {

								        ldsZMin = uFltMax;

								        ldsZMax = 0;

								        lightOffs = 0;

								    }


								#if NR_THREADS > PLATFORM_LANE_COUNT

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    uint2 viTilLL = 16*tileIDX;


								    // establish min and max depth first

								    float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;


								#if PIXEL_PER_THREAD == 4

								    float4 vLinDepths;

								#else

								    float vLinDepths[PIXEL_PER_THREAD];

								#endif

								    {

								        // Fetch depths and calculate min/max

								        UNITY_UNROLL

								        for(i = 0; i < PIXEL_PER_THREAD; i++)

								        {

								            int idx = i * NR_THREADS + t;

								            uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );

								            const float fDepth = FetchDepth(uCrd);

								            vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);

								            if(fDepth<VIEWPORT_SCALE_Z)     // if not skydome

								            {

								                dpt_mi = min(fDepth, dpt_mi);

								                dpt_ma = max(fDepth, dpt_ma);

								            }

								        }


								        InterlockedMax(ldsZMax, asuint(dpt_ma));

								        InterlockedMin(ldsZMin, asuint(dpt_mi));


								        // For some platforms we always need GroupMemoryBarrierWithGroupSync() otherwise results are incorrect.

										// Reason is under investigation, related discussions:

										// https://unity.slack.com/archives/C02C8FWPNHE/p1704321597295329

										// https://unity.slack.com/archives/G3JUQKYV8/p1705081617447289

								#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) || defined(SHADER_API_SWITCH2)

								        GroupMemoryBarrierWithGroupSync();

								#endif

								    }


								    float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));

								    float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));

								    vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;


								    // build coarse list using AABB

								#ifdef USE_TWO_PASS_TILED_LIGHTING

								    const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);


								    int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;

								    int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;

								    const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;

								    const uint bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio);       // map the idx to 64x64 tiles

								    int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx / 2 + 0] & 0xFFFF;

								    for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)

								    {

								        int l = FetchBigTileLightIndex(bigTileIdx, l0);

								#else

								    for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)

								    {

								#endif

								        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);

								        const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;

								        const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;


								        if( all(vMa>vTileLL) && all(vMi<vTileUR))

								        {

								            unsigned int uInc = 1;

								            unsigned int uIndex;

								            InterlockedAdd(lightOffs, uInc, uIndex);

								            if(uIndex<LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l;        // add to light list

								        }

								    }


								#ifdef FINE_PRUNING_ENABLED

								    if(t<LIGHT_FPTL_VISIBILITY_DWORD_COUNTS) ldsDoesLightIntersect[t] = 0;

								#endif


								#if NR_THREADS > PLATFORM_LANE_COUNT

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								    iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );

								#endif


								#ifndef FINE_PRUNING_ENABLED

								    {

								        UNITY_UNROLL

								        for(i=t; i<LIGHT_LIST_MAX_COARSE_ENTRIES; i+=NR_THREADS) if(i<iNrCoarseLights) prunedList[i] = coarseList[i];

								        if(t==0) ldsNrLightsFinal=iNrCoarseLights;

								    }

								#else

								    {

								        // initializes ldsNrLightsFinal with the number of accepted lights.

								        // all accepted entries delivered in prunedList[].

								        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);

								    }

								#endif


								    if(t<CATEGORY_LIST_SIZE) ldsCategoryListCount[t]=0;

								#ifdef USE_FEATURE_FLAGS

								    if(t==0) ldsFeatureFlags=0;

								#endif


								#if NR_THREADS > PLATFORM_LANE_COUNT

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);

								    for(i=t; i<nrLightsCombinedList; i+=NR_THREADS)

								    {

								        const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);


								        InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);

								#ifdef USE_FEATURE_FLAGS

								        InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);

								#endif

								    }


								    // sort lights (gives a more efficient execution in both deferred and tiled forward lighting).

								#if NR_THREADS > PLATFORM_LANE_COUNT

								    SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);

								    //MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);

								#endif


								#ifdef USE_FEATURE_FLAGS

								    if(t == 0)

								    {

								        uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;

								        // In case of back

								        if(ldsZMax < ldsZMin)   // is background pixel

								        {

								            // There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. It will still execute but will do nothing

								            featureFlags = 0;

								        }


								        g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;

								    }

								#endif


								    // write lights to global buffers

								    int localOffs=0;

								    int offs = tileIDX.y*nrTilesX + tileIDX.x;


								#if defined(UNITY_STEREO_INSTANCING_ENABLED)

								    // Eye base offset must match code in GetCountAndStartTile()

								    offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;

								#endif


								    // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index

								    // to make it work correctly

								    int shiftIndex[CATEGORY_LIST_SIZE];

								    ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);


								    shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;

								    shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;


								    for(int category=0; category<CATEGORY_LIST_SIZE; category++)

								    {

								        int nrLightsFinal = ldsCategoryListCount[category];

								        int nrLightsFinalClamped = nrLightsFinal<SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;


								        const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;

								        for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS)

								        {

								            // We remap the prunedList index to the original LightData / EnvLightData indices

								            uint uLow = l==0 ? nrLightsFinalClamped : prunedList[max(0,2 * l - 1 + localOffs)] - shiftIndex[category];

								            uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];


								            g_vLightList[LIGHT_DWORD_PER_FPTL_TILE*offs + l] = (uLow&0xffff) | (uHigh<<16);

								        }


								        localOffs += nrLightsFinal;

								        offs += (nrTilesX*nrTilesY);

								    }

								}


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)

								{

								    if(threadID==0) lightOffsSph = 0;


								    // make a copy of coarseList in prunedList.

								    int l;

								    for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)

								        prunedList[l]=coarseList[l];


								#if NR_THREADS > PLATFORM_LANE_COUNT

								    GroupMemoryBarrierWithGroupSync();

								#endif


								#if USE_LEFT_HAND_CAMERA_SPACE

								    float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);

								#else

								    float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);

								#endif


								    float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();

								    float halfTileSizeAtZDistOne = 8*onePixDiagDist;        // scale by half a tile


								    for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)

								    {

								        const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);

								        SFiniteLightBound lightData = g_data[lightBoundIndex];


								        if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic!=0) )

								        {

								            unsigned int uIndex;

								            InterlockedAdd(lightOffsSph, 1, uIndex);

								            coarseList[uIndex]=prunedList[l];       // read from the original copy of coarseList which is backed up in prunedList

								        }

								    }


								#if NR_THREADS > PLATFORM_LANE_COUNT

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    return lightOffsSph;

								}

								#endif


								#ifdef FINE_PRUNING_ENABLED

								int GetCoarseLightIndex(int l, int iNrCoarseLights)

								{

								    return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;

								}


								groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];


								void StoreLightVolumeCache(int lightIndex, int coarseIndex, int volumeType)

								{

								    // 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.

								    // 29 bits for a coarse light index.

								    s_lightVolumesCache[lightIndex] = (uint)(volumeType & 0x7) | (uint)(coarseIndex << 3);

								}


								void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)

								{

								    uint data = s_lightVolumesCache[lightIndex];

								    coarseIndex = (int)(data >> 3);

								    volumeType = (int)(data & 0x7);

								}


								// initializes ldsNrLightsFinal with the number of accepted lights.

								// all accepted entries delivered in prunedList[].

								#if PIXEL_PER_THREAD == 4

								void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.

								#else

								void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])

								#endif

								{

								    uint t = threadID;

								    uint iWidth = g_viDimensions.x;

								    uint iHeight = g_viDimensions.y;

								    uint uLightsFlags[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS];

								    {

								        [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]

								        for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)

								            uLightsFlags[ii] = 0u;

								    }


								    int l=0;

								    // need this outer loop even on xb1 and ps4 since direct lights and

								    // reflection lights are kept in separate regions.


								    {

								        #define MAX_FINE_PRUNE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS)

								        [unroll(MAX_FINE_PRUNE_LOOP_CNT)]

								        for (uint it = 0; it < MAX_FINE_PRUNE_LOOP_CNT; ++it)

								        {

								            uint i = t + it * NR_THREADS;

								            if (i < (uint)iNrCoarseLights)

								            {

								                int idxCoarse = GetCoarseLightIndex((int)i, iNrCoarseLights);

								                int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;

								                StoreLightVolumeCache(i, idxCoarse, uLightVolume);

								            }

								        }

								    }


								#if NR_THREADS > PLATFORM_LANE_COUNT

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    //When using LDS to cache the volume data, this produces the best most optimal code.

								    //Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.

								    for (; l < iNrCoarseLights; ++l)

								    {

								        int idxCoarse;

								        int uLightVolume;

								        LoadLightVolumeCache(l, idxCoarse, uLightVolume);


								        // WARNING: we use here a uint for lightValid because there is a bug with the unity vulkan compiler.

								        // If this is a bool, the second dword of uLightsFlags never gets written to, which causes light tile artifacts

								        // on tiles that have more than 32 lights.

								        uint lightValid = 0;

								        if (uLightVolume == LIGHTVOLUMETYPE_CONE)

								        {

								            LightVolumeData lightData = _LightVolumeData[idxCoarse];

								            const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;

								            for(int i=0; i<PIXEL_PER_THREAD; i++)

								            {

								                int idx = t + i*NR_THREADS;


								                uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));

								                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);


								                // check pixel

								                float3 fromLight = vVPos-lightData.lightPos.xyz;

								                float distSq = dot(fromLight,fromLight);

								                const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz);        // spotDir = lightData.lightAxisZ.xyz


								                float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );


								                float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);

								                bool validInPixel = all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) );

								#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS

								                //a wave is on the same tile, and the loop is uniform for the wave.

								                // thus we early out if at least 1 thread in the wave passed this light, saving some ALU.

								                lightValid = WaveActiveAnyTrue(validInPixel);

								#else

								                lightValid = validInPixel;

								#endif

								                if (lightValid)

								                    break;

								            }

								        }

								        else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)

								        {

								            LightVolumeData lightData = _LightVolumeData[idxCoarse];

								            for(int i=0; i<PIXEL_PER_THREAD; i++)

								            {

								                int idx = t + i*NR_THREADS;


								                uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));

								                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);


								                // check pixel

								                float3 vLp = lightData.lightPos.xyz;

								                float3 toLight = vLp - vVPos;

								                float distSq = dot(toLight,toLight);


								                bool validInPixel = lightData.radiusSq>distSq;

								#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS

								                lightValid = WaveActiveAnyTrue(validInPixel);

								#else

								                lightValid = validInPixel;

								#endif

								                if (lightValid)

								                    break;

								           }

								        }

								        else if (uLightVolume ==  LIGHTVOLUMETYPE_BOX)

								        {

								            LightVolumeData lightData = _LightVolumeData[idxCoarse];

								            for(int i=0; i<PIXEL_PER_THREAD; i++)

								            {

								                int idx = t + i*NR_THREADS;


								                uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));

								                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);


								                // check pixel

								                float3 toLight  = lightData.lightPos.xyz - vVPos;


								                float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );

								                dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange;        // not as efficient as it could be

								                bool validInPixel = max(max(dist.x, dist.y), dist.z)<1;                       // but allows us to not write out OuterDists

								#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS

								                lightValid = WaveActiveAnyTrue(validInPixel);

								#else

								                lightValid = validInPixel;

								#endif

								                if (lightValid)

								                    break;

								            }

								        }

								        else

								            break;


								        // Imlicit division by 32, to pick the correct array index.

								        // E.g 37th light devided by 32 = 1.15 (rounded to 1), so we pick uLightsFlags[1] (which represents the lights from 32 to 64).

								        uLightsFlags[l >> 5] |= lightValid << (l&31);

								    }


								    // Merge results from all threads into shared memory.

								    // `InterlockedOr` performs a bitwise OR between `ldsDoesLightIntersect` and `uLightsFlags`.

								    // This allows multiple threads to update `ldsDoesLightIntersect` without collision.

								    {

								        [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]

								        for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)

								            InterlockedOr(ldsDoesLightIntersect[ii], uLightsFlags[ii]);

								    }


								    // For some platforms we always need GroupMemoryBarrierWithGroupSync() otherwise results are incorrect.

									// Reason is under investigation, related discussions:

									// https://unity.slack.com/archives/C02C8FWPNHE/p1704321597295329

									// https://unity.slack.com/archives/G3JUQKYV8/p1705081617447289

								#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) || defined(SHADER_API_SWITCH2)

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    {

								        // Reset the total number of lights for the tile.

								        if (t == 0)

								            ldsNrLightsFinal = 0;


								        // Split the job into multiple passes to ensure all lights are processed even if NR_THREADS is smaller than SHADEROPTIONS_FPTLMAX_LIGHT_COUNT.

								        // A thread will possibly processes many lights.

								        #define MAX_LIGHT_WRITE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS)

								        [unroll(MAX_LIGHT_WRITE_LOOP_CNT)]

								        for (uint it = 0; it < MAX_LIGHT_WRITE_LOOP_CNT; ++it)

								        {

								            // Retrieve the light index for the current thread and current iteration.

								            uint lightIndex = t + it * NR_THREADS;


								            // Check if the mask of the current light is valid (intersection with the tile).

								            uint lightsMask = ldsDoesLightIntersect[lightIndex >> 5];


								            // Select only the current light bit i in the block of 32.

								            uint localMask = (1u << (lightIndex & 31));


								            // If the thread index is in the light list and the mask is valid.

								            if (lightIndex < (uint)iNrCoarseLights && (localMask & lightsMask) != 0u)

								            {

								                // ldsDoesLightIntersect[k] contains the valid lights for each block of 32 lights.

								                // We sum the number of enabled bits (countbits()) in all blocks before the block i.

								                // backOffset represents the number of valid lights before the block where i is located.

								                uint backOffset = 0;

								                [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]

								                for (uint k = 0u; k < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++k)

								                    if (k < (lightIndex >> 5))

								                        backOffset += countbits(ldsDoesLightIntersect[k]);


								                // Count the number of valid lights (set bits) in the current 32-bit block before the current lightIndex.

								                uint lightsInActualBlock = countbits((localMask - 1u) & lightsMask);


								                // Determine the index of the current light by calculating how many lights were stored before it.

								                // This ensures that lights are packed correctly without gaps.

								                uint uIndex = backOffset + lightsInActualBlock;


								                // Ensure the computed index does not exceed the max allowed light entries.

								                if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES)

								                {

								                    // InterlockedAdd ensures atomic and ordered writing to prunedList, preventing threads from overwriting each other's values.

								                    // Increment ldsNrLightsFinal (represents the total number of lights for the tile).

								                    unsigned int uInc = 1;

								                    unsigned int finalPrunedLightIndex;

								                    InterlockedAdd(ldsNrLightsFinal, uInc, finalPrunedLightIndex);


								                    // Add the light to the prune list. If the index varies due to desynchronization (e.g without the previous InterlockedAdd),

								                    // it can causes flickering (mostly seen on Metal and Apple GPUs).

								                    if (finalPrunedLightIndex < LIGHT_LIST_MAX_COARSE_ENTRIES)

								                        prunedList[finalPrunedLightIndex] = coarseList[lightIndex];

								                }

								            }

								        }

								    }

								}

								#endif