You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

601 lines
23 KiB

// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
// https://github.com/wolfgangfengel/GPU-Pro-7
#pragma kernel TileLightListGen
#pragma multi_compile _ USE_TWO_PASS_TILED_LIGHTING
#pragma multi_compile _ USE_FEATURE_FLAGS
#pragma multi_compile _ USE_OBLIQUE_MODE
//#pragma enable_d3d11_debug_symbols
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/TextureXR.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ShaderBase.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL) && !defined(SHADER_API_SWITCH) && !defined(SHADER_API_GAMECORE)
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/SortingComputeUtils.hlsl"
#endif
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
#define LIGHT_FPTL_VISIBILITY_DWORD_COUNTS (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + 31)/32)
StructuredBuffer<float4> g_vBoundsBuffer : register( t1 );
StructuredBuffer<LightVolumeData> _LightVolumeData : register(t2);
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
#ifdef USE_TWO_PASS_TILED_LIGHTING
StructuredBuffer<uint> g_vBigTileLightList : register( t4 ); // don't support Buffer yet in unity
#endif
#ifdef PLATFORM_LANE_COUNT // We can infer the size of a wave. This is currently not possible on non-consoles, so we have to fallback to a sensible default in those cases.
#define NR_THREADS PLATFORM_LANE_COUNT
#else
#define NR_THREADS 64 // default to 64 threads per group on other platforms..
#endif
#define PIXEL_PER_THREAD ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4
// output buffer
RWStructuredBuffer<uint> g_vLightList : register( u0 ); // don't support RWBuffer yet in unity
#define CATEGORY_LIST_SIZE LIGHTCATEGORY_COUNT
groupshared unsigned int coarseList[LIGHT_LIST_MAX_COARSE_ENTRIES];
groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS
groupshared uint ldsZMin;
groupshared uint ldsZMax;
groupshared uint lightOffs;
#ifdef FINE_PRUNING_ENABLED
groupshared uint ldsDoesLightIntersect[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS];
#endif
groupshared int ldsNrLightsFinal;
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
groupshared uint lightOffsSph;
#endif
#ifdef USE_FEATURE_FLAGS
groupshared uint ldsFeatureFlags;
RWStructuredBuffer<uint> g_TileFeatureFlags;
#endif
float GetLinearDepth(float2 pixXY, float zDptBufSpace) // 0 is near 1 is far
{
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];
#ifdef USE_OBLIQUE_MODE
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
return res2.x / res2.y;
#else
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
#endif
}
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];
bool isOrthographic = g_isOrthographic!=0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
#if USE_LEFT_HAND_CAMERA_SPACE
bool useLeftHandVersion = true;
#else
bool useLeftHandVersion = isOrthographic;
#endif
float s = useLeftHandVersion ? 1 : (-1);
float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);
return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}
float GetOnePixDiagWorldDistAtDepthOne()
{
float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return length( float2(1.0/fSx,1.0/fSy) );
}
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
#endif
#ifdef FINE_PRUNING_ENABLED
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths);
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD]);
#endif
#endif
#ifdef USE_TWO_PASS_TILED_LIGHTING
uint FetchBigTileLightIndex(uint lightStart, uint lightOffset)
{
const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light
// Light index are store on 16bit
return (g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * lightStart / 2 + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;
}
#endif
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
int i;
UNITY_UNROLLX(LIGHT_LIST_MAX_COARSE_ENTRIES)
for(i=t; i<LIGHT_LIST_MAX_COARSE_ENTRIES; i+=NR_THREADS)
if(i<LIGHT_LIST_MAX_COARSE_ENTRIES)
prunedList[i]=0;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
// build tile scr boundary
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
if(t==0)
{
ldsZMin = uFltMax;
ldsZMax = 0;
lightOffs = 0;
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
uint2 viTilLL = 16*tileIDX;
// establish min and max depth first
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
#if PIXEL_PER_THREAD == 4
float4 vLinDepths;
#else
float vLinDepths[PIXEL_PER_THREAD];
#endif
{
// Fetch depths and calculate min/max
UNITY_UNROLL
for(i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = i * NR_THREADS + t;
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
const float fDepth = FetchDepth(uCrd);
vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
{
dpt_mi = min(fDepth, dpt_mi);
dpt_ma = max(fDepth, dpt_ma);
}
}
InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without)
GroupMemoryBarrierWithGroupSync();
#endif
}
float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));
float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));
vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;
// build coarse list using AABB
#ifdef USE_TWO_PASS_TILED_LIGHTING
const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);
int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;
int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;
const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;
const uint bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx / 2 + 0] & 0xFFFF;
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
{
int l = FetchBigTileLightIndex(bigTileIdx, l0);
#else
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
{
#endif
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);
const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;
const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;
if( all(vMa>vTileLL) && all(vMi<vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
}
}
#ifdef FINE_PRUNING_ENABLED
if(t<LIGHT_FPTL_VISIBILITY_DWORD_COUNTS) ldsDoesLightIntersect[t] = 0;
#endif
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
#endif
#ifndef FINE_PRUNING_ENABLED
{
for(i=t; i<LIGHT_LIST_MAX_COARSE_ENTRIES; i+=NR_THREADS) if(i<iNrCoarseLights) prunedList[i] = coarseList[i];
if(t==0) ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
if(t<CATEGORY_LIST_SIZE) ldsCategoryListCount[t]=0;
#ifdef USE_FEATURE_FLAGS
if(t==0) ldsFeatureFlags=0;
#endif
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);
for(i=t; i<nrLightsCombinedList; i+=NR_THREADS)
{
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);
InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);
#ifdef USE_FEATURE_FLAGS
InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);
#endif
}
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if NR_THREADS > PLATFORM_LANE_COUNT
SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif
#ifdef USE_FEATURE_FLAGS
if(t == 0)
{
uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
// In case of back
if(ldsZMax < ldsZMin) // is background pixel
{
// There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. It will still execute but will do nothing
featureFlags = 0;
}
g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;
}
#endif
// write lights to global buffers
int localOffs=0;
int offs = tileIDX.y*nrTilesX + tileIDX.x;
#if defined(UNITY_STEREO_INSTANCING_ENABLED)
// Eye base offset must match code in GetCountAndStartTile()
offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;
#endif
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
int shiftIndex[CATEGORY_LIST_SIZE];
ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;
for(int category=0; category<CATEGORY_LIST_SIZE; category++)
{
int nrLightsFinal = ldsCategoryListCount[category];
int nrLightsFinalClamped = nrLightsFinal<SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;
const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;
for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS)
{
// We remap the prunedList index to the original LightData / EnvLightData indices
uint uLow = l==0 ? nrLightsFinalClamped : prunedList[max(0,2 * l - 1 + localOffs)] - shiftIndex[category];
uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];
g_vLightList[LIGHT_DWORD_PER_FPTL_TILE*offs + l] = (uLow&0xffff) | (uHigh<<16);
}
localOffs += nrLightsFinal;
offs += (nrTilesX*nrTilesY);
}
}
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
if(threadID==0) lightOffsSph = 0;
// make a copy of coarseList in prunedList.
int l;
for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
prunedList[l]=coarseList[l];
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
#if USE_LEFT_HAND_CAMERA_SPACE
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
#else
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
#endif
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 8*onePixDiagDist; // scale by half a tile
for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
{
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);
SFiniteLightBound lightData = g_data[lightBoundIndex];
if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic!=0) )
{
unsigned int uIndex;
InterlockedAdd(lightOffsSph, 1, uIndex);
coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
return lightOffsSph;
}
#endif
#ifdef FINE_PRUNING_ENABLED
int GetCoarseLightIndex(int l, int iNrCoarseLights)
{
return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
}
groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];
void StoreLightVolumeCache(int lightIndex, int coarseIndex, int volumeType)
{
// 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
// 29 bits for a coarse light index.
s_lightVolumesCache[lightIndex] = (uint)(volumeType & 0x7) | (uint)(coarseIndex << 3);
}
void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
{
uint data = s_lightVolumesCache[lightIndex];
coarseIndex = (int)(data >> 3);
volumeType = (int)(data & 0x7);
}
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
uint t = threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint uLightsFlags[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS];
{
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)
uLightsFlags[ii] = 0u;
}
int l=0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
{
#define MAX_FINE_PRUNE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS)
[unroll(MAX_FINE_PRUNE_LOOP_CNT)]
for (uint it = 0; it < MAX_FINE_PRUNE_LOOP_CNT; ++it)
{
uint i = t + it * NR_THREADS;
if (i < (uint)iNrCoarseLights)
{
int idxCoarse = GetCoarseLightIndex((int)i, iNrCoarseLights);
int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;
StoreLightVolumeCache(i, idxCoarse, uLightVolume);
}
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//When using LDS to cache the volume data, this produces the best most optimal code.
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
for (; l < iNrCoarseLights; ++l)
{
int idxCoarse;
int uLightVolume;
LoadLightVolumeCache(l, idxCoarse, uLightVolume);
// WARNING: we use here a uint for lightValid because there is a bug with the unity vulkan compiler.
// If this is a bool, the second dword of uLightsFlags never gets written to, which causes light tile artifacts
// on tiles that have more than 32 lights.
uint lightValid = 0;
if (uLightVolume == LIGHTVOLUMETYPE_CONE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
for(int i=0; i<PIXEL_PER_THREAD; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 fromLight = vVPos-lightData.lightPos.xyz;
float distSq = dot(fromLight,fromLight);
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );
float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);
bool validInPixel = all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) );
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
//a wave is on the same tile, and the loop is uniform for the wave.
// thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
for(int i=0; i<PIXEL_PER_THREAD; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 vLp = lightData.lightPos.xyz;
float3 toLight = vLp - vVPos;
float distSq = dot(toLight,toLight);
bool validInPixel = lightData.radiusSq>distSq;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
for(int i=0; i<PIXEL_PER_THREAD; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;
float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
bool validInPixel = max(max(dist.x, dist.y), dist.z)<1; // but allows us to not write out OuterDists
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else
break;
uLightsFlags[l >> 5] |= lightValid << (l&31);
}
{
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)
InterlockedOr(ldsDoesLightIntersect[ii], uLightsFlags[ii]);
}
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without)
GroupMemoryBarrierWithGroupSync();
#endif
{
uint localCount = 0;
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)
localCount += countbits(ldsDoesLightIntersect[ii]);
if (t == 0) ldsNrLightsFinal = localCount;
#define MAX_LIGHT_WRITE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS)
[unroll(MAX_LIGHT_WRITE_LOOP_CNT)]
for (uint it = 0; it < MAX_LIGHT_WRITE_LOOP_CNT; ++it)
{
uint i = t + it * NR_THREADS;
uint lightsMask = ldsDoesLightIntersect[i >> 5];
uint localMask = (1u << (i & 31));
if(i<(uint) iNrCoarseLights && (localMask & lightsMask) != 0u)
{
uint backOffset = 0;
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
for (uint k = 0u; k < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++k)
if (k < (i >> 5))
backOffset += countbits(ldsDoesLightIntersect[k]);
uint uIndex = backOffset + countbits((localMask - 1u) & lightsMask);
if(uIndex<LIGHT_LIST_MAX_COARSE_ENTRIES) prunedList[uIndex] = coarseList[i];
}
}
}
}
#endif