// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7. // https://github.com/wolfgangfengel/GPU-Pro-7 #pragma kernel TileLightListGen #pragma multi_compile _ USE_TWO_PASS_TILED_LIGHTING #pragma multi_compile _ USE_FEATURE_FLAGS #pragma multi_compile _ USE_OBLIQUE_MODE //#pragma enable_d3d11_debug_symbols #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl" #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/TextureXR.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ShaderBase.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL) && !defined(SHADER_API_SWITCH) && !defined(SHADER_API_GAMECORE) #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/SortingComputeUtils.hlsl" #endif #pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch #define FINE_PRUNING_ENABLED #define PERFORM_SPHERICAL_INTERSECTION_TESTS #define LIGHT_FPTL_VISIBILITY_DWORD_COUNTS (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + 31)/32) StructuredBuffer g_vBoundsBuffer : register( t1 ); StructuredBuffer _LightVolumeData : register(t2); StructuredBuffer g_data : register( t3 ); #ifdef USE_TWO_PASS_TILED_LIGHTING StructuredBuffer g_vBigTileLightList : register( t4 ); // don't support Buffer yet in unity #endif #ifdef PLATFORM_LANE_COUNT // We can infer the size of a wave. This is currently not possible on non-consoles, so we have to fallback to a sensible default in those cases. #define NR_THREADS PLATFORM_LANE_COUNT #else #define NR_THREADS 64 // default to 64 threads per group on other platforms.. #endif #define PIXEL_PER_THREAD ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4 // output buffer RWStructuredBuffer g_vLightList : register( u0 ); // don't support RWBuffer yet in unity #define CATEGORY_LIST_SIZE LIGHTCATEGORY_COUNT groupshared unsigned int coarseList[LIGHT_LIST_MAX_COARSE_ENTRIES]; groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS groupshared uint ldsZMin; groupshared uint ldsZMax; groupshared uint lightOffs; #ifdef FINE_PRUNING_ENABLED groupshared uint ldsDoesLightIntersect[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS]; #endif groupshared int ldsNrLightsFinal; groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE]; #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS groupshared uint lightOffsSph; #endif #ifdef USE_FEATURE_FLAGS groupshared uint ldsFeatureFlags; RWStructuredBuffer g_TileFeatureFlags; #endif float GetLinearDepth(float2 pixXY, float zDptBufSpace) // 0 is near 1 is far { float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex]; #ifdef USE_OBLIQUE_MODE float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw; return res2.x / res2.y; #else // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj) // however this function must also work for orthographic projection so we keep it like this. float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w; float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w; return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33); #endif } float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth) { float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex]; bool isOrthographic = g_isOrthographic!=0; float fSx = g_mScrProjection[0].x; float fSy = g_mScrProjection[1].y; float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z; float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z; #if USE_LEFT_HAND_CAMERA_SPACE bool useLeftHandVersion = true; #else bool useLeftHandVersion = isOrthographic; #endif float s = useLeftHandVersion ? 1 : (-1); float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy); return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth); } float GetOnePixDiagWorldDistAtDepthOne() { float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex]; float fSx = g_mScrProjection[0].x; float fSy = g_mScrProjection[1].y; return length( float2(1.0/fSx,1.0/fSy) ); } #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate); #endif #ifdef FINE_PRUNING_ENABLED #if PIXEL_PER_THREAD == 4 void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths); #else void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD]); #endif #endif #ifdef USE_TWO_PASS_TILED_LIGHTING uint FetchBigTileLightIndex(uint lightStart, uint lightOffset) { const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light // Light index are store on 16bit return (g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * lightStart / 2 + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff; } #endif [numthreads(NR_THREADS, 1, 1)] void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z); uint2 tileIDX = u3GroupID.xy; uint t=threadID; int i; UNITY_UNROLLX(LIGHT_LIST_MAX_COARSE_ENTRIES) for(i=t; i PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif uint2 viTilLL = 16*tileIDX; // establish min and max depth first float dpt_mi=asfloat(uFltMax), dpt_ma=0.0; #if PIXEL_PER_THREAD == 4 float4 vLinDepths; #else float vLinDepths[PIXEL_PER_THREAD]; #endif { // Fetch depths and calculate min/max UNITY_UNROLL for(i = 0; i < PIXEL_PER_THREAD; i++) { int idx = i * NR_THREADS + t; uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) ); const float fDepth = FetchDepth(uCrd); vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth); if(fDepth PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without) GroupMemoryBarrierWithGroupSync(); #endif } float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin)); float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax)); vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy; // build coarse list using AABB #ifdef USE_TWO_PASS_TILED_LIGHTING const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16); int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio; int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio; const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY; const uint bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio); // map the idx to 64x64 tiles int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx / 2 + 0] & 0xFFFF; for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS) { int l = FetchBigTileLightIndex(bigTileIdx, l0); #else for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS) { #endif const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex); const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz; const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz; if( all(vMa>vTileLL) && all(vMi PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES); #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) ); #endif #ifndef FINE_PRUNING_ENABLED { for(i=t; i PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES); for(i=t; i PLATFORM_LANE_COUNT SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS); //MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS); #endif #ifdef USE_FEATURE_FLAGS if(t == 0) { uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags; // In case of back if(ldsZMax < ldsZMin) // is background pixel { // There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. It will still execute but will do nothing featureFlags = 0; } g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags; } #endif // write lights to global buffers int localOffs=0; int offs = tileIDX.y*nrTilesX + tileIDX.x; #if defined(UNITY_STEREO_INSTANCING_ENABLED) // Eye base offset must match code in GetCountAndStartTile() offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT; #endif // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index // to make it work correctly int shiftIndex[CATEGORY_LIST_SIZE]; ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE); shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift; shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift; for(int category=0; category>1; for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS) { // We remap the prunedList index to the original LightData / EnvLightData indices uint uLow = l==0 ? nrLightsFinalClamped : prunedList[max(0,2 * l - 1 + localOffs)] - shiftIndex[category]; uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category]; g_vLightList[LIGHT_DWORD_PER_FPTL_TILE*offs + l] = (uLow&0xffff) | (uHigh<<16); } localOffs += nrLightsFinal; offs += (nrTilesX*nrTilesY); } } #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate) { if(threadID==0) lightOffsSph = 0; // make a copy of coarseList in prunedList. int l; for(l=threadID; l PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif #if USE_LEFT_HAND_CAMERA_SPACE float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0); #else float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0); #endif float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(); float halfTileSizeAtZDistOne = 8*onePixDiagDist; // scale by half a tile for(l=threadID; l PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif return lightOffsSph; } #endif #ifdef FINE_PRUNING_ENABLED int GetCoarseLightIndex(int l, int iNrCoarseLights) { return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0; } groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES]; void StoreLightVolumeCache(int lightIndex, int coarseIndex, int volumeType) { // 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement. // 29 bits for a coarse light index. s_lightVolumesCache[lightIndex] = (uint)(volumeType & 0x7) | (uint)(coarseIndex << 3); } void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType) { uint data = s_lightVolumesCache[lightIndex]; coarseIndex = (int)(data >> 3); volumeType = (int)(data & 0x7); } // initializes ldsNrLightsFinal with the number of accepted lights. // all accepted entries delivered in prunedList[]. #if PIXEL_PER_THREAD == 4 void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats. #else void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD]) #endif { uint t = threadID; uint iWidth = g_viDimensions.x; uint iHeight = g_viDimensions.y; uint uLightsFlags[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS]; { [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)] for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii) uLightsFlags[ii] = 0u; } int l=0; // need this outer loop even on xb1 and ps4 since direct lights and // reflection lights are kept in separate regions. { #define MAX_FINE_PRUNE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS) [unroll(MAX_FINE_PRUNE_LOOP_CNT)] for (uint it = 0; it < MAX_FINE_PRUNE_LOOP_CNT; ++it) { uint i = t + it * NR_THREADS; if (i < (uint)iNrCoarseLights) { int idxCoarse = GetCoarseLightIndex((int)i, iNrCoarseLights); int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume; StoreLightVolumeCache(i, idxCoarse, uLightVolume); } } } #if NR_THREADS > PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif //When using LDS to cache the volume data, this produces the best most optimal code. //Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS. for (; l < iNrCoarseLights; ++l) { int idxCoarse; int uLightVolume; LoadLightVolumeCache(l, idxCoarse, uLightVolume); // WARNING: we use here a uint for lightValid because there is a bug with the unity vulkan compiler. // If this is a bool, the second dword of uLightsFlags never gets written to, which causes light tile artifacts // on tiles that have more than 32 lights. uint lightValid = 0; if (uLightVolume == LIGHTVOLUMETYPE_CONE) { LightVolumeData lightData = _LightVolumeData[idxCoarse]; const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0; for(int i=0; i>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 fromLight = vVPos-lightData.lightPos.xyz; float distSq = dot(fromLight,fromLight); const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) ); float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y); bool validInPixel = all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ); #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS //a wave is on the same tile, and the loop is uniform for the wave. // thus we early out if at least 1 thread in the wave passed this light, saving some ALU. lightValid = WaveActiveAnyTrue(validInPixel); #else lightValid = validInPixel; #endif if (lightValid) break; } } else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE) { LightVolumeData lightData = _LightVolumeData[idxCoarse]; for(int i=0; i>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 vLp = lightData.lightPos.xyz; float3 toLight = vLp - vVPos; float distSq = dot(toLight,toLight); bool validInPixel = lightData.radiusSq>distSq; #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS lightValid = WaveActiveAnyTrue(validInPixel); #else lightValid = validInPixel; #endif if (lightValid) break; } } else if (uLightVolume == LIGHTVOLUMETYPE_BOX) { LightVolumeData lightData = _LightVolumeData[idxCoarse]; for(int i=0; i>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 toLight = lightData.lightPos.xyz - vVPos; float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) ); dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be bool validInPixel = max(max(dist.x, dist.y), dist.z)<1; // but allows us to not write out OuterDists #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS lightValid = WaveActiveAnyTrue(validInPixel); #else lightValid = validInPixel; #endif if (lightValid) break; } } else break; uLightsFlags[l >> 5] |= lightValid << (l&31); } { [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)] for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii) InterlockedOr(ldsDoesLightIntersect[ii], uLightsFlags[ii]); } #if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without) GroupMemoryBarrierWithGroupSync(); #endif { uint localCount = 0; [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)] for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii) localCount += countbits(ldsDoesLightIntersect[ii]); if (t == 0) ldsNrLightsFinal = localCount; #define MAX_LIGHT_WRITE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS) [unroll(MAX_LIGHT_WRITE_LOOP_CNT)] for (uint it = 0; it < MAX_LIGHT_WRITE_LOOP_CNT; ++it) { uint i = t + it * NR_THREADS; uint lightsMask = ldsDoesLightIntersect[i >> 5]; uint localMask = (1u << (i & 31)); if(i<(uint) iNrCoarseLights && (localMask & lightsMask) != 0u) { uint backOffset = 0; [unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)] for (uint k = 0u; k < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++k) if (k < (i >> 5)) backOffset += countbits(ldsDoesLightIntersect[k]); uint uIndex = backOffset + countbits((localMask - 1u) & lightsMask); if(uIndex