#pragma kernel BigTileLightListGen #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariablesGlobal.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/SortingComputeUtils.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" #pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch #pragma multi_compile _ GENERATE_VOLUMETRIC_BIGTILE #define EXACT_EDGE_TESTS #define PERFORM_SPHERICAL_INTERSECTION_TESTS // is not actually used for anything in this kernel #define USE_OBLIQUE_MODE #define MAX_NR_BIGTILE_LIGHTS (MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE-1) StructuredBuffer g_vBoundsBuffer : register( t1 ); StructuredBuffer _LightVolumeData : register(t2); StructuredBuffer g_data : register( t3 ); #ifdef PLATFORM_LANE_COUNT // We can infer the size of a wave. This is currently not possible on non-consoles, so we have to fallback to a sensible default in those cases. #define NR_THREADS PLATFORM_LANE_COUNT #else #define NR_THREADS 64 // default to 64 threads per group on other platforms.. #endif // output buffer RWStructuredBuffer g_vLightList : register( u0 ); // don't support RWBuffer yet in unity #if defined(GENERATE_VOLUMETRIC_BIGTILE) // Output buffer for volumetric big tiles RWStructuredBuffer g_vVolumetricLightList; groupshared unsigned int volumetricLightCounts[NR_THREADS]; #endif // 2kB (room for roughly 30 wavefronts) groupshared unsigned int lightsListLDS[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE]; groupshared uint lightOffs; // TODO: Remove this function and g_mInvScrProjectionArr from constants. // Only usage of that constant. float GetLinearDepth(float2 pixXY, float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far { float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex]; #ifdef USE_OBLIQUE_MODE float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw; return res2.x / res2.y; #else // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj) // however this function must also work for orthographic projection so we keep it like this. float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w; float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w; return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33); #endif } float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex) { float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex]; bool isOrthographic = g_isOrthographic!=0; float fSx = g_mScrProjection[0].x; float fSy = g_mScrProjection[1].y; float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z; float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z; #if USE_LEFT_HAND_CAMERA_SPACE bool useLeftHandVersion = true; #else bool useLeftHandVersion = isOrthographic; #endif float s = useLeftHandVersion ? 1 : (-1); float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy); return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth); } float GetOnePixDiagWorldDistAtDepthOne(uint eyeIndex) { float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex]; float fSx = g_mScrProjection[0].x; float fSy = g_mScrProjection[1].y; return length( float2(1.0/fSx,1.0/fSy) ); } #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex); #endif #ifdef EXACT_EDGE_TESTS void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, uint eyeIndex); #endif bool LightAffectVolumetric(uint lightIndex) { uint flags = _LightVolumeData[lightIndex].featureFlags; bool supportedLightShape = flags == LIGHTFEATUREFLAGS_PUNCTUAL || flags == LIGHTFEATUREFLAGS_DIRECTIONAL; bool affectVolumetric = _LightVolumeData[lightIndex].affectVolumetric != 0; return affectVolumetric && supportedLightShape; } [numthreads(NR_THREADS, 1, 1)] void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint eyeIndex = u3GroupID.z; uint2 tileIDX = u3GroupID.xy; uint t=threadID; uint iWidth = g_viDimensions.x; uint iHeight = g_viDimensions.y; uint nrBigTilesX = (iWidth+63)/64; uint nrBigTilesY = (iHeight+63)/64; if(t==0) lightOffs = 0; #if NR_THREADS > PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif // Raw pixel coordinates of tile uint2 viTilLL = 64*tileIDX; uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) ); // not width and height minus 1 since viTilUR represents the end of the tile corner. // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight); float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight); // build coarse list using AABB for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS) { const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex); const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy; const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy; if( all(vMa>vTileLL) && all(vMi PLATFORM_LANE_COUNT || defined(SHADER_API_XBOXONE) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_SWITCH) // not sure why XB1 and Switch need the barrier (it will not be correct without) GroupMemoryBarrierWithGroupSync(); #endif uint iNrCoarseLights = min(lightOffs,MAX_NR_BIGTILE_LIGHTS); #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))), eyeIndex ); #endif #ifdef EXACT_EDGE_TESTS CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, eyeIndex); #endif // sort lights SORTLIST(lightsListLDS, iNrCoarseLights, MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE, t, NR_THREADS); if(t==0) lightOffs = 0; GroupMemoryBarrierWithGroupSync(); uint i; for(i=t; i 0) // cannot use ternary operator here (it would evaluate both sides and fetch invalid indices, causing crash on some GPUs) { lightIndexOrCount0 = lightsListLDS[id - 1]; // Index0 } uint lightIndex1 = 0; // Index1 if (id < iNrCoarseLights) // cannot use ternary operator here (it would evaluate both sides and fetch invalid indices, causing crash on some GPUs) { lightIndex1 = lightsListLDS[id]; } // Pack 2 light indices into a single bigtile value g_vLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * offs / 2 + i] = (lightIndexOrCount0 & 0xFFFF) | (lightIndex1 << 16); } #if defined(GENERATE_VOLUMETRIC_BIGTILE) uint bucketSize = MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE / NR_THREADS; uint bucketCount = MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE / bucketSize; if (t < bucketCount) { // Pack light indices affecting volumetric fog in bucket of MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE / NR_THREADS (8 by default) uint localVolumetricLightCount = 0; for (i = 0; i < bucketSize; i++) { uint id = t * bucketSize + i; if (id >= iNrCoarseLights) break; uint lightIndex = lightsListLDS[id]; if (LightAffectVolumetric(lightIndex)) { lightsListLDS[t * bucketSize + localVolumetricLightCount] = lightIndex; localVolumetricLightCount++; } } // Keep the volumetric light count in the bucket (volumetricLightCounts will be overwritten with the write offset) volumetricLightCounts[t] = localVolumetricLightCount; GroupMemoryBarrierWithGroupSync(); if (t == 0) { // for each bucket, write the packed light indices back to the volumetric bigtile buffer uint packedLightData = 0; uint volumetricLightCounter = 0; uint firstLightIndex = -1; uint bigTileOffset = MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * offs / 2; for (i = 0; i < bucketCount; i++) { if (i * bucketSize >= iNrCoarseLights) break; for (uint j = 0; j < volumetricLightCounts[i]; j++) { uint lightIndex = lightsListLDS[i * bucketSize + j]; volumetricLightCounter++; if (firstLightIndex == -1) firstLightIndex = lightIndex; packedLightData |= lightIndex << ((volumetricLightCounter & 1) * 16); if (volumetricLightCounter & 1) { g_vVolumetricLightList[bigTileOffset + volumetricLightCounter / 2] = packedLightData; packedLightData = 0; } } } // In case a single light index remains in the packed data, we flush it to the bigtile buffer if (volumetricLightCounter != 0 && !(volumetricLightCounter & 1)) g_vVolumetricLightList[bigTileOffset + volumetricLightCounter / 2] = packedLightData; if (firstLightIndex == -1) firstLightIndex = 0; g_vVolumetricLightList[bigTileOffset] = volumetricLightCounter | (firstLightIndex << 16); } } #endif } #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex) { #if USE_LEFT_HAND_CAMERA_SPACE float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0, eyeIndex); #else float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0, eyeIndex); #endif float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(eyeIndex); float halfTileSizeAtZDistOne = 32*onePixDiagDist; // scale by half a tile for(int l=threadID; l PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif } #endif #ifdef EXACT_EDGE_TESTS float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane, uint eyeIndex) { float x = (i&1)==0 ? viTilLL.x : viTilUR.x; float y = (i&2)==0 ? viTilLL.y : viTilUR.y; float z = (i&4)==0 ? g_fNearPlane : fTileFarPlane; #if !USE_LEFT_HAND_CAMERA_SPACE z = -z; #endif return GetViewPosFromLinDepth( float2(x, y), z, eyeIndex); } void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex) { int iSection = e0>>2; // section 0 is side edges, section 1 is near edges and section 2 is far edges int iSwizzle = e0&0x3; int i=iSwizzle + (2*(iSection&0x2)); // offset by 4 at section 2 vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane, eyeIndex); #if USE_LEFT_HAND_CAMERA_SPACE float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,1.0); #else float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,-1.0); #endif vE0 = iSection == 0 ? edgeSectionZero : (((iSwizzle & 0x2) == 0 ? 1.0f : (-1.0f)) * ((int)(iSwizzle & 0x1) == (iSwizzle >> 1) ? float3(1, 0, 0) : float3(0, 1, 0))); } void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, uint eyeIndex) { const bool bOnlyNeedFrustumSideEdges = true; const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull. const int totNrEdgePairs = 12*nrFrustEdges; for(int l=0; l0) ++positive; else if(fSignDist<0) ++negative; } int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0)); positive=0; negative=0; for(int j=0; j<8; j++) { float3 vPf = GetTileVertex(viTilLL, viTilUR, j, g_fFarPlane, eyeIndex); float fSignDist = dot(vN, vPf-vP0); if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative; } int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0)); bool bFoundSepPlane = (resh*resf)<0; if(bFoundSepPlane) lightsListLDS[l]=UINT_MAX; } } } #if NR_THREADS > PLATFORM_LANE_COUNT GroupMemoryBarrierWithGroupSync(); #endif } #endif