You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
643 lines
26 KiB
643 lines
26 KiB
// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
|
|
// https://github.com/wolfgangfengel/GPU-Pro-7
|
|
|
|
#pragma kernel TileLightListGen
|
|
|
|
#pragma multi_compile _ USE_TWO_PASS_TILED_LIGHTING
|
|
#pragma multi_compile _ USE_FEATURE_FLAGS
|
|
#pragma multi_compile _ USE_OBLIQUE_MODE
|
|
|
|
//#pragma enable_d3d11_debug_symbols
|
|
|
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
|
|
|
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/TextureXR.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ShaderBase.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
|
|
|
|
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL) && !defined(SHADER_API_SWITCH) && !defined(SHADER_API_GAMECORE) && !defined(SHADER_API_SWITCH2)
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/SortingComputeUtils.hlsl"
|
|
#endif
|
|
|
|
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch switch2
|
|
|
|
#define FINE_PRUNING_ENABLED
|
|
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
|
|
|
|
#define LIGHT_FPTL_VISIBILITY_DWORD_COUNTS (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + 31)/32)
|
|
|
|
StructuredBuffer<float4> g_vBoundsBuffer : register( t1 );
|
|
StructuredBuffer<LightVolumeData> _LightVolumeData : register(t2);
|
|
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
|
|
|
|
#ifdef USE_TWO_PASS_TILED_LIGHTING
|
|
StructuredBuffer<uint> g_vBigTileLightList : register( t4 ); // don't support Buffer yet in unity
|
|
#endif
|
|
|
|
#ifdef PLATFORM_LANE_COUNT // We can infer the size of a wave. This is currently not possible on non-consoles, so we have to fallback to a sensible default in those cases.
|
|
#define NR_THREADS PLATFORM_LANE_COUNT
|
|
#else
|
|
#define NR_THREADS 64 // default to 64 threads per group on other platforms..
|
|
#endif
|
|
|
|
#define PIXEL_PER_THREAD ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4
|
|
|
|
// output buffer
|
|
RWStructuredBuffer<uint> g_vLightList : register( u0 ); // don't support RWBuffer yet in unity
|
|
|
|
#define CATEGORY_LIST_SIZE LIGHTCATEGORY_COUNT
|
|
|
|
groupshared unsigned int coarseList[LIGHT_LIST_MAX_COARSE_ENTRIES];
|
|
groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS
|
|
|
|
groupshared uint ldsZMin;
|
|
groupshared uint ldsZMax;
|
|
groupshared uint lightOffs;
|
|
#ifdef FINE_PRUNING_ENABLED
|
|
groupshared uint ldsDoesLightIntersect[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS];
|
|
#endif
|
|
groupshared int ldsNrLightsFinal;
|
|
|
|
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];
|
|
|
|
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
|
|
groupshared uint lightOffsSph;
|
|
#endif
|
|
|
|
#ifdef USE_FEATURE_FLAGS
|
|
groupshared uint ldsFeatureFlags;
|
|
RWStructuredBuffer<uint> g_TileFeatureFlags;
|
|
#endif
|
|
|
|
|
|
float GetLinearDepth(float2 pixXY, float zDptBufSpace) // 0 is near 1 is far
|
|
{
|
|
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];
|
|
|
|
#ifdef USE_OBLIQUE_MODE
|
|
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
|
|
return res2.x / res2.y;
|
|
#else
|
|
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
|
|
// however this function must also work for orthographic projection so we keep it like this.
|
|
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
|
|
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
|
|
|
|
return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
|
|
#endif
|
|
}
|
|
|
|
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
|
|
{
|
|
float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];
|
|
|
|
bool isOrthographic = g_isOrthographic!=0;
|
|
float fSx = g_mScrProjection[0].x;
|
|
float fSy = g_mScrProjection[1].y;
|
|
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
|
|
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
|
|
|
|
#if USE_LEFT_HAND_CAMERA_SPACE
|
|
bool useLeftHandVersion = true;
|
|
#else
|
|
bool useLeftHandVersion = isOrthographic;
|
|
#endif
|
|
|
|
float s = useLeftHandVersion ? 1 : (-1);
|
|
float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);
|
|
|
|
return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
|
|
}
|
|
|
|
float GetOnePixDiagWorldDistAtDepthOne()
|
|
{
|
|
float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];
|
|
float fSx = g_mScrProjection[0].x;
|
|
float fSy = g_mScrProjection[1].y;
|
|
|
|
return length( float2(1.0/fSx,1.0/fSy) );
|
|
}
|
|
|
|
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
|
|
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
|
|
#endif
|
|
|
|
#ifdef FINE_PRUNING_ENABLED
|
|
#if PIXEL_PER_THREAD == 4
|
|
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths);
|
|
#else
|
|
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD]);
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef USE_TWO_PASS_TILED_LIGHTING
|
|
uint FetchBigTileLightIndex(uint lightStart, uint lightOffset)
|
|
{
|
|
const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light
|
|
// Light index are store on 16bit
|
|
return (g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * lightStart / 2 + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;
|
|
}
|
|
#endif
|
|
|
|
[numthreads(NR_THREADS, 1, 1)]
|
|
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
|
|
{
|
|
UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
|
|
uint2 tileIDX = u3GroupID.xy;
|
|
uint t=threadID;
|
|
int i;
|
|
|
|
UNITY_UNROLLX(LIGHT_LIST_MAX_COARSE_ENTRIES)
|
|
for(i=t; i<LIGHT_LIST_MAX_COARSE_ENTRIES; i+=NR_THREADS)
|
|
if(i<LIGHT_LIST_MAX_COARSE_ENTRIES)
|
|
prunedList[i]=0;
|
|
|
|
uint iWidth = g_viDimensions.x;
|
|
uint iHeight = g_viDimensions.y;
|
|
uint nrTilesX = (iWidth+15)/16;
|
|
uint nrTilesY = (iHeight+15)/16;
|
|
|
|
// build tile scr boundary
|
|
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
|
|
if(t==0)
|
|
{
|
|
ldsZMin = uFltMax;
|
|
ldsZMax = 0;
|
|
lightOffs = 0;
|
|
}
|
|
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
|
|
uint2 viTilLL = 16*tileIDX;
|
|
|
|
// establish min and max depth first
|
|
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
|
|
|
|
|
|
#if PIXEL_PER_THREAD == 4
|
|
float4 vLinDepths;
|
|
#else
|
|
float vLinDepths[PIXEL_PER_THREAD];
|
|
#endif
|
|
{
|
|
// Fetch depths and calculate min/max
|
|
UNITY_UNROLL
|
|
for(i = 0; i < PIXEL_PER_THREAD; i++)
|
|
{
|
|
int idx = i * NR_THREADS + t;
|
|
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
|
|
const float fDepth = FetchDepth(uCrd);
|
|
vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
|
|
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
|
|
{
|
|
dpt_mi = min(fDepth, dpt_mi);
|
|
dpt_ma = max(fDepth, dpt_ma);
|
|
}
|
|
}
|
|
|
|
InterlockedMax(ldsZMax, asuint(dpt_ma));
|
|
InterlockedMin(ldsZMin, asuint(dpt_mi));
|
|
|
|
// For some platforms we always need GroupMemoryBarrierWithGroupSync() otherwise results are incorrect.
|
|
// Reason is under investigation, related discussions:
|
|
// https://unity.slack.com/archives/C02C8FWPNHE/p1704321597295329
|
|
// https://unity.slack.com/archives/G3JUQKYV8/p1705081617447289
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) || defined(SHADER_API_SWITCH2)
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
}
|
|
|
|
|
|
float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));
|
|
float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));
|
|
vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;
|
|
|
|
|
|
// build coarse list using AABB
|
|
#ifdef USE_TWO_PASS_TILED_LIGHTING
|
|
const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);
|
|
|
|
int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;
|
|
int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;
|
|
const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;
|
|
const uint bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio); // map the idx to 64x64 tiles
|
|
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx / 2 + 0] & 0xFFFF;
|
|
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
|
|
{
|
|
int l = FetchBigTileLightIndex(bigTileIdx, l0);
|
|
#else
|
|
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
|
|
{
|
|
#endif
|
|
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);
|
|
const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;
|
|
const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;
|
|
|
|
if( all(vMa>vTileLL) && all(vMi<vTileUR))
|
|
{
|
|
unsigned int uInc = 1;
|
|
unsigned int uIndex;
|
|
InterlockedAdd(lightOffs, uInc, uIndex);
|
|
if(uIndex<LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
|
|
}
|
|
}
|
|
|
|
#ifdef FINE_PRUNING_ENABLED
|
|
if(t<LIGHT_FPTL_VISIBILITY_DWORD_COUNTS) ldsDoesLightIntersect[t] = 0;
|
|
#endif
|
|
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);
|
|
|
|
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
|
|
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
|
|
#endif
|
|
|
|
#ifndef FINE_PRUNING_ENABLED
|
|
{
|
|
UNITY_UNROLL
|
|
for(i=t; i<LIGHT_LIST_MAX_COARSE_ENTRIES; i+=NR_THREADS) if(i<iNrCoarseLights) prunedList[i] = coarseList[i];
|
|
if(t==0) ldsNrLightsFinal=iNrCoarseLights;
|
|
}
|
|
#else
|
|
{
|
|
// initializes ldsNrLightsFinal with the number of accepted lights.
|
|
// all accepted entries delivered in prunedList[].
|
|
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
|
|
}
|
|
#endif
|
|
|
|
if(t<CATEGORY_LIST_SIZE) ldsCategoryListCount[t]=0;
|
|
#ifdef USE_FEATURE_FLAGS
|
|
if(t==0) ldsFeatureFlags=0;
|
|
#endif
|
|
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
|
|
int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);
|
|
for(i=t; i<nrLightsCombinedList; i+=NR_THREADS)
|
|
{
|
|
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);
|
|
|
|
InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);
|
|
#ifdef USE_FEATURE_FLAGS
|
|
InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);
|
|
#endif
|
|
}
|
|
|
|
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);
|
|
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
|
|
#endif
|
|
|
|
#ifdef USE_FEATURE_FLAGS
|
|
if(t == 0)
|
|
{
|
|
uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
|
|
// In case of back
|
|
if(ldsZMax < ldsZMin) // is background pixel
|
|
{
|
|
// There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. It will still execute but will do nothing
|
|
featureFlags = 0;
|
|
}
|
|
|
|
g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;
|
|
}
|
|
#endif
|
|
|
|
// write lights to global buffers
|
|
int localOffs=0;
|
|
int offs = tileIDX.y*nrTilesX + tileIDX.x;
|
|
|
|
#if defined(UNITY_STEREO_INSTANCING_ENABLED)
|
|
// Eye base offset must match code in GetCountAndStartTile()
|
|
offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;
|
|
#endif
|
|
|
|
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
|
|
// to make it work correctly
|
|
int shiftIndex[CATEGORY_LIST_SIZE];
|
|
ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
|
|
|
|
shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
|
|
shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;
|
|
|
|
for(int category=0; category<CATEGORY_LIST_SIZE; category++)
|
|
{
|
|
int nrLightsFinal = ldsCategoryListCount[category];
|
|
int nrLightsFinalClamped = nrLightsFinal<SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;
|
|
|
|
const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;
|
|
for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS)
|
|
{
|
|
// We remap the prunedList index to the original LightData / EnvLightData indices
|
|
uint uLow = l==0 ? nrLightsFinalClamped : prunedList[max(0,2 * l - 1 + localOffs)] - shiftIndex[category];
|
|
uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];
|
|
|
|
g_vLightList[LIGHT_DWORD_PER_FPTL_TILE*offs + l] = (uLow&0xffff) | (uHigh<<16);
|
|
}
|
|
|
|
localOffs += nrLightsFinal;
|
|
offs += (nrTilesX*nrTilesY);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
|
|
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
|
|
{
|
|
if(threadID==0) lightOffsSph = 0;
|
|
|
|
// make a copy of coarseList in prunedList.
|
|
int l;
|
|
for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
|
|
prunedList[l]=coarseList[l];
|
|
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
#if USE_LEFT_HAND_CAMERA_SPACE
|
|
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
|
|
#else
|
|
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
|
|
#endif
|
|
|
|
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
|
|
float halfTileSizeAtZDistOne = 8*onePixDiagDist; // scale by half a tile
|
|
|
|
for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
|
|
{
|
|
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);
|
|
SFiniteLightBound lightData = g_data[lightBoundIndex];
|
|
|
|
if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic!=0) )
|
|
{
|
|
unsigned int uIndex;
|
|
InterlockedAdd(lightOffsSph, 1, uIndex);
|
|
coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
|
|
}
|
|
}
|
|
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
return lightOffsSph;
|
|
}
|
|
#endif
|
|
|
|
|
|
#ifdef FINE_PRUNING_ENABLED
|
|
int GetCoarseLightIndex(int l, int iNrCoarseLights)
|
|
{
|
|
return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
|
|
}
|
|
|
|
groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];
|
|
|
|
void StoreLightVolumeCache(int lightIndex, int coarseIndex, int volumeType)
|
|
{
|
|
// 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
|
|
// 29 bits for a coarse light index.
|
|
s_lightVolumesCache[lightIndex] = (uint)(volumeType & 0x7) | (uint)(coarseIndex << 3);
|
|
}
|
|
|
|
void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
|
|
{
|
|
uint data = s_lightVolumesCache[lightIndex];
|
|
coarseIndex = (int)(data >> 3);
|
|
volumeType = (int)(data & 0x7);
|
|
}
|
|
|
|
// initializes ldsNrLightsFinal with the number of accepted lights.
|
|
// all accepted entries delivered in prunedList[].
|
|
#if PIXEL_PER_THREAD == 4
|
|
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
|
|
#else
|
|
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
|
|
#endif
|
|
{
|
|
uint t = threadID;
|
|
uint iWidth = g_viDimensions.x;
|
|
uint iHeight = g_viDimensions.y;
|
|
uint uLightsFlags[LIGHT_FPTL_VISIBILITY_DWORD_COUNTS];
|
|
{
|
|
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
|
|
for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)
|
|
uLightsFlags[ii] = 0u;
|
|
}
|
|
|
|
int l=0;
|
|
// need this outer loop even on xb1 and ps4 since direct lights and
|
|
// reflection lights are kept in separate regions.
|
|
|
|
{
|
|
#define MAX_FINE_PRUNE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS)
|
|
[unroll(MAX_FINE_PRUNE_LOOP_CNT)]
|
|
for (uint it = 0; it < MAX_FINE_PRUNE_LOOP_CNT; ++it)
|
|
{
|
|
uint i = t + it * NR_THREADS;
|
|
if (i < (uint)iNrCoarseLights)
|
|
{
|
|
int idxCoarse = GetCoarseLightIndex((int)i, iNrCoarseLights);
|
|
int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;
|
|
StoreLightVolumeCache(i, idxCoarse, uLightVolume);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
//When using LDS to cache the volume data, this produces the best most optimal code.
|
|
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
|
|
for (; l < iNrCoarseLights; ++l)
|
|
{
|
|
int idxCoarse;
|
|
int uLightVolume;
|
|
LoadLightVolumeCache(l, idxCoarse, uLightVolume);
|
|
|
|
// WARNING: we use here a uint for lightValid because there is a bug with the unity vulkan compiler.
|
|
// If this is a bool, the second dword of uLightsFlags never gets written to, which causes light tile artifacts
|
|
// on tiles that have more than 32 lights.
|
|
uint lightValid = 0;
|
|
if (uLightVolume == LIGHTVOLUMETYPE_CONE)
|
|
{
|
|
LightVolumeData lightData = _LightVolumeData[idxCoarse];
|
|
const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
|
|
for(int i=0; i<PIXEL_PER_THREAD; i++)
|
|
{
|
|
int idx = t + i*NR_THREADS;
|
|
|
|
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
|
|
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
|
|
|
|
// check pixel
|
|
float3 fromLight = vVPos-lightData.lightPos.xyz;
|
|
float distSq = dot(fromLight,fromLight);
|
|
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
|
|
|
|
float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );
|
|
|
|
float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);
|
|
bool validInPixel = all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) );
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
//a wave is on the same tile, and the loop is uniform for the wave.
|
|
// thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
|
|
lightValid = WaveActiveAnyTrue(validInPixel);
|
|
#else
|
|
lightValid = validInPixel;
|
|
#endif
|
|
if (lightValid)
|
|
break;
|
|
}
|
|
}
|
|
else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
|
|
{
|
|
LightVolumeData lightData = _LightVolumeData[idxCoarse];
|
|
for(int i=0; i<PIXEL_PER_THREAD; i++)
|
|
{
|
|
int idx = t + i*NR_THREADS;
|
|
|
|
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
|
|
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
|
|
|
|
// check pixel
|
|
float3 vLp = lightData.lightPos.xyz;
|
|
float3 toLight = vLp - vVPos;
|
|
float distSq = dot(toLight,toLight);
|
|
|
|
bool validInPixel = lightData.radiusSq>distSq;
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
lightValid = WaveActiveAnyTrue(validInPixel);
|
|
#else
|
|
lightValid = validInPixel;
|
|
#endif
|
|
if (lightValid)
|
|
break;
|
|
}
|
|
}
|
|
else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
|
|
{
|
|
LightVolumeData lightData = _LightVolumeData[idxCoarse];
|
|
for(int i=0; i<PIXEL_PER_THREAD; i++)
|
|
{
|
|
int idx = t + i*NR_THREADS;
|
|
|
|
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
|
|
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
|
|
|
|
// check pixel
|
|
float3 toLight = lightData.lightPos.xyz - vVPos;
|
|
|
|
float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );
|
|
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
|
|
bool validInPixel = max(max(dist.x, dist.y), dist.z)<1; // but allows us to not write out OuterDists
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
lightValid = WaveActiveAnyTrue(validInPixel);
|
|
#else
|
|
lightValid = validInPixel;
|
|
#endif
|
|
if (lightValid)
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
break;
|
|
|
|
// Imlicit division by 32, to pick the correct array index.
|
|
// E.g 37th light devided by 32 = 1.15 (rounded to 1), so we pick uLightsFlags[1] (which represents the lights from 32 to 64).
|
|
uLightsFlags[l >> 5] |= lightValid << (l&31);
|
|
}
|
|
|
|
// Merge results from all threads into shared memory.
|
|
// `InterlockedOr` performs a bitwise OR between `ldsDoesLightIntersect` and `uLightsFlags`.
|
|
// This allows multiple threads to update `ldsDoesLightIntersect` without collision.
|
|
{
|
|
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
|
|
for (uint ii = 0; ii < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++ii)
|
|
InterlockedOr(ldsDoesLightIntersect[ii], uLightsFlags[ii]);
|
|
}
|
|
|
|
// For some platforms we always need GroupMemoryBarrierWithGroupSync() otherwise results are incorrect.
|
|
// Reason is under investigation, related discussions:
|
|
// https://unity.slack.com/archives/C02C8FWPNHE/p1704321597295329
|
|
// https://unity.slack.com/archives/G3JUQKYV8/p1705081617447289
|
|
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) || defined(SHADER_API_SWITCH2)
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
{
|
|
// Reset the total number of lights for the tile.
|
|
if (t == 0)
|
|
ldsNrLightsFinal = 0;
|
|
|
|
// Split the job into multiple passes to ensure all lights are processed even if NR_THREADS is smaller than SHADEROPTIONS_FPTLMAX_LIGHT_COUNT.
|
|
// A thread will possibly processes many lights.
|
|
#define MAX_LIGHT_WRITE_LOOP_CNT (((SHADEROPTIONS_FPTLMAX_LIGHT_COUNT+1) + NR_THREADS - 1)/NR_THREADS)
|
|
[unroll(MAX_LIGHT_WRITE_LOOP_CNT)]
|
|
for (uint it = 0; it < MAX_LIGHT_WRITE_LOOP_CNT; ++it)
|
|
{
|
|
// Retrieve the light index for the current thread and current iteration.
|
|
uint lightIndex = t + it * NR_THREADS;
|
|
|
|
// Check if the mask of the current light is valid (intersection with the tile).
|
|
uint lightsMask = ldsDoesLightIntersect[lightIndex >> 5];
|
|
|
|
// Select only the current light bit i in the block of 32.
|
|
uint localMask = (1u << (lightIndex & 31));
|
|
|
|
// If the thread index is in the light list and the mask is valid.
|
|
if (lightIndex < (uint)iNrCoarseLights && (localMask & lightsMask) != 0u)
|
|
{
|
|
// ldsDoesLightIntersect[k] contains the valid lights for each block of 32 lights.
|
|
// We sum the number of enabled bits (countbits()) in all blocks before the block i.
|
|
// backOffset represents the number of valid lights before the block where i is located.
|
|
uint backOffset = 0;
|
|
[unroll(LIGHT_FPTL_VISIBILITY_DWORD_COUNTS)]
|
|
for (uint k = 0u; k < LIGHT_FPTL_VISIBILITY_DWORD_COUNTS; ++k)
|
|
if (k < (lightIndex >> 5))
|
|
backOffset += countbits(ldsDoesLightIntersect[k]);
|
|
|
|
// Count the number of valid lights (set bits) in the current 32-bit block before the current lightIndex.
|
|
uint lightsInActualBlock = countbits((localMask - 1u) & lightsMask);
|
|
|
|
// Determine the index of the current light by calculating how many lights were stored before it.
|
|
// This ensures that lights are packed correctly without gaps.
|
|
uint uIndex = backOffset + lightsInActualBlock;
|
|
|
|
// Ensure the computed index does not exceed the max allowed light entries.
|
|
if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES)
|
|
{
|
|
// InterlockedAdd ensures atomic and ordered writing to prunedList, preventing threads from overwriting each other's values.
|
|
// Increment ldsNrLightsFinal (represents the total number of lights for the tile).
|
|
unsigned int uInc = 1;
|
|
unsigned int finalPrunedLightIndex;
|
|
InterlockedAdd(ldsNrLightsFinal, uInc, finalPrunedLightIndex);
|
|
|
|
// Add the light to the prune list. If the index varies due to desynchronization (e.g without the previous InterlockedAdd),
|
|
// it can causes flickering (mostly seen on Metal and Apple GPUs).
|
|
if (finalPrunedLightIndex < LIGHT_LIST_MAX_COARSE_ENTRIES)
|
|
prunedList[finalPrunedLightIndex] = coarseList[lightIndex];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|