#ifndef UNITY_HD_SHADOW_LOOP_HLSL #define UNITY_HD_SHADOW_LOOP_HLSL //#define SHADOW_LOOP_MULTIPLY //#define SHADOW_LOOP_AVERAGE #if defined(SHADOW_LOOP_MULTIPLY) || defined(SHADOW_LOOP_AVERAGE) #define SHADOW_LOOP_WEIGHT #endif void ShadowLoopMin(HDShadowContext shadowContext, PositionInputs posInput, float3 normalWS, uint featureFlags, uint renderLayer, out float3 shadow) { #ifdef SHADOW_LOOP_WEIGHT float shadowCount = 0.0f; #endif #ifdef SHADOW_LOOP_MULTIPLY shadow = float3(1, 1, 1); #elif defined(SHADOW_LOOP_AVERAGE) shadow = float3(0, 0, 0); #else shadow = float3(1, 1, 1); #endif // With XR single-pass and camera-relative: offset position to do lighting computations from the combined center view (original camera matrix). // This is required because there is only one list of lights generated on the CPU. Shadows are also generated once and shared between the instanced views. ApplyCameraRelativeXR(posInput.positionWS); // Initialize the contactShadow and contactShadowFade fields // First of all we compute the shadow value of the directional light to reduce the VGPR pressure if (featureFlags & LIGHTFEATUREFLAGS_DIRECTIONAL) { // Evaluate sun shadows. if (_DirectionalShadowIndex >= 0) { DirectionalLightData light = _DirectionalLightDatas[_DirectionalShadowIndex]; // TODO: this will cause us to load from the normal buffer first. Does this cause a performance problem? float3 wi = -light.forward; // Is it worth sampling the shadow map? if (light.lightDimmer > 0 && light.shadowDimmer > 0) { SHADOW_TYPE shadowD = 1.0; #if defined(SCREEN_SPACE_SHADOWS_ON) && !defined(_SURFACE_TYPE_TRANSPARENT) if ((light.screenSpaceShadowIndex & SCREEN_SPACE_SHADOW_INDEX_MASK) != INVALID_SCREEN_SPACE_SHADOW) { shadowD = GetScreenSpaceColorShadow(posInput, light.screenSpaceShadowIndex).SHADOW_TYPE_SWIZZLE; } else #endif { shadowD = GetDirectionalShadowAttenuation(shadowContext, posInput.positionSS, posInput.positionWS, normalWS, light.shadowIndex, wi); } #ifdef SHADOW_LOOP_MULTIPLY shadow *= lerp(light.shadowTint, float3(1, 1, 1), shadowD); #elif defined(SHADOW_LOOP_AVERAGE) shadow += lerp(light.shadowTint, float3(1, 1, 1), shadowD); #else shadow = min(shadow, shadowD.SHADOW_TYPE_SWIZZLE); #endif #ifdef SHADOW_LOOP_WEIGHT shadowCount += 1.0f; #endif } } } if (featureFlags & LIGHTFEATUREFLAGS_PUNCTUAL) { uint lightCount, lightStart; #ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER GetCountAndStart(posInput, LIGHTCATEGORY_PUNCTUAL, lightStart, lightCount); #else // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER lightCount = _PunctualLightCount; lightStart = 0; #endif bool fastPath = false; uint lightStartLane0; fastPath = IsFastPath(lightStart, lightStartLane0); if (fastPath) { lightStart = lightStartLane0; } // Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed. // For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register), // v_ are variables that might have different value for each thread in the wave (meant for vector registers). // This will perform more loads than it is supposed to, however, the benefits should offset the downside, especially given that light data accessed should be largely coherent. // Note that the above is valid only if wave intriniscs are supported. uint v_lightListOffset = 0; uint v_lightIdx = lightStart; #if NEED_TO_CHECK_HELPER_LANE // On some platform helper lanes don't behave as we'd expect, therefore we prevent them from entering the loop altogether. // IMPORTANT! This has implications if ddx/ddy is used on results derived from lighting, however given Lightloop is called in compute we should be // sure it will not happen. bool isHelperLane = WaveIsHelperLane(); while (!isHelperLane && v_lightListOffset < lightCount) #else while (v_lightListOffset < lightCount) #endif { v_lightIdx = FetchIndex(lightStart, v_lightListOffset); uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath); if (s_lightIdx == -1) break; LightData s_lightData = FetchLight(s_lightIdx); // If current scalar and vector light index match, we process the light. The v_lightListOffset for current thread is increased. // Note that the following should really be ==, however, since helper lanes are not considered by WaveActiveMin, such helper lanes could // end up with a unique v_lightIdx value that is smaller than s_lightIdx hence being stuck in a loop. All the active lanes will not have this problem. if (s_lightIdx >= v_lightIdx) { v_lightListOffset++; if (IsMatchingLightLayer(s_lightData.lightLayers, renderLayer) && s_lightData.shadowIndex >= 0 && s_lightData.shadowDimmer > 0) { float shadowP; float3 L; float4 distances; // {d, d^2, 1/d, d_proj} GetPunctualLightVectors(posInput.positionWS, s_lightData, L, distances); // Projector lights (box, pyramid) always have cookies, so we can perform clipping inside the if(). float lightinBounds = 1.0; if (s_lightData.lightType == GPULIGHTTYPE_PROJECTOR_PYRAMID || s_lightData.lightType == GPULIGHTTYPE_PROJECTOR_BOX) { float3 lightToSample = posInput.positionWS - s_lightData.positionRWS; float3x3 lightToWorld = float3x3(s_lightData.right, s_lightData.up, s_lightData.forward); float3 positionLS = mul(lightToSample, transpose(lightToWorld)); // Perform orthographic or perspective projection. float perspectiveZ = (s_lightData.lightType != GPULIGHTTYPE_PROJECTOR_BOX) ? positionLS.z : 1.0; float2 positionCS = positionLS.xy / perspectiveZ; float z = positionLS.z; float r = s_lightData.range; // Box lights have no range attenuation, so we must clip manually. lightinBounds = Max3(abs(positionCS.x), abs(positionCS.y), abs(z - 0.5 * r) - 0.5 * r + 1) <= s_lightData.boxLightSafeExtent ? 1 : 0; } if (distances.x < s_lightData.range && PunctualLightAttenuation(distances, s_lightData.rangeAttenuationScale, s_lightData.rangeAttenuationBias, s_lightData.angleScale, s_lightData.angleOffset) > 0.0 && lightinBounds > 0.0 && L.y > 0.0) { #if defined(SCREEN_SPACE_SHADOWS_ON) && !defined(_SURFACE_TYPE_TRANSPARENT) if ((s_lightData.screenSpaceShadowIndex & SCREEN_SPACE_SHADOW_INDEX_MASK) != INVALID_SCREEN_SPACE_SHADOW) { shadowP = GetScreenSpaceShadow(posInput, s_lightData.screenSpaceShadowIndex); } else #endif { shadowP = GetPunctualShadowAttenuation(shadowContext, posInput.positionSS, posInput.positionWS, normalWS, s_lightData.shadowIndex, L, distances.x, s_lightData.lightType == GPULIGHTTYPE_POINT, s_lightData.lightType != GPULIGHTTYPE_PROJECTOR_BOX); shadowP = s_lightData.nonLightMappedOnly ? min(1.0f, shadowP) : shadowP; } shadowP = lerp(1.0f, shadowP, s_lightData.shadowDimmer); #ifdef SHADOW_LOOP_MULTIPLY shadow *= lerp(s_lightData.shadowTint, float3(1, 1, 1), shadowP); #elif defined(SHADOW_LOOP_AVERAGE) shadow += lerp(s_lightData.shadowTint, float3(1, 1, 1), shadowP); #else shadow = min(shadow, shadowP.xxx); #endif #ifdef SHADOW_LOOP_WEIGHT shadowCount += 1.0f; #endif } } } } } if (featureFlags & LIGHTFEATUREFLAGS_AREA) { uint lightCount, lightStart; #ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER GetCountAndStart(posInput, LIGHTCATEGORY_AREA, lightStart, lightCount); #else lightCount = _AreaLightCount; lightStart = _PunctualLightCount; #endif // COMPILER BEHAVIOR WARNING! // If rectangle lights are before line lights, the compiler will duplicate light matrices in VGPR because they are used differently between the two types of lights. // By keeping line lights first we avoid this behavior and save substantial register pressure. // TODO: This is based on the current Lit.shader and can be different for any other way of implementing area lights, how to be generic and ensure performance ? uint i; if (lightCount > 0) { i = 0; uint last = lightCount - 1; LightData lightData = FetchLight(lightStart, i); while (i <= last && lightData.lightType == GPULIGHTTYPE_TUBE) { lightData = FetchLight(lightStart, min(++i, last)); } while (i <= last) // GPULIGHTTYPE_RECTANGLE { lightData.lightType = GPULIGHTTYPE_RECTANGLE; // Enforce constant propagation float shadowArea = 1.0f; // If the point to shade is in the positive hemisphere of the area light, we can read the shadow. if (dot(lightData.forward, posInput.positionWS) > dot(lightData.forward, lightData.positionRWS)) { if (IsMatchingLightLayer(lightData.lightLayers, renderLayer)) { #if defined(SCREEN_SPACE_SHADOWS_ON) && !defined(_SURFACE_TYPE_TRANSPARENT) if ((lightData.screenSpaceShadowIndex & SCREEN_SPACE_SHADOW_INDEX_MASK) != INVALID_SCREEN_SPACE_SHADOW) { shadowArea = GetScreenSpaceShadow(posInput, lightData.screenSpaceShadowIndex); } else #endif if ( lightData.shadowIndex >= 0 ) { float3 L; float4 distances; // {d, d^2, 1/d, d_proj} GetPunctualLightVectors(posInput.positionWS, lightData, L, distances); float lightRadSqr = lightData.size.x; float shadowP; float coef = 0.0f; float3 unL = lightData.positionRWS - posInput.positionWS; if (dot(lightData.forward, unL) < FLT_EPS) { float3x3 lightToWorld = float3x3(lightData.right, lightData.up, -lightData.forward); unL = mul(unL, transpose(lightToWorld)); float halfWidth = lightData.size.x*0.5; float halfHeight = lightData.size.y*0.5; float range = lightData.range; float3 invHalfDim = rcp(float3(range + halfWidth, range + halfHeight, range)); coef = EllipsoidalDistanceAttenuation(unL, invHalfDim, lightData.rangeAttenuationScale, lightData.rangeAttenuationBias); } if (distances.x < lightData.range && coef > 0.0) { shadowArea = GetRectAreaShadowAttenuation(shadowContext, posInput.positionSS, posInput.positionWS, normalWS, lightData.shadowIndex, normalize(lightData.positionRWS), length(lightData.positionRWS)); } } } #ifdef SHADOW_LOOP_MULTIPLY shadow *= lerp(lightData.shadowTint, float3(1, 1, 1), shadowArea); #elif defined(SHADOW_LOOP_AVERAGE) shadow += lerp(lightData.shadowTint, float3(1, 1, 1), shadowArea); #else shadow = min(shadow, shadowArea.xxx); #endif #ifdef SHADOW_LOOP_WEIGHT shadowCount += 1.0f; #endif } lightData = FetchLight(lightStart, min(++i, last)); } } } #ifdef SHADOW_LOOP_MULTIPLY if (shadowCount == 0.0f) { shadow = float3(1, 1, 1); } #elif defined(SHADOW_LOOP_AVERAGE) if (shadowCount > 0.0f) { shadow /= shadowCount; } else { shadow = float3(1, 1, 1); } #endif } #endif