#define HAMMERSLEY_USE_CB #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl" #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl" #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/SphericalHarmonics.hlsl" #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Hammersley.hlsl" #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Sampling.hlsl" #pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch switch2 // Regular ambient probe convolution // Always use mips and output separate diffuse buffer. // Volumetric output as an option #pragma kernel AmbientProbeConvolutionDiffuse KERNEL_NAME=AmbientProbeConvolutionDiffuse OUTPUT_DIFFUSE #pragma kernel AmbientProbeConvolutionVolumetric KERNEL_NAME=AmbientProbeConvolutionVolumetric OUTPUT_VOLUMETRIC #pragma kernel AmbientProbeConvolutionDiffuseVolumetric KERNEL_NAME=AmbientProbeConvolutionDiffuseVolumetric OUTPUT_VOLUMETRIC OUTPUT_DIFFUSE // Ambient probe convolution for clouds. // Does not output diffuse and volumetric buffer and does not use input texture mips. #pragma kernel AmbientProbeConvolutionClouds KERNEL_NAME=AmbientProbeConvolutionClouds NO_MIPS OUTPUT_CLOUDS // Input Cubemap TEXTURECUBE(_AmbientProbeInputCubemap); // Output buffer RWStructuredBuffer _AmbientProbeOutputBuffer; RWStructuredBuffer _VolumetricAmbientProbeOutputBuffer; RWStructuredBuffer _DiffuseAmbientProbeOutputBuffer; // If we use local VGPRs as a scratch buffer we end up using too many register // To avoid that we go through memory. // This is quite messy and bad for performance but this shader should never be critical so it should be fine. // Uint is used as it's the only format supported everywhere as read/write from same thread. RWStructuredBuffer _ScratchBuffer; uniform float4 _FogParameters; #define _FogDimmer _FogParameters.x #define _FogAnisotropy _FogParameters.y #define SAMPLE_COUNT 256 #define SH_COEFF_COUNT 27 #if defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && defined(PLATFORM_LANE_COUNT) // Allocate space to accumulate all waves result. We need space for each single wavefront (because we can't atomic add floats) groupshared float outputSHCoeffsLDS[SH_COEFF_COUNT * SAMPLE_COUNT / PLATFORM_LANE_COUNT]; #else // Allocate space for parallel reduction (so half the number of samples. groupshared float outputSHCoeffsLDS[SH_COEFF_COUNT * SAMPLE_COUNT / 2]; #endif void PackSHFromScratchBuffer(RWStructuredBuffer buffer) { int c = 0; for (c = 0; c < 3; c++) { buffer[c] = float4(asfloat(_ScratchBuffer[c * 9 + 3]), asfloat(_ScratchBuffer[c * 9 + 1]), asfloat(_ScratchBuffer[c * 9 + 2]), asfloat(_ScratchBuffer[c * 9 + 0]) - asfloat(_ScratchBuffer[c * 9 + 6])); } // Quadratic (4/5) for (c = 0; c < 3; c++) { buffer[3 + c] = float4(asfloat(_ScratchBuffer[c * 9 + 4]), asfloat(_ScratchBuffer[c * 9 + 5]), asfloat(_ScratchBuffer[c * 9 + 6]) * 3.0f, asfloat(_ScratchBuffer[c * 9 + 7])); } // Quadratic (5) buffer[6] = float4(asfloat(_ScratchBuffer[0 * 9 + 8]), asfloat(_ScratchBuffer[1 * 9 + 8]), asfloat(_ScratchBuffer[2 * 9 + 8]), 1.0f); } void ConvolveZonalFromScratchBuffer(float3 zh) { for (int l = 0; l <= 2; l++) { float n = sqrt((4.0f * PI) / (2 * l + 1)); float k = zh[l]; float p = n * k; for (int m = -l; m <= l; m++) { int i = l * (l + 1) + m; for (int c = 0; c < 3; c++) { _ScratchBuffer[c * 9 + i] = asuint(asfloat(_ScratchBuffer[c * 9 + i]) * p); } } } } [numthreads(SAMPLE_COUNT, 1, 1)] void KERNEL_NAME(uint dispatchThreadId : SV_DispatchThreadID) { uint sampleCount = SAMPLE_COUNT; // Construct the direction float2 u = Hammersley2d(dispatchThreadId, sampleCount); float3 n = SampleSphereUniform(u.x, u.y); #if defined(NO_MIPS) // Sample once per thread float4 value = SAMPLE_TEXTURECUBE_LOD(_AmbientProbeInputCubemap, s_linear_clamp_sampler, n, 0); #else // Grab the cubemap size float2 cubeSize; _AmbientProbeInputCubemap.GetDimensions(cubeSize.x, cubeSize.y); // Prefiltered importance sampling // Use lower MIP-map levels for fetching samples with low probabilities // in order to reduce the variance. // Ref: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html // // - OmegaS: Solid angle associated with the sample // - OmegaP: Solid angle associated with the texel of the cubemap float invOmegaP = (6.0 * cubeSize.x * cubeSize.y) / FOUR_PI; float pdf = 1.0 / FOUR_PI; // Solid angle of the sphere is 4*PI float omegaS = rcp(sampleCount) * rcp(pdf); float mipLevel = 0.5 * log2(omegaS * invOmegaP); // Sample once per thread float4 value = SAMPLE_TEXTURECUBE_LOD(_AmbientProbeInputCubemap, s_linear_clamp_sampler, n, mipLevel); #endif float outputSHCoeffs[SH_COEFF_COUNT]; for (int channel = 0; channel < 3; ++channel) { // Note: SH coefficient are apply letter outputSHCoeffs[channel * 9 + 0] = value[channel]; outputSHCoeffs[channel * 9 + 1] = n.y * value[channel]; outputSHCoeffs[channel * 9 + 2] = n.z * value[channel]; outputSHCoeffs[channel * 9 + 3] = n.x * value[channel]; outputSHCoeffs[channel * 9 + 4] = n.x * n.y * value[channel]; outputSHCoeffs[channel * 9 + 5] = n.y * n.z * value[channel]; outputSHCoeffs[channel * 9 + 6] = (3.0 * n.z * n.z - 1.0) * value[channel]; outputSHCoeffs[channel * 9 + 7] = n.x * n.z * value[channel]; outputSHCoeffs[channel * 9 + 8] = (n.x * n.x - n.y * n.y) * value[channel]; } uint i; #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS // Sum up all threads result and broadcast for (i = 0; i < SH_COEFF_COUNT; ++i) { outputSHCoeffs[i] = WaveActiveSum(outputSHCoeffs[i]); } // First thread of each wave stores the result in LDS uint laneCount = WaveGetLaneCount(); if (dispatchThreadId % laneCount == 0) { for (i = 0; i < SH_COEFF_COUNT; ++i) { uint offset = (dispatchThreadId / laneCount) * SH_COEFF_COUNT; outputSHCoeffsLDS[i + offset] = outputSHCoeffs[i]; } } GroupMemoryBarrierWithGroupSync(); // Read back the result to VGPRs to store it to memory at the end // First wave intializes the array for (i = 0; i < SH_COEFF_COUNT; ++i) { outputSHCoeffs[i] = outputSHCoeffsLDS[i]; } // Then accumulate remaining waves uint waveCount = sampleCount / laneCount; for (uint wave = 1; wave < waveCount; ++wave) { for (i = 0; i < SH_COEFF_COUNT; ++i) { outputSHCoeffs[i] += outputSHCoeffsLDS[i + wave * SH_COEFF_COUNT]; } } #else // Parallel reduction of all threads result. for (uint k = 0; k < FastLog2(SAMPLE_COUNT); ++k) { // Each loop iteration, even threads store their result in LDS, odd threads sum them up back to local VGPR until all results are summed up. if ((dispatchThreadId & ((2 << k) - 1)) == (1 << k)) { uint index = dispatchThreadId >> (k + 1); for (uint coeff = 0; coeff < SH_COEFF_COUNT; ++coeff) { outputSHCoeffsLDS[index * SH_COEFF_COUNT + coeff] = outputSHCoeffs[coeff]; } } GroupMemoryBarrierWithGroupSync(); if ((dispatchThreadId & ((2 << k) - 1)) == 0) { uint index = dispatchThreadId >> (k + 1); for (uint coeff = 0; coeff < SH_COEFF_COUNT; ++coeff) { outputSHCoeffs[coeff] += outputSHCoeffsLDS[index * SH_COEFF_COUNT + coeff]; } } GroupMemoryBarrierWithGroupSync(); } #endif float weight = 4.0 * PI / (sampleCount); // Write to memory and convolution + weighing if (dispatchThreadId == 0) { for (i = 0; i < SH_COEFF_COUNT; ++i) { // Sh Coefficient used for encoding outputSHCoeffs[i] = outputSHCoeffs[i] * kSHBasisCoef[i % 9] * weight; } #if !defined(OUTPUT_CLOUDS) for (i = 0; i < SH_COEFF_COUNT; ++i) { // ClampedCosine * SH Coefficient used in Decode _AmbientProbeOutputBuffer[i] = outputSHCoeffs[i] * kClampedCosineCoefs[i % 9] * kSHBasisCoef[i % 9]; } #endif #if OUTPUT_DIFFUSE for (i = 0; i < SH_COEFF_COUNT; ++i) { _ScratchBuffer[i] = asuint(_AmbientProbeOutputBuffer[i]); } // Diffuse convolution packed to be ready for shader consumption PackSHFromScratchBuffer(_DiffuseAmbientProbeOutputBuffer); #endif #if OUTPUT_VOLUMETRIC || OUTPUT_CLOUDS // Note: Code below could be optimize (lot of constant multiplication), but compiler may figure it out and this path is rarely executed, so prefer clean code. // Apply FogDimmer for (i = 0; i < SH_COEFF_COUNT; ++i) { _ScratchBuffer[i] = asuint(outputSHCoeffs[i] * _FogDimmer); } // Apply CornetteShank phase function float3 zh; GetCornetteShanksPhaseFunction(zh, _FogAnisotropy); ConvolveZonalFromScratchBuffer(zh); // Premultiplies the SH with the polynomial coefficients of SH basis functions, // which avoids using any constants during SH evaluation. // The resulting evaluation takes the form: // (c_0 - c_6) + c_1 y + c_2 z + c_3 x + c_4 x y + c_5 y z + c_6 (3 z^2) + c_7 x z + c_8 (x^2 - y^2) for (i = 0; i < SH_COEFF_COUNT; ++i) { _ScratchBuffer[i] = asuint(asfloat(_ScratchBuffer[i]) * kSHBasisCoef[i % 9]); } PackSHFromScratchBuffer(_VolumetricAmbientProbeOutputBuffer); #endif } }