Laurens-Packages/Packages/com.unity.render-pipelines..../Runtime/Sky/AmbientProbeConvolution.com...


								#define HAMMERSLEY_USE_CB


								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"

								#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"

								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl"

								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/SphericalHarmonics.hlsl"

								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Hammersley.hlsl"

								#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Sampling.hlsl"


								#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch


								// Regular ambient probe convolution

								// Always use mips and output separate diffuse buffer.

								// Volumetric output as an option

								#pragma kernel AmbientProbeConvolutionDiffuse KERNEL_NAME=AmbientProbeConvolutionDiffuse OUTPUT_DIFFUSE

								#pragma kernel AmbientProbeConvolutionVolumetric KERNEL_NAME=AmbientProbeConvolutionVolumetric OUTPUT_VOLUMETRIC

								#pragma kernel AmbientProbeConvolutionDiffuseVolumetric KERNEL_NAME=AmbientProbeConvolutionDiffuseVolumetric OUTPUT_VOLUMETRIC OUTPUT_DIFFUSE

								// Ambient probe convolution for clouds.

								// Does not output diffuse and volumetric buffer and does not use input texture mips.

								#pragma kernel AmbientProbeConvolutionClouds KERNEL_NAME=AmbientProbeConvolutionClouds NO_MIPS OUTPUT_CLOUDS


								// Input Cubemap

								TEXTURECUBE(_AmbientProbeInputCubemap);

								// Output buffer

								RWStructuredBuffer<float> _AmbientProbeOutputBuffer;

								RWStructuredBuffer<float4> _VolumetricAmbientProbeOutputBuffer;

								RWStructuredBuffer<float4> _DiffuseAmbientProbeOutputBuffer;


								// If we use local VGPRs as a scratch buffer we end up using too many register

								// To avoid that we go through memory.

								// This is quite messy and bad for performance but this shader should never be critical so it should be fine.

								// Uint is used as it's the only format supported everywhere as read/write from same thread.

								RWStructuredBuffer<uint> _ScratchBuffer;


								uniform float4 _FogParameters;


								#define _FogDimmer _FogParameters.x

								#define _FogAnisotropy _FogParameters.y


								#define SAMPLE_COUNT 256

								#define SH_COEFF_COUNT 27


								#if defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && defined(PLATFORM_LANE_COUNT)

								    // Allocate space to accumulate all waves result. We need space for each single wavefront (because we can't atomic add floats)

								    groupshared float outputSHCoeffsLDS[SH_COEFF_COUNT * SAMPLE_COUNT / PLATFORM_LANE_COUNT];

								#else

								    // Allocate space for parallel reduction (so half the number of samples.

								    groupshared float outputSHCoeffsLDS[SH_COEFF_COUNT * SAMPLE_COUNT / 2];

								#endif


								    void PackSHFromScratchBuffer(RWStructuredBuffer<float4> buffer)

								    {

								        int c = 0;

								        for (c = 0; c < 3; c++)


								        {

								            buffer[c] = float4(asfloat(_ScratchBuffer[c * 9 + 3]), asfloat(_ScratchBuffer[c * 9 + 1]), asfloat(_ScratchBuffer[c * 9 + 2]), asfloat(_ScratchBuffer[c * 9 + 0]) - asfloat(_ScratchBuffer[c * 9 + 6]));

								        }


								        // Quadratic (4/5)

								        for (c = 0; c < 3; c++)

								        {

								            buffer[3 + c] = float4(asfloat(_ScratchBuffer[c * 9 + 4]), asfloat(_ScratchBuffer[c * 9 + 5]), asfloat(_ScratchBuffer[c * 9 + 6]) * 3.0f, asfloat(_ScratchBuffer[c * 9 + 7]));

								        }


								        // Quadratic (5)

								        buffer[6] = float4(asfloat(_ScratchBuffer[0 * 9 + 8]), asfloat(_ScratchBuffer[1 * 9 + 8]), asfloat(_ScratchBuffer[2 * 9 + 8]), 1.0f);

								    }


								    void ConvolveZonalFromScratchBuffer(float3 zh)

								    {

								        for (int l = 0; l <= 2; l++)

								        {

								            float n = sqrt((4.0f * PI) / (2 * l + 1));

								            float k = zh[l];

								            float p = n * k;


								            for (int m = -l; m <= l; m++)

								            {

								                int i = l * (l + 1) + m;


								                for (int c = 0; c < 3; c++)

								                {

								                    _ScratchBuffer[c * 9 + i] = asuint(asfloat(_ScratchBuffer[c * 9 + i]) * p);

								                }

								            }

								        }

								    }


								[numthreads(SAMPLE_COUNT, 1, 1)]

								void KERNEL_NAME(uint dispatchThreadId : SV_DispatchThreadID)

								{

								    uint sampleCount = SAMPLE_COUNT;


								    // Construct the direction

								    float2 u = Hammersley2d(dispatchThreadId, sampleCount);

								    float3 n = SampleSphereUniform(u.x, u.y);


								#if defined(NO_MIPS)

								    // Sample once per thread

								    float4 value = SAMPLE_TEXTURECUBE_LOD(_AmbientProbeInputCubemap, s_linear_clamp_sampler, n, 0);

								#else

								    // Grab the cubemap size

								    float2 cubeSize;

								    _AmbientProbeInputCubemap.GetDimensions(cubeSize.x, cubeSize.y);


								    // Prefiltered importance sampling

								    // Use lower MIP-map levels for fetching samples with low probabilities

								    // in order to reduce the variance.

								    // Ref: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html

								    //

								    // - OmegaS: Solid angle associated with the sample

								    // - OmegaP: Solid angle associated with the texel of the cubemap

								    float invOmegaP = (6.0 * cubeSize.x * cubeSize.y) / FOUR_PI;

								    float pdf = 1.0 / FOUR_PI; // Solid angle of the sphere is 4*PI

								    float omegaS = rcp(sampleCount) * rcp(pdf);

								    float mipLevel = 0.5 * log2(omegaS * invOmegaP);


								    // Sample once per thread

								    float4 value = SAMPLE_TEXTURECUBE_LOD(_AmbientProbeInputCubemap, s_linear_clamp_sampler, n, mipLevel);

								#endif


								    float outputSHCoeffs[SH_COEFF_COUNT];


								    for (int channel = 0; channel < 3; ++channel)

								    {

								        // Note: SH coefficient are apply letter

								        outputSHCoeffs[channel * 9 + 0] = value[channel];

								        outputSHCoeffs[channel * 9 + 1] = n.y * value[channel];

								        outputSHCoeffs[channel * 9 + 2] = n.z * value[channel];

								        outputSHCoeffs[channel * 9 + 3] = n.x * value[channel];

								        outputSHCoeffs[channel * 9 + 4] = n.x * n.y * value[channel];

								        outputSHCoeffs[channel * 9 + 5] = n.y * n.z * value[channel];

								        outputSHCoeffs[channel * 9 + 6] = (3.0 * n.z * n.z - 1.0) * value[channel];

								        outputSHCoeffs[channel * 9 + 7] = n.x * n.z * value[channel];

								        outputSHCoeffs[channel * 9 + 8] = (n.x * n.x - n.y * n.y) * value[channel];

								    }


								    uint i;

								#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS


								    // Sum up all threads result and broadcast

								    for (i = 0; i < SH_COEFF_COUNT; ++i)

								    {

								        outputSHCoeffs[i] = WaveActiveSum(outputSHCoeffs[i]);

								    }


								    // First thread of each wave stores the result in LDS

								    uint laneCount = WaveGetLaneCount();

								    if (dispatchThreadId % laneCount == 0)

								    {

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            uint offset = (dispatchThreadId / laneCount) * SH_COEFF_COUNT;

								            outputSHCoeffsLDS[i + offset] = outputSHCoeffs[i];

								        }

								    }


								    GroupMemoryBarrierWithGroupSync();


								    // Read back the result to VGPRs to store it to memory at the end

								    // First wave intializes the array

								    for (i = 0; i < SH_COEFF_COUNT; ++i)

								    {

								        outputSHCoeffs[i] = outputSHCoeffsLDS[i];

								    }


								    // Then accumulate remaining waves

								    uint waveCount = sampleCount / laneCount;

								    for (uint wave = 1; wave < waveCount; ++wave)

								    {

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            outputSHCoeffs[i] += outputSHCoeffsLDS[i + wave * SH_COEFF_COUNT];

								        }

								    }

								#else

								    // Parallel reduction of all threads result.

								    for (uint k = 0; k < FastLog2(SAMPLE_COUNT); ++k)

								    {

								        // Each loop iteration, even threads store their result in LDS, odd threads sum them up back to local VGPR until all results are summed up.

								        if ((dispatchThreadId & ((2 << k) - 1)) == (1 << k))

								        {

								            uint index = dispatchThreadId >> (k + 1);

								            for (uint coeff = 0; coeff < SH_COEFF_COUNT; ++coeff)

								            {

								                outputSHCoeffsLDS[index * SH_COEFF_COUNT + coeff] = outputSHCoeffs[coeff];

								            }

								        }


								        GroupMemoryBarrierWithGroupSync();


								        if ((dispatchThreadId & ((2 << k) - 1)) == 0)

								        {

								            uint index = dispatchThreadId >> (k + 1);

								            for (uint coeff = 0; coeff < SH_COEFF_COUNT; ++coeff)

								            {

								                outputSHCoeffs[coeff] += outputSHCoeffsLDS[index * SH_COEFF_COUNT + coeff];

								            }

								        }


								        GroupMemoryBarrierWithGroupSync();

								    }

								#endif


								    float weight = 4.0 * PI / (sampleCount);


								    // Write to memory and convolution + weighing

								    if (dispatchThreadId == 0)

								    {

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            // Sh Coefficient used for encoding

								            outputSHCoeffs[i] = outputSHCoeffs[i] * kSHBasisCoef[i % 9] * weight;

								        }


								#if !defined(OUTPUT_CLOUDS)

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            // ClampedCosine * SH Coefficient used in Decode

								            _AmbientProbeOutputBuffer[i] = outputSHCoeffs[i] * kClampedCosineCoefs[i % 9] * kSHBasisCoef[i % 9];

								        }

								#endif


								#if OUTPUT_DIFFUSE

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            _ScratchBuffer[i] = asuint(_AmbientProbeOutputBuffer[i]);

								        }


								        // Diffuse convolution packed to be ready for shader consumption

								        PackSHFromScratchBuffer(_DiffuseAmbientProbeOutputBuffer);

								#endif


								#if OUTPUT_VOLUMETRIC || OUTPUT_CLOUDS


								        // Note: Code below could be optimize (lot of constant multiplication), but compiler may figure it out and this path is rarely executed, so prefer clean code.

								        // Apply FogDimmer

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            _ScratchBuffer[i] = asuint(outputSHCoeffs[i] * _FogDimmer);

								        }


								        // Apply CornetteShank phase function

								        float3 zh;

								        GetCornetteShanksPhaseFunction(zh, _FogAnisotropy);

								        ConvolveZonalFromScratchBuffer(zh);


								        // Premultiplies the SH with the polynomial coefficients of SH basis functions,

								        // which avoids using any constants during SH evaluation.

								        // The resulting evaluation takes the form:

								        // (c_0 - c_6) + c_1 y + c_2 z + c_3 x + c_4 x y + c_5 y z + c_6 (3 z^2) + c_7 x z + c_8 (x^2 - y^2)

								        for (i = 0; i < SH_COEFF_COUNT; ++i)

								        {

								            _ScratchBuffer[i] = asuint(asfloat(_ScratchBuffer[i]) * kSHBasisCoef[i % 9]);

								        }


								        PackSHFromScratchBuffer(_VolumetricAmbientProbeOutputBuffer);

								#endif

								    }

								}