You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

261 lines
9.9 KiB

#define HAMMERSLEY_USE_CB
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/SphericalHarmonics.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Hammersley.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Sampling.hlsl"
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
// Regular ambient probe convolution
// Always use mips and output separate diffuse buffer.
// Volumetric output as an option
#pragma kernel AmbientProbeConvolutionDiffuse KERNEL_NAME=AmbientProbeConvolutionDiffuse OUTPUT_DIFFUSE
#pragma kernel AmbientProbeConvolutionVolumetric KERNEL_NAME=AmbientProbeConvolutionVolumetric OUTPUT_VOLUMETRIC
#pragma kernel AmbientProbeConvolutionDiffuseVolumetric KERNEL_NAME=AmbientProbeConvolutionDiffuseVolumetric OUTPUT_VOLUMETRIC OUTPUT_DIFFUSE
// Ambient probe convolution for clouds.
// Does not output diffuse and volumetric buffer and does not use input texture mips.
#pragma kernel AmbientProbeConvolutionClouds KERNEL_NAME=AmbientProbeConvolutionClouds NO_MIPS OUTPUT_CLOUDS
// Input Cubemap
TEXTURECUBE(_AmbientProbeInputCubemap);
// Output buffer
RWStructuredBuffer<float> _AmbientProbeOutputBuffer;
RWStructuredBuffer<float4> _VolumetricAmbientProbeOutputBuffer;
RWStructuredBuffer<float4> _DiffuseAmbientProbeOutputBuffer;
// If we use local VGPRs as a scratch buffer we end up using too many register
// To avoid that we go through memory.
// This is quite messy and bad for performance but this shader should never be critical so it should be fine.
// Uint is used as it's the only format supported everywhere as read/write from same thread.
RWStructuredBuffer<uint> _ScratchBuffer;
uniform float4 _FogParameters;
#define _FogDimmer _FogParameters.x
#define _FogAnisotropy _FogParameters.y
#define SAMPLE_COUNT 256
#define SH_COEFF_COUNT 27
#if defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && defined(PLATFORM_LANE_COUNT)
// Allocate space to accumulate all waves result. We need space for each single wavefront (because we can't atomic add floats)
groupshared float outputSHCoeffsLDS[SH_COEFF_COUNT * SAMPLE_COUNT / PLATFORM_LANE_COUNT];
#else
// Allocate space for parallel reduction (so half the number of samples.
groupshared float outputSHCoeffsLDS[SH_COEFF_COUNT * SAMPLE_COUNT / 2];
#endif
void PackSHFromScratchBuffer(RWStructuredBuffer<float4> buffer)
{
int c = 0;
for (c = 0; c < 3; c++)
{
buffer[c] = float4(asfloat(_ScratchBuffer[c * 9 + 3]), asfloat(_ScratchBuffer[c * 9 + 1]), asfloat(_ScratchBuffer[c * 9 + 2]), asfloat(_ScratchBuffer[c * 9 + 0]) - asfloat(_ScratchBuffer[c * 9 + 6]));
}
// Quadratic (4/5)
for (c = 0; c < 3; c++)
{
buffer[3 + c] = float4(asfloat(_ScratchBuffer[c * 9 + 4]), asfloat(_ScratchBuffer[c * 9 + 5]), asfloat(_ScratchBuffer[c * 9 + 6]) * 3.0f, asfloat(_ScratchBuffer[c * 9 + 7]));
}
// Quadratic (5)
buffer[6] = float4(asfloat(_ScratchBuffer[0 * 9 + 8]), asfloat(_ScratchBuffer[1 * 9 + 8]), asfloat(_ScratchBuffer[2 * 9 + 8]), 1.0f);
}
void ConvolveZonalFromScratchBuffer(float3 zh)
{
for (int l = 0; l <= 2; l++)
{
float n = sqrt((4.0f * PI) / (2 * l + 1));
float k = zh[l];
float p = n * k;
for (int m = -l; m <= l; m++)
{
int i = l * (l + 1) + m;
for (int c = 0; c < 3; c++)
{
_ScratchBuffer[c * 9 + i] = asuint(asfloat(_ScratchBuffer[c * 9 + i]) * p);
}
}
}
}
[numthreads(SAMPLE_COUNT, 1, 1)]
void KERNEL_NAME(uint dispatchThreadId : SV_DispatchThreadID)
{
uint sampleCount = SAMPLE_COUNT;
// Construct the direction
float2 u = Hammersley2d(dispatchThreadId, sampleCount);
float3 n = SampleSphereUniform(u.x, u.y);
#if defined(NO_MIPS)
// Sample once per thread
float4 value = SAMPLE_TEXTURECUBE_LOD(_AmbientProbeInputCubemap, s_linear_clamp_sampler, n, 0);
#else
// Grab the cubemap size
float2 cubeSize;
_AmbientProbeInputCubemap.GetDimensions(cubeSize.x, cubeSize.y);
// Prefiltered importance sampling
// Use lower MIP-map levels for fetching samples with low probabilities
// in order to reduce the variance.
// Ref: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html
//
// - OmegaS: Solid angle associated with the sample
// - OmegaP: Solid angle associated with the texel of the cubemap
float invOmegaP = (6.0 * cubeSize.x * cubeSize.y) / FOUR_PI;
float pdf = 1.0 / FOUR_PI; // Solid angle of the sphere is 4*PI
float omegaS = rcp(sampleCount) * rcp(pdf);
float mipLevel = 0.5 * log2(omegaS * invOmegaP);
// Sample once per thread
float4 value = SAMPLE_TEXTURECUBE_LOD(_AmbientProbeInputCubemap, s_linear_clamp_sampler, n, mipLevel);
#endif
float outputSHCoeffs[SH_COEFF_COUNT];
for (int channel = 0; channel < 3; ++channel)
{
// Note: SH coefficient are apply letter
outputSHCoeffs[channel * 9 + 0] = value[channel];
outputSHCoeffs[channel * 9 + 1] = n.y * value[channel];
outputSHCoeffs[channel * 9 + 2] = n.z * value[channel];
outputSHCoeffs[channel * 9 + 3] = n.x * value[channel];
outputSHCoeffs[channel * 9 + 4] = n.x * n.y * value[channel];
outputSHCoeffs[channel * 9 + 5] = n.y * n.z * value[channel];
outputSHCoeffs[channel * 9 + 6] = (3.0 * n.z * n.z - 1.0) * value[channel];
outputSHCoeffs[channel * 9 + 7] = n.x * n.z * value[channel];
outputSHCoeffs[channel * 9 + 8] = (n.x * n.x - n.y * n.y) * value[channel];
}
uint i;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
// Sum up all threads result and broadcast
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
outputSHCoeffs[i] = WaveActiveSum(outputSHCoeffs[i]);
}
// First thread of each wave stores the result in LDS
uint laneCount = WaveGetLaneCount();
if (dispatchThreadId % laneCount == 0)
{
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
uint offset = (dispatchThreadId / laneCount) * SH_COEFF_COUNT;
outputSHCoeffsLDS[i + offset] = outputSHCoeffs[i];
}
}
GroupMemoryBarrierWithGroupSync();
// Read back the result to VGPRs to store it to memory at the end
// First wave intializes the array
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
outputSHCoeffs[i] = outputSHCoeffsLDS[i];
}
// Then accumulate remaining waves
uint waveCount = sampleCount / laneCount;
for (uint wave = 1; wave < waveCount; ++wave)
{
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
outputSHCoeffs[i] += outputSHCoeffsLDS[i + wave * SH_COEFF_COUNT];
}
}
#else
// Parallel reduction of all threads result.
for (uint k = 0; k < FastLog2(SAMPLE_COUNT); ++k)
{
// Each loop iteration, even threads store their result in LDS, odd threads sum them up back to local VGPR until all results are summed up.
if ((dispatchThreadId & ((2 << k) - 1)) == (1 << k))
{
uint index = dispatchThreadId >> (k + 1);
for (uint coeff = 0; coeff < SH_COEFF_COUNT; ++coeff)
{
outputSHCoeffsLDS[index * SH_COEFF_COUNT + coeff] = outputSHCoeffs[coeff];
}
}
GroupMemoryBarrierWithGroupSync();
if ((dispatchThreadId & ((2 << k) - 1)) == 0)
{
uint index = dispatchThreadId >> (k + 1);
for (uint coeff = 0; coeff < SH_COEFF_COUNT; ++coeff)
{
outputSHCoeffs[coeff] += outputSHCoeffsLDS[index * SH_COEFF_COUNT + coeff];
}
}
GroupMemoryBarrierWithGroupSync();
}
#endif
float weight = 4.0 * PI / (sampleCount);
// Write to memory and convolution + weighing
if (dispatchThreadId == 0)
{
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
// Sh Coefficient used for encoding
outputSHCoeffs[i] = outputSHCoeffs[i] * kSHBasisCoef[i % 9] * weight;
}
#if !defined(OUTPUT_CLOUDS)
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
// ClampedCosine * SH Coefficient used in Decode
_AmbientProbeOutputBuffer[i] = outputSHCoeffs[i] * kClampedCosineCoefs[i % 9] * kSHBasisCoef[i % 9];
}
#endif
#if OUTPUT_DIFFUSE
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
_ScratchBuffer[i] = asuint(_AmbientProbeOutputBuffer[i]);
}
// Diffuse convolution packed to be ready for shader consumption
PackSHFromScratchBuffer(_DiffuseAmbientProbeOutputBuffer);
#endif
#if OUTPUT_VOLUMETRIC || OUTPUT_CLOUDS
// Note: Code below could be optimize (lot of constant multiplication), but compiler may figure it out and this path is rarely executed, so prefer clean code.
// Apply FogDimmer
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
_ScratchBuffer[i] = asuint(outputSHCoeffs[i] * _FogDimmer);
}
// Apply CornetteShank phase function
float3 zh;
GetCornetteShanksPhaseFunction(zh, _FogAnisotropy);
ConvolveZonalFromScratchBuffer(zh);
// Premultiplies the SH with the polynomial coefficients of SH basis functions,
// which avoids using any constants during SH evaluation.
// The resulting evaluation takes the form:
// (c_0 - c_6) + c_1 y + c_2 z + c_3 x + c_4 x y + c_5 y z + c_6 (3 z^2) + c_7 x z + c_8 (x^2 - y^2)
for (i = 0; i < SH_COEFF_COUNT; ++i)
{
_ScratchBuffer[i] = asuint(asfloat(_ScratchBuffer[i]) * kSHBasisCoef[i % 9]);
}
PackSHFromScratchBuffer(_VolumetricAmbientProbeOutputBuffer);
#endif
}
}