You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

256 lines
12 KiB

#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.core/Runtime/GPUDriven/InstanceData/InstanceTransformUpdateDefs.cs.hlsl"
#pragma kernel ScatterInitTransformMain
#pragma kernel ScatterUpdateTransformMain
#pragma kernel ScatterUpdateMotionMain
#pragma kernel ScatterUpdateProbesMain
#pragma multi_compile_local _ PROCESS_BOUNDING_SPHERES
int _TransformUpdateQueueCount;
int _TransformUpdateOutputL2WVec4Offset;
int _TransformUpdateOutputW2LVec4Offset;
int _TransformUpdateOutputPrevL2WVec4Offset;
int _TransformUpdateOutputPrevW2LVec4Offset;
int _BoundingSphereOutputVec4Offset;
StructuredBuffer<TransformUpdatePacket> _TransformUpdateDataQueue;
ByteAddressBuffer _TransformUpdateIndexQueue;
StructuredBuffer<float4> _BoundingSphereDataQueue;
RWByteAddressBuffer _OutputTransformBuffer;
float4x4 UnpackMatrix(float4 p1, float4 p2, float4 p3)
{
return float4x4(
p1.x, p1.w, p2.z, p3.y,
p1.y, p2.x, p2.w, p3.z,
p1.z, p2.y, p3.x, p3.w,
0.0, 0.0, 0.0, 1.0
);
}
float4 PackMatrix0(float4x4 m) { return m._m00_m10_m20_m01; }
float4 PackMatrix1(float4x4 m) { return m._m11_m21_m02_m12; }
float4 PackMatrix2(float4x4 m) { return m._m22_m03_m13_m23; }
float3x3 Inverse3x3(float3x3 m)
{
float3 row0 = m[0];
float3 row1 = m[1];
float3 row2 = m[2];
float3 col0 = cross(row1, row2);
float3 col1 = cross(row2, row0);
float3 col2 = cross(row0, row1);
float det = dot(row0, col0);
return transpose(float3x3(col0, col1, col2)/det);
}
float4x4 AffineInverse3D(float4x4 m)
{
float3x3 R = (float3x3)m;
float3 T = m._m03_m13_m23;
float3x3 invR = Inverse3x3(R);
float3 invT = -mul(invR, T);
return float4x4(
invR._m00, invR._m01, invR._m02, invT.x,
invR._m10, invR._m11, invR._m12, invT.y,
invR._m20, invR._m21, invR._m22, invT.z,
0.0f, 0.0f, 0.0f, 1.0f);
}
[numthreads(64, 1, 1)]
void ScatterInitTransformMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
if (dispatchThreadID.x >= (uint)_TransformUpdateQueueCount)
return;
uint queueIndex = dispatchThreadID.x;
uint queueByteIndex = queueIndex << 2;
uint outputIndex = _TransformUpdateIndexQueue.Load(queueByteIndex);
uint byteOutputL2WOffset = _TransformUpdateOutputL2WVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputW2LOffset = _TransformUpdateOutputW2LVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputPrevL2WOffset = _TransformUpdateOutputPrevL2WVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputPrevW2LOffset = _TransformUpdateOutputPrevW2LVec4Offset * 4 + outputIndex * 4 * 4 * 3;
TransformUpdatePacket l2wUpdatePacket = _TransformUpdateDataQueue[queueIndex * 2];
TransformUpdatePacket l2wPrevUpdatePacket = _TransformUpdateDataQueue[queueIndex * 2 + 1];
float4x4 l2w = UnpackMatrix(l2wUpdatePacket.localToWorld0, l2wUpdatePacket.localToWorld1, l2wUpdatePacket.localToWorld2);
float4x4 l2wPrev = UnpackMatrix(l2wPrevUpdatePacket.localToWorld0, l2wPrevUpdatePacket.localToWorld1, l2wPrevUpdatePacket.localToWorld2);
float4x4 w2l = AffineInverse3D(l2w);
float4x4 w2lPrev = AffineInverse3D(l2wPrev);
#ifdef PROCESS_FLIP_GPUDRIVEN_WINDING
UpdateInstanceFlipWindingFlag(l2w, outputIndex);
#endif
// initialize current transforms
_OutputTransformBuffer.Store4(byteOutputL2WOffset + 0, asuint(PackMatrix0(l2w)));
_OutputTransformBuffer.Store4(byteOutputL2WOffset + 16, asuint(PackMatrix1(l2w)));
_OutputTransformBuffer.Store4(byteOutputL2WOffset + 32, asuint(PackMatrix2(l2w)));
_OutputTransformBuffer.Store4(byteOutputW2LOffset + 0, asuint(PackMatrix0(w2l)));
_OutputTransformBuffer.Store4(byteOutputW2LOffset + 16, asuint(PackMatrix1(w2l)));
_OutputTransformBuffer.Store4(byteOutputW2LOffset + 32, asuint(PackMatrix2(w2l)));
// initialize previous transforms
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 0, asuint(PackMatrix0(l2wPrev)));
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 16, asuint(PackMatrix1(l2wPrev)));
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 32, asuint(PackMatrix2(l2wPrev)));
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 0, asuint(PackMatrix0(w2lPrev)));
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 16, asuint(PackMatrix1(w2lPrev)));
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 32, asuint(PackMatrix2(w2lPrev)));
#ifdef PROCESS_BOUNDING_SPHERES
uint byteOutputBoundingSphereOffset = _BoundingSphereOutputVec4Offset * 4 + outputIndex * 4 * 4;
float4 updateSphereValue = _BoundingSphereDataQueue[queueIndex];
_OutputTransformBuffer.Store4(byteOutputBoundingSphereOffset, asuint(updateSphereValue));
#endif
}
[numthreads(64, 1, 1)]
void ScatterUpdateTransformMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
if (dispatchThreadID.x >= (uint)_TransformUpdateQueueCount)
return;
uint queueIndex = dispatchThreadID.x;
uint queueByteIndex = queueIndex << 2;
uint outputIndex = _TransformUpdateIndexQueue.Load(queueByteIndex);
uint byteOutputL2WOffset = _TransformUpdateOutputL2WVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputW2LOffset = _TransformUpdateOutputW2LVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputPrevL2WOffset = _TransformUpdateOutputPrevL2WVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputPrevW2LOffset = _TransformUpdateOutputPrevW2LVec4Offset * 4 + outputIndex * 4 * 4 * 3;
// copy the current world transform to the previous one
uint4 prevLocalToWorld0 = _OutputTransformBuffer.Load4(byteOutputL2WOffset + 0);
uint4 prevLocalToWorld1 = _OutputTransformBuffer.Load4(byteOutputL2WOffset + 16);
uint4 prevLocalToWorld2 = _OutputTransformBuffer.Load4(byteOutputL2WOffset + 32);
uint4 prevWorldToLocal0 = _OutputTransformBuffer.Load4(byteOutputW2LOffset + 0);
uint4 prevWorldToLocal1 = _OutputTransformBuffer.Load4(byteOutputW2LOffset + 16);
uint4 prevWorldToLocal2 = _OutputTransformBuffer.Load4(byteOutputW2LOffset + 32);
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 0, prevLocalToWorld0);
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 16, prevLocalToWorld1);
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 32, prevLocalToWorld2);
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 0, prevWorldToLocal0);
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 16, prevWorldToLocal1);
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 32, prevWorldToLocal2);
TransformUpdatePacket updatePacket = _TransformUpdateDataQueue[queueIndex];
float4x4 l2w = UnpackMatrix(updatePacket.localToWorld0, updatePacket.localToWorld1, updatePacket.localToWorld2);
float4x4 w2l = AffineInverse3D(l2w);
#ifdef PROCESS_FLIP_GPUDRIVEN_WINDING
UpdateInstanceFlipWindingFlag(l2w, outputIndex);
#endif
// update current world transform
_OutputTransformBuffer.Store4(byteOutputL2WOffset + 0, asuint(PackMatrix0(l2w)));
_OutputTransformBuffer.Store4(byteOutputL2WOffset + 16, asuint(PackMatrix1(l2w)));
_OutputTransformBuffer.Store4(byteOutputL2WOffset + 32, asuint(PackMatrix2(l2w)));
_OutputTransformBuffer.Store4(byteOutputW2LOffset + 0, asuint(PackMatrix0(w2l)));
_OutputTransformBuffer.Store4(byteOutputW2LOffset + 16, asuint(PackMatrix1(w2l)));
_OutputTransformBuffer.Store4(byteOutputW2LOffset + 32, asuint(PackMatrix2(w2l)));
#ifdef PROCESS_BOUNDING_SPHERES
uint byteOutputBoundingSphereOffset = _BoundingSphereOutputVec4Offset * 4 + outputIndex * 4 * 4;
float4 updateSphereValue = _BoundingSphereDataQueue[queueIndex];
_OutputTransformBuffer.Store4(byteOutputBoundingSphereOffset, asuint(updateSphereValue));
#endif
}
[numthreads(64, 1, 1)]
void ScatterUpdateMotionMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
if (dispatchThreadID.x >= (uint)_TransformUpdateQueueCount)
return;
uint queueIndex = dispatchThreadID.x;
uint queueByteIndex = queueIndex << 2;
uint outputIndex = _TransformUpdateIndexQueue.Load(queueByteIndex);
uint byteOutputL2WOffset = _TransformUpdateOutputL2WVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputW2LOffset = _TransformUpdateOutputW2LVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputPrevL2WOffset = _TransformUpdateOutputPrevL2WVec4Offset * 4 + outputIndex * 4 * 4 * 3;
uint byteOutputPrevW2LOffset = _TransformUpdateOutputPrevW2LVec4Offset * 4 + outputIndex * 4 * 4 * 3;
// copy the current world transform to the previous one
uint4 prevLocalToWorld0 = _OutputTransformBuffer.Load4(byteOutputL2WOffset + 0);
uint4 prevLocalToWorld1 = _OutputTransformBuffer.Load4(byteOutputL2WOffset + 16);
uint4 prevLocalToWorld2 = _OutputTransformBuffer.Load4(byteOutputL2WOffset + 32);
uint4 prevWorldToLocal0 = _OutputTransformBuffer.Load4(byteOutputW2LOffset + 0);
uint4 prevWorldToLocal1 = _OutputTransformBuffer.Load4(byteOutputW2LOffset + 16);
uint4 prevWorldToLocal2 = _OutputTransformBuffer.Load4(byteOutputW2LOffset + 32);
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 0, prevLocalToWorld0);
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 16, prevLocalToWorld1);
_OutputTransformBuffer.Store4(byteOutputPrevL2WOffset + 32, prevLocalToWorld2);
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 0, prevWorldToLocal0);
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 16, prevWorldToLocal1);
_OutputTransformBuffer.Store4(byteOutputPrevW2LOffset + 32, prevWorldToLocal2);
}
int _ProbeUpdateQueueCount;
int _SHUpdateVec4Offset;
int _ProbeOcclusionUpdateVec4Offset;
StructuredBuffer<SHUpdatePacket> _ProbeUpdateDataQueue;
StructuredBuffer<float4> _ProbeOcclusionUpdateDataQueue;
ByteAddressBuffer _ProbeUpdateIndexQueue;
RWByteAddressBuffer _OutputProbeBuffer;
struct SHProperties
{
float4 SHAr;
float4 SHAg;
float4 SHAb;
float4 SHBr;
float4 SHBg;
float4 SHBb;
float4 SHC;
};
SHProperties UnpackShUpdate(SHUpdatePacket sh)
{
SHProperties p;
p.SHAr = float4(sh.shr3, sh.shr1, sh.shr2, sh.shr0 - sh.shr6);//GetSHA(sh, 0);
p.SHAg = float4(sh.shg3, sh.shg1, sh.shg2, sh.shg0 - sh.shg6);//GetSHA(sh, 1);
p.SHAb = float4(sh.shb3, sh.shb1, sh.shb2, sh.shb0 - sh.shb6);//GetSHA(sh, 2);
p.SHBr = float4(sh.shr4, sh.shr5, sh.shr6 * 3.0f, sh.shr7);//GetSHB(sh, 0);
p.SHBg = float4(sh.shg4, sh.shg5, sh.shg6 * 3.0f, sh.shg7);//GetSHB(sh, 1);
p.SHBb = float4(sh.shb4, sh.shb5, sh.shb6 * 3.0f, sh.shb7);//GetSHB(sh, 2);
p.SHC = float4(sh.shr8, sh.shg8, sh.shb8, 1.0);//GetSHC(sh);
return p;
}
[numthreads(64, 1, 1)]
void ScatterUpdateProbesMain(uint3 dispatchThreadID : SV_DispatchThreadID)
{
if (dispatchThreadID.x >= (uint)_ProbeUpdateQueueCount)
return;
uint outputIndex = _ProbeUpdateIndexQueue.Load(dispatchThreadID.x << 2);
SHUpdatePacket updatePacket = _ProbeUpdateDataQueue[dispatchThreadID.x];
SHProperties sh = UnpackShUpdate(updatePacket);
float4 occlusionData = _ProbeOcclusionUpdateDataQueue[dispatchThreadID.x];
uint bytesPerSH = 8 * 4 * 4;
uint baseSHOffset = outputIndex * bytesPerSH;
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 0 * 16, asuint(sh.SHAr));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 1 * 16, asuint(sh.SHAg));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 2 * 16, asuint(sh.SHAb));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 3 * 16, asuint(sh.SHBr));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 4 * 16, asuint(sh.SHBg));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 5 * 16, asuint(sh.SHBb));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 6 * 16, asuint(sh.SHC));
_OutputProbeBuffer.Store4(_SHUpdateVec4Offset * 4 + baseSHOffset + 7 * 16, asuint(occlusionData));
}