You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

146 lines
4.3 KiB

#pragma kernel CSVFXPrepareSingleInstance PREFIX_SUM_THREAD_COUNT=1
#pragma kernel CSVFXBatchSumCount VFX_BATCH_COUNT=CSVFXBatchSumCount PREFIX_SUM_THREAD_COUNT=1024
#pragma kernel CSVFXBatchSumCount_128 VFX_BATCH_COUNT=CSVFXBatchSumCount_128 PREFIX_SUM_THREAD_COUNT=128
#pragma kernel CSVFXBuildPrefixSum VFX_BUILD_PREFIX_SUM=CSVFXBuildPrefixSum VFX_USE_INSTANCING=1 PREFIX_SUM_THREAD_COUNT=1024
#pragma kernel CSVFXBuildPrefixSum_128 VFX_BUILD_PREFIX_SUM=CSVFXBuildPrefixSum_128 VFX_USE_INSTANCING=1 PREFIX_SUM_THREAD_COUNT=128
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch switch2 glcore gles3 webgpu
#include "HLSLSupport.cginc"
CBUFFER_START(Uniform)
uint dstOffset;
uint srcOffset;
uint size;
float4 instancingConstants;
uint2 instancingBufferOffsets;
CBUFFER_END
#if VFX_USE_INSTANCING
#define VFX_INSTANCING_ACTIVE_INDIRECTION 1
#define VFX_INSTANCING_BATCH_INDIRECTION 1
#define VFX_INSTANCING_FIXED_SIZE 1
#include "VFXInstancing.hlsl"
#endif
#define PREFIX_SUM_MAX_SIZE 1024
#define PREFIX_SUM_PER_THREAD (PREFIX_SUM_MAX_SIZE / PREFIX_SUM_THREAD_COUNT)
StructuredBuffer<uint> srcStructBufferUint;
RWByteAddressBuffer dstBuffer;
RWStructuredBuffer<uint> dstStructBuffer;
[numthreads(1, 1, 1)]
void CSVFXPrepareSingleInstance(uint3 threadId : SV_DispatchThreadID)
{
uint index = threadId.x;
if (index < 1)
{
// Count always placed at the beginning of the buffer
const uint srcOffset = 0u;
uint count = dstStructBuffer[srcOffset];
// Copy count to indirect buffer
dstBuffer.Store(dstOffset << 2, count);
// Copy count to prefix sum (after count and total count)
dstStructBuffer[srcOffset + 2] = count;
// reset event count
dstStructBuffer[srcOffset] = 0u;
}
}
groupshared unsigned int batchSumCount = 0u;
[numthreads(PREFIX_SUM_THREAD_COUNT, 1, 1)]
void VFX_BATCH_COUNT(uint3 threadId : SV_DispatchThreadID)
{
// Initialize to 0 (should not be required)
if (threadId.x == 0)
{
batchSumCount = 0;
}
// Wait for initialization
GroupMemoryBarrierWithGroupSync();
// Accumulate valid threads
[unroll]
for (int i = 0; i < PREFIX_SUM_PER_THREAD; ++i)
{
uint index = threadId.x * PREFIX_SUM_PER_THREAD + i;
if (index < size)
{
uint load = srcStructBufferUint[index];
uint original;
InterlockedAdd(batchSumCount, load, original);
}
}
// Wait for all threads to finish adding
GroupMemoryBarrierWithGroupSync();
// Store the final value (only first thread)
if (threadId.x == 0)
{
dstBuffer.Store(dstOffset << 2, batchSumCount);
}
}
groupshared unsigned int prefixSum[PREFIX_SUM_MAX_SIZE];
[numthreads(PREFIX_SUM_THREAD_COUNT, 1, 1)]
void VFX_BUILD_PREFIX_SUM(uint3 threadId : SV_DispatchThreadID)
{
// read values from buffer
[unroll]
for (int i = 0; i < PREFIX_SUM_PER_THREAD; ++i)
{
uint index = threadId.x * PREFIX_SUM_PER_THREAD + i;
if (index < size)
{
uint srcIndex = index;
#if VFX_USE_INSTANCING
uint instanceIndex, instanceActiveIndex, instanceCurrentIndex;
VFXInitInstancing(index, instanceIndex, instanceActiveIndex, instanceCurrentIndex);
srcIndex = instanceIndex;
#endif
prefixSum[index] = dstStructBuffer[srcIndex + srcOffset];
}
}
// perform prefix sum (Sklansky)
for (uint j = 1; j < size; j <<= 1)
{
GroupMemoryBarrierWithGroupSync();
uint mask = ~(j - 1);
[unroll]
for (int ii = 0; ii < PREFIX_SUM_PER_THREAD; ++ii)
{
uint index = threadId.x * PREFIX_SUM_PER_THREAD + ii;
if ((index & j) != 0)
prefixSum[index] += prefixSum[(index & mask) - 1];
}
}
GroupMemoryBarrierWithGroupSync();
// write values to the buffer
[unroll]
for (int iii = 0; iii < PREFIX_SUM_PER_THREAD; ++iii)
{
uint index = threadId.x * PREFIX_SUM_PER_THREAD + iii;
#if VFX_USE_INSTANCING
// reset event count for all instances
if (index < instancingBatchSize)
{
dstStructBuffer[index + srcOffset] = 0u;
}
#endif
if (index < size)
{
dstStructBuffer[index + dstOffset] = prefixSum[index];
}
}
}