You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

279 lines
11 KiB

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef FFX_OPTICALFLOW_COMPUTE_OPTICAL_FLOW_V5_H
#define FFX_OPTICALFLOW_COMPUTE_OPTICAL_FLOW_V5_H
#define CompareSize (4 * 2)
#define BlockSizeY 8
#define BlockSizeX 8
#define ThreadCount (4 * 16)
#define SearchRadiusX (8)
#define SearchRadiusY (8)
#define BlockCount 2
#define SearchBufferSizeX ((CompareSize + SearchRadiusX*2)/4)
#define SearchBufferSizeY (CompareSize + SearchRadiusY*2)
FFX_GROUPSHARED FfxUInt32 pixels[CompareSize][CompareSize / 4];
FFX_GROUPSHARED FfxUInt32 searchBuffer[1][SearchBufferSizeY * SearchBufferSizeX];
#define bankBreaker 1
FFX_GROUPSHARED FfxUInt32 sadMapBuffer[4][SearchRadiusY * 2][(SearchRadiusX * 2) / 4 + bankBreaker];
#define MaxWaves 2
FFX_GROUPSHARED FfxUInt32 sWaveSad[MaxWaves];
FFX_GROUPSHARED FfxUInt32 sWaveMin[MaxWaves];
FfxUInt32 BlockSad64(FfxUInt32 blockSadSum, FfxInt32 iLocalIndex, FfxInt32 iLaneToBlockId, FfxInt32 block)
{
if (iLaneToBlockId != block)
{
blockSadSum = 0u;
}
blockSadSum = ffxWaveSum(blockSadSum);
if (ffxWaveLaneCount() == 32)
{
FfxInt32 waveId = iLocalIndex >> 5u;
if (ffxWaveIsFirstLane())
{
sWaveSad[waveId] = blockSadSum;
}
FFX_GROUP_MEMORY_BARRIER;
blockSadSum += sWaveSad[waveId ^ 1];
}
return blockSadSum;
}
FfxUInt32 SadMapMinReduction256(FfxInt32x2 iSearchId, FfxInt32 iLocalIndex)
{
FfxUInt32 min01 = ffxMin(sadMapBuffer[0][iSearchId.y][iSearchId.x], sadMapBuffer[1][iSearchId.y][iSearchId.x]);
FfxUInt32 min23 = ffxMin(sadMapBuffer[2][iSearchId.y][iSearchId.x], sadMapBuffer[3][iSearchId.y][iSearchId.x]);
FfxUInt32 min0123 = ffxMin(min01, min23);
min0123 = ffxWaveMin(min0123);
if (ffxWaveLaneCount() == 32)
{
FfxInt32 waveId = iLocalIndex >> 5u;
if (ffxWaveIsFirstLane())
{
sWaveMin[waveId] = min0123;
}
FFX_GROUP_MEMORY_BARRIER;
min0123 = ffxMin(min0123, sWaveMin[waveId ^ 1]);
}
return min0123;
}
void LoadSearchBuffer(FfxInt32 iLocalIndex, FfxInt32x2 iPxPosShifted)
{
FfxInt32 baseX = (iPxPosShifted.x - SearchRadiusX);
FfxInt32 baseY = (iPxPosShifted.y - SearchRadiusY);
for (FfxInt32 id = iLocalIndex; id < SearchBufferSizeX * SearchBufferSizeY; id += ThreadCount)
{
FfxInt32 idx = id % SearchBufferSizeX;
FfxInt32 idy = id / SearchBufferSizeX;
FfxInt32 x = baseX + idx * 4;
FfxInt32 y = baseY + idy;
searchBuffer[0][id] = LoadSecondImagePackedLuma(FfxInt32x2(x, y));
}
FFX_GROUP_MEMORY_BARRIER;
}
FfxUInt32x4 CalculateQSads2(FfxInt32x2 iSearchId)
{
FfxUInt32x4 sad = ffxBroadcast4(0u);
#if FFX_OPTICALFLOW_USE_MSAD4_INSTRUCTION == 1
FfxInt32 idx = iSearchId.y * 6 + iSearchId.x;
sad = msad4(pixels[0][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[0][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[1][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[1][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[2][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[2][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[3][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[3][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[4][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[4][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[5][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[5][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[6][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[6][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
idx += 6;
sad = msad4(pixels[7][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
sad = msad4(pixels[7][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
#else
for (FfxInt32 dy = 0; dy < CompareSize; dy++)
{
FfxInt32 rowOffset = (iSearchId.y + dy) * SearchBufferSizeX;
FfxUInt32 a0 = searchBuffer[0][rowOffset + iSearchId.x];
FfxUInt32 a1 = searchBuffer[0][rowOffset + iSearchId.x + 1];
FfxUInt32 a2 = searchBuffer[0][rowOffset + iSearchId.x + 2];
sad += QSad(a0, a1, pixels[dy][0]);
sad += QSad(a1, a2, pixels[dy][1]);
}
#endif
return sad;
}
FfxUInt32x2 abs_2(FfxInt32x2 val)
{
FfxInt32x2 tmp = val;
FfxInt32x2 mask = tmp >> 31;
FfxUInt32x2 res = (tmp + mask) ^ mask;
return res;
}
FfxUInt32 EncodeSearchCoord(FfxInt32x2 coord)
{
#if FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
FfxUInt32x2 absCoord = FfxUInt32x2(abs_2(coord - 8));
return FfxUInt32(absCoord.y << 12) | FfxUInt32(absCoord.x << 8) | FfxUInt32(coord.y << 4) | FfxUInt32(coord.x);
#else //FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
return FfxUInt32(coord.y << 8) | FfxUInt32(coord.x);
#endif //FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
}
FfxInt32x2 DecodeSearchCoord(FfxUInt32 bits)
{
#if FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
FfxInt32 dx = FfxInt32(bits & 0xfu) - SearchRadiusX;
FfxInt32 dy = FfxInt32((bits >> 4) & 0xfu) - SearchRadiusY;
return FfxInt32x2(dx, dy);
#else
FfxInt32 dx = FfxInt32(bits & 0xffu) - SearchRadiusX;
FfxInt32 dy = FfxInt32((bits >> 8) & 0xffu) - SearchRadiusY;
return FfxInt32x2(dx, dy);
#endif
}
void PrepareSadMap(FfxInt32x2 iSearchId, FfxUInt32x4 qsad)
{
sadMapBuffer[0][iSearchId.y][iSearchId.x] = (qsad.x << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 0, iSearchId.y));
sadMapBuffer[1][iSearchId.y][iSearchId.x] = (qsad.y << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 1, iSearchId.y));
sadMapBuffer[2][iSearchId.y][iSearchId.x] = (qsad.z << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 2, iSearchId.y));
sadMapBuffer[3][iSearchId.y][iSearchId.x] = (qsad.w << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 3, iSearchId.y));
FFX_GROUP_MEMORY_BARRIER;
}
uint ABfe(uint src, uint off, uint bits) { uint mask = (1u << bits) - 1u; return (src >> off) & mask; }
uint ABfi(uint src, uint ins, uint mask) { return (ins & mask) | (src & (~mask)); }
uint ABfiM(uint src, uint ins, uint bits) { uint mask = (1u << bits) - 1u; return (ins & mask) | (src & (~mask)); }
void MapThreads(in FfxInt32x2 iGroupId, in FfxInt32 iLocalIndex,
out FfxInt32x2 iSearchId, out FfxInt32x2 iPxPos, out FfxInt32 iLaneToBlockId)
{
iSearchId = FfxInt32x2(ABfe(iLocalIndex, 0u, 2u), ABfe(iLocalIndex, 2u, 4u));
iLaneToBlockId = FfxInt32(ABfe(iLocalIndex, 1u, 1u) | (ABfe(iLocalIndex, 5u, 1u) << 1u));
iPxPos = (iGroupId << 4u) + iSearchId * FfxInt32x2(4, 1);
}
void ComputeOpticalFlowAdvanced(FfxInt32x2 iGlobalId, FfxInt32x2 iLocalId, FfxInt32x2 iGroupId, FfxInt32 iLocalIndex)
{
FfxInt32x2 iSearchId;
FfxInt32x2 iPxPos;
FfxInt32 iLaneToBlockId;
MapThreads(iGroupId, iLocalIndex, iSearchId, iPxPos, iLaneToBlockId);
FfxInt32x2 currentOFPos = iPxPos >> 3u;
if (IsSceneChanged())
{
if ((iSearchId.y & 0x7) == 0 && (iSearchId.x & 0x1) == 0)
{
StoreOpticalFlow(currentOFPos, FfxInt32x2(0, 0));
}
return;
}
const FfxBoolean bUsePredictionFromPreviousLevel = (OpticalFlowPyramidLevel() != OpticalFlowPyramidLevelCount() - 1);
FfxUInt32 packedLuma_4blocks = LoadFirstImagePackedLuma(iPxPos);
#if FFX_LOCAL_SEARCH_FALLBACK == 1
FfxUInt32 prevPackedLuma_4blocks = LoadSecondImagePackedLuma(iPxPos);
FfxUInt32 sad_4blocks = Sad(packedLuma_4blocks, prevPackedLuma_4blocks);
#endif //FFX_LOCAL_SEARCH_FALLBACK
FfxInt32x2 ofGroupOffset = iGroupId << 1u;
FfxInt32x2 pixelGroupOffset = iGroupId << 4u;
FfxInt32x2 blockId;
for (blockId.y = 0; blockId.y < BlockCount; blockId.y++)
{
for (blockId.x = 0; blockId.x < BlockCount; blockId.x++)
{
FfxInt32x2 currentVector = LoadRwOpticalFlow(ofGroupOffset + blockId);
if (!bUsePredictionFromPreviousLevel)
{
currentVector = FfxInt32x2(0, 0);
}
if (iLaneToBlockId == blockId.y * 2 + blockId.x)
{
pixels[iSearchId.y & 0x7][iSearchId.x & 0x1] = packedLuma_4blocks;
}
LoadSearchBuffer(iLocalIndex, pixelGroupOffset + blockId * 8 + currentVector);
FfxUInt32x4 qsad = CalculateQSads2(iSearchId);
PrepareSadMap(iSearchId, qsad);
FfxUInt32 minSad = SadMapMinReduction256(iSearchId, iLocalIndex);
FfxInt32x2 minSadCoord = DecodeSearchCoord(minSad);
FfxInt32x2 newVector = currentVector + minSadCoord;
#if FFX_LOCAL_SEARCH_FALLBACK == 1
FfxUInt32 blockSadSum = BlockSad64(sad_4blocks, iLocalIndex, iLaneToBlockId, blockId.x + blockId.y * 2);
if (OpticalFlowPyramidLevel() == 0 && blockSadSum <= (minSad >> 16u))
{
newVector = FfxInt32x2(0, 0);
}
#endif //FFX_LOCAL_SEARCH_FALLBACK
{
StoreOpticalFlow(ofGroupOffset + blockId, newVector);
}
}
}
}
#endif // FFX_OPTICALFLOW_COMPUTE_OPTICAL_FLOW_V5_H