You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
279 lines
11 KiB
279 lines
11 KiB
// This file is part of the FidelityFX SDK.
|
|
//
|
|
// Copyright (C) 2024 Advanced Micro Devices, Inc.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files(the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions :
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#ifndef FFX_OPTICALFLOW_COMPUTE_OPTICAL_FLOW_V5_H
|
|
#define FFX_OPTICALFLOW_COMPUTE_OPTICAL_FLOW_V5_H
|
|
|
|
#define CompareSize (4 * 2)
|
|
#define BlockSizeY 8
|
|
#define BlockSizeX 8
|
|
#define ThreadCount (4 * 16)
|
|
#define SearchRadiusX (8)
|
|
#define SearchRadiusY (8)
|
|
#define BlockCount 2
|
|
|
|
#define SearchBufferSizeX ((CompareSize + SearchRadiusX*2)/4)
|
|
#define SearchBufferSizeY (CompareSize + SearchRadiusY*2)
|
|
|
|
FFX_GROUPSHARED FfxUInt32 pixels[CompareSize][CompareSize / 4];
|
|
FFX_GROUPSHARED FfxUInt32 searchBuffer[1][SearchBufferSizeY * SearchBufferSizeX];
|
|
#define bankBreaker 1
|
|
FFX_GROUPSHARED FfxUInt32 sadMapBuffer[4][SearchRadiusY * 2][(SearchRadiusX * 2) / 4 + bankBreaker];
|
|
|
|
#define MaxWaves 2
|
|
FFX_GROUPSHARED FfxUInt32 sWaveSad[MaxWaves];
|
|
FFX_GROUPSHARED FfxUInt32 sWaveMin[MaxWaves];
|
|
|
|
FfxUInt32 BlockSad64(FfxUInt32 blockSadSum, FfxInt32 iLocalIndex, FfxInt32 iLaneToBlockId, FfxInt32 block)
|
|
{
|
|
if (iLaneToBlockId != block)
|
|
{
|
|
blockSadSum = 0u;
|
|
}
|
|
blockSadSum = ffxWaveSum(blockSadSum);
|
|
|
|
if (ffxWaveLaneCount() == 32)
|
|
{
|
|
FfxInt32 waveId = iLocalIndex >> 5u;
|
|
if (ffxWaveIsFirstLane())
|
|
{
|
|
sWaveSad[waveId] = blockSadSum;
|
|
}
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
blockSadSum += sWaveSad[waveId ^ 1];
|
|
}
|
|
|
|
return blockSadSum;
|
|
}
|
|
|
|
FfxUInt32 SadMapMinReduction256(FfxInt32x2 iSearchId, FfxInt32 iLocalIndex)
|
|
{
|
|
FfxUInt32 min01 = ffxMin(sadMapBuffer[0][iSearchId.y][iSearchId.x], sadMapBuffer[1][iSearchId.y][iSearchId.x]);
|
|
FfxUInt32 min23 = ffxMin(sadMapBuffer[2][iSearchId.y][iSearchId.x], sadMapBuffer[3][iSearchId.y][iSearchId.x]);
|
|
FfxUInt32 min0123 = ffxMin(min01, min23);
|
|
min0123 = ffxWaveMin(min0123);
|
|
|
|
if (ffxWaveLaneCount() == 32)
|
|
{
|
|
FfxInt32 waveId = iLocalIndex >> 5u;
|
|
|
|
if (ffxWaveIsFirstLane())
|
|
{
|
|
sWaveMin[waveId] = min0123;
|
|
}
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
min0123 = ffxMin(min0123, sWaveMin[waveId ^ 1]);
|
|
}
|
|
|
|
return min0123;
|
|
}
|
|
|
|
void LoadSearchBuffer(FfxInt32 iLocalIndex, FfxInt32x2 iPxPosShifted)
|
|
{
|
|
FfxInt32 baseX = (iPxPosShifted.x - SearchRadiusX);
|
|
FfxInt32 baseY = (iPxPosShifted.y - SearchRadiusY);
|
|
|
|
for (FfxInt32 id = iLocalIndex; id < SearchBufferSizeX * SearchBufferSizeY; id += ThreadCount)
|
|
{
|
|
FfxInt32 idx = id % SearchBufferSizeX;
|
|
FfxInt32 idy = id / SearchBufferSizeX;
|
|
FfxInt32 x = baseX + idx * 4;
|
|
FfxInt32 y = baseY + idy;
|
|
searchBuffer[0][id] = LoadSecondImagePackedLuma(FfxInt32x2(x, y));
|
|
}
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
}
|
|
|
|
FfxUInt32x4 CalculateQSads2(FfxInt32x2 iSearchId)
|
|
{
|
|
FfxUInt32x4 sad = ffxBroadcast4(0u);
|
|
|
|
#if FFX_OPTICALFLOW_USE_MSAD4_INSTRUCTION == 1
|
|
|
|
FfxInt32 idx = iSearchId.y * 6 + iSearchId.x;
|
|
|
|
sad = msad4(pixels[0][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[0][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[1][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[1][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[2][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[2][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[3][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[3][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[4][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[4][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[5][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[5][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[6][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[6][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
idx += 6;
|
|
sad = msad4(pixels[7][0], FfxUInt32x2(searchBuffer[0][idx], searchBuffer[0][idx + 1]), sad);
|
|
sad = msad4(pixels[7][1], FfxUInt32x2(searchBuffer[0][idx + 1], searchBuffer[0][idx + 2]), sad);
|
|
|
|
#else
|
|
for (FfxInt32 dy = 0; dy < CompareSize; dy++)
|
|
{
|
|
FfxInt32 rowOffset = (iSearchId.y + dy) * SearchBufferSizeX;
|
|
FfxUInt32 a0 = searchBuffer[0][rowOffset + iSearchId.x];
|
|
FfxUInt32 a1 = searchBuffer[0][rowOffset + iSearchId.x + 1];
|
|
FfxUInt32 a2 = searchBuffer[0][rowOffset + iSearchId.x + 2];
|
|
sad += QSad(a0, a1, pixels[dy][0]);
|
|
sad += QSad(a1, a2, pixels[dy][1]);
|
|
}
|
|
#endif
|
|
|
|
return sad;
|
|
}
|
|
|
|
FfxUInt32x2 abs_2(FfxInt32x2 val)
|
|
{
|
|
FfxInt32x2 tmp = val;
|
|
FfxInt32x2 mask = tmp >> 31;
|
|
FfxUInt32x2 res = (tmp + mask) ^ mask;
|
|
return res;
|
|
}
|
|
|
|
FfxUInt32 EncodeSearchCoord(FfxInt32x2 coord)
|
|
{
|
|
#if FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
|
|
FfxUInt32x2 absCoord = FfxUInt32x2(abs_2(coord - 8));
|
|
return FfxUInt32(absCoord.y << 12) | FfxUInt32(absCoord.x << 8) | FfxUInt32(coord.y << 4) | FfxUInt32(coord.x);
|
|
#else //FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
|
|
return FfxUInt32(coord.y << 8) | FfxUInt32(coord.x);
|
|
#endif //FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
|
|
}
|
|
|
|
FfxInt32x2 DecodeSearchCoord(FfxUInt32 bits)
|
|
{
|
|
#if FFX_OPTICALFLOW_FIX_TOP_LEFT_BIAS == 1
|
|
FfxInt32 dx = FfxInt32(bits & 0xfu) - SearchRadiusX;
|
|
FfxInt32 dy = FfxInt32((bits >> 4) & 0xfu) - SearchRadiusY;
|
|
|
|
return FfxInt32x2(dx, dy);
|
|
#else
|
|
FfxInt32 dx = FfxInt32(bits & 0xffu) - SearchRadiusX;
|
|
FfxInt32 dy = FfxInt32((bits >> 8) & 0xffu) - SearchRadiusY;
|
|
|
|
return FfxInt32x2(dx, dy);
|
|
#endif
|
|
}
|
|
|
|
void PrepareSadMap(FfxInt32x2 iSearchId, FfxUInt32x4 qsad)
|
|
{
|
|
sadMapBuffer[0][iSearchId.y][iSearchId.x] = (qsad.x << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 0, iSearchId.y));
|
|
sadMapBuffer[1][iSearchId.y][iSearchId.x] = (qsad.y << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 1, iSearchId.y));
|
|
sadMapBuffer[2][iSearchId.y][iSearchId.x] = (qsad.z << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 2, iSearchId.y));
|
|
sadMapBuffer[3][iSearchId.y][iSearchId.x] = (qsad.w << 16) | EncodeSearchCoord(FfxInt32x2(iSearchId.x * 4 + 3, iSearchId.y));
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
}
|
|
|
|
|
|
uint ABfe(uint src, uint off, uint bits) { uint mask = (1u << bits) - 1u; return (src >> off) & mask; }
|
|
uint ABfi(uint src, uint ins, uint mask) { return (ins & mask) | (src & (~mask)); }
|
|
uint ABfiM(uint src, uint ins, uint bits) { uint mask = (1u << bits) - 1u; return (ins & mask) | (src & (~mask)); }
|
|
void MapThreads(in FfxInt32x2 iGroupId, in FfxInt32 iLocalIndex,
|
|
out FfxInt32x2 iSearchId, out FfxInt32x2 iPxPos, out FfxInt32 iLaneToBlockId)
|
|
{
|
|
iSearchId = FfxInt32x2(ABfe(iLocalIndex, 0u, 2u), ABfe(iLocalIndex, 2u, 4u));
|
|
iLaneToBlockId = FfxInt32(ABfe(iLocalIndex, 1u, 1u) | (ABfe(iLocalIndex, 5u, 1u) << 1u));
|
|
iPxPos = (iGroupId << 4u) + iSearchId * FfxInt32x2(4, 1);
|
|
}
|
|
|
|
void ComputeOpticalFlowAdvanced(FfxInt32x2 iGlobalId, FfxInt32x2 iLocalId, FfxInt32x2 iGroupId, FfxInt32 iLocalIndex)
|
|
{
|
|
FfxInt32x2 iSearchId;
|
|
FfxInt32x2 iPxPos;
|
|
FfxInt32 iLaneToBlockId;
|
|
MapThreads(iGroupId, iLocalIndex, iSearchId, iPxPos, iLaneToBlockId);
|
|
|
|
FfxInt32x2 currentOFPos = iPxPos >> 3u;
|
|
|
|
if (IsSceneChanged())
|
|
{
|
|
if ((iSearchId.y & 0x7) == 0 && (iSearchId.x & 0x1) == 0)
|
|
{
|
|
StoreOpticalFlow(currentOFPos, FfxInt32x2(0, 0));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
const FfxBoolean bUsePredictionFromPreviousLevel = (OpticalFlowPyramidLevel() != OpticalFlowPyramidLevelCount() - 1);
|
|
|
|
FfxUInt32 packedLuma_4blocks = LoadFirstImagePackedLuma(iPxPos);
|
|
|
|
#if FFX_LOCAL_SEARCH_FALLBACK == 1
|
|
FfxUInt32 prevPackedLuma_4blocks = LoadSecondImagePackedLuma(iPxPos);
|
|
FfxUInt32 sad_4blocks = Sad(packedLuma_4blocks, prevPackedLuma_4blocks);
|
|
#endif //FFX_LOCAL_SEARCH_FALLBACK
|
|
|
|
FfxInt32x2 ofGroupOffset = iGroupId << 1u;
|
|
FfxInt32x2 pixelGroupOffset = iGroupId << 4u;
|
|
|
|
FfxInt32x2 blockId;
|
|
for (blockId.y = 0; blockId.y < BlockCount; blockId.y++)
|
|
{
|
|
for (blockId.x = 0; blockId.x < BlockCount; blockId.x++)
|
|
{
|
|
FfxInt32x2 currentVector = LoadRwOpticalFlow(ofGroupOffset + blockId);
|
|
if (!bUsePredictionFromPreviousLevel)
|
|
{
|
|
currentVector = FfxInt32x2(0, 0);
|
|
}
|
|
|
|
if (iLaneToBlockId == blockId.y * 2 + blockId.x)
|
|
{
|
|
pixels[iSearchId.y & 0x7][iSearchId.x & 0x1] = packedLuma_4blocks;
|
|
}
|
|
|
|
LoadSearchBuffer(iLocalIndex, pixelGroupOffset + blockId * 8 + currentVector);
|
|
|
|
FfxUInt32x4 qsad = CalculateQSads2(iSearchId);
|
|
|
|
PrepareSadMap(iSearchId, qsad);
|
|
FfxUInt32 minSad = SadMapMinReduction256(iSearchId, iLocalIndex);
|
|
|
|
FfxInt32x2 minSadCoord = DecodeSearchCoord(minSad);
|
|
FfxInt32x2 newVector = currentVector + minSadCoord;
|
|
|
|
#if FFX_LOCAL_SEARCH_FALLBACK == 1
|
|
FfxUInt32 blockSadSum = BlockSad64(sad_4blocks, iLocalIndex, iLaneToBlockId, blockId.x + blockId.y * 2);
|
|
if (OpticalFlowPyramidLevel() == 0 && blockSadSum <= (minSad >> 16u))
|
|
{
|
|
newVector = FfxInt32x2(0, 0);
|
|
}
|
|
#endif //FFX_LOCAL_SEARCH_FALLBACK
|
|
|
|
{
|
|
StoreOpticalFlow(ofGroupOffset + blockId, newVector);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif // FFX_OPTICALFLOW_COMPUTE_OPTICAL_FLOW_V5_H
|