You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

644 lines
28 KiB

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
void Deringing(RectificationBox clippingBox, FFX_PARAMETER_INOUT FfxFloat32x3 fColor)
{
fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax);
}
#ifndef FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE
#define FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE 2 // Approximate
#endif
FfxFloat32 GetUpsampleLanczosWeight(FfxFloat32x2 fSrcSampleOffset, FfxFloat32 fKernelWeight)
{
FfxFloat32x2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx;
#if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE
FfxFloat32 fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased));
#elif FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT
FfxFloat32 fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased));
#elif FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
FfxFloat32 fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
#else
#error "Invalid Lanczos type"
#endif
return fSampleWeight;
}
#if FFX_HALF
FFX_MIN16_F GetUpsampleLanczosWeight(FFX_MIN16_F2 fSrcSampleOffset, FFX_MIN16_F fKernelWeight)
{
FFX_MIN16_F2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx;
#if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE
FFX_MIN16_F fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased));
#elif FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT
FFX_MIN16_F fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased));
#elif FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
FFX_MIN16_F fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
// To Test: Save reciproqual sqrt compute
// FfxFloat32 fSampleWeight = Lanczos2Sq_UseLUT(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
#else
#error "Invalid Lanczos type"
#endif
return fSampleWeight;
}
#endif
FfxFloat32 ComputeMaxKernelWeight(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT AccumulationPassData data) {
const FfxFloat32 fKernelSizeBias = 1.0f + (1.0f / FfxFloat32x2(DownscaleFactor()) - 1.0f).x;
return ffxMin(FfxFloat32(1.99f), fKernelSizeBias);
}
FfxFloat32x3 LoadPreparedColor(FfxInt32x2 iSamplePos)
{
const FfxFloat32x3 fRgb = ffxMax(FfxFloat32x3(0, 0, 0), LoadInputColor(iSamplePos)) * Exposure();
const FfxFloat32x3 fPreparedYCoCg = RGBToYCoCg(fRgb);
return fPreparedYCoCg;
}
#if FFX_HALF && (FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2) && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
#define FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS 1
#else
#define FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS 0
#endif
#if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
void LoadPreparedColorPairedRgb(FFX_PARAMETER_OUT FFX_MIN16_F2 r,
FFX_PARAMETER_OUT FFX_MIN16_F2 g,
FFX_PARAMETER_OUT FFX_MIN16_F2 b,
FfxInt32x2 iSamplePos0,
FfxInt32x2 iSamplePos1)
{
const FFX_MIN16_F3 sample0 = FFX_MIN16_F3(LoadInputColor(iSamplePos0));
const FFX_MIN16_F3 sample1 = FFX_MIN16_F3(LoadInputColor(iSamplePos1));
r = ffxMax(FFX_MIN16_F2(0, 0), FFX_MIN16_F2(sample0.r, sample1.r));
g = ffxMax(FFX_MIN16_F2(0, 0), FFX_MIN16_F2(sample0.g, sample1.g));
b = ffxMax(FFX_MIN16_F2(0, 0), FFX_MIN16_F2(sample0.b, sample1.b));
r = FFX_MIN16_F2(r * Exposure());
g = FFX_MIN16_F2(g * Exposure());
b = FFX_MIN16_F2(b * Exposure());
}
void TonemapPaired(FFX_PARAMETER_INOUT FFX_MIN16_F2 r, FFX_PARAMETER_INOUT FFX_MIN16_F2 g, FFX_PARAMETER_INOUT FFX_MIN16_F2 b)
{
FFX_MIN16_F2 denomF16 = ffxMax(ffxMax(ffxMax(0.0, r), g), b) + FFX_MIN16_F2(1.0, 1.0);
// NOTE: expect 2 x v_cvt_f32_f16
FfxFloat32x2 denomF32 = FfxFloat32x2(denomF16);
// NOTE: expect 2 x v_rcp_f32
FfxFloat32x2 normF32 = FfxFloat32x2(1.0, 1.0) / denomF32;
// NOTE: expect 2 x v_cvt_f16_f32
FFX_MIN16_F2 normF16 = FFX_MIN16_F2(normF32);
r *= normF16;
g *= normF16;
b *= normF16;
}
void RGBToYCoCgPaired(FFX_PARAMETER_INOUT FFX_MIN16_F2 r, FFX_PARAMETER_INOUT FFX_MIN16_F2 g, FFX_PARAMETER_INOUT FFX_MIN16_F2 b)
{
/**
* NOTE: given the following conversion
*
* fYCoCg = FfxFloat32x3(
* 0.25f * fRgb.r + 0.5f * fRgb.g + 0.25f * fRgb.b,
* 0.5f * fRgb.r - 0.5f * fRgb.b,
* -0.25f * fRgb.r + 0.5f * fRgb.g - 0.25f * fRgb.b);
*
* it's possible to notice that we can compute:
* RplusBdiv4 = 0.25 * (R + B)
*
* so everything else is computed in 3 instructions
* Y = G * 0.5 + RplusBdiv4
* Co = 2 * RplusBdiv4 - G
* Cg = G * 0.5 - RplusBdiv4
*/
// NOTE: expect v_pk_add_f32 + v_pk_mul_f32
FFX_MIN16_F2 RplusBdiv4 = (r + b) * 0.25;
FFX_MIN16_F2 G = g;
FFX_MIN16_F2 B = b;
// NOTE: expect 3x v_pk_fma_f32
r = G * 0.5 + RplusBdiv4;
g = RplusBdiv4 * 2.0 - B;
b = G * 0.5 - RplusBdiv4;
}
FFX_MIN16_F2 Compute3x3SamplesMinMaxPaired(FFX_PARAMETER_IN FFX_MIN16_F2 sampleCenter,
FFX_PARAMETER_IN FFX_MIN16_F2 sample0,
FFX_PARAMETER_IN FFX_MIN16_F2 sample1,
FFX_PARAMETER_IN FFX_MIN16_F2 sample2,
FFX_PARAMETER_IN FFX_MIN16_F2 sample3)
{
FFX_MIN16_F2 twoMinValues = ffxMin(ffxMin(sample0, sample1), ffxMin(sample2, sample3));
FFX_MIN16_F2 twoMaxValues = ffxMax(ffxMax(sample0, sample1), ffxMax(sample2, sample3));
return FFX_MIN16_F2(
ffxMin3Half(twoMinValues.x, twoMinValues.y, sampleCenter.x),
ffxMax3Half(twoMaxValues.x, twoMaxValues.y, sampleCenter.x)
);
}
FFX_MIN16_F2 Bool2ToFloat16x2(bool x, bool y)
{
uint lo = x ? 0x00003c00 : 0x00000000;
uint hi = y ? 0x3c000000 : 0x00000000;
return FFX_MIN16_F2(__XB_AsHalf(lo).x, __XB_AsHalf(hi).y);
}
struct PairedRectificationBoxAndAccumulatedColorAndWeight
{
FFX_MIN16_F2 boxCenterR;
FFX_MIN16_F2 boxCenterG;
FFX_MIN16_F2 boxCenterB;
FFX_MIN16_F2 boxVecR;
FFX_MIN16_F2 boxVecG;
FFX_MIN16_F2 boxVecB;
FFX_MIN16_F2 fBoxCenterWeight;
FFX_MIN16_F2 fColorR;
FFX_MIN16_F2 fColorG;
FFX_MIN16_F2 fColorB;
FFX_MIN16_F2 fWeight;
FFX_MIN16_F fKernelBiasSq;
FFX_MIN16_F fRectificationCurveBias;
void setKernelBiasAndRectificationCurveBias(FfxFloat32 kernelBias, FfxFloat32 rectificationCurveBias)
{
fKernelBiasSq = FFX_MIN16_F(kernelBias * kernelBias);
fRectificationCurveBias = FFX_MIN16_F(rectificationCurveBias);
}
void initUpscaledColor(FFX_MIN16_F fSrcSampleOffsetSq, FFX_MIN16_F fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
{
#if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
const FFX_MIN16_F2 LanczosUpsampleWeight = FFX_MIN16_F2(
PairedLanczos2ApproxSq(fSrcSampleOffsetSq * fKernelBiasSq).x,
0.0
);
#else
#error "Only LANCZOS_TYPE_APPROXIMATE is supported in paired version so far"
#endif
const FFX_MIN16_F2 fSampleWeight = fOnScreenWeight * LanczosUpsampleWeight;
fColorR = sampleR * fSampleWeight;
fColorG = sampleG * fSampleWeight;
fColorB = sampleB * fSampleWeight;
fWeight = fSampleWeight;
}
void initBox(FFX_MIN16_F fSrcSampleOffsetSq, FFX_MIN16_F fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
{
const FFX_MIN16_F2 fBoxSampleWeight = FFX_MIN16_F2(
exp(fRectificationCurveBias * fSrcSampleOffsetSq) * fOnScreenWeight,
0.0
);
FFX_MIN16_F2 weightedSampleR = sampleR * fBoxSampleWeight;
FFX_MIN16_F2 weightedSampleG = sampleG * fBoxSampleWeight;
FFX_MIN16_F2 weightedSampleB = sampleB * fBoxSampleWeight;
boxCenterR = weightedSampleR;
boxCenterG = weightedSampleG;
boxCenterB = weightedSampleB;
boxVecR = sampleR * weightedSampleR;
boxVecG = sampleG * weightedSampleG;
boxVecB = sampleB * weightedSampleB;
fBoxCenterWeight = fBoxSampleWeight;
}
void addUpscaledColorSample(FFX_MIN16_F2 fSrcSampleOffsetSq, FFX_MIN16_F2 fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
{
#if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
const FFX_MIN16_F2 LanczosUpsampleWeight = PairedLanczos2ApproxSq(fSrcSampleOffsetSq * fKernelBiasSq);
#else
#error "Only LANCZOS_TYPE_APPROXIMATE is supported in paired version so far"
#endif
const FFX_MIN16_F2 fSampleWeight = fOnScreenWeight * LanczosUpsampleWeight;
fColorR += sampleR * fSampleWeight;
fColorG += sampleG * fSampleWeight;
fColorB += sampleB * fSampleWeight;
fWeight += fSampleWeight;
}
void addBoxSample(FFX_MIN16_F2 fSrcSampleOffsetSq, FFX_MIN16_F2 fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
{
// NOTE: ideally expect here 2x v_fma_mix + 2x v_exp_f32 + 2x v_fma_mix
const FFX_MIN16_F2 fBoxSampleWeight = exp(fRectificationCurveBias * fSrcSampleOffsetSq) * fOnScreenWeight;
FFX_MIN16_F2 weightedSampleR = sampleR * fBoxSampleWeight;
FFX_MIN16_F2 weightedSampleG = sampleG * fBoxSampleWeight;
FFX_MIN16_F2 weightedSampleB = sampleB * fBoxSampleWeight;
boxCenterR += weightedSampleR;
boxCenterG += weightedSampleG;
boxCenterB += weightedSampleB;
boxVecR += sampleR * weightedSampleR;
boxVecG += sampleG * weightedSampleG;
boxVecB += sampleB * weightedSampleB;
fBoxCenterWeight += fBoxSampleWeight;
}
void finalizeUpscaledColor(FFX_PARAMETER_OUT FfxFloat32x4 upscaledColorAndWeight)
{
upscaledColorAndWeight.r = fColorR.x + fColorR.y;
upscaledColorAndWeight.g = fColorG.x + fColorG.y;
upscaledColorAndWeight.b = fColorB.x + fColorB.y;
upscaledColorAndWeight.a = fWeight.x + fWeight.y;
}
void finalizeBox(FFX_PARAMETER_OUT FfxFloat32x2 boxCenterAndVecR,
FFX_PARAMETER_OUT FfxFloat32x2 boxCenterAndVecG,
FFX_PARAMETER_OUT FfxFloat32x2 boxCenterAndVecB,
FFX_PARAMETER_OUT FfxFloat32 boxCenterWeight)
{
boxCenterAndVecR = FfxFloat32x2(boxCenterR.x + boxCenterR.y, boxVecR.x + boxVecR.y);
boxCenterAndVecG = FfxFloat32x2(boxCenterG.x + boxCenterG.y, boxVecG.x + boxVecG.y);
boxCenterAndVecB = FfxFloat32x2(boxCenterB.x + boxCenterB.y, boxVecB.x + boxVecB.y);
boxCenterWeight = fBoxCenterWeight.x + fBoxCenterWeight.y;
}
};
#endif // #if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
void ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT AccumulationPassData data)
{
// We compute a sliced lanczos filter with 2 lobes (other slices are accumulated temporaly)
const FfxFloat32x2 fDstOutputPos = FfxFloat32x2(params.iPxHrPos) + FFX_BROADCAST_FLOAT32X2(0.5f);
const FfxFloat32x2 fSrcOutputPos = fDstOutputPos * DownscaleFactor();
const FfxInt32x2 iSrcInputPos = FfxInt32x2(floor(fSrcOutputPos));
const FfxFloat32x2 fSrcUnjitteredPos = (FfxFloat32x2(iSrcInputPos) + FfxFloat32x2(0.5f, 0.5f)) - Jitter(); // This is the un-jittered position of the sample at offset 0,0
const FfxFloat32x2 fBaseSampleOffset = FfxFloat32x2(fSrcUnjitteredPos - fSrcOutputPos);
FfxInt32x2 offsetTL;
offsetTL.x = (fSrcUnjitteredPos.x > fSrcOutputPos.x) ? FfxInt32(-2) : FfxInt32(-1);
offsetTL.y = (fSrcUnjitteredPos.y > fSrcOutputPos.y) ? FfxInt32(-2) : FfxInt32(-1);
//Load samples
// If fSrcUnjitteredPos.y > fSrcOutputPos.y, indicates offsetTL.y = -2, sample offset Y will be [-2, 1], clipbox will be rows [1, 3].
// Flip row# for sampling offset in this case, so first 0~2 rows in the sampled array can always be used for computing the clipbox.
// This reduces branch or cmove on sampled colors, but moving this overhead to sample position / weight calculation time which apply to less values.
const FfxBoolean bFlipRow = fSrcUnjitteredPos.y > fSrcOutputPos.y;
const FfxBoolean bFlipCol = fSrcUnjitteredPos.x > fSrcOutputPos.x;
const FfxFloat32x2 fOffsetTL = FfxFloat32x2(offsetTL);
const FfxBoolean bIsInitialSample = (params.fAccumulation == 0.0f);
#if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
// Unroll the loop to load samples on Scarlett to help the shader compiler
const FFX_MIN16_F2 fSampleOffsetX02 = __XB_AsHalf(bFlipCol ? __XB_AsUInt(FFX_MIN16_F2( 1, -1)) : __XB_AsUInt(FFX_MIN16_F2(-1, 1)));
const FFX_MIN16_F2 fSampleOffsetY02 = __XB_AsHalf(bFlipRow ? __XB_AsUInt(FFX_MIN16_F2( 1, -1)) : __XB_AsUInt(FFX_MIN16_F2(-1, 1)));
typedef FfxInt32 FfxTexCoordI;
typedef FfxInt32x2 FfxTexCoordI2;
const FfxTexCoordI2 iSrcSamplePosX01 = FfxTexCoordI2(iSrcInputPos.xx) + (bFlipCol ? FfxTexCoordI2( 1, 0) : FfxTexCoordI2(-1, 0));
const FfxTexCoordI2 iSrcSamplePosX23 = FfxTexCoordI2(iSrcInputPos.xx) + (bFlipCol ? FfxTexCoordI2(-1, -2) : FfxTexCoordI2( 1, 2));
const FfxTexCoordI2 iSrcSamplePosY01 = FfxTexCoordI2(iSrcInputPos.yy) + (bFlipRow ? FfxTexCoordI2( 1, 0) : FfxTexCoordI2(-1, 0));
const FfxTexCoordI2 iSrcSamplePosY23 = FfxTexCoordI2(iSrcInputPos.yy) + (bFlipRow ? FfxTexCoordI2(-1, -2) : FfxTexCoordI2( 1, 2));
const FfxTexCoordI2 renderSizeLastTexelCoord = FfxTexCoordI2(RenderSize()) - FfxTexCoordI2(1, 1);
const FfxTexCoordI2 iSrcSamplePosX01Clamped = FfxTexCoordI2(
__XB_Med3_I32(iSrcSamplePosX01.x, 0, renderSizeLastTexelCoord.x),
__XB_Med3_I32(iSrcSamplePosX01.y, 0, renderSizeLastTexelCoord.x)
);
const FfxTexCoordI2 iSrcSamplePosX23Clamped = FfxTexCoordI2(
__XB_Med3_I32(iSrcSamplePosX23.x, 0, renderSizeLastTexelCoord.x),
__XB_Med3_I32(iSrcSamplePosX23.y, 0, renderSizeLastTexelCoord.x)
);
const FfxTexCoordI2 iSrcSamplePosY01Clamped = FfxTexCoordI2(
__XB_Med3_I32(iSrcSamplePosY01.x, 0, renderSizeLastTexelCoord.y),
__XB_Med3_I32(iSrcSamplePosY01.y, 0, renderSizeLastTexelCoord.y)
);
const FfxTexCoordI2 iSrcSamplePosY23Clamped = FfxTexCoordI2(
__XB_Med3_I32(iSrcSamplePosY23.x, 0, renderSizeLastTexelCoord.y),
__XB_Med3_I32(iSrcSamplePosY23.y, 0, renderSizeLastTexelCoord.y)
);
FFX_MIN16_F2 TopCornerR, BotCornerR, HorzR, VertR, CenterR;
FFX_MIN16_F2 TopCornerG, BotCornerG, HorzG, VertG, CenterG;
FFX_MIN16_F2 TopCornerB, BotCornerB, HorzB, VertB, CenterB;
LoadPreparedColorPairedRgb(TopCornerR, TopCornerG, TopCornerB,
FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY01Clamped.x),
FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY01Clamped.x)
);
LoadPreparedColorPairedRgb(BotCornerR, BotCornerG, BotCornerB,
FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY23Clamped.x),
FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY23Clamped.x)
);
LoadPreparedColorPairedRgb(HorzR, HorzG, HorzB,
FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY01Clamped.y),
FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY01Clamped.y)
);
LoadPreparedColorPairedRgb(VertR, VertG, VertB,
FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.x),
FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY23Clamped.x)
);
// NOTE: duplicated data
LoadPreparedColorPairedRgb(CenterR, CenterG, CenterB,
FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.y),
FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.y)
);
#if FFX_FSR3UPSCALER_OPTION_HDR_COLOR_INPUT
if (bIsInitialSample)
{
TonemapPaired(TopCornerR, TopCornerG, TopCornerB);
TonemapPaired(BotCornerR, BotCornerG, BotCornerB);
TonemapPaired(HorzR, HorzG, HorzB);
TonemapPaired(VertR, VertG, VertB);
TonemapPaired(CenterR, CenterG, CenterB);
}
#endif
RGBToYCoCgPaired(TopCornerR, TopCornerG, TopCornerB);
RGBToYCoCgPaired(BotCornerR, BotCornerG, BotCornerB);
RGBToYCoCgPaired(HorzR, HorzG, HorzB);
RGBToYCoCgPaired(VertR, VertG, VertB);
RGBToYCoCgPaired(CenterR, CenterG, CenterB);
#else
FfxFloat32x3 fSamples[9];
FfxInt32 iSampleIndex = 0;
FFX_UNROLL
for (FfxInt32 row = 0; row < 3; row++) {
FFX_UNROLL
for (FfxInt32 col = 0; col < 3; col++) {
const FfxInt32x2 iSampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row);
const FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + offsetTL + iSampleColRow;
const FfxInt32x2 iSampleCoord = ClampLoad(iSrcSamplePos, FfxInt32x2(0, 0), FfxInt32x2(RenderSize()));
fSamples[iSampleIndex] = LoadPreparedColor(iSampleCoord);
++iSampleIndex;
}
}
#if FFX_FSR3UPSCALER_OPTION_HDR_COLOR_INPUT
if (bIsInitialSample)
{
for (iSampleIndex = 0; iSampleIndex < 9; ++iSampleIndex)
{
//YCoCg -> RGB -> Tonemap -> YCoCg (Use RGB tonemapper to avoid color desaturation)
fSamples[iSampleIndex] = RGBToYCoCg(Tonemap(YCoCgToRGB(fSamples[iSampleIndex])));
}
}
#endif
#endif // #if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
// Identify how much of each upsampled color to be used for this frame
const FfxFloat32 fKernelBiasMax = ComputeMaxKernelWeight(params, data);
const FfxFloat32 fKernelBiasMin = ffxMax(1.0f, ((1.0f + fKernelBiasMax) * 0.3f));
const FfxFloat32 fKernelBiasWeight =
ffxMin(1.0f - params.fDisocclusion * 0.5f,
ffxMin(1.0f - params.fShadingChange,
ffxSaturate(data.fHistoryWeight * 5.0f)
));
const FfxFloat32 fKernelBias = ffxLerp(fKernelBiasMin, fKernelBiasMax, fKernelBiasWeight);
#if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
// Unroll the loop to load samples on Scarlett to help the shader compiler
const bool coordX0OnScreen = iSrcSamplePosX01.x == iSrcSamplePosX01Clamped.x;
const bool coordX1OnScreen = iSrcSamplePosX01.y == iSrcSamplePosX01Clamped.y;
const bool coordX2OnScreen = iSrcSamplePosX23.x == iSrcSamplePosX23Clamped.x;
const bool coordY0OnScreen = iSrcSamplePosY01.x == iSrcSamplePosY01Clamped.x;
const bool coordY1OnScreen = iSrcSamplePosY01.y == iSrcSamplePosY01Clamped.y;
const bool coordY2OnScreen = iSrcSamplePosY23.x == iSrcSamplePosY23Clamped.x;
const FFX_MIN16_F2 fBaseSampleOffsetHalf = FFX_MIN16_F2(fBaseSampleOffset);
const FFX_MIN16_F2 fSrcSampleOffsetX_02 = fBaseSampleOffsetHalf.xx + fSampleOffsetX02;
const FFX_MIN16_F2 fSrcSampleOffsetY_02 = fBaseSampleOffsetHalf.yy + fSampleOffsetY02;
const FFX_MIN16_F2 fSrcSampleOffsetXSq_02 = fSrcSampleOffsetX_02 * fSrcSampleOffsetX_02;
const FFX_MIN16_F2 fSrcSampleOffsetYSq_02 = fSrcSampleOffsetY_02 * fSrcSampleOffsetY_02;
const FFX_MIN16_F2 fSrcSampleOffsetXYSq_11 = fBaseSampleOffsetHalf * fBaseSampleOffsetHalf;
const FfxFloat32 fRectificationCurveBias = -2.3f;
PairedRectificationBoxAndAccumulatedColorAndWeight pairedBox;
pairedBox.setKernelBiasAndRectificationCurveBias(fKernelBias, fRectificationCurveBias);
// init by o o o
// o x o
// o o o
pairedBox.initBox(
fSrcSampleOffsetXYSq_11.x + fSrcSampleOffsetXYSq_11.y,
Bool2ToFloat16x2(coordX1OnScreen && coordY1OnScreen, false).x,
CenterR, CenterG, CenterB
);
// add remaining two samples from 1st row x o x
// o * o
// o o o
pairedBox.addBoxSample(
fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.xx,
Bool2ToFloat16x2(coordX0OnScreen && coordY0OnScreen, coordX2OnScreen && coordY0OnScreen),
TopCornerR, TopCornerG, TopCornerB
);
// add two samples from 2nd row * o *
// o * o
// x o x
pairedBox.addBoxSample(
fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.yy,
Bool2ToFloat16x2(coordX0OnScreen && coordY2OnScreen, coordX2OnScreen && coordY2OnScreen),
BotCornerR, BotCornerG, BotCornerB
);
// add two samples from 3rd row * o *
// x * x
// * o *
pairedBox.addBoxSample(
fSrcSampleOffsetXSq_02 + fSrcSampleOffsetXYSq_11.yy,
Bool2ToFloat16x2(coordX0OnScreen && coordY1OnScreen, coordX2OnScreen && coordY1OnScreen),
HorzR, HorzG, HorzB
);
// add remaining samples * x *
// * * *
// * x *
pairedBox.addBoxSample(
fSrcSampleOffsetXYSq_11.xx + fSrcSampleOffsetYSq_02,
Bool2ToFloat16x2(coordX1OnScreen && coordY0OnScreen, coordX1OnScreen && coordY2OnScreen),
VertR, VertG, VertB
);
FfxFloat32x2 boxCenterAndVecR, boxCenterAndVecG, boxCenterAndVecB;
FfxFloat32 boxCenterWeight;
pairedBox.finalizeBox(boxCenterAndVecR, boxCenterAndVecG, boxCenterAndVecB, boxCenterWeight);
if (!bIsInitialSample)
{
pairedBox.initUpscaledColor(
fSrcSampleOffsetXYSq_11.x + fSrcSampleOffsetXYSq_11.y,
Bool2ToFloat16x2(coordX1OnScreen && coordY1OnScreen, false).x,
CenterR, CenterG, CenterB
);
// add remaining two samples from 1st row x o x
// o * o
// o o o
pairedBox.addUpscaledColorSample(
fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.xx,
Bool2ToFloat16x2(coordX0OnScreen && coordY0OnScreen, coordX2OnScreen && coordY0OnScreen),
TopCornerR, TopCornerG, TopCornerB
);
// add two samples from 2nd row * o *
// o * o
// x o x
pairedBox.addUpscaledColorSample(
fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.yy,
Bool2ToFloat16x2(coordX0OnScreen && coordY2OnScreen, coordX2OnScreen && coordY2OnScreen),
BotCornerR, BotCornerG, BotCornerB
);
// add two samples from 3rd row * o *
// x * x
// * o *
pairedBox.addUpscaledColorSample(
fSrcSampleOffsetXSq_02 + fSrcSampleOffsetXYSq_11.yy,
Bool2ToFloat16x2(coordX0OnScreen && coordY1OnScreen, coordX2OnScreen && coordY1OnScreen),
HorzR, HorzG, HorzB
);
// add remaining samples * x *
// * * *
// * x *
pairedBox.addUpscaledColorSample(
fSrcSampleOffsetXYSq_11.xx + fSrcSampleOffsetYSq_02,
Bool2ToFloat16x2(coordX1OnScreen && coordY0OnScreen, coordX1OnScreen && coordY2OnScreen),
VertR, VertG, VertB
);
FfxFloat32x4 upscaledColorAndWeight = 0.0;
pairedBox.finalizeUpscaledColor(upscaledColorAndWeight);
data.fUpsampledColor = FfxFloat32x3(upscaledColorAndWeight.rgb);
data.fUpsampledWeight = FfxFloat32(upscaledColorAndWeight.w);
}
FFX_MIN16_F2 aabbMinMaxR = Compute3x3SamplesMinMaxPaired(CenterR, TopCornerR, BotCornerR, HorzR, VertR);
FFX_MIN16_F2 aabbMinMaxG = Compute3x3SamplesMinMaxPaired(CenterG, TopCornerG, BotCornerG, HorzG, VertG);
FFX_MIN16_F2 aabbMinMaxB = Compute3x3SamplesMinMaxPaired(CenterB, TopCornerB, BotCornerB, HorzB, VertB);
data.clippingBox.boxCenter = FfxFloat32x3(boxCenterAndVecR.x, boxCenterAndVecG.x, boxCenterAndVecB.x);
data.clippingBox.boxVec = FfxFloat32x3(boxCenterAndVecR.y, boxCenterAndVecG.y, boxCenterAndVecB.y);
data.clippingBox.aabbMin = FfxFloat32x3(aabbMinMaxR.x, aabbMinMaxG.x, aabbMinMaxB.x);
data.clippingBox.aabbMax = FfxFloat32x3(aabbMinMaxR.y, aabbMinMaxG.y, aabbMinMaxB.y);
data.clippingBox.fBoxCenterWeight = FfxFloat32(boxCenterWeight);
#else
iSampleIndex = 0;
FFX_UNROLL
for (FfxInt32 row = 0; row < 3; row++)
{
FFX_UNROLL
for (FfxInt32 col = 0; col < 3; col++)
{
const FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row);
const FfxFloat32x2 fOffset = fOffsetTL + FfxFloat32x2(sampleColRow);
const FfxFloat32x2 fSrcSampleOffset = fBaseSampleOffset + fOffset;
const FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + FfxInt32x2(offsetTL) + sampleColRow;
const FfxFloat32 fOnScreenFactor = FfxFloat32(IsOnScreen(FfxInt32x2(iSrcSamplePos), FfxInt32x2(RenderSize())));
if (!bIsInitialSample)
{
const FfxFloat32 fSampleWeight = fOnScreenFactor * FfxFloat32(GetUpsampleLanczosWeight(fSrcSampleOffset, fKernelBias));
data.fUpsampledColor += fSamples[iSampleIndex] * fSampleWeight;
data.fUpsampledWeight += fSampleWeight;
}
// Update rectification box
{
const FfxFloat32 fRectificationCurveBias = -2.3f;
const FfxFloat32 fSrcSampleOffsetSq = dot(fSrcSampleOffset, fSrcSampleOffset);
const FfxFloat32 fBoxSampleWeight = exp(fRectificationCurveBias * fSrcSampleOffsetSq) * fOnScreenFactor;
const FfxBoolean bInitialSample = (row == 0) && (col == 0);
RectificationBoxAddSample(bInitialSample, data.clippingBox, fSamples[iSampleIndex], fBoxSampleWeight);
}
++iSampleIndex;
}
}
#endif // #if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
RectificationBoxComputeVarianceBoxData(data.clippingBox);
data.fUpsampledWeight *= FfxFloat32(data.fUpsampledWeight > FSR3UPSCALER_EPSILON);
if (data.fUpsampledWeight > FSR3UPSCALER_EPSILON) {
// Normalize for deringing (we need to compare colors)
data.fUpsampledColor = data.fUpsampledColor / data.fUpsampledWeight;
data.fUpsampledWeight *= fAverageLanczosWeightPerFrame;
Deringing(data.clippingBox, data.fUpsampledColor);
}
// Initial samples using tonemapped upsampling
if (bIsInitialSample) {
#if FFX_FSR3UPSCALER_OPTION_HDR_COLOR_INPUT
data.fUpsampledColor = RGBToYCoCg(InverseTonemap(YCoCgToRGB(data.clippingBox.boxCenter)));
#else
data.fUpsampledColor = data.clippingBox.boxCenter;
#endif
data.fUpsampledWeight = 1.0f;
data.fHistoryWeight = 0.0f;
}
}