diff --git a/Shaders/ffx_fsr_unity_common.cginc b/Shaders/ffx_fsr_unity_common.cginc
index b1bcb5c..af09dee 100644
--- a/Shaders/ffx_fsr_unity_common.cginc
+++ b/Shaders/ffx_fsr_unity_common.cginc
@@ -38,6 +38,11 @@
 //#pragma require Native16Bit
 //#endif
 
+// Allow use of Xbox Series-specific optimizations
+#if defined(SHADER_API_GAMECORE_XBOXSERIES)
+#define __XBOX_SCARLETT
+#endif
+
 // Hack to work around the lack of texture atomics on Metal
 #if defined(SHADER_API_METAL)
 #define InterlockedAdd(dest, val, orig)     { (orig) = (dest); (dest) += (val); }
diff --git a/Shaders/shaders/fsr2/ffx_fsr2_callbacks_hlsl.h b/Shaders/shaders/fsr2/ffx_fsr2_callbacks_hlsl.h
index c52cc1a..9007343 100644
--- a/Shaders/shaders/fsr2/ffx_fsr2_callbacks_hlsl.h
+++ b/Shaders/shaders/fsr2/ffx_fsr2_callbacks_hlsl.h
@@ -558,6 +558,14 @@ FfxFloat32x3 LoadPreparedInputColor(FfxUInt32x2 iPxPos)
 {
     return r_prepared_input_color[iPxPos].xyz;
 }
+
+#if FFX_HALF && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+FFX_MIN16_F3 LoadPreparedInputColorHalf(FfxUInt32x2 iPxPos)
+{
+    return FFX_MIN16_F3(r_prepared_input_color[iPxPos].xyz);
+}
+#endif
+
 #endif
 
 #if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS)
@@ -801,6 +809,27 @@ FfxFloat32 SampleLanczos2Weight(FfxFloat32 x)
 #endif
 }
 
+#if FFX_HALF && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+
+FFX_MIN16_F SampleLanczos2Weight_NoValu(FFX_MIN16_F x)
+{
+#if defined(FSR2_BIND_SRV_LANCZOS_LUT)
+    return FFX_MIN16_F(r_lanczos_lut.SampleLevel(s_LinearClamp, __XB_AsHalf(__XB_V_PACK_B32_F16(x, 0.5)), 0));
+#else
+    return 0.0;
+#endif
+}
+
+FFX_MIN16_F SampleLanczos2Weight_NoValuNoA16(FfxFloat32 x)
+{
+#if defined(FSR2_BIND_SRV_LANCZOS_LUT)
+    return FFX_MIN16_F(r_lanczos_lut.SampleLevel(s_LinearClamp, FfxFloat32x2(x, 0.5), 0));
+#else
+    return 0.0;
+#endif
+}
+#endif
+
 #if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT)
 FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv)
 {
diff --git a/Shaders/shaders/fsr2/ffx_fsr2_common.h b/Shaders/shaders/fsr2/ffx_fsr2_common.h
index e46b66c..22d7f48 100644
--- a/Shaders/shaders/fsr2/ffx_fsr2_common.h
+++ b/Shaders/shaders/fsr2/ffx_fsr2_common.h
@@ -509,6 +509,32 @@ FfxFloat32x3 UnprepareRgb(FfxFloat32x3 fRgb, FfxFloat32 fExposure)
     return fRgb;
 }
 
+#if FFX_HALF && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+
+void PrepareRgbPaired(inout FFX_MIN16_F2 r, inout FFX_MIN16_F2 g, inout FFX_MIN16_F2 b, FfxFloat32 fExposure, FfxFloat32 fPreExposure)
+{
+    FFX_MIN16_F ExposureOverPreExposureOver = FFX_MIN16_F(fExposure / fPreExposure);
+
+    r *= ExposureOverPreExposureOver;
+    g *= ExposureOverPreExposureOver;
+    b *= ExposureOverPreExposureOver;
+
+    r = ffxClampHalf(r, 0.0, FSR2_FP16_MAX);
+    g = ffxClampHalf(g, 0.0, FSR2_FP16_MAX);
+    b = ffxClampHalf(b, 0.0, FSR2_FP16_MAX);
+}
+
+void UnprepareRgbPaired(inout FFX_MIN16_F2 r, inout FFX_MIN16_F2 g, inout FFX_MIN16_F2 b, FfxFloat32 fExposure)
+{
+    FFX_MIN16_F PreExposureOverExposure = FFX_MIN16_F(PreExposure() / fExposure);
+
+    r *= PreExposureOverExposure;
+    g *= PreExposureOverExposure;
+    b *= PreExposureOverExposure;
+}
+
+#endif
+
 
 struct BilinearSamplingData
 {
diff --git a/Shaders/shaders/fsr2/ffx_fsr2_rcas.h b/Shaders/shaders/fsr2/ffx_fsr2_rcas.h
index fd5fd26..1a8c756 100644
--- a/Shaders/shaders/fsr2/ffx_fsr2_rcas.h
+++ b/Shaders/shaders/fsr2/ffx_fsr2_rcas.h
@@ -25,38 +25,80 @@
 
 #include "../ffx_core.h"
 
+#if FFX_HALF && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+    #define FSR_RCAS_PREFER_PAIRED_VERSION 1
+#else
+    #define FSR_RCAS_PREFER_PAIRED_VERSION 0
+#endif
+
 void WriteUpscaledOutput(FFX_MIN16_U2 iPxHrPos, FfxFloat32x3 fUpscaledColor)
 {
     StoreUpscaledOutput(FFX_MIN16_I2(iPxHrPos), fUpscaledColor);
 }
 
-#define FSR_RCAS_F 1
-FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p)
-{
-    FfxFloat32x4 fColor = LoadRCAS_Input(p);
+#if FSR_RCAS_PREFER_PAIRED_VERSION
+    #define FSR_RCAS_HX2 1
 
-    fColor.rgb = PrepareRgb(fColor.rgb, Exposure(), PreExposure());
+    FfxFloat16x4 FsrRcasLoadHx2(FfxInt16x2 p)
+    {
+        return FfxFloat16x4(LoadRCAS_Input(p));
+    }
+    void FsrRcasInputHx2(inout FfxFloat16x2 r, inout FfxFloat16x2 g, inout FfxFloat16x2 b)
+    {
+        PrepareRgbPaired(r, g, b, Exposure(), PreExposure());
+    }
 
-    return fColor;
-}
-void FsrRcasInputF(inout FfxFloat32 r, inout FfxFloat32 g, inout FfxFloat32 b) {}
+    #include "../fsr1/ffx_fsr1.h"
 
-#include "../fsr1/ffx_fsr1.h"
+    void CurrFilterPaired(FFX_MIN16_U2 pos)
+    {
+        FfxFloat16x2 cr;
+        FfxFloat16x2 cg;
+        FfxFloat16x2 cb;
+        FsrRcasHx2(cr, cg, cb, pos, RCASConfig());
 
-void CurrFilter(FFX_MIN16_U2 pos)
-{
-    FfxFloat32x3 c;
-    FsrRcasF(c.r, c.g, c.b, pos, RCASConfig());
+        UnprepareRgbPaired(cr, cg, cb, Exposure());
 
-    c = UnprepareRgb(c, Exposure());
+        WriteUpscaledOutput(pos, FfxFloat16x3(cr.x, cg.x, cb.x)); //TODO: fix type
+        pos.x += 8;
+        WriteUpscaledOutput(pos, FfxFloat16x3(cr.y, cg.y, cb.y)); //TODO: fix type
+    }
 
-    WriteUpscaledOutput(pos, c);
-}
+#else
+    #define FSR_RCAS_F 1
+    FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p)
+    {
+        FfxFloat32x4 fColor = LoadRCAS_Input(p);
+
+        fColor.rgb = PrepareRgb(fColor.rgb, Exposure(), PreExposure());
+
+        return fColor;
+    }
+    void FsrRcasInputF(inout FfxFloat32 r, inout FfxFloat32 g, inout FfxFloat32 b) {}
+
+    #include "../fsr1/ffx_fsr1.h"
+
+    void CurrFilter(FFX_MIN16_U2 pos)
+    {
+        FfxFloat32x3 c;
+        FsrRcasF(c.r, c.g, c.b, pos, RCASConfig());
+
+        c = UnprepareRgb(c, Exposure());
+
+        WriteUpscaledOutput(pos, c);
+    }
+
+#endif // #if FSR_RCAS_PREFER_PAIRED_VERSION
 
 void RCAS(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid)
 {
     // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
     FfxUInt32x2 gxy = ffxRemapForQuad(LocalThreadId.x) + FfxUInt32x2(WorkGroupId.x << 4u, WorkGroupId.y << 4u);
+#if FSR_RCAS_PREFER_PAIRED_VERSION
+    CurrFilterPaired(FFX_MIN16_U2(gxy));
+    gxy.y += 8u;
+    CurrFilterPaired(FFX_MIN16_U2(gxy));
+#else
     CurrFilter(FFX_MIN16_U2(gxy));
     gxy.x += 8u;
     CurrFilter(FFX_MIN16_U2(gxy));
@@ -64,4 +106,5 @@ void RCAS(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid)
     CurrFilter(FFX_MIN16_U2(gxy));
     gxy.x -= 8u;
     CurrFilter(FFX_MIN16_U2(gxy));
+#endif
 }
diff --git a/Shaders/shaders/fsr2/ffx_fsr2_sample.h b/Shaders/shaders/fsr2/ffx_fsr2_sample.h
index b75f090..cd7142a 100644
--- a/Shaders/shaders/fsr2/ffx_fsr2_sample.h
+++ b/Shaders/shaders/fsr2/ffx_fsr2_sample.h
@@ -139,14 +139,6 @@ FfxFloat32 Lanczos2(FfxFloat32 x)
 
 #if FFX_HALF
 
-#if 0
-FFX_MIN16_F Lanczos2NoClamp(FFX_MIN16_F x)
-{
-    const FFX_MIN16_F PI = FFX_MIN16_F(3.141592653589793f); // TODO: share SDK constants
-    return abs(x) < FFX_MIN16_F(FSR2_EPSILON) ? FFX_MIN16_F(1.f) : (sin(PI * x) / (PI * x)) * (sin(FFX_MIN16_F(0.5f) * PI * x) / (FFX_MIN16_F(0.5f) * PI * x));
-}
-#endif
-
 FFX_MIN16_F Lanczos2(FFX_MIN16_F x)
 {
     x = ffxMin(abs(x), FFX_MIN16_F(2.0f));
@@ -169,6 +161,26 @@ FFX_MIN16_F Lanczos2ApproxSqNoClamp(FFX_MIN16_F x2)
     FFX_MIN16_F b = FFX_MIN16_F(1.0f / 4.0f) * x2 - FFX_MIN16_F(1);
     return (FFX_MIN16_F(25.0f / 16.0f) * a * a - FFX_MIN16_F(25.0f / 16.0f - 1)) * (b * b);
 }
+
+#if defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+
+FFX_MIN16_F2 PairedLanczos2ApproxSqNoClamp(FFX_MIN16_F2 x2)
+{
+    // Xbox ATG (Pavel):
+    // 
+    //     2.0 * x2 - 5.0     25.0           25.0 - 16.0     (2.0 * x2 - 5.0)^2 - (3.0)^2    (2.0 * x2 - 8.0) * (2.0 * x2 - 2.0)   (x2 - 4.0) * (x2 - 1.0)
+    // a = -------------- ==> ---- * a^2 - -------------- = ----------------------------- =  ---------------------------------- =  ----------------------- = b * (x2 - 1.0)
+    //           5.0          16.0              16.0                16.0                                     16.0                            4.0
+    //
+    // so we need to compute just (b * b) * (b * x2 - b), so we should get four packed instructions: 2 fma + 2 mul
+    //
+
+    FFX_MIN16_F2 b = (0.25 * x2 - 1.0);
+    return (b * b) * (b * x2 - b);
+}
+
+#endif
+
 #endif //FFX_HALF
 
 FfxFloat32 Lanczos2ApproxSq(FfxFloat32 x2)
@@ -183,6 +195,15 @@ FFX_MIN16_F Lanczos2ApproxSq(FFX_MIN16_F x2)
     x2 = ffxMin(x2, FFX_MIN16_F(4.0f));
     return Lanczos2ApproxSqNoClamp(x2);
 }
+
+#if defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+FFX_MIN16_F2 PairedLanczos2ApproxSq(FFX_MIN16_F2 x2)
+{
+    x2 = ffxMin(x2, FFX_MIN16_F2(4.0, 4.0));
+    return PairedLanczos2ApproxSqNoClamp(x2);
+}
+#endif
+
 #endif //FFX_HALF
 
 FfxFloat32 Lanczos2ApproxNoClamp(FfxFloat32 x)
@@ -219,6 +240,21 @@ FFX_MIN16_F Lanczos2_UseLUT(FFX_MIN16_F x)
 {
     return FFX_MIN16_F(SampleLanczos2Weight(abs(x)));
 }
+
+#if defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+
+FFX_MIN16_F Lanczos2_UseLUTNoAbs(FFX_MIN16_F x)
+{
+    return SampleLanczos2Weight_NoValu(x);
+}
+
+FFX_MIN16_F Lanczos2_UseLUTNoAbsNoA16(FfxFloat32 x)
+{
+    return SampleLanczos2Weight_NoValuNoA16(x);
+}
+
+#endif
+
 #endif //FFX_HALF
 
 FfxFloat32x4 Lanczos2_UseLUT(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t)
@@ -364,6 +400,19 @@ FfxFloat32x4 Lanczos2LUT(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac)
 }
 
 #if FFX_HALF
+
+#if defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+FFX_MIN16_F4 Lanczos2ApplyWeightX(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F2 fWeight0, FFX_MIN16_F2 fWeight1, FFX_MIN16_F2 fWeight2, FFX_MIN16_F2 fWeight3, FFX_MIN16_F2 fWeightSumInverted)
+{
+    return (((fWeight0.x * fColor0) + fWeight1.x * fColor1) + ((fWeight2.x * fColor2) + fWeight3.x * fColor3)) * fWeightSumInverted.x;
+}
+
+FFX_MIN16_F4 Lanczos2ApplyWeightY(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F2 fWeight0, FFX_MIN16_F2 fWeight1, FFX_MIN16_F2 fWeight2, FFX_MIN16_F2 fWeight3, FFX_MIN16_F2 fWeightSumInverted)
+{
+    return (((fWeight0.y * fColor0) + fWeight1.y * fColor1) + ((fWeight2.y * fColor2) + fWeight3.y * fColor3)) * fWeightSumInverted.y;
+}
+#endif
+
 FFX_MIN16_F4 Lanczos2LUT(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac)
 {
     FFX_MIN16_F4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
diff --git a/Shaders/shaders/fsr2/ffx_fsr2_upsample.h b/Shaders/shaders/fsr2/ffx_fsr2_upsample.h
index 9287185..2281d98 100644
--- a/Shaders/shaders/fsr2/ffx_fsr2_upsample.h
+++ b/Shaders/shaders/fsr2/ffx_fsr2_upsample.h
@@ -83,22 +83,167 @@ FfxFloat32 ComputeMaxKernelWeight() {
     return ffxMin(FfxFloat32(1.99f), fKernelWeight);
 }
 
+
+#if FFX_HALF && (FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2) && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+#define FFX_FSR2_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS 1
+#else
+#define FFX_FSR2_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS 0
+#endif
+
+#if FFX_FSR2_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+
+FFX_MIN16_F2 Bool2ToFloat16x2(bool x, bool y)
+{
+    uint lo = x ? 0x00003c00 : 0x00000000;
+    uint hi = y ? 0x3c000000 : 0x00000000;
+
+    return FFX_MIN16_F2(__XB_AsHalf(lo).x, __XB_AsHalf(hi).y);
+}
+
+struct PairedRectificationBoxAndAccumulatedColorAndWeight
+{
+    FFX_MIN16_F2 aabbMinRG;
+    FFX_MIN16_F2 aabbMinB;
+
+    FFX_MIN16_F2 aabbMaxRG;
+    FFX_MIN16_F2 aabbMaxB;
+
+    FFX_MIN16_F2 boxCenterRG;
+    FFX_MIN16_F2 boxCenterB;
+
+    FFX_MIN16_F2 boxVecRG;
+    FFX_MIN16_F2 boxVecB;
+
+    FFX_MIN16_F2 fBoxCenterWeight;
+
+    FFX_MIN16_F2 fColorRG;
+    FFX_MIN16_F2 fColorB;
+    FFX_MIN16_F2 fWeight;
+
+    FFX_MIN16_F fKernelBiasSq;
+    FfxFloat32 fRectificationCurveBias;
+
+    void setKernelBiasAndRectificationCurveBias(FfxFloat32 kernelBias, FfxFloat32 rectificationCurveBias)
+    {
+        fKernelBiasSq = FFX_MIN16_F(kernelBias * kernelBias);
+        fRectificationCurveBias = rectificationCurveBias;
+    }
+
+    void init(FFX_MIN16_F fSrcSampleOffsetSq, bool sampleOnScreenX, bool sampleOnScreenY, FFX_MIN16_F3 colorSample)
+    {
+        // NOTE: make sure exp has 32-bit precision
+        const FFX_MIN16_F fBoxSampleWeight = FFX_MIN16_F(
+            exp(fRectificationCurveBias * FfxFloat32(fSrcSampleOffsetSq))
+        );
+
+#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+        const FFX_MIN16_F2 LanczosUpsampleWeight = PairedLanczos2ApproxSq(fSrcSampleOffsetSq * fKernelBiasSq);
+#else
+#error "Only LANCZOS_TYPE_APPROXIMATE is supported in paired version so far"
+#endif
+        const FFX_MIN16_F2 fSampleWeight = FFX_MIN16_F2((sampleOnScreenX && sampleOnScreenY ? 1.0 : 0.0), 0.0) * LanczosUpsampleWeight;
+
+        aabbMinRG = colorSample.rg;
+        aabbMinB = colorSample.bb;
+
+        aabbMaxRG = colorSample.rg;
+        aabbMaxB = colorSample.bb;
+
+        boxCenterRG = colorSample.rg * fBoxSampleWeight.x;
+        boxCenterB = colorSample.bb * fBoxSampleWeight;
+
+        boxVecRG = colorSample.rg * boxCenterRG;
+        boxVecB = colorSample.bb * boxCenterB;
+
+        fBoxCenterWeight = fBoxSampleWeight;
+
+        fColorRG = colorSample.rg * fSampleWeight.x;
+        fColorB = colorSample.bb * fSampleWeight;
+        fWeight = fSampleWeight;
+    }
+
+    void addSample(FFX_MIN16_F2 fSrcSampleOffsetSq, bool sample0OnScreen, bool sample1OnScreen, bool sample01OnScreen, FFX_MIN16_F3 ColorSample0, FFX_MIN16_F3 ColorSample1)
+    {
+        // NOTE: make sure exp has 32-bit precision
+        const FFX_MIN16_F2 fBoxSampleWeight = FFX_MIN16_F2(
+            exp(fRectificationCurveBias * FfxFloat32(fSrcSampleOffsetSq.x)),
+            exp(fRectificationCurveBias * FfxFloat32(fSrcSampleOffsetSq.y))
+        );
+
+#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+        const FFX_MIN16_F2 LanczosUpsampleWeight = PairedLanczos2ApproxSq(fSrcSampleOffsetSq * fKernelBiasSq);
+#else
+#error "Only LANCZOS_TYPE_APPROXIMATE is supported in paired version so far"
+#endif
+        const FFX_MIN16_F2 fSampleWeight = Bool2ToFloat16x2(sample0OnScreen && sample01OnScreen, sample1OnScreen && sample01OnScreen) * LanczosUpsampleWeight;
+
+        FFX_MIN16_F2 colorSampleB = FFX_MIN16_F2(ColorSample0.b, ColorSample1.b);
+
+        aabbMinRG = ffxMin(aabbMinRG, ColorSample0.rg);
+        aabbMinRG = ffxMin(aabbMinRG, ColorSample1.rg);
+        aabbMinB = ffxMin(aabbMinB, colorSampleB);
+
+        aabbMaxRG = ffxMax(aabbMaxRG, ColorSample0.rg);
+        aabbMaxRG = ffxMax(aabbMaxRG, ColorSample1.rg);
+        aabbMaxB = ffxMax(aabbMaxB, colorSampleB);
+
+        FFX_MIN16_F2 weightedColorSampleRG0 = ColorSample0.rg * fBoxSampleWeight.x;
+        FFX_MIN16_F2 weightedColorSampleRG1 = ColorSample1.rg * fBoxSampleWeight.y;
+        FFX_MIN16_F2 weightedColorSampleB = colorSampleB * fBoxSampleWeight;
+
+        boxCenterRG += weightedColorSampleRG0;
+        boxCenterRG += weightedColorSampleRG1;
+        boxCenterB += weightedColorSampleB;
+
+        boxVecRG += ColorSample0.rg * weightedColorSampleRG0;
+        boxVecRG += ColorSample1.rg * weightedColorSampleRG1;
+        boxVecB += colorSampleB * weightedColorSampleB;
+
+        fBoxCenterWeight += fBoxSampleWeight;
+
+        fWeight += fSampleWeight;
+        fColorRG += (ColorSample0.rg * fSampleWeight.x) + (ColorSample1.rg * fSampleWeight.y);
+        fColorB += colorSampleB * fSampleWeight;
+    }
+
+    void finalize(FFX_PARAMETER_INOUT RectificationBox rectificationBox, FFX_PARAMETER_INOUT FfxFloat32x4 outColorAndWeight)
+    {
+        rectificationBox.aabbMin.r = FfxFloat32(aabbMinRG.x);
+        rectificationBox.aabbMin.g = FfxFloat32(aabbMinRG.y);
+        rectificationBox.aabbMin.b = FfxFloat32(ffxMin(aabbMinB.x, aabbMinB.y));
+
+        rectificationBox.aabbMax.r = FfxFloat32(aabbMaxRG.x);
+        rectificationBox.aabbMax.g = FfxFloat32(aabbMaxRG.y);
+        rectificationBox.aabbMax.b = FfxFloat32(ffxMax(aabbMaxB.x, aabbMaxB.y));
+
+        rectificationBox.boxCenter.r = FfxFloat32(boxCenterRG.x);
+        rectificationBox.boxCenter.g = FfxFloat32(boxCenterRG.y);
+        rectificationBox.boxCenter.b = FfxFloat32(boxCenterB.x + boxCenterB.y);
+
+        rectificationBox.boxVec.r = FfxFloat32(boxVecRG.x);
+        rectificationBox.boxVec.g = FfxFloat32(boxVecRG.y);
+        rectificationBox.boxVec.b = FfxFloat32(boxVecB.x + boxVecB.y);
+
+        rectificationBox.fBoxCenterWeight = FfxFloat32(fBoxCenterWeight.x + fBoxCenterWeight.y);
+
+        outColorAndWeight = FfxFloat32x4(fColorRG, fColorB.x + fColorB.y, fWeight.x + fWeight.y);
+    }
+};
+#endif
+
 FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params,
     FFX_PARAMETER_INOUT RectificationBox clippingBox, FfxFloat32 fReactiveFactor)
 {
-    #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF
-    #include "ffx_fsr2_force16_begin.h"
-    #endif
     // We compute a sliced lanczos filter with 2 lobes (other slices are accumulated temporaly)
     FfxFloat32x2 fDstOutputPos = FfxFloat32x2(params.iPxHrPos) + FFX_BROADCAST_FLOAT32X2(0.5f);      // Destination resolution output pixel center position
     FfxFloat32x2 fSrcOutputPos = fDstOutputPos * DownscaleFactor();                   // Source resolution output pixel center position
     FfxInt32x2 iSrcInputPos = FfxInt32x2(floor(fSrcOutputPos));                     // TODO: what about weird upscale factors...
 
-    #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF
-    #include "ffx_fsr2_force16_end.h"
-    #endif
-
+#if FFX_FSR2_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+    FFX_MIN16_F3 fSamples[iLanczos2SampleCount];
+#else
     FfxFloat32x3 fSamples[iLanczos2SampleCount];
+#endif
 
     FfxFloat32x2 fSrcUnjitteredPos = (FfxFloat32x2(iSrcInputPos) + FfxFloat32x2(0.5f, 0.5f)) - Jitter(); // This is the un-jittered position of the sample at offset 0,0
 
@@ -113,6 +258,59 @@ FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams p
     const FfxBoolean bFlipRow = fSrcUnjitteredPos.y > fSrcOutputPos.y;
     const FfxBoolean bFlipCol = fSrcUnjitteredPos.x > fSrcOutputPos.x;
 
+#if FFX_FSR2_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+    // Unroll the loop to load samples on Scarlett to help the shader compiler
+    const FFX_MIN16_F2 fSampleOffsetX02 = __XB_AsHalf(bFlipCol ? __XB_AsUInt(FFX_MIN16_F2( 1, -1)) : __XB_AsUInt(FFX_MIN16_F2(-1, 1)));
+    const FFX_MIN16_F2 fSampleOffsetY02 = __XB_AsHalf(bFlipRow ? __XB_AsUInt(FFX_MIN16_F2( 1, -1)) : __XB_AsUInt(FFX_MIN16_F2(-1, 1)));
+
+    typedef FfxInt32 FfxTexCoordI;
+    typedef FfxInt32x2 FfxTexCoordI2;
+
+    const FfxTexCoordI2 iSrcSamplePosX01 = FfxTexCoordI2(iSrcInputPos.xx) + (bFlipCol ? FfxTexCoordI2( 1,  0) : FfxTexCoordI2(-1, 0));
+    const FfxTexCoordI2 iSrcSamplePosX23 = FfxTexCoordI2(iSrcInputPos.xx) + (bFlipCol ? FfxTexCoordI2(-1, -2) : FfxTexCoordI2( 1, 2));
+
+    const FfxTexCoordI2 iSrcSamplePosY01 = FfxTexCoordI2(iSrcInputPos.yy) + (bFlipRow ? FfxTexCoordI2( 1,  0) : FfxTexCoordI2(-1, 0));
+    const FfxTexCoordI2 iSrcSamplePosY23 = FfxTexCoordI2(iSrcInputPos.yy) + (bFlipRow ? FfxTexCoordI2(-1, -2) : FfxTexCoordI2( 1, 2));
+
+    const FfxTexCoordI2 renderSizeLastTexelCoord = FfxTexCoordI2(RenderSize()) - FfxTexCoordI2(1, 1);
+
+    const FfxTexCoordI2 iSrcSamplePosX01Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosX01.x, 0, renderSizeLastTexelCoord.x),
+        __XB_Med3_I32(iSrcSamplePosX01.y, 0, renderSizeLastTexelCoord.x)
+    );
+
+    const FfxTexCoordI2 iSrcSamplePosX23Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosX23.x, 0, renderSizeLastTexelCoord.x),
+        __XB_Med3_I32(iSrcSamplePosX23.y, 0, renderSizeLastTexelCoord.x)
+    );
+
+    const FfxTexCoordI2 iSrcSamplePosY01Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosY01.x, 0, renderSizeLastTexelCoord.y),
+        __XB_Med3_I32(iSrcSamplePosY01.y, 0, renderSizeLastTexelCoord.y)
+    );
+
+    const FfxTexCoordI2 iSrcSamplePosY23Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosY23.x, 0, renderSizeLastTexelCoord.y),
+        __XB_Med3_I32(iSrcSamplePosY23.y, 0, renderSizeLastTexelCoord.y)
+    );
+
+    fSamples[ 0] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY01Clamped.x));
+    fSamples[ 1] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.x));
+    fSamples[ 2] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY01Clamped.x));
+
+    fSamples[4 + 0] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY01Clamped.y));
+    fSamples[4 + 1] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.y));
+    fSamples[4 + 2] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY01Clamped.y));
+
+    fSamples[8 + 0] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY23Clamped.x));
+    fSamples[8 + 1] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY23Clamped.x));
+    fSamples[8 + 2] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY23Clamped.x));
+
+    fSamples[12 + 0] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY23Clamped.y));
+    fSamples[12 + 1] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY23Clamped.y));
+    fSamples[12 + 2] = LoadPreparedInputColorHalf(FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY23Clamped.y));
+
+#else
     FfxFloat32x2 fOffsetTL = FfxFloat32x2(offsetTL);
 
     FFX_UNROLL
@@ -130,6 +328,7 @@ FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams p
                 fSamples[iSampleIndex] = LoadPreparedInputColor(FfxInt32x2(sampleCoord));
             }
     }
+#endif
 
     FfxFloat32x4 fColorAndWeight = FfxFloat32x4(0.0f, 0.0f, 0.0f, 0.0f);
 
@@ -145,6 +344,75 @@ FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams p
 
     const FfxFloat32 fRectificationCurveBias = ffxLerp(-2.0f, -3.0f, ffxSaturate(params.fHrVelocity / 50.0f));
 
+#if FFX_FSR2_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+    // Unroll the loop to load samples on Scarlett to help the shader compiler
+    const bool coordX0OnScreen = iSrcSamplePosX01.x == iSrcSamplePosX01Clamped.x;
+    const bool coordX1OnScreen = iSrcSamplePosX01.y == iSrcSamplePosX01Clamped.y;
+    const bool coordX2OnScreen = iSrcSamplePosX23.x == iSrcSamplePosX23Clamped.x;
+
+    const bool coordY0OnScreen = iSrcSamplePosY01.x == iSrcSamplePosY01Clamped.x;
+    const bool coordY1OnScreen = iSrcSamplePosY01.y == iSrcSamplePosY01Clamped.y;
+    const bool coordY2OnScreen = iSrcSamplePosY23.x == iSrcSamplePosY23Clamped.x;
+
+    const FFX_MIN16_F2 fBaseSampleOffsetHalf = FFX_MIN16_F2(fBaseSampleOffset);
+
+    const FFX_MIN16_F2 fSrcSampleOffsetX_02 = fBaseSampleOffsetHalf.xx + fSampleOffsetX02;
+    const FFX_MIN16_F2 fSrcSampleOffsetY_02 = fBaseSampleOffsetHalf.yy + fSampleOffsetY02;
+
+    const FFX_MIN16_F2 fSrcSampleOffsetXSq_02 = fSrcSampleOffsetX_02 * fSrcSampleOffsetX_02;
+    const FFX_MIN16_F2 fSrcSampleOffsetYSq_02 = fSrcSampleOffsetY_02 * fSrcSampleOffsetY_02;
+    const FFX_MIN16_F2 fSrcSampleOffsetXYSq_11 = fBaseSampleOffsetHalf * fBaseSampleOffsetHalf;
+
+    PairedRectificationBoxAndAccumulatedColorAndWeight pairedBox;
+    pairedBox.setKernelBiasAndRectificationCurveBias(fKernelBias, fRectificationCurveBias);
+
+    // init by o o o
+    //         o x o
+    //         o o o
+    pairedBox.init(
+        fSrcSampleOffsetXYSq_11.x + fSrcSampleOffsetXYSq_11.y,
+        coordX1OnScreen, coordY1OnScreen,
+        fSamples[5]
+    );
+
+    // add remaining two samples from 1st row x o x
+    //                                        o * o
+    //                                        o o o
+    pairedBox.addSample(
+        fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.xx,
+        coordX0OnScreen, coordX2OnScreen, coordY0OnScreen,
+        fSamples[0 + 0], fSamples[0 + 2]
+    );
+
+    // add two samples from 2nd row * o *
+    //                              o * o
+    //                              x o x
+    pairedBox.addSample(
+        fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.yy,
+        coordX0OnScreen, coordX2OnScreen, coordY2OnScreen,
+        fSamples[8 + 0], fSamples[8 + 2]
+    );
+
+    // add two samples from 3rd row * o *
+    //                              x * x
+    //                              * o *
+    pairedBox.addSample(
+        fSrcSampleOffsetXSq_02 + fSrcSampleOffsetXYSq_11.yy,
+        coordX0OnScreen, coordX2OnScreen, coordY1OnScreen,
+        fSamples[4 + 0], fSamples[4 + 2]
+    );
+
+    // add remaining samples * x *
+    //                       * * *
+    //                       * x *
+    pairedBox.addSample(
+        fSrcSampleOffsetXYSq_11.xx + fSrcSampleOffsetYSq_02,
+        coordY0OnScreen, coordY2OnScreen, coordX1OnScreen,
+        fSamples[0 + 1], fSamples[8 + 1]
+    );
+
+    pairedBox.finalize(clippingBox, fColorAndWeight);
+#else
     FFX_UNROLL
     for (FfxInt32 row = 0; row < 3; row++) {
         FFX_UNROLL
@@ -172,6 +440,7 @@ FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams p
             }
         }
     }
+#endif
 
     RectificationBoxComputeVarianceBoxData(clippingBox);
 
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_accumulate.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_accumulate.h
index 766cba3..2cba17c 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_accumulate.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_accumulate.h
@@ -162,6 +162,8 @@ void Accumulate(FfxInt32x2 iPxHrPos)
 
     data.fHistoryColor /= Exposure();
 
+    data.fHistoryColor = ffxMax(data.fHistoryColor, FfxFloat32x3(0.0f, 0.0f, 0.0f));
+
     StoreInternalColorAndWeight(iPxHrPos, FfxFloat32x4(data.fHistoryColor, data.fLock));
 
     // Output final color when RCAS is disabled
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_callbacks_hlsl.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_callbacks_hlsl.h
index 1c3fc99..f719674 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_callbacks_hlsl.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_callbacks_hlsl.h
@@ -75,6 +75,8 @@ cbuffer cbFSR3Upscaler : FFX_FSR3UPSCALER_DECLARE_CB(FSR3UPSCALER_BIND_CB_FSR3UP
     FfxFloat32    fDeltaPreExposure;
     FfxFloat32    fViewSpaceToMetersFactor;
     FfxFloat32    fFrameIndex;
+
+    FfxFloat32    fVelocityFactor;
 };
 
 #define FFX_FSR3UPSCALER_CONSTANT_BUFFER_1_SIZE (sizeof(cbFSR3Upscaler) / 4)  // Number of 32-bit values. This must be kept in sync with the cbFSR3Upscaler size.
@@ -170,6 +172,11 @@ FfxFloat32 FrameIndex()
     return fFrameIndex;
 }
 
+FfxFloat32 VelocityFactor()
+{
+    return fVelocityFactor;
+}
+
 #endif // #if defined(FSR3UPSCALER_BIND_CB_FSR3UPSCALER)
 
 #define FFX_FSR3UPSCALER_ROOTSIG_STRINGIFY(p) FFX_FSR3UPSCALER_ROOTSIG_STR(p)
@@ -788,9 +795,15 @@ FfxFloat32 Exposure()
 {
     FfxFloat32 exposure = r_input_exposure[FfxUInt32x2(0, 0)].x;
 
+#if defined(__XBOX_SCARLETT)
+    if (exposure < 0.000030517578/** 2^15 */) {
+        exposure = 1.0f;
+    }
+#else
     if (exposure == 0.0f) {
         exposure = 1.0f;
     }
+#endif // #if defined(__XBOX_SCARLETT)
 
     return exposure;
 }
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_common.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_common.h
index dd479b1..28993c6 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_common.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_common.h
@@ -99,7 +99,7 @@ FfxFloat32 SceneAverageLuma()
 #endif
 
 // Auto exposure
-FFX_STATIC const FfxFloat32 resetAutoExposureAverageSmoothing = 1e8f;
+FFX_STATIC const FfxFloat32 resetAutoExposureAverageSmoothing = 1e4f;
 
 struct AccumulationPassCommonParams
 {
@@ -123,7 +123,7 @@ struct AccumulationPassCommonParams
 
 FfxFloat32 Get4KVelocity(FfxFloat32x2 fMotionVector)
 {
-    return length(fMotionVector * FfxFloat32x2(3840.0f, 2160.0f));
+    return length(fMotionVector * FfxFloat32x2(3840.0f, 2160.0f)) * VelocityFactor();
 }
 
 struct RectificationBox
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_rcas.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_rcas.h
index 90a85b3..81f8ed2 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_rcas.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_rcas.h
@@ -25,38 +25,82 @@
 
 #include "../ffx_core.h"
 
+#if FFX_HALF && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+    #define FSR_RCAS_PREFER_PAIRED_VERSION 1
+#else
+    #define FSR_RCAS_PREFER_PAIRED_VERSION 0
+#endif
+
 void WriteUpscaledOutput(FFX_MIN16_U2 iPxHrPos, FfxFloat32x3 fUpscaledColor)
 {
     StoreUpscaledOutput(FFX_MIN16_I2(iPxHrPos), fUpscaledColor);
 }
 
-#define FSR_RCAS_F 1
-FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p)
-{
-    FfxFloat32x4 fColor = LoadRCAS_Input(p);
+#if FSR_RCAS_PREFER_PAIRED_VERSION
+    #define FSR_RCAS_HX2 1
+    FfxFloat16x4 FsrRcasLoadHx2(FfxInt16x2 p)
+    {
+        return FfxFloat16x4(LoadRCAS_Input(p));
+    }
+    void FsrRcasInputHx2(inout FfxFloat16x2 r, inout FfxFloat16x2 g, inout FfxFloat16x2 b)
+    {
+        FfxFloat32 e = Exposure();
+        r = FfxFloat16x2(r * e);
+        g = FfxFloat16x2(g * e);
+        b = FfxFloat16x2(b * e);
+    }
 
-    fColor.rgb *= Exposure();
+	#include "../fsr1/ffx_fsr1.h"
+	
+	void CurrFilterPaired(FFX_MIN16_U2 pos)
+    {
+        FfxFloat16x2 cr;
+        FfxFloat16x2 cg;
+        FfxFloat16x2 cb;
+        FsrRcasHx2(cr, cg, cb, pos, RCASConfig());
+		FfxFloat32 InvExposure = 1.0f / Exposure();
+		cr = FfxFloat16x2(cr * InvExposure);
+        cg = FfxFloat16x2(cg * InvExposure);
+        cb = FfxFloat16x2(cb * InvExposure);
+		WriteUpscaledOutput(pos, FfxFloat16x3(cr.x, cg.x, cb.x)); //TODO: fix type
+        pos.x += 8;
+        WriteUpscaledOutput(pos, FfxFloat16x3(cr.y, cg.y, cb.y)); //TODO: fix type
+    }	
+#else
+    #define FSR_RCAS_F 1
+    FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p)
+    {
+        FfxFloat32x4 fColor = LoadRCAS_Input(p);
 
-    return fColor;
-}
-void FsrRcasInputF(inout FfxFloat32 r, inout FfxFloat32 g, inout FfxFloat32 b) {}
+        fColor.rgb *= Exposure();
 
-#include "../fsr1/ffx_fsr1.h"
+        return fColor;
+    }
+    void FsrRcasInputF(inout FfxFloat32 r, inout FfxFloat32 g, inout FfxFloat32 b) {}
 
-void CurrFilter(FFX_MIN16_U2 pos)
-{
-    FfxFloat32x3 c;
-    FsrRcasF(c.r, c.g, c.b, pos, RCASConfig());
+    #include "../fsr1/ffx_fsr1.h"
 
-    c /= Exposure();
+    void CurrFilter(FFX_MIN16_U2 pos)
+    {
+        FfxFloat32x3 c;
+        FsrRcasF(c.r, c.g, c.b, pos, RCASConfig());
 
-    WriteUpscaledOutput(pos, c);
-}
+        c /= Exposure();
+
+        WriteUpscaledOutput(pos, c);
+    }
+
+#endif // #if FSR_RCAS_PREFER_PAIRED_VERSION
 
 void RCAS(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid)
 {
     // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
     FfxUInt32x2 gxy = ffxRemapForQuad(LocalThreadId.x) + FfxUInt32x2(WorkGroupId.x << 4u, WorkGroupId.y << 4u);
+#if FSR_RCAS_PREFER_PAIRED_VERSION
+    CurrFilterPaired(FFX_MIN16_U2(gxy));
+    gxy.y += 8u;
+    CurrFilterPaired(FFX_MIN16_U2(gxy));
+#else
     CurrFilter(FFX_MIN16_U2(gxy));
     gxy.x += 8u;
     CurrFilter(FFX_MIN16_U2(gxy));
@@ -64,4 +108,5 @@ void RCAS(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid)
     CurrFilter(FFX_MIN16_U2(gxy));
     gxy.x -= 8u;
     CurrFilter(FFX_MIN16_U2(gxy));
+#endif
 }
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_reproject.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_reproject.h
index 153a9b7..bc0e9d4 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_reproject.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_reproject.h
@@ -32,6 +32,16 @@ FfxFloat32x4 WrapHistory(FfxInt32x2 iPxSample)
 DeclareCustomFetchBicubicSamples(FetchHistorySamples, WrapHistory)
 DeclareCustomTextureSample(HistorySample, FFX_FSR3UPSCALER_GET_LANCZOS_SAMPLER1D(FFX_FSR3UPSCALER_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples)
 
+#if FFX_HALF
+FFX_MIN16_F4 WrapHistory16(FfxInt32x2 iPxSample)
+{
+    return FFX_MIN16_F4(LoadHistory(iPxSample));
+}
+
+DeclareCustomFetchBicubicSamplesMin16(FetchHistorySamples16, WrapHistory16)
+DeclareCustomTextureSampleMin16(HistorySample16, FFX_FSR3UPSCALER_GET_LANCZOS_SAMPLER1D(FFX_FSR3UPSCALER_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples16)
+#endif
+
 FfxFloat32x2 GetMotionVector(FfxInt32x2 iPxHrPos, FfxFloat32x2 fHrUv)
 {
 #if FFX_FSR3UPSCALER_OPTION_LOW_RESOLUTION_MOTION_VECTORS
@@ -51,8 +61,13 @@ void ComputeReprojectedUVs(const AccumulationPassCommonParams params, FFX_PARAME
 }
 
 void ReprojectHistoryColor(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT AccumulationPassData data)
+
 {
+#if FFX_HALF && FFX_FSR3UPSCALER_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF
+    const FfxFloat32x4 fReprojectedHistory = FfxFloat32x4(HistorySample16(params.fReprojectedHrUv, UpscaleSize()));
+#else
     const FfxFloat32x4 fReprojectedHistory = HistorySample(params.fReprojectedHrUv, PreviousFrameUpscaleSize());
+#endif
 
     data.fHistoryColor = fReprojectedHistory.rgb;
     data.fHistoryColor *= DeltaPreExposure();
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_sample.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_sample.h
index 5f727b1..7a723d5 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_sample.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_sample.h
@@ -169,6 +169,24 @@ FFX_MIN16_F Lanczos2ApproxSqNoClamp(FFX_MIN16_F x2)
     FFX_MIN16_F b = FFX_MIN16_F(1.0f / 4.0f) * x2 - FFX_MIN16_F(1);
     return (FFX_MIN16_F(25.0f / 16.0f) * a * a - FFX_MIN16_F(25.0f / 16.0f - 1)) * (b * b);
 }
+
+#if defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+FFX_MIN16_F2 PairedLanczos2ApproxSqNoClamp(FFX_MIN16_F2 x2)
+{
+    // Xbox ATG (Pavel):
+    // 
+    //     2.0 * x2 - 5.0     25.0           25.0 - 16.0     (2.0 * x2 - 5.0)^2 - (3.0)^2    (2.0 * x2 - 8.0) * (2.0 * x2 - 2.0)   (x2 - 4.0) * (x2 - 1.0)
+    // a = -------------- ==> ---- * a^2 - -------------- = ----------------------------- =  ---------------------------------- =  ----------------------- = b * (x2 - 1.0)
+    //           5.0          16.0              16.0                16.0                                     16.0                            4.0
+    //
+    // so we need to compute just (b * b) * (b * x2 - b), so we should get four packed instructions: 2 fma + 2 mul
+    //
+
+    FFX_MIN16_F2 b = (0.25 * x2 - 1.0);
+    return (b * b) * (b * x2 - b);
+}
+#endif
+
 #endif //FFX_HALF
 
 FfxFloat32 Lanczos2ApproxSq(FfxFloat32 x2)
@@ -183,6 +201,14 @@ FFX_MIN16_F Lanczos2ApproxSq(FFX_MIN16_F x2)
     x2 = ffxMin(x2, FFX_MIN16_F(4.0f));
     return Lanczos2ApproxSqNoClamp(x2);
 }
+
+#if defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+FFX_MIN16_F2 PairedLanczos2ApproxSq(FFX_MIN16_F2 x2)
+{
+    x2 = ffxMin(x2, FFX_MIN16_F2(4.0, 4.0));
+    return PairedLanczos2ApproxSqNoClamp(x2);
+}
+#endif
 #endif //FFX_HALF
 
 FfxFloat32 Lanczos2ApproxNoClamp(FfxFloat32 x)
diff --git a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_upsample.h b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_upsample.h
index 2d587f0..801a0a9 100644
--- a/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_upsample.h
+++ b/Shaders/shaders/fsr3upscaler/ffx_fsr3upscaler_upsample.h
@@ -44,6 +44,26 @@ FfxFloat32 GetUpsampleLanczosWeight(FfxFloat32x2 fSrcSampleOffset, FfxFloat32 fK
     return fSampleWeight;
 }
 
+#if FFX_HALF
+FFX_MIN16_F GetUpsampleLanczosWeight(FFX_MIN16_F2 fSrcSampleOffset, FFX_MIN16_F fKernelWeight)
+{
+    FFX_MIN16_F2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx;
+#if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE
+    FFX_MIN16_F fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased));
+#elif FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT
+    FFX_MIN16_F fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased));
+#elif FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+    FFX_MIN16_F fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
+
+    // To Test: Save reciproqual sqrt compute
+    // FfxFloat32 fSampleWeight = Lanczos2Sq_UseLUT(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
+#else
+#error "Invalid Lanczos type"
+#endif
+    return fSampleWeight;
+}
+#endif
+
 FfxFloat32 ComputeMaxKernelWeight(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT AccumulationPassData data) {
 
     const FfxFloat32 fKernelSizeBias = 1.0f + (1.0f / FfxFloat32x2(DownscaleFactor()) - 1.0f).x;
@@ -59,6 +79,225 @@ FfxFloat32x3 LoadPreparedColor(FfxInt32x2 iSamplePos)
     return fPreparedYCoCg;
 }
 
+#if FFX_HALF && (FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2) && defined(__XBOX_SCARLETT) && defined(__XBATG_EXTRA_16_BIT_OPTIMISATION) && (__XBATG_EXTRA_16_BIT_OPTIMISATION == 1)
+#define FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS 1
+#else
+#define FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS 0
+#endif
+
+#if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+
+void LoadPreparedColorPairedRgb(FFX_PARAMETER_OUT FFX_MIN16_F2 r,
+                                FFX_PARAMETER_OUT FFX_MIN16_F2 g,
+                                FFX_PARAMETER_OUT FFX_MIN16_F2 b,
+                                FfxInt32x2 iSamplePos0,
+                                FfxInt32x2 iSamplePos1)
+{
+    const FFX_MIN16_F3 sample0 = FFX_MIN16_F3(LoadInputColor(iSamplePos0));
+    const FFX_MIN16_F3 sample1 = FFX_MIN16_F3(LoadInputColor(iSamplePos1));
+
+    r = ffxMax(FFX_MIN16_F2(0, 0), FFX_MIN16_F2(sample0.r, sample1.r));
+    g = ffxMax(FFX_MIN16_F2(0, 0), FFX_MIN16_F2(sample0.g, sample1.g));
+    b = ffxMax(FFX_MIN16_F2(0, 0), FFX_MIN16_F2(sample0.b, sample1.b));
+
+    r = FFX_MIN16_F2(r * Exposure());
+    g = FFX_MIN16_F2(g * Exposure());
+    b = FFX_MIN16_F2(b * Exposure());
+}
+
+void TonemapPaired(FFX_PARAMETER_INOUT FFX_MIN16_F2 r, FFX_PARAMETER_INOUT FFX_MIN16_F2 g, FFX_PARAMETER_INOUT FFX_MIN16_F2 b)
+{
+    FFX_MIN16_F2 denomF16 = ffxMax(ffxMax(ffxMax(0.0, r), g), b) + FFX_MIN16_F2(1.0, 1.0);
+
+    // NOTE: expect 2 x v_cvt_f32_f16
+    FfxFloat32x2 denomF32 = FfxFloat32x2(denomF16);
+    // NOTE: expect 2 x v_rcp_f32
+    FfxFloat32x2 normF32 = FfxFloat32x2(1.0, 1.0) / denomF32;
+    // NOTE: expect 2 x v_cvt_f16_f32
+    FFX_MIN16_F2 normF16 = FFX_MIN16_F2(normF32);
+
+    r *= normF16;
+    g *= normF16;
+    b *= normF16;
+}
+
+void RGBToYCoCgPaired(FFX_PARAMETER_INOUT FFX_MIN16_F2 r, FFX_PARAMETER_INOUT FFX_MIN16_F2 g, FFX_PARAMETER_INOUT FFX_MIN16_F2 b)
+{
+    /**
+     *  NOTE: given the following conversion
+     *
+     *      fYCoCg = FfxFloat32x3(
+     *          0.25f * fRgb.r + 0.5f * fRgb.g + 0.25f * fRgb.b,
+     *           0.5f * fRgb.r - 0.5f * fRgb.b,
+     *         -0.25f * fRgb.r + 0.5f * fRgb.g - 0.25f * fRgb.b);
+     *
+     *  it's possible to notice that we can compute:
+     *      RplusBdiv4 = 0.25 * (R + B)
+     *
+     *  so everything else is computed in 3 instructions
+     *      Y  = G * 0.5 + RplusBdiv4
+     *      Co = 2 * RplusBdiv4 - G
+     *      Cg = G * 0.5 - RplusBdiv4
+     */
+
+    // NOTE: expect v_pk_add_f32 + v_pk_mul_f32
+    FFX_MIN16_F2 RplusBdiv4 = (r + b) * 0.25;
+    FFX_MIN16_F2 G = g;
+    FFX_MIN16_F2 B = b;
+
+    // NOTE: expect 3x v_pk_fma_f32
+    r = G * 0.5 + RplusBdiv4;
+    g = RplusBdiv4 * 2.0 - B;
+    b = G * 0.5 - RplusBdiv4;
+}
+
+FFX_MIN16_F2 Compute3x3SamplesMinMaxPaired(FFX_PARAMETER_IN FFX_MIN16_F2 sampleCenter,
+                                           FFX_PARAMETER_IN FFX_MIN16_F2 sample0,
+                                           FFX_PARAMETER_IN FFX_MIN16_F2 sample1,
+                                           FFX_PARAMETER_IN FFX_MIN16_F2 sample2,
+                                           FFX_PARAMETER_IN FFX_MIN16_F2 sample3)
+{
+    FFX_MIN16_F2 twoMinValues = ffxMin(ffxMin(sample0, sample1), ffxMin(sample2, sample3));
+    FFX_MIN16_F2 twoMaxValues = ffxMax(ffxMax(sample0, sample1), ffxMax(sample2, sample3));
+
+    return FFX_MIN16_F2(
+        ffxMin3Half(twoMinValues.x, twoMinValues.y, sampleCenter.x),
+        ffxMax3Half(twoMaxValues.x, twoMaxValues.y, sampleCenter.x)
+    );
+}
+
+
+FFX_MIN16_F2 Bool2ToFloat16x2(bool x, bool y)
+{
+    uint lo = x ? 0x00003c00 : 0x00000000;
+    uint hi = y ? 0x3c000000 : 0x00000000;
+    return FFX_MIN16_F2(__XB_AsHalf(lo).x, __XB_AsHalf(hi).y);
+}
+
+struct PairedRectificationBoxAndAccumulatedColorAndWeight
+{
+    FFX_MIN16_F2 boxCenterR;
+    FFX_MIN16_F2 boxCenterG;
+    FFX_MIN16_F2 boxCenterB;
+
+    FFX_MIN16_F2 boxVecR;
+    FFX_MIN16_F2 boxVecG;
+    FFX_MIN16_F2 boxVecB;
+
+    FFX_MIN16_F2 fBoxCenterWeight;
+
+    FFX_MIN16_F2 fColorR;
+    FFX_MIN16_F2 fColorG;
+    FFX_MIN16_F2 fColorB;
+    FFX_MIN16_F2 fWeight;
+
+    FFX_MIN16_F fKernelBiasSq;
+    FFX_MIN16_F fRectificationCurveBias;
+
+    void setKernelBiasAndRectificationCurveBias(FfxFloat32 kernelBias, FfxFloat32 rectificationCurveBias)
+    {
+        fKernelBiasSq = FFX_MIN16_F(kernelBias * kernelBias);
+        fRectificationCurveBias = FFX_MIN16_F(rectificationCurveBias);
+    }
+
+    void initUpscaledColor(FFX_MIN16_F fSrcSampleOffsetSq, FFX_MIN16_F fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
+    {
+        #if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+            const FFX_MIN16_F2 LanczosUpsampleWeight = FFX_MIN16_F2(
+                PairedLanczos2ApproxSq(fSrcSampleOffsetSq * fKernelBiasSq).x,
+                0.0
+            );
+        #else
+            #error "Only LANCZOS_TYPE_APPROXIMATE is supported in paired version so far"
+        #endif
+        const FFX_MIN16_F2 fSampleWeight = fOnScreenWeight * LanczosUpsampleWeight;
+
+        fColorR = sampleR * fSampleWeight;
+        fColorG = sampleG * fSampleWeight;
+        fColorB = sampleB * fSampleWeight;
+        fWeight = fSampleWeight;
+    }
+
+    void initBox(FFX_MIN16_F fSrcSampleOffsetSq, FFX_MIN16_F fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
+    {
+        const FFX_MIN16_F2 fBoxSampleWeight = FFX_MIN16_F2(
+            exp(fRectificationCurveBias * fSrcSampleOffsetSq) * fOnScreenWeight,
+            0.0
+        );
+
+        FFX_MIN16_F2 weightedSampleR = sampleR * fBoxSampleWeight;
+        FFX_MIN16_F2 weightedSampleG = sampleG * fBoxSampleWeight;
+        FFX_MIN16_F2 weightedSampleB = sampleB * fBoxSampleWeight;
+
+        boxCenterR = weightedSampleR;
+        boxCenterG = weightedSampleG;
+        boxCenterB = weightedSampleB;
+
+        boxVecR = sampleR * weightedSampleR;
+        boxVecG = sampleG * weightedSampleG;
+        boxVecB = sampleB * weightedSampleB;
+
+        fBoxCenterWeight = fBoxSampleWeight;
+    }
+
+    void addUpscaledColorSample(FFX_MIN16_F2 fSrcSampleOffsetSq, FFX_MIN16_F2 fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
+    {
+        #if FFX_FSR3UPSCALER_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+            const FFX_MIN16_F2 LanczosUpsampleWeight = PairedLanczos2ApproxSq(fSrcSampleOffsetSq * fKernelBiasSq);
+        #else
+            #error "Only LANCZOS_TYPE_APPROXIMATE is supported in paired version so far"
+        #endif
+        const FFX_MIN16_F2 fSampleWeight = fOnScreenWeight * LanczosUpsampleWeight;
+
+        fColorR += sampleR * fSampleWeight;
+        fColorG += sampleG * fSampleWeight;
+        fColorB += sampleB * fSampleWeight;
+        fWeight += fSampleWeight;
+    }
+
+    void addBoxSample(FFX_MIN16_F2 fSrcSampleOffsetSq, FFX_MIN16_F2 fOnScreenWeight, FFX_MIN16_F2 sampleR, FFX_MIN16_F2 sampleG, FFX_MIN16_F2 sampleB)
+    {
+        // NOTE: ideally expect here 2x v_fma_mix + 2x v_exp_f32 + 2x v_fma_mix
+        const FFX_MIN16_F2 fBoxSampleWeight = exp(fRectificationCurveBias * fSrcSampleOffsetSq) * fOnScreenWeight;
+
+        FFX_MIN16_F2 weightedSampleR = sampleR * fBoxSampleWeight;
+        FFX_MIN16_F2 weightedSampleG = sampleG * fBoxSampleWeight;
+        FFX_MIN16_F2 weightedSampleB = sampleB * fBoxSampleWeight;
+
+        boxCenterR += weightedSampleR;
+        boxCenterG += weightedSampleG;
+        boxCenterB += weightedSampleB;
+
+        boxVecR += sampleR * weightedSampleR;
+        boxVecG += sampleG * weightedSampleG;
+        boxVecB += sampleB * weightedSampleB;
+
+        fBoxCenterWeight += fBoxSampleWeight;
+    }
+
+    void finalizeUpscaledColor(FFX_PARAMETER_OUT FfxFloat32x4 upscaledColorAndWeight)
+    {
+        upscaledColorAndWeight.r = fColorR.x + fColorR.y;
+        upscaledColorAndWeight.g = fColorG.x + fColorG.y;
+        upscaledColorAndWeight.b = fColorB.x + fColorB.y;
+
+        upscaledColorAndWeight.a = fWeight.x + fWeight.y;
+    }
+
+    void finalizeBox(FFX_PARAMETER_OUT FfxFloat32x2 boxCenterAndVecR,
+                     FFX_PARAMETER_OUT FfxFloat32x2 boxCenterAndVecG,
+                     FFX_PARAMETER_OUT FfxFloat32x2 boxCenterAndVecB,
+                     FFX_PARAMETER_OUT FfxFloat32   boxCenterWeight)
+    {
+        boxCenterAndVecR = FfxFloat32x2(boxCenterR.x + boxCenterR.y, boxVecR.x + boxVecR.y);
+        boxCenterAndVecG = FfxFloat32x2(boxCenterG.x + boxCenterG.y, boxVecG.x + boxVecG.y);
+        boxCenterAndVecB = FfxFloat32x2(boxCenterB.x + boxCenterB.y, boxVecB.x + boxVecB.y);
+
+        boxCenterWeight = fBoxCenterWeight.x + fBoxCenterWeight.y;
+    }
+};
+#endif // #if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+
 void ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT AccumulationPassData data)
 {
     // We compute a sliced lanczos filter with 2 lobes (other slices are accumulated temporaly)
@@ -82,6 +321,90 @@ void ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, F
 
     const FfxBoolean bIsInitialSample = (params.fAccumulation == 0.0f);
 
+#if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+    // Unroll the loop to load samples on Scarlett to help the shader compiler
+    const FFX_MIN16_F2 fSampleOffsetX02 = __XB_AsHalf(bFlipCol ? __XB_AsUInt(FFX_MIN16_F2( 1, -1)) : __XB_AsUInt(FFX_MIN16_F2(-1, 1)));
+    const FFX_MIN16_F2 fSampleOffsetY02 = __XB_AsHalf(bFlipRow ? __XB_AsUInt(FFX_MIN16_F2( 1, -1)) : __XB_AsUInt(FFX_MIN16_F2(-1, 1)));
+
+    typedef FfxInt32 FfxTexCoordI;
+    typedef FfxInt32x2 FfxTexCoordI2;
+
+    const FfxTexCoordI2 iSrcSamplePosX01 = FfxTexCoordI2(iSrcInputPos.xx) + (bFlipCol ? FfxTexCoordI2( 1,  0) : FfxTexCoordI2(-1, 0));
+    const FfxTexCoordI2 iSrcSamplePosX23 = FfxTexCoordI2(iSrcInputPos.xx) + (bFlipCol ? FfxTexCoordI2(-1, -2) : FfxTexCoordI2( 1, 2));
+
+    const FfxTexCoordI2 iSrcSamplePosY01 = FfxTexCoordI2(iSrcInputPos.yy) + (bFlipRow ? FfxTexCoordI2( 1,  0) : FfxTexCoordI2(-1, 0));
+    const FfxTexCoordI2 iSrcSamplePosY23 = FfxTexCoordI2(iSrcInputPos.yy) + (bFlipRow ? FfxTexCoordI2(-1, -2) : FfxTexCoordI2( 1, 2));
+
+    const FfxTexCoordI2 renderSizeLastTexelCoord = FfxTexCoordI2(RenderSize()) - FfxTexCoordI2(1, 1);
+
+    const FfxTexCoordI2 iSrcSamplePosX01Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosX01.x, 0, renderSizeLastTexelCoord.x),
+        __XB_Med3_I32(iSrcSamplePosX01.y, 0, renderSizeLastTexelCoord.x)
+    );
+
+    const FfxTexCoordI2 iSrcSamplePosX23Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosX23.x, 0, renderSizeLastTexelCoord.x),
+        __XB_Med3_I32(iSrcSamplePosX23.y, 0, renderSizeLastTexelCoord.x)
+    );
+
+    const FfxTexCoordI2 iSrcSamplePosY01Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosY01.x, 0, renderSizeLastTexelCoord.y),
+        __XB_Med3_I32(iSrcSamplePosY01.y, 0, renderSizeLastTexelCoord.y)
+    );
+
+    const FfxTexCoordI2 iSrcSamplePosY23Clamped = FfxTexCoordI2(
+        __XB_Med3_I32(iSrcSamplePosY23.x, 0, renderSizeLastTexelCoord.y),
+        __XB_Med3_I32(iSrcSamplePosY23.y, 0, renderSizeLastTexelCoord.y)
+    );
+
+    FFX_MIN16_F2 TopCornerR, BotCornerR, HorzR, VertR, CenterR;
+    FFX_MIN16_F2 TopCornerG, BotCornerG, HorzG, VertG, CenterG;
+    FFX_MIN16_F2 TopCornerB, BotCornerB, HorzB, VertB, CenterB;
+
+    LoadPreparedColorPairedRgb(TopCornerR, TopCornerG, TopCornerB,
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY01Clamped.x),
+        FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY01Clamped.x)
+    );
+
+    LoadPreparedColorPairedRgb(BotCornerR, BotCornerG, BotCornerB,
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY23Clamped.x),
+        FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY23Clamped.x)
+    );
+
+    LoadPreparedColorPairedRgb(HorzR, HorzG, HorzB,
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.x, iSrcSamplePosY01Clamped.y),
+        FfxTexCoordI2(iSrcSamplePosX23Clamped.x, iSrcSamplePosY01Clamped.y)
+    );
+
+    LoadPreparedColorPairedRgb(VertR, VertG, VertB,
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.x),
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY23Clamped.x)
+    );
+
+    // NOTE: duplicated data
+    LoadPreparedColorPairedRgb(CenterR, CenterG, CenterB,
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.y),
+        FfxTexCoordI2(iSrcSamplePosX01Clamped.y, iSrcSamplePosY01Clamped.y)
+    );
+
+    #if FFX_FSR3UPSCALER_OPTION_HDR_COLOR_INPUT
+    if (bIsInitialSample)
+    {
+        TonemapPaired(TopCornerR, TopCornerG, TopCornerB);
+        TonemapPaired(BotCornerR, BotCornerG, BotCornerB);
+        TonemapPaired(HorzR, HorzG, HorzB);
+        TonemapPaired(VertR, VertG, VertB);
+        TonemapPaired(CenterR, CenterG, CenterB);
+    }
+    #endif
+
+    RGBToYCoCgPaired(TopCornerR, TopCornerG, TopCornerB);
+    RGBToYCoCgPaired(BotCornerR, BotCornerG, BotCornerB);
+    RGBToYCoCgPaired(HorzR, HorzG, HorzB);
+    RGBToYCoCgPaired(VertR, VertG, VertB);
+    RGBToYCoCgPaired(CenterR, CenterG, CenterB);
+
+#else
     FfxFloat32x3 fSamples[9];
     FfxInt32 iSampleIndex = 0;
 
@@ -110,6 +433,8 @@ void ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, F
     }
 #endif
 
+#endif // #if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+
     // Identify how much of each upsampled color to be used for this frame
     const FfxFloat32 fKernelBiasMax          = ComputeMaxKernelWeight(params, data);
     const FfxFloat32 fKernelBiasMin          = ffxMax(1.0f, ((1.0f + fKernelBiasMax) * 0.3f));
@@ -122,6 +447,139 @@ void ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, F
 
     const FfxFloat32 fKernelBias             = ffxLerp(fKernelBiasMin, fKernelBiasMax, fKernelBiasWeight);
     
+#if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
+    // Unroll the loop to load samples on Scarlett to help the shader compiler
+    const bool coordX0OnScreen = iSrcSamplePosX01.x == iSrcSamplePosX01Clamped.x;
+    const bool coordX1OnScreen = iSrcSamplePosX01.y == iSrcSamplePosX01Clamped.y;
+    const bool coordX2OnScreen = iSrcSamplePosX23.x == iSrcSamplePosX23Clamped.x;
+
+    const bool coordY0OnScreen = iSrcSamplePosY01.x == iSrcSamplePosY01Clamped.x;
+    const bool coordY1OnScreen = iSrcSamplePosY01.y == iSrcSamplePosY01Clamped.y;
+    const bool coordY2OnScreen = iSrcSamplePosY23.x == iSrcSamplePosY23Clamped.x;
+
+    const FFX_MIN16_F2 fBaseSampleOffsetHalf = FFX_MIN16_F2(fBaseSampleOffset);
+
+    const FFX_MIN16_F2 fSrcSampleOffsetX_02 = fBaseSampleOffsetHalf.xx + fSampleOffsetX02;
+    const FFX_MIN16_F2 fSrcSampleOffsetY_02 = fBaseSampleOffsetHalf.yy + fSampleOffsetY02;
+
+    const FFX_MIN16_F2 fSrcSampleOffsetXSq_02 = fSrcSampleOffsetX_02 * fSrcSampleOffsetX_02;
+    const FFX_MIN16_F2 fSrcSampleOffsetYSq_02 = fSrcSampleOffsetY_02 * fSrcSampleOffsetY_02;
+    const FFX_MIN16_F2 fSrcSampleOffsetXYSq_11 = fBaseSampleOffsetHalf * fBaseSampleOffsetHalf;
+
+    const FfxFloat32 fRectificationCurveBias = -2.3f;
+    PairedRectificationBoxAndAccumulatedColorAndWeight pairedBox;
+    pairedBox.setKernelBiasAndRectificationCurveBias(fKernelBias, fRectificationCurveBias);
+
+    // init by o o o
+    //         o x o
+    //         o o o
+    pairedBox.initBox(
+        fSrcSampleOffsetXYSq_11.x + fSrcSampleOffsetXYSq_11.y,
+        Bool2ToFloat16x2(coordX1OnScreen && coordY1OnScreen, false).x,
+        CenterR, CenterG, CenterB
+    );
+
+    // add remaining two samples from 1st row x o x
+    //                                        o * o
+    //                                        o o o
+    pairedBox.addBoxSample(
+        fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.xx,
+        Bool2ToFloat16x2(coordX0OnScreen && coordY0OnScreen, coordX2OnScreen && coordY0OnScreen),
+        TopCornerR, TopCornerG, TopCornerB
+    );
+
+    // add two samples from 2nd row * o *
+    //                              o * o
+    //                              x o x
+    pairedBox.addBoxSample(
+        fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.yy,
+        Bool2ToFloat16x2(coordX0OnScreen && coordY2OnScreen, coordX2OnScreen && coordY2OnScreen),
+        BotCornerR, BotCornerG, BotCornerB
+    );
+
+    // add two samples from 3rd row * o *
+    //                              x * x
+    //                              * o *
+    pairedBox.addBoxSample(
+        fSrcSampleOffsetXSq_02 + fSrcSampleOffsetXYSq_11.yy,
+        Bool2ToFloat16x2(coordX0OnScreen && coordY1OnScreen, coordX2OnScreen && coordY1OnScreen),
+        HorzR, HorzG, HorzB
+    );
+
+    // add remaining samples * x *
+    //                       * * *
+    //                       * x *
+    pairedBox.addBoxSample(
+        fSrcSampleOffsetXYSq_11.xx + fSrcSampleOffsetYSq_02,
+        Bool2ToFloat16x2(coordX1OnScreen && coordY0OnScreen, coordX1OnScreen && coordY2OnScreen),
+        VertR, VertG, VertB
+    );
+
+    FfxFloat32x2 boxCenterAndVecR, boxCenterAndVecG, boxCenterAndVecB;
+    FfxFloat32 boxCenterWeight;
+    pairedBox.finalizeBox(boxCenterAndVecR, boxCenterAndVecG, boxCenterAndVecB, boxCenterWeight);
+
+    if (!bIsInitialSample)
+    {
+        pairedBox.initUpscaledColor(
+            fSrcSampleOffsetXYSq_11.x + fSrcSampleOffsetXYSq_11.y,
+            Bool2ToFloat16x2(coordX1OnScreen && coordY1OnScreen, false).x,
+            CenterR, CenterG, CenterB
+        );
+
+        // add remaining two samples from 1st row x o x
+        //                                        o * o
+        //                                        o o o
+        pairedBox.addUpscaledColorSample(
+            fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.xx,
+            Bool2ToFloat16x2(coordX0OnScreen && coordY0OnScreen, coordX2OnScreen && coordY0OnScreen),
+            TopCornerR, TopCornerG, TopCornerB
+        );
+
+        // add two samples from 2nd row * o *
+        //                              o * o
+        //                              x o x
+        pairedBox.addUpscaledColorSample(
+            fSrcSampleOffsetXSq_02 + fSrcSampleOffsetYSq_02.yy,
+            Bool2ToFloat16x2(coordX0OnScreen && coordY2OnScreen, coordX2OnScreen && coordY2OnScreen),
+            BotCornerR, BotCornerG, BotCornerB
+        );
+
+        // add two samples from 3rd row * o *
+        //                              x * x
+        //                              * o *
+        pairedBox.addUpscaledColorSample(
+            fSrcSampleOffsetXSq_02 + fSrcSampleOffsetXYSq_11.yy,
+            Bool2ToFloat16x2(coordX0OnScreen && coordY1OnScreen, coordX2OnScreen && coordY1OnScreen),
+            HorzR, HorzG, HorzB
+        );
+
+        // add remaining samples * x *
+        //                       * * *
+        //                       * x *
+        pairedBox.addUpscaledColorSample(
+            fSrcSampleOffsetXYSq_11.xx + fSrcSampleOffsetYSq_02,
+            Bool2ToFloat16x2(coordX1OnScreen && coordY0OnScreen, coordX1OnScreen && coordY2OnScreen),
+            VertR, VertG, VertB
+        );
+
+        FfxFloat32x4 upscaledColorAndWeight = 0.0;
+        pairedBox.finalizeUpscaledColor(upscaledColorAndWeight);
+
+        data.fUpsampledColor    = FfxFloat32x3(upscaledColorAndWeight.rgb);
+        data.fUpsampledWeight   = FfxFloat32(upscaledColorAndWeight.w);
+    }
+
+    FFX_MIN16_F2 aabbMinMaxR = Compute3x3SamplesMinMaxPaired(CenterR, TopCornerR, BotCornerR, HorzR, VertR);
+    FFX_MIN16_F2 aabbMinMaxG = Compute3x3SamplesMinMaxPaired(CenterG, TopCornerG, BotCornerG, HorzG, VertG);
+    FFX_MIN16_F2 aabbMinMaxB = Compute3x3SamplesMinMaxPaired(CenterB, TopCornerB, BotCornerB, HorzB, VertB);
+
+    data.clippingBox.boxCenter          = FfxFloat32x3(boxCenterAndVecR.x, boxCenterAndVecG.x, boxCenterAndVecB.x);
+    data.clippingBox.boxVec             = FfxFloat32x3(boxCenterAndVecR.y, boxCenterAndVecG.y, boxCenterAndVecB.y);
+    data.clippingBox.aabbMin            = FfxFloat32x3(aabbMinMaxR.x, aabbMinMaxG.x, aabbMinMaxB.x);
+    data.clippingBox.aabbMax            = FfxFloat32x3(aabbMinMaxR.y, aabbMinMaxG.y, aabbMinMaxB.y);
+    data.clippingBox.fBoxCenterWeight   = FfxFloat32(boxCenterWeight);
+#else
 
     iSampleIndex = 0;
 
@@ -158,6 +616,8 @@ void ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, F
             ++iSampleIndex;
         }
     }
+	
+#endif // #if FFX_FSR3UPSCALER_USE_XBOX_PAIRED_16BIT_MATH_OPTIMIZATIONS
 
     RectificationBoxComputeVarianceBoxData(data.clippingBox);