From d65cc3a35de270a1011c8158810e3d56c579953f Mon Sep 17 00:00:00 2001
From: Nico de Poel <ndepoel@gmail.com>
Date: Tue, 29 Oct 2024 18:30:00 +0100
Subject: [PATCH] Added standalone CAS sharpening shader

---
 Runtime/Common/ConstantsBuffer.cs             |   50 +
 Runtime/Common/ConstantsBuffer.cs.meta        |    3 +
 Shaders/ffx_cas_sharpen_pass.compute          |   11 +
 Shaders/ffx_cas_sharpen_pass.compute.meta     |    8 +
 Shaders/shaders/cas.meta                      |    8 +
 Shaders/shaders/cas/ffx_cas.h                 | 1271 +++++++++++++++++
 Shaders/shaders/cas/ffx_cas.h.meta            |   65 +
 Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h  |  226 +++
 .../shaders/cas/ffx_cas_callbacks_hlsl.h.meta |   65 +
 Shaders/shaders/cas/ffx_cas_resources.h       |   41 +
 Shaders/shaders/cas/ffx_cas_resources.h.meta  |   65 +
 Shaders/shaders/cas/ffx_cas_sharpen.h         |   89 ++
 Shaders/shaders/cas/ffx_cas_sharpen.h.meta    |   65 +
 Shaders/shaders/ffx_cas_sharpen_pass.hlsl     |   54 +
 .../shaders/ffx_cas_sharpen_pass.hlsl.meta    |    7 +
 15 files changed, 2028 insertions(+)
 create mode 100644 Runtime/Common/ConstantsBuffer.cs
 create mode 100644 Runtime/Common/ConstantsBuffer.cs.meta
 create mode 100644 Shaders/ffx_cas_sharpen_pass.compute
 create mode 100644 Shaders/ffx_cas_sharpen_pass.compute.meta
 create mode 100644 Shaders/shaders/cas.meta
 create mode 100644 Shaders/shaders/cas/ffx_cas.h
 create mode 100644 Shaders/shaders/cas/ffx_cas.h.meta
 create mode 100644 Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h
 create mode 100644 Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h.meta
 create mode 100644 Shaders/shaders/cas/ffx_cas_resources.h
 create mode 100644 Shaders/shaders/cas/ffx_cas_resources.h.meta
 create mode 100644 Shaders/shaders/cas/ffx_cas_sharpen.h
 create mode 100644 Shaders/shaders/cas/ffx_cas_sharpen.h.meta
 create mode 100644 Shaders/shaders/ffx_cas_sharpen_pass.hlsl
 create mode 100644 Shaders/shaders/ffx_cas_sharpen_pass.hlsl.meta
diff --git a/Runtime/Common/ConstantsBuffer.cs b/Runtime/Common/ConstantsBuffer.cs
new file mode 100644
index 0000000..98fbe54
--- /dev/null
+++ b/Runtime/Common/ConstantsBuffer.cs
@@ -0,0 +1,50 @@
+﻿using System.Runtime.InteropServices;
+using UnityEngine;
+using UnityEngine.Rendering;
+
+namespace FidelityFX
+{
+	/// <summary>
+	/// Convenience class for handling a constants buffer containing a single struct item.
+	/// This wraps the compute buffer and the value array, as well as providing easy access to both.
+	/// </summary>
+	public class ConstantsBuffer<TConst>
+		where TConst: struct
+	{
+		private ComputeBuffer _computeBuffer;
+            
+		private readonly TConst[] _constArray = { new TConst() };
+		public ref TConst Value => ref _constArray[0];
+
+		public static ConstantsBuffer<TConst> Create()
+		{
+			ConstantsBuffer<TConst> buffer = new();
+			buffer.Init();
+			return buffer;
+		}
+		
+		public void Init()
+		{
+			_computeBuffer = new ComputeBuffer(1, Marshal.SizeOf<TConst>(), ComputeBufferType.Constant);
+		}
+
+		public void UpdateBufferData(CommandBuffer commandBuffer)
+		{
+			commandBuffer.SetBufferData(_computeBuffer, _constArray);
+		}
+            
+		public void Destroy()
+		{
+			if (_computeBuffer == null)
+				return;
+                
+			_computeBuffer.Release();
+			_computeBuffer = null;
+		}
+
+		public static implicit operator ComputeBuffer(ConstantsBuffer<TConst> constants)
+		{
+			return constants._computeBuffer;
+		}
+	}
+}
diff --git a/Runtime/Common/ConstantsBuffer.cs.meta b/Runtime/Common/ConstantsBuffer.cs.meta
new file mode 100644
index 0000000..8cacdb6
--- /dev/null
+++ b/Runtime/Common/ConstantsBuffer.cs.meta
@@ -0,0 +1,3 @@
+fileFormatVersion: 2
+guid: c423d73295de440dade5b92337efc50a
+timeCreated: 1729668780
\ No newline at end of file
diff --git a/Shaders/ffx_cas_sharpen_pass.compute b/Shaders/ffx_cas_sharpen_pass.compute
new file mode 100644
index 0000000..a8bd30d
--- /dev/null
+++ b/Shaders/ffx_cas_sharpen_pass.compute
@@ -0,0 +1,11 @@
+#pragma kernel CS
+
+#define FFX_CAS_OPTION_SHARPEN_ONLY     1
+#define FFX_CAS_COLOR_SPACE_CONVERSION  0   // Linear color space
+
+#define FFX_GPU         // Compiling for GPU
+#define FFX_HLSL        // Compile for plain HLSL
+
+#pragma warning(disable: 3571)
+
+#include "shaders/ffx_cas_sharpen_pass.hlsl"
diff --git a/Shaders/ffx_cas_sharpen_pass.compute.meta b/Shaders/ffx_cas_sharpen_pass.compute.meta
new file mode 100644
index 0000000..1369384
--- /dev/null
+++ b/Shaders/ffx_cas_sharpen_pass.compute.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 00e3ffafadd35564780d8a12adcbeff7
+ComputeShaderImporter:
+  externalObjects: {}
+  preprocessorOverride: 0
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Shaders/shaders/cas.meta b/Shaders/shaders/cas.meta
new file mode 100644
index 0000000..a2a4885
--- /dev/null
+++ b/Shaders/shaders/cas.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 94edab5297308bd4fae936da8ce22a37
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Shaders/shaders/cas/ffx_cas.h b/Shaders/shaders/cas/ffx_cas.h
new file mode 100644
index 0000000..3ef2ad7
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas.h
@@ -0,0 +1,1271 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (C) 2024 Advanced Micro Devices, Inc.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/// @defgroup FfxGPUCas FidelityFX CAS
+/// FidelityFX Contrast Adaptive Sharpening GPU documentation
+/// 
+/// @ingroup FfxGPUEffects
+
+/// The maximum scaling ratio that CAS can support.
+///
+/// @ingroup FfxGPUCas
+#define FFX_CAS_AREA_LIMIT      (4.0)
+
+/// A function to check if the scaling ratio is supported by CAS.
+/// 
+/// Contrast Adaptive Sharpening (CAS) supports a maximum scaling ratio expressed in <c><i>FFX_CAS_AREA_LIMIT</i></c>.
+/// 
+/// @param [in] outX                The width of the target output, expressed in pixels.
+/// @param [in] outY                The height of the target output, expressed in pixels.
+/// @param [in] inX                 The width of the input surface, expressed in pixels.
+/// @param [in] inY                 The height of the input surface, expressed in pixels.
+///
+/// @returns
+/// True if CAS supports scaling in the given configuration.
+/// 
+/// @ingroup FfxGPUCas
+FfxUInt32 ffxCasSupportScaling(
+    FFX_PARAMETER_IN FfxFloat32 outX,
+    FFX_PARAMETER_IN FfxFloat32 outY,
+    FFX_PARAMETER_IN FfxFloat32 inX,
+    FFX_PARAMETER_IN FfxFloat32 inY)
+{
+    return FfxUInt32(((outX * outY) * ffxReciprocal(inX * inY)) <= FFX_CAS_AREA_LIMIT);
+}
+
+/// Call to setup required constant values (works on CPU or GPU).
+///
+/// @param [out] const0                 The first 4 32-bit values of the constant buffer which is populated by this function.
+/// @param [out] const1                 The second 4 32-bit values of the constant buffer which is populated by this function.
+/// @param [in] sharpness               Set to 0 for the default (lower ringing), 1 for maximum (higest ringing).
+/// @param [in] inputSizeInPixelsX      The size of the input resolution in the X dimension.
+/// @param [in] inputSizeInPixelsY      The size of the input resolution in the Y dimension.
+/// @param [in] outputSizeInPixelsX     The size of the output resolution in the X dimension.
+/// @param [in] outputSizeInPixelsY     The size of the output resolution in the Y dimension.
+/// 
+/// @ingroup FfxGPUCas
+FFX_STATIC void ffxCasSetup(
+    FFX_PARAMETER_INOUT FfxUInt32x4 const0,
+    FFX_PARAMETER_INOUT FfxUInt32x4 const1,
+    FFX_PARAMETER_IN FfxFloat32 sharpness,
+    FFX_PARAMETER_IN FfxFloat32 inputSizeInPixelsX,
+    FFX_PARAMETER_IN FfxFloat32 inputSizeInPixelsY,
+    FFX_PARAMETER_IN FfxFloat32 outputSizeInPixelsX,
+    FFX_PARAMETER_IN FfxFloat32 outputSizeInPixelsY)
+{
+    // Scaling terms.
+    const0[0] = ffxAsUInt32(inputSizeInPixelsX * ffxReciprocal(outputSizeInPixelsX));
+    const0[1] = ffxAsUInt32(inputSizeInPixelsY * ffxReciprocal(outputSizeInPixelsY));
+    const0[2] = ffxAsUInt32(FfxFloat32(0.5) * inputSizeInPixelsX * ffxReciprocal(outputSizeInPixelsX) - FfxFloat32(0.5));
+    const0[3] = ffxAsUInt32(FfxFloat32(0.5) * inputSizeInPixelsY * ffxReciprocal(outputSizeInPixelsY) - FfxFloat32(0.5));
+
+    // Sharpness value.
+    FfxFloat32   sharp  = -ffxReciprocal(ffxLerp(8.0, 5.0, ffxSaturate(sharpness)));
+    FfxFloat32x2 hSharp = {sharp, 0.0};
+    const1[0] = ffxAsUInt32(sharp);
+    const1[1] = ffxPackHalf2x16(hSharp);
+    const1[2] = ffxAsUInt32(FfxFloat32(8.0) * inputSizeInPixelsX * ffxReciprocal(outputSizeInPixelsX));
+    const1[3] = 0;
+}
+
+#if defined(FFX_GPU)
+#if defined(FFX_CAS_PACKED_ONLY)
+// Avoid compiler errors by including default implementations of these callbacks.
+FfxFloat32x3 casLoad(FFX_PARAMETER_IN FfxInt32x2 position)
+{
+    return FfxFloat32x3(0.0, 0.0, 0.0);
+}
+
+void casInput(
+    FFX_PARAMETER_INOUT FfxFloat32 red,
+    FFX_PARAMETER_INOUT FfxFloat32 green,
+    FFX_PARAMETER_INOUT FfxFloat32 blue)
+{
+}
+#endif // #if defined(FFX_CAS_PACKED_ONLY)
+
+// No scaling algorithm uses minimal 3x3 pixel neighborhood.
+void casFilterNoScaling(
+    FFX_PARAMETER_OUT FfxFloat32 outPixelRed,
+    FFX_PARAMETER_OUT FfxFloat32 outPixelGreen,
+    FFX_PARAMETER_OUT FfxFloat32 outPixelBlue,
+    FFX_PARAMETER_IN FfxUInt32x2 samplePosition,
+    FFX_PARAMETER_IN FfxUInt32x4 const0,
+    FFX_PARAMETER_IN FfxUInt32x4 const1)
+{
+    // Load a collection of samples in a 3x3 neighorhood, where e is the current pixel.
+    // a b c
+    // d e f
+    // g h i
+    FfxFloat32x3 sampleA = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(-1, -1));
+    FfxFloat32x3 sampleB = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(0, -1));
+    FfxFloat32x3 sampleC = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(1, -1));
+    FfxFloat32x3 sampleD = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(-1, 0));
+    FfxFloat32x3 sampleE = casLoad(FfxInt32x2(samplePosition));
+    FfxFloat32x3 sampleF = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(1, 0));
+    FfxFloat32x3 sampleG = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(-1, 1));
+    FfxFloat32x3 sampleH = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(0, 1));
+    FfxFloat32x3 sampleI = casLoad(FfxInt32x2(samplePosition) + FfxInt32x2(1, 1));
+
+    // Run optional input transform.
+    casInput(sampleA.r, sampleA.g, sampleA.b);
+    casInput(sampleB.r, sampleB.g, sampleB.b);
+    casInput(sampleC.r, sampleC.g, sampleC.b);
+    casInput(sampleD.r, sampleD.g, sampleD.b);
+    casInput(sampleE.r, sampleE.g, sampleE.b);
+    casInput(sampleF.r, sampleF.g, sampleF.b);
+    casInput(sampleG.r, sampleG.g, sampleG.b);
+    casInput(sampleH.r, sampleH.g, sampleH.b);
+    casInput(sampleI.r, sampleI.g, sampleI.b);
+
+    // Soft min and max.
+    //  a b c             b
+    //  d e f * 0.5  +  d e f * 0.5
+    //  g h i             h
+    // These are 2.0x bigger (factored out the extra multiply).
+    FfxFloat32 minimumRed   = ffxMin3(ffxMin3(sampleD.r, sampleE.r, sampleF.r), sampleB.r, sampleH.r);
+    FfxFloat32 minimumGreen = ffxMin3(ffxMin3(sampleD.g, sampleE.g, sampleF.g), sampleB.g, sampleH.g);
+    FfxFloat32 minimumBlue  = ffxMin3(ffxMin3(sampleD.b, sampleE.b, sampleF.b), sampleB.b, sampleH.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 minimumRed2   = ffxMin3(ffxMin3(minimumRed, sampleA.r, sampleC.r), sampleG.r, sampleI.r);
+    FfxFloat32 minimumGreen2 = ffxMin3(ffxMin3(minimumGreen, sampleA.g, sampleC.g), sampleG.g, sampleI.g);
+    FfxFloat32 minimumBlue2  = ffxMin3(ffxMin3(minimumBlue, sampleA.b, sampleC.b), sampleG.b, sampleI.b);
+    minimumRed               = minimumRed + minimumRed2;
+    minimumGreen             = minimumGreen + minimumGreen2;
+    minimumBlue              = minimumBlue + minimumBlue2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    FfxFloat32 maximumRed   = ffxMax3(ffxMax3(sampleD.r, sampleE.r, sampleF.r), sampleB.r, sampleH.r);
+    FfxFloat32 maximumGreen = ffxMax3(ffxMax3(sampleD.g, sampleE.g, sampleF.g), sampleB.g, sampleH.g);
+    FfxFloat32 maximumBlue  = ffxMax3(ffxMax3(sampleD.b, sampleE.b, sampleF.b), sampleB.b, sampleH.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 maximumRed2   = ffxMax3(ffxMax3(maximumRed, sampleA.r, sampleC.r), sampleG.r, sampleI.r);
+    FfxFloat32 maximumGreen2 = ffxMax3(ffxMax3(maximumGreen, sampleA.g, sampleC.g), sampleG.g, sampleI.g);
+    FfxFloat32 maximumBlue2  = ffxMax3(ffxMax3(maximumBlue, sampleA.b, sampleC.b), sampleG.b, sampleI.b);
+    maximumRed               = maximumRed + maximumRed2;
+    maximumGreen             = maximumGreen + maximumGreen2;
+    maximumBlue              = maximumBlue + maximumBlue2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    // Smooth minimum distance to signal limit divided by smooth max.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat32 reciprocalMaximumRed   = ffxReciprocal(maximumRed);
+    FfxFloat32 reciprocalMaximumGreen = ffxReciprocal(maximumGreen);
+    FfxFloat32 reciprocalMaximumBlue  = ffxReciprocal(maximumBlue);
+#else
+    FfxFloat32 reciprocalMaximumRed   = ffxApproximateReciprocal(maximumRed);
+    FfxFloat32 reciprocalMaximumGreen = ffxApproximateReciprocal(maximumGreen);
+    FfxFloat32 reciprocalMaximumBlue  = ffxApproximateReciprocal(maximumBlue);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat32 amplifyRed   = ffxSaturate(ffxMin(minimumRed, FfxFloat32(2.0) - maximumRed) * reciprocalMaximumRed);
+    FfxFloat32 amplifyGreen = ffxSaturate(ffxMin(minimumGreen, FfxFloat32(2.0) - maximumGreen) * reciprocalMaximumGreen);
+    FfxFloat32 amplifyBlue  = ffxSaturate(ffxMin(minimumBlue, FfxFloat32(2.0) - maximumBlue) * reciprocalMaximumBlue);
+#else
+    FfxFloat32 amplifyRed   = ffxSaturate(ffxMin(minimumRed, FfxFloat32(1.0) - maximumRed) * reciprocalMaximumRed);
+    FfxFloat32 amplifyGreen = ffxSaturate(ffxMin(minimumGreen, FfxFloat32(1.0) - maximumGreen) * reciprocalMaximumGreen);
+    FfxFloat32 amplifyBlue  = ffxSaturate(ffxMin(minimumBlue, FfxFloat32(1.0) - maximumBlue) * reciprocalMaximumBlue);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Shaping amount of sharpening.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    amplifyRed   = ffxSqrt(amplifyRed);
+    amplifyGreen = ffxSqrt(amplifyGreen);
+    amplifyBlue  = ffxSqrt(amplifyBlue);
+#else
+    amplifyRed   = ffxApproximateSqrt(amplifyRed);
+    amplifyGreen = ffxApproximateSqrt(amplifyGreen);
+    amplifyBlue  = ffxApproximateSqrt(amplifyBlue);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Filter shape.
+    //  0 w 0
+    //  w 1 w
+    //  0 w 0
+    FfxFloat32 peak = ffxAsFloat(const1.x);
+    FfxFloat32x3 weight = FfxFloat32x3(amplifyRed * peak, amplifyGreen * peak, amplifyBlue * peak);
+
+    // Filter using green coef only, depending on dead code removal to strip out the extra overhead.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat32 reciprocalWeight = ffxReciprocal(FfxFloat32(1.0) + FfxFloat32(4.0) * weight.g);
+#else
+    FfxFloat32 reciprocalWeight = ffxApproximateReciprocalMedium(FfxFloat32(1.0) + FfxFloat32(4.0) * weight.g);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    outPixelRed   = ffxSaturate((sampleB.r * weight.g + sampleD.r * weight.g + sampleF.r * weight.g + sampleH.r * weight.g + sampleE.r) * reciprocalWeight);
+    outPixelGreen = ffxSaturate((sampleB.g * weight.g + sampleD.g * weight.g + sampleF.g * weight.g + sampleH.g * weight.g + sampleE.g) * reciprocalWeight);
+    outPixelBlue  = ffxSaturate((sampleB.b * weight.g + sampleD.b * weight.g + sampleF.b * weight.g + sampleH.b * weight.g + sampleE.b) * reciprocalWeight);
+}
+
+#if FFX_HALF == 1
+// Half precision version algorithm with no scaling and filters 2 tiles in one run.
+void casFilterNoScalingHalf(
+    FFX_PARAMETER_OUT FfxFloat16x2 outPixelRed,
+    FFX_PARAMETER_OUT FfxFloat16x2 outPixelGreen,
+    FFX_PARAMETER_OUT FfxFloat16x2 outPixelBlue,
+    FFX_PARAMETER_IN FfxUInt32x2 samplePosition,
+    FFX_PARAMETER_IN FfxUInt32x4 const0,
+    FFX_PARAMETER_IN FfxUInt32x4 const1)
+{
+    FfxInt16x2   samplePosition0 = FfxInt16x2(samplePosition);
+    FfxFloat16x3 sampleA0 = casLoadHalf(samplePosition0 + FfxInt16x2(-1, -1));
+    FfxFloat16x3 sampleB0 = casLoadHalf(samplePosition0 + FfxInt16x2(0, -1));
+    FfxFloat16x3 sampleC0 = casLoadHalf(samplePosition0 + FfxInt16x2(1, -1));
+    FfxFloat16x3 sampleD0 = casLoadHalf(samplePosition0 + FfxInt16x2(-1, 0));
+    FfxFloat16x3 sampleE0 = casLoadHalf(samplePosition0);
+    FfxFloat16x3 sampleF0 = casLoadHalf(samplePosition0 + FfxInt16x2(1, 0));
+    FfxFloat16x3 sampleG0 = casLoadHalf(samplePosition0 + FfxInt16x2(-1, 1));
+    FfxFloat16x3 sampleH0 = casLoadHalf(samplePosition0 + FfxInt16x2(0, 1));
+    FfxFloat16x3 sampleI0 = casLoadHalf(samplePosition0 + FfxInt16x2(1, 1));
+    FfxInt16x2   samplePosition1 = samplePosition0 + FfxInt16x2(8, 0);
+    FfxFloat16x3 sampleA1  = casLoadHalf(samplePosition1 + FfxInt16x2(-1, -1));
+    FfxFloat16x3 sampleB1  = casLoadHalf(samplePosition1 + FfxInt16x2(0, -1));
+    FfxFloat16x3 sampleC1  = casLoadHalf(samplePosition1 + FfxInt16x2(1, -1));
+    FfxFloat16x3 sampleD1  = casLoadHalf(samplePosition1 + FfxInt16x2(-1, 0));
+    FfxFloat16x3 sampleE1  = casLoadHalf(samplePosition1);
+    FfxFloat16x3 sampleF1  = casLoadHalf(samplePosition1 + FfxInt16x2(1, 0));
+    FfxFloat16x3 sampleG1  = casLoadHalf(samplePosition1 + FfxInt16x2(-1, 1));
+    FfxFloat16x3 sampleH1  = casLoadHalf(samplePosition1 + FfxInt16x2(0, 1));
+    FfxFloat16x3 sampleI1  = casLoadHalf(samplePosition1 + FfxInt16x2(1, 1));
+
+    // AOS to SOA conversion.
+    FfxFloat16x2 aR = FfxFloat16x2(sampleA0.r, sampleA1.r);
+    FfxFloat16x2 aG = FfxFloat16x2(sampleA0.g, sampleA1.g);
+    FfxFloat16x2 aB = FfxFloat16x2(sampleA0.b, sampleA1.b);
+    FfxFloat16x2 bR = FfxFloat16x2(sampleB0.r, sampleB1.r);
+    FfxFloat16x2 bG = FfxFloat16x2(sampleB0.g, sampleB1.g);
+    FfxFloat16x2 bB = FfxFloat16x2(sampleB0.b, sampleB1.b);
+    FfxFloat16x2 cR = FfxFloat16x2(sampleC0.r, sampleC1.r);
+    FfxFloat16x2 cG = FfxFloat16x2(sampleC0.g, sampleC1.g);
+    FfxFloat16x2 cB = FfxFloat16x2(sampleC0.b, sampleC1.b);
+    FfxFloat16x2 dR = FfxFloat16x2(sampleD0.r, sampleD1.r);
+    FfxFloat16x2 dG = FfxFloat16x2(sampleD0.g, sampleD1.g);
+    FfxFloat16x2 dB = FfxFloat16x2(sampleD0.b, sampleD1.b);
+    FfxFloat16x2 eR = FfxFloat16x2(sampleE0.r, sampleE1.r);
+    FfxFloat16x2 eG = FfxFloat16x2(sampleE0.g, sampleE1.g);
+    FfxFloat16x2 eB = FfxFloat16x2(sampleE0.b, sampleE1.b);
+    FfxFloat16x2 fR = FfxFloat16x2(sampleF0.r, sampleF1.r);
+    FfxFloat16x2 fG = FfxFloat16x2(sampleF0.g, sampleF1.g);
+    FfxFloat16x2 fB = FfxFloat16x2(sampleF0.b, sampleF1.b);
+    FfxFloat16x2 gR = FfxFloat16x2(sampleG0.r, sampleG1.r);
+    FfxFloat16x2 gG = FfxFloat16x2(sampleG0.g, sampleG1.g);
+    FfxFloat16x2 gB = FfxFloat16x2(sampleG0.b, sampleG1.b);
+    FfxFloat16x2 hR = FfxFloat16x2(sampleH0.r, sampleH1.r);
+    FfxFloat16x2 hG = FfxFloat16x2(sampleH0.g, sampleH1.g);
+    FfxFloat16x2 hB = FfxFloat16x2(sampleH0.b, sampleH1.b);
+    FfxFloat16x2 iR = FfxFloat16x2(sampleI0.r, sampleI1.r);
+    FfxFloat16x2 iG = FfxFloat16x2(sampleI0.g, sampleI1.g);
+    FfxFloat16x2 iB = FfxFloat16x2(sampleI0.b, sampleI1.b);
+
+    // Run optional input transform.
+    casInputHalf(aR, aG, aB);
+    casInputHalf(bR, bG, bB);
+    casInputHalf(cR, cG, cB);
+    casInputHalf(dR, dG, dB);
+    casInputHalf(eR, eG, eB);
+    casInputHalf(fR, fG, fB);
+    casInputHalf(gR, gG, gB);
+    casInputHalf(hR, hG, hB);
+    casInputHalf(iR, iG, iB);
+
+    // Soft min and max.
+    FfxFloat16x2 minimumRed   = ffxMin(ffxMin(fR, hR), ffxMin(ffxMin(bR, dR), eR));
+    FfxFloat16x2 minimumGreen = ffxMin(ffxMin(fG, hG), ffxMin(ffxMin(bG, dG), eG));
+    FfxFloat16x2 minimumBlue  = ffxMin(ffxMin(fB, hB), ffxMin(ffxMin(bB, dB), eB));
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat16x2 minimumRed2   = ffxMin(ffxMin(gR, iR), ffxMin(ffxMin(aR, cR), minimumRed));
+    FfxFloat16x2 minimumGreen2 = ffxMin(ffxMin(gG, iG), ffxMin(ffxMin(aG, cG), minimumGreen));
+    FfxFloat16x2 minimumBlue2  = ffxMin(ffxMin(gB, iB), ffxMin(ffxMin(aB, cB), minimumBlue));
+    minimumRed                 = minimumRed + minimumRed2;
+    minimumGreen               = minimumGreen + minimumGreen2;
+    minimumBlue                = minimumBlue + minimumBlue2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    FfxFloat16x2 maximumRed   = max(max(fR, hR), max(max(bR, dR), eR));
+    FfxFloat16x2 maximumGreen = max(max(fG, hG), max(max(bG, dG), eG));
+    FfxFloat16x2 maximumBlue  = max(max(fB, hB), max(max(bB, dB), eB));
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat16x2 maximumRed2   = max(max(gR, iR), max(max(aR, cR), maximumRed));
+    FfxFloat16x2 maximumGreen2 = max(max(gG, iG), max(max(aG, cG), maximumGreen));
+    FfxFloat16x2 maximumBlue2  = max(max(gB, iB), max(max(aB, cB), maximumBlue));
+    maximumRed                 = maximumRed + maximumRed2;
+    maximumGreen               = maximumGreen + maximumGreen2;
+    maximumBlue                = maximumBlue + maximumBlue2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    // Smooth minimum distance to signal limit divided by smooth max.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat16x2 reciprocalMaximumRed   = ffxReciprocalHalf(maximumRed);
+    FfxFloat16x2 reciprocalMaximumGreen = ffxReciprocalHalf(maximumGreen);
+    FfxFloat16x2 reciprocalMaximumBlue  = ffxReciprocalHalf(maximumBlue);
+#else
+    FfxFloat16x2 reciprocalMaximumRed   = ffxApproximateReciprocalHalf(maximumRed);
+    FfxFloat16x2 reciprocalMaximumGreen = ffxApproximateReciprocalHalf(maximumGreen);
+    FfxFloat16x2 reciprocalMaximumBlue  = ffxApproximateReciprocalHalf(maximumBlue);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat16x2 amplifyRed   = ffxSaturate(min(minimumRed, FFX_BROADCAST_FLOAT16X2(2.0) - maximumRed) * reciprocalMaximumRed);
+    FfxFloat16x2 amplifyGreen = ffxSaturate(min(minimumGreen, FFX_BROADCAST_FLOAT16X2(2.0) - maximumGreen) * reciprocalMaximumGreen);
+    FfxFloat16x2 amplifyBlue  = ffxSaturate(min(minimumBlue, FFX_BROADCAST_FLOAT16X2(2.0) - maximumBlue) * reciprocalMaximumBlue);
+#else
+    FfxFloat16x2 amplifyRed   = ffxSaturate(min(minimumRed, FFX_BROADCAST_FLOAT16X2(1.0) - maximumRed) * reciprocalMaximumRed);
+    FfxFloat16x2 amplifyGreen = ffxSaturate(min(minimumGreen, FFX_BROADCAST_FLOAT16X2(1.0) - maximumGreen) * reciprocalMaximumGreen);
+    FfxFloat16x2 amplifyBlue  = ffxSaturate(min(minimumBlue, FFX_BROADCAST_FLOAT16X2(1.0) - maximumBlue) * reciprocalMaximumBlue);
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    // Shaping amount of sharpening.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    amplifyRed   = ffxSqrt(amplifyRed);
+    amplifyGreen = ffxSqrt(amplifyGreen);
+    amplifyBlue  = ffxSqrt(amplifyBlue);
+#else
+    amplifyRed   = ffxApproximateSqrtHalf(amplifyRed);
+    amplifyGreen = ffxApproximateSqrtHalf(amplifyGreen);
+    amplifyBlue  = ffxApproximateSqrtHalf(amplifyBlue);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Filter shape.
+    FfxFloat16   peak        = FFX_UINT32_TO_FLOAT16X2(const1.y).x;
+    FfxFloat16x2 weightRed   = amplifyRed * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 weightGreen = amplifyGreen * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 weightBlue  = amplifyBlue * FFX_BROADCAST_FLOAT16X2(peak);
+    // Filter.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat16x2 reciprocalWeight = ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(1.0) + FFX_BROADCAST_FLOAT16X2(4.0) * weightGreen);
+#else
+    FfxFloat16x2 reciprocalWeight = ffxApproximateReciprocalMediumHalf(FFX_BROADCAST_FLOAT16X2(1.0) + FFX_BROADCAST_FLOAT16X2(4.0) * weightGreen);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    outPixelRed = ffxSaturate((bR * weightGreen + dR * weightGreen + fR * weightGreen + hR * weightGreen + eR) * reciprocalWeight);
+    outPixelGreen = ffxSaturate((bG * weightGreen + dG * weightGreen + fG * weightGreen + hG * weightGreen + eG) * reciprocalWeight);
+    outPixelBlue = ffxSaturate((bB * weightGreen + dB * weightGreen + fB * weightGreen + hB * weightGreen + eB) * reciprocalWeight);
+}
+#endif // #if FFX_HALF == 1
+
+// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
+void casFilterWithScaling(
+    FFX_PARAMETER_OUT FfxFloat32 pixR,
+    FFX_PARAMETER_OUT FfxFloat32 pixG,
+    FFX_PARAMETER_OUT FfxFloat32 pixB,
+    FFX_PARAMETER_IN FfxUInt32x2 samplePosition,
+    FFX_PARAMETER_IN FfxUInt32x4 const0,
+    FFX_PARAMETER_IN FfxUInt32x4 const1)
+{
+    //  a b c d
+    //  e f g h
+    //  i j k l
+    //  m n o p
+    // Working these 4 results.
+    //  +-----+-----+
+    //  |     |     |
+    //  |  f..|..g  |
+    //  |  .  |  .  |
+    //  +-----+-----+
+    //  |  .  |  .  |
+    //  |  j..|..k  |
+    //  |     |     |
+    //  +-----+-----+
+    FfxFloat32x2 pixelPosition = FfxFloat32x2(samplePosition) * ffxAsFloat(const0.xy) + ffxAsFloat(const0.zw);
+    FfxFloat32x2 floorPixelPosition = floor(pixelPosition);
+    pixelPosition -= floorPixelPosition;
+    FfxInt32x2   finalSamplePosition = FfxInt32x2(floorPixelPosition);
+    FfxFloat32x3 a  = casLoad(finalSamplePosition + FfxInt32x2(-1, -1));
+    FfxFloat32x3 b  = casLoad(finalSamplePosition + FfxInt32x2(0, -1));
+    FfxFloat32x3 e  = casLoad(finalSamplePosition + FfxInt32x2(-1, 0));
+    FfxFloat32x3 f  = casLoad(finalSamplePosition);
+    FfxFloat32x3 c  = casLoad(finalSamplePosition + FfxInt32x2(1, -1));
+    FfxFloat32x3 d  = casLoad(finalSamplePosition + FfxInt32x2(2, -1));
+    FfxFloat32x3 g  = casLoad(finalSamplePosition + FfxInt32x2(1, 0));
+    FfxFloat32x3 h  = casLoad(finalSamplePosition + FfxInt32x2(2, 0));
+    FfxFloat32x3 i  = casLoad(finalSamplePosition + FfxInt32x2(-1, 1));
+    FfxFloat32x3 j  = casLoad(finalSamplePosition + FfxInt32x2(0, 1));
+    FfxFloat32x3 m  = casLoad(finalSamplePosition + FfxInt32x2(-1, 2));
+    FfxFloat32x3 n  = casLoad(finalSamplePosition + FfxInt32x2(0, 2));
+    FfxFloat32x3 k  = casLoad(finalSamplePosition + FfxInt32x2(1, 1));
+    FfxFloat32x3 l  = casLoad(finalSamplePosition + FfxInt32x2(2, 1));
+    FfxFloat32x3 o  = casLoad(finalSamplePosition + FfxInt32x2(1, 2));
+    FfxFloat32x3 p  = casLoad(finalSamplePosition + FfxInt32x2(2, 2));
+
+    // Run optional input transform.
+    casInput(a.r, a.g, a.b);
+    casInput(b.r, b.g, b.b);
+    casInput(c.r, c.g, c.b);
+    casInput(d.r, d.g, d.b);
+    casInput(e.r, e.g, e.b);
+    casInput(f.r, f.g, f.b);
+    casInput(g.r, g.g, g.b);
+    casInput(h.r, h.g, h.b);
+    casInput(i.r, i.g, i.b);
+    casInput(j.r, j.g, j.b);
+    casInput(k.r, k.g, k.b);
+    casInput(l.r, l.g, l.b);
+    casInput(m.r, m.g, m.b);
+    casInput(n.r, n.g, n.b);
+    casInput(o.r, o.g, o.b);
+    casInput(p.r, p.g, p.b);
+
+    // Soft min and max.
+    // These are 2.0x bigger (factored out the extra multiply).
+    //  a b c             b
+    //  e f g * 0.5  +  e f g * 0.5  [F]
+    //  i j k             j
+    FfxFloat32 minimumRed = ffxMin3(ffxMin3(b.r, e.r, f.r), g.r, j.r);
+    FfxFloat32 minimumGreen = ffxMin3(ffxMin3(b.g, e.g, f.g), g.g, j.g);
+    FfxFloat32 minimumBlue = ffxMin3(ffxMin3(b.b, e.b, f.b), g.b, j.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mnfR2 = ffxMin3(ffxMin3(minimumRed, a.r, c.r), i.r, k.r);
+    FfxFloat32 mnfG2 = ffxMin3(ffxMin3(minimumGreen, a.g, c.g), i.g, k.g);
+    FfxFloat32 mnfB2 = ffxMin3(ffxMin3(minimumBlue, a.b, c.b), i.b, k.b);
+    minimumRed       = minimumRed + mnfR2;
+    minimumGreen     = minimumGreen + mnfG2;
+    minimumBlue      = minimumBlue + mnfB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    FfxFloat32 mxfR = ffxMax3(ffxMax3(b.r, e.r, f.r), g.r, j.r);
+    FfxFloat32 mxfG = ffxMax3(ffxMax3(b.g, e.g, f.g), g.g, j.g);
+    FfxFloat32 mxfB = ffxMax3(ffxMax3(b.b, e.b, f.b), g.b, j.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mxfR2 = ffxMax3(ffxMax3(mxfR, a.r, c.r), i.r, k.r);
+    FfxFloat32 mxfG2 = ffxMax3(ffxMax3(mxfG, a.g, c.g), i.g, k.g);
+    FfxFloat32 mxfB2 = ffxMax3(ffxMax3(mxfB, a.b, c.b), i.b, k.b);
+    mxfR             = mxfR + mxfR2;
+    mxfG             = mxfG + mxfG2;
+    mxfB             = mxfB + mxfB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    //  b c d             c
+    //  f g h * 0.5  +  f g h * 0.5  [G]
+    //  j k l             k
+    FfxFloat32 mngR = ffxMin3(ffxMin3(c.r, f.r, g.r), h.r, k.r);
+    FfxFloat32 mngG = ffxMin3(ffxMin3(c.g, f.g, g.g), h.g, k.g);
+    FfxFloat32 mngB = ffxMin3(ffxMin3(c.b, f.b, g.b), h.b, k.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mngR2 = ffxMin3(ffxMin3(mngR, b.r, d.r), j.r, l.r);
+    FfxFloat32 mngG2 = ffxMin3(ffxMin3(mngG, b.g, d.g), j.g, l.g);
+    FfxFloat32 mngB2 = ffxMin3(ffxMin3(mngB, b.b, d.b), j.b, l.b);
+    mngR             = mngR + mngR2;
+    mngG             = mngG + mngG2;
+    mngB             = mngB + mngB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    FfxFloat32 mxgR = ffxMax3(ffxMax3(c.r, f.r, g.r), h.r, k.r);
+    FfxFloat32 mxgG = ffxMax3(ffxMax3(c.g, f.g, g.g), h.g, k.g);
+    FfxFloat32 mxgB = ffxMax3(ffxMax3(c.b, f.b, g.b), h.b, k.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mxgR2 = ffxMax3(ffxMax3(mxgR, b.r, d.r), j.r, l.r);
+    FfxFloat32 mxgG2 = ffxMax3(ffxMax3(mxgG, b.g, d.g), j.g, l.g);
+    FfxFloat32 mxgB2 = ffxMax3(ffxMax3(mxgB, b.b, d.b), j.b, l.b);
+    mxgR             = mxgR + mxgR2;
+    mxgG             = mxgG + mxgG2;
+    mxgB             = mxgB + mxgB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    //  e f g             f
+    //  i j k * 0.5  +  i j k * 0.5  [J]
+    //  m n o             n
+    FfxFloat32 mnjR = ffxMin3(ffxMin3(f.r, i.r, j.r), k.r, n.r);
+    FfxFloat32 mnjG = ffxMin3(ffxMin3(f.g, i.g, j.g), k.g, n.g);
+    FfxFloat32 mnjB = ffxMin3(ffxMin3(f.b, i.b, j.b), k.b, n.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mnjR2 = ffxMin3(ffxMin3(mnjR, e.r, g.r), m.r, o.r);
+    FfxFloat32 mnjG2 = ffxMin3(ffxMin3(mnjG, e.g, g.g), m.g, o.g);
+    FfxFloat32 mnjB2 = ffxMin3(ffxMin3(mnjB, e.b, g.b), m.b, o.b);
+    mnjR             = mnjR + mnjR2;
+    mnjG             = mnjG + mnjG2;
+    mnjB             = mnjB + mnjB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    FfxFloat32 mxjR = ffxMax3(ffxMax3(f.r, i.r, j.r), k.r, n.r);
+    FfxFloat32 mxjG = ffxMax3(ffxMax3(f.g, i.g, j.g), k.g, n.g);
+    FfxFloat32 mxjB = ffxMax3(ffxMax3(f.b, i.b, j.b), k.b, n.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mxjR2 = ffxMax3(ffxMax3(mxjR, e.r, g.r), m.r, o.r);
+    FfxFloat32 mxjG2 = ffxMax3(ffxMax3(mxjG, e.g, g.g), m.g, o.g);
+    FfxFloat32 mxjB2 = ffxMax3(ffxMax3(mxjB, e.b, g.b), m.b, o.b);
+    mxjR             = mxjR + mxjR2;
+    mxjG             = mxjG + mxjG2;
+    mxjB             = mxjB + mxjB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    //  f g h             g
+    //  j k l * 0.5  +  j k l * 0.5  [K]
+    //  n o p             o
+    FfxFloat32 mnkR = ffxMin3(ffxMin3(g.r, j.r, k.r), l.r, o.r);
+    FfxFloat32 mnkG = ffxMin3(ffxMin3(g.g, j.g, k.g), l.g, o.g);
+    FfxFloat32 mnkB = ffxMin3(ffxMin3(g.b, j.b, k.b), l.b, o.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mnkR2 = ffxMin3(ffxMin3(mnkR, f.r, h.r), n.r, p.r);
+    FfxFloat32 mnkG2 = ffxMin3(ffxMin3(mnkG, f.g, h.g), n.g, p.g);
+    FfxFloat32 mnkB2 = ffxMin3(ffxMin3(mnkB, f.b, h.b), n.b, p.b);
+    mnkR             = mnkR + mnkR2;
+    mnkG             = mnkG + mnkG2;
+    mnkB             = mnkB + mnkB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+    FfxFloat32 mxkR = ffxMax3(ffxMax3(g.r, j.r, k.r), l.r, o.r);
+    FfxFloat32 mxkG = ffxMax3(ffxMax3(g.g, j.g, k.g), l.g, o.g);
+    FfxFloat32 mxkB = ffxMax3(ffxMax3(g.b, j.b, k.b), l.b, o.b);
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 mxkR2 = ffxMax3(ffxMax3(mxkR, f.r, h.r), n.r, p.r);
+    FfxFloat32 mxkG2 = ffxMax3(ffxMax3(mxkG, f.g, h.g), n.g, p.g);
+    FfxFloat32 mxkB2 = ffxMax3(ffxMax3(mxkB, f.b, h.b), n.b, p.b);
+    mxkR             = mxkR + mxkR2;
+    mxkG             = mxkG + mxkG2;
+    mxkB             = mxkB + mxkB2;
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    // Smooth minimum distance to signal limit divided by smooth max.
+    FfxFloat32 rcpMfR = ffxReciprocal(mxfR);
+    FfxFloat32 rcpMfG = ffxReciprocal(mxfG);
+    FfxFloat32 rcpMfB = ffxReciprocal(mxfB);
+    FfxFloat32 rcpMgR = ffxReciprocal(mxgR);
+    FfxFloat32 rcpMgG = ffxReciprocal(mxgG);
+    FfxFloat32 rcpMgB = ffxReciprocal(mxgB);
+    FfxFloat32 rcpMjR = ffxReciprocal(mxjR);
+    FfxFloat32 rcpMjG = ffxReciprocal(mxjG);
+    FfxFloat32 rcpMjB = ffxReciprocal(mxjB);
+    FfxFloat32 rcpMkR = ffxReciprocal(mxkR);
+    FfxFloat32 rcpMkG = ffxReciprocal(mxkG);
+    FfxFloat32 rcpMkB = ffxReciprocal(mxkB);
+#else
+    // Smooth minimum distance to signal limit divided by smooth max.
+    FfxFloat32 rcpMfR = ffxApproximateReciprocal(mxfR);
+    FfxFloat32 rcpMfG = ffxApproximateReciprocal(mxfG);
+    FfxFloat32 rcpMfB = ffxApproximateReciprocal(mxfB);
+    FfxFloat32 rcpMgR = ffxApproximateReciprocal(mxgR);
+    FfxFloat32 rcpMgG = ffxApproximateReciprocal(mxgG);
+    FfxFloat32 rcpMgB = ffxApproximateReciprocal(mxgB);
+    FfxFloat32 rcpMjR = ffxApproximateReciprocal(mxjR);
+    FfxFloat32 rcpMjG = ffxApproximateReciprocal(mxjG);
+    FfxFloat32 rcpMjB = ffxApproximateReciprocal(mxjB);
+    FfxFloat32 rcpMkR = ffxApproximateReciprocal(mxkR);
+    FfxFloat32 rcpMkG = ffxApproximateReciprocal(mxkG);
+    FfxFloat32 rcpMkB = ffxApproximateReciprocal(mxkB);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+#if defined(FFX_CAS_BETTER_DIAGONALS)
+    FfxFloat32 ampfR = ffxSaturate(ffxMin(minimumRed, FfxFloat32(2.0) - mxfR) * rcpMfR);
+    FfxFloat32 ampfG = ffxSaturate(ffxMin(minimumGreen, FfxFloat32(2.0) - mxfG) * rcpMfG);
+    FfxFloat32 ampfB = ffxSaturate(ffxMin(minimumBlue, FfxFloat32(2.0) - mxfB) * rcpMfB);
+    FfxFloat32 ampgR = ffxSaturate(ffxMin(mngR, FfxFloat32(2.0) - mxgR) * rcpMgR);
+    FfxFloat32 ampgG = ffxSaturate(ffxMin(mngG, FfxFloat32(2.0) - mxgG) * rcpMgG);
+    FfxFloat32 ampgB = ffxSaturate(ffxMin(mngB, FfxFloat32(2.0) - mxgB) * rcpMgB);
+    FfxFloat32 ampjR = ffxSaturate(ffxMin(mnjR, FfxFloat32(2.0) - mxjR) * rcpMjR);
+    FfxFloat32 ampjG = ffxSaturate(ffxMin(mnjG, FfxFloat32(2.0) - mxjG) * rcpMjG);
+    FfxFloat32 ampjB = ffxSaturate(ffxMin(mnjB, FfxFloat32(2.0) - mxjB) * rcpMjB);
+    FfxFloat32 ampkR = ffxSaturate(ffxMin(mnkR, FfxFloat32(2.0) - mxkR) * rcpMkR);
+    FfxFloat32 ampkG = ffxSaturate(ffxMin(mnkG, FfxFloat32(2.0) - mxkG) * rcpMkG);
+    FfxFloat32 ampkB = ffxSaturate(ffxMin(mnkB, FfxFloat32(2.0) - mxkB) * rcpMkB);
+#else
+    FfxFloat32 ampfR = ffxSaturate(ffxMin(minimumRed, FfxFloat32(1.0) - mxfR) * rcpMfR);
+    FfxFloat32 ampfG = ffxSaturate(ffxMin(minimumGreen, FfxFloat32(1.0) - mxfG) * rcpMfG);
+    FfxFloat32 ampfB = ffxSaturate(ffxMin(minimumBlue, FfxFloat32(1.0) - mxfB) * rcpMfB);
+    FfxFloat32 ampgR = ffxSaturate(ffxMin(mngR, FfxFloat32(1.0) - mxgR) * rcpMgR);
+    FfxFloat32 ampgG = ffxSaturate(ffxMin(mngG, FfxFloat32(1.0) - mxgG) * rcpMgG);
+    FfxFloat32 ampgB = ffxSaturate(ffxMin(mngB, FfxFloat32(1.0) - mxgB) * rcpMgB);
+    FfxFloat32 ampjR = ffxSaturate(ffxMin(mnjR, FfxFloat32(1.0) - mxjR) * rcpMjR);
+    FfxFloat32 ampjG = ffxSaturate(ffxMin(mnjG, FfxFloat32(1.0) - mxjG) * rcpMjG);
+    FfxFloat32 ampjB = ffxSaturate(ffxMin(mnjB, FfxFloat32(1.0) - mxjB) * rcpMjB);
+    FfxFloat32 ampkR = ffxSaturate(ffxMin(mnkR, FfxFloat32(1.0) - mxkR) * rcpMkR);
+    FfxFloat32 ampkG = ffxSaturate(ffxMin(mnkG, FfxFloat32(1.0) - mxkG) * rcpMkG);
+    FfxFloat32 ampkB = ffxSaturate(ffxMin(mnkB, FfxFloat32(1.0) - mxkB) * rcpMkB);
+#endif  // #if defined(FFX_CAS_BETTER_DIAGONALS)
+
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    // Shaping amount of sharpening.
+    ampfR = ffxSqrt(ampfR);
+    ampfG = ffxSqrt(ampfG);
+    ampfB = ffxSqrt(ampfB);
+    ampgR = ffxSqrt(ampgR);
+    ampgG = ffxSqrt(ampgG);
+    ampgB = ffxSqrt(ampgB);
+    ampjR = ffxSqrt(ampjR);
+    ampjG = ffxSqrt(ampjG);
+    ampjB = ffxSqrt(ampjB);
+    ampkR = ffxSqrt(ampkR);
+    ampkG = ffxSqrt(ampkG);
+    ampkB = ffxSqrt(ampkB);
+#else
+    // Shaping amount of sharpening.
+    ampfR = ffxApproximateSqrt(ampfR);
+    ampfG = ffxApproximateSqrt(ampfG);
+    ampfB = ffxApproximateSqrt(ampfB);
+    ampgR = ffxApproximateSqrt(ampgR);
+    ampgG = ffxApproximateSqrt(ampgG);
+    ampgB = ffxApproximateSqrt(ampgB);
+    ampjR = ffxApproximateSqrt(ampjR);
+    ampjG = ffxApproximateSqrt(ampjG);
+    ampjB = ffxApproximateSqrt(ampjB);
+    ampkR = ffxApproximateSqrt(ampkR);
+    ampkG = ffxApproximateSqrt(ampkG);
+    ampkB = ffxApproximateSqrt(ampkB);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Filter shape.
+    //  0 w 0
+    //  w 1 w
+    //  0 w 0
+    FfxFloat32 peak = ffxAsFloat(const1.x);
+    FfxFloat32 wfR  = ampfR * peak;
+    FfxFloat32 wfG  = ampfG * peak;
+    FfxFloat32 wfB  = ampfB * peak;
+    FfxFloat32 wgR  = ampgR * peak;
+    FfxFloat32 wgG  = ampgG * peak;
+    FfxFloat32 wgB  = ampgB * peak;
+    FfxFloat32 wjR  = ampjR * peak;
+    FfxFloat32 wjG  = ampjG * peak;
+    FfxFloat32 wjB  = ampjB * peak;
+    FfxFloat32 wkR  = ampkR * peak;
+    FfxFloat32 wkG  = ampkG * peak;
+    FfxFloat32 wkB  = ampkB * peak;
+
+    // Blend between 4 results.
+    //  s t
+    //  u v
+    FfxFloat32 s = (FfxFloat32(1.0) - pixelPosition.x) * (FfxFloat32(1.0) - pixelPosition.y);
+    FfxFloat32 t = pixelPosition.x * (FfxFloat32(1.0) - pixelPosition.y);
+    FfxFloat32 u = (FfxFloat32(1.0) - pixelPosition.x) * pixelPosition.y;
+    FfxFloat32 v = pixelPosition.x * pixelPosition.y;
+
+    // Thin edges to hide bilinear interpolation (helps diagonals).
+    FfxFloat32 thinB = 1.0 / 32.0;
+
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    s *= ffxReciprocal(thinB + (mxfG - minimumGreen));
+    t *= ffxReciprocal(thinB + (mxgG - mngG));
+    u *= ffxReciprocal(thinB + (mxjG - mnjG));
+    v *= ffxReciprocal(thinB + (mxkG - mnkG));
+#else
+    s *= ffxApproximateReciprocal(thinB + (mxfG - minimumGreen));
+    t *= ffxApproximateReciprocal(thinB + (mxgG - mngG));
+    u *= ffxApproximateReciprocal(thinB + (mxjG - mnjG));
+    v *= ffxApproximateReciprocal(thinB + (mxkG - mnkG));
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Final weighting.
+    //    b c
+    //  e f g h
+    //  i j k l
+    //    n o
+    //  _____  _____  _____  _____
+    //         fs        gt
+    //
+    //  _____  _____  _____  _____
+    //  fs      s gt  fs  t     gt
+    //         ju        kv
+    //  _____  _____  _____  _____
+    //         fs        gt
+    //  ju      u kv  ju  v     kv
+    //  _____  _____  _____  _____
+    //
+    //         ju        kv
+    FfxFloat32 qbeR = wfR * s;
+    FfxFloat32 qbeG = wfG * s;
+    FfxFloat32 qbeB = wfB * s;
+    FfxFloat32 qchR = wgR * t;
+    FfxFloat32 qchG = wgG * t;
+    FfxFloat32 qchB = wgB * t;
+    FfxFloat32 qfR  = wgR * t + wjR * u + s;
+    FfxFloat32 qfG  = wgG * t + wjG * u + s;
+    FfxFloat32 qfB  = wgB * t + wjB * u + s;
+    FfxFloat32 qgR  = wfR * s + wkR * v + t;
+    FfxFloat32 qgG  = wfG * s + wkG * v + t;
+    FfxFloat32 qgB  = wfB * s + wkB * v + t;
+    FfxFloat32 qjR  = wfR * s + wkR * v + u;
+    FfxFloat32 qjG  = wfG * s + wkG * v + u;
+    FfxFloat32 qjB  = wfB * s + wkB * v + u;
+    FfxFloat32 qkR  = wgR * t + wjR * u + v;
+    FfxFloat32 qkG  = wgG * t + wjG * u + v;
+    FfxFloat32 qkB  = wgB * t + wjB * u + v;
+    FfxFloat32 qinR = wjR * u;
+    FfxFloat32 qinG = wjG * u;
+    FfxFloat32 qinB = wjB * u;
+    FfxFloat32 qloR = wkR * v;
+    FfxFloat32 qloG = wkG * v;
+    FfxFloat32 qloB = wkB * v;
+
+    // Using green coef only, depending on dead code removal to strip out the extra overhead.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat32 rcpWG = ffxReciprocal(FfxFloat32(2.0) * qbeG + FfxFloat32(2.0) * qchG + FfxFloat32(2.0) * qinG + FfxFloat32(2.0) * qloG + qfG + qgG + qjG + qkG);
+#else
+    FfxFloat32 rcpWG = ffxApproximateReciprocalMedium(FfxFloat32(2.0) * qbeG + FfxFloat32(2.0) * qchG + FfxFloat32(2.0) * qinG + FfxFloat32(2.0) * qloG + qfG +
+                                                      qgG + qjG + qkG);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    pixR = ffxSaturate((b.r * qbeG + e.r * qbeG + c.r * qchG + h.r * qchG + i.r * qinG + n.r * qinG + l.r * qloG + o.r * qloG + f.r * qfG + g.r * qgG +
+                        j.r * qjG +
+                     k.r * qkG) *
+                    rcpWG);
+    pixG = ffxSaturate((b.g * qbeG + e.g * qbeG + c.g * qchG + h.g * qchG + i.g * qinG + n.g * qinG + l.g * qloG + o.g * qloG + f.g * qfG + g.g * qgG +
+                        j.g * qjG +
+                     k.g * qkG) *
+                    rcpWG);
+    pixB = ffxSaturate((b.b * qbeG + e.b * qbeG + c.b * qchG + h.b * qchG + i.b * qinG + n.b * qinG + l.b * qloG + o.b * qloG + f.b * qfG + g.b * qgG +
+                        j.b * qjG +
+                     k.b * qkG) *
+                    rcpWG);
+}
+
+#if FFX_HALF == 1
+// Half precision version algorithm with scaling and filters 2 tiles in one run.
+void casFilterWithScalingHalf(
+    FFX_PARAMETER_OUT FfxFloat16x2 pixR,
+    FFX_PARAMETER_OUT FfxFloat16x2 pixG,
+    FFX_PARAMETER_OUT FfxFloat16x2 pixB,
+    FFX_PARAMETER_IN FfxUInt32x2 ip,      // Integer pixel position in output.
+    FFX_PARAMETER_IN FfxUInt32x4 const0,  // Constants generated by ffxCasSetup().
+    FFX_PARAMETER_IN FfxUInt32x4 const1)
+{
+    FfxFloat32x2 pp = FfxFloat32x2(ip) * ffxAsFloat(const0.xy) + ffxAsFloat(const0.zw);
+
+    // Tile 0.
+    // Fractional position is needed in high precision here.
+    FfxFloat32x2 fp0 = floor(pp);
+    FfxFloat16x2 ppX;
+    ppX.x            = FfxFloat16(pp.x - fp0.x);
+    FfxFloat16   ppY = FfxFloat16(pp.y - fp0.y);
+    FfxInt16x2   sp0 = FfxInt16x2(fp0);
+    FfxFloat16x3 a0  = casLoadHalf(sp0 + FfxInt16x2(-1, -1));
+    FfxFloat16x3 b0  = casLoadHalf(sp0 + FfxInt16x2(0, -1));
+    FfxFloat16x3 e0  = casLoadHalf(sp0 + FfxInt16x2(-1, 0));
+    FfxFloat16x3 f0  = casLoadHalf(sp0);
+    FfxFloat16x3 c0  = casLoadHalf(sp0 + FfxInt16x2(1, -1));
+    FfxFloat16x3 d0  = casLoadHalf(sp0 + FfxInt16x2(2, -1));
+    FfxFloat16x3 g0  = casLoadHalf(sp0 + FfxInt16x2(1, 0));
+    FfxFloat16x3 h0  = casLoadHalf(sp0 + FfxInt16x2(2, 0));
+    FfxFloat16x3 i0  = casLoadHalf(sp0 + FfxInt16x2(-1, 1));
+    FfxFloat16x3 j0  = casLoadHalf(sp0 + FfxInt16x2(0, 1));
+    FfxFloat16x3 m0  = casLoadHalf(sp0 + FfxInt16x2(-1, 2));
+    FfxFloat16x3 n0  = casLoadHalf(sp0 + FfxInt16x2(0, 2));
+    FfxFloat16x3 k0  = casLoadHalf(sp0 + FfxInt16x2(1, 1));
+    FfxFloat16x3 l0  = casLoadHalf(sp0 + FfxInt16x2(2, 1));
+    FfxFloat16x3 o0  = casLoadHalf(sp0 + FfxInt16x2(1, 2));
+    FfxFloat16x3 p0  = casLoadHalf(sp0 + FfxInt16x2(2, 2));
+
+    // Tile 1 (offset only in x).
+    FfxFloat32 pp1   = pp.x + ffxAsFloat(const1.z);
+    FfxFloat32 fp1   = floor(pp1);
+    ppX.y            = FfxFloat16(pp1 - fp1);
+    FfxInt16x2   sp1 = FfxInt16x2(fp1, sp0.y);
+    FfxFloat16x3 a1  = casLoadHalf(sp1 + FfxInt16x2(-1, -1));
+    FfxFloat16x3 b1  = casLoadHalf(sp1 + FfxInt16x2(0, -1));
+    FfxFloat16x3 e1  = casLoadHalf(sp1 + FfxInt16x2(-1, 0));
+    FfxFloat16x3 f1  = casLoadHalf(sp1);
+    FfxFloat16x3 c1  = casLoadHalf(sp1 + FfxInt16x2(1, -1));
+    FfxFloat16x3 d1  = casLoadHalf(sp1 + FfxInt16x2(2, -1));
+    FfxFloat16x3 g1  = casLoadHalf(sp1 + FfxInt16x2(1, 0));
+    FfxFloat16x3 h1  = casLoadHalf(sp1 + FfxInt16x2(2, 0));
+    FfxFloat16x3 i1  = casLoadHalf(sp1 + FfxInt16x2(-1, 1));
+    FfxFloat16x3 j1  = casLoadHalf(sp1 + FfxInt16x2(0, 1));
+    FfxFloat16x3 m1  = casLoadHalf(sp1 + FfxInt16x2(-1, 2));
+    FfxFloat16x3 n1  = casLoadHalf(sp1 + FfxInt16x2(0, 2));
+    FfxFloat16x3 k1  = casLoadHalf(sp1 + FfxInt16x2(1, 1));
+    FfxFloat16x3 l1  = casLoadHalf(sp1 + FfxInt16x2(2, 1));
+    FfxFloat16x3 o1  = casLoadHalf(sp1 + FfxInt16x2(1, 2));
+    FfxFloat16x3 p1  = casLoadHalf(sp1 + FfxInt16x2(2, 2));
+
+    // AOS to SOA conversion.
+    FfxFloat16x2 aR = FfxFloat16x2(a0.r, a1.r);
+    FfxFloat16x2 aG = FfxFloat16x2(a0.g, a1.g);
+    FfxFloat16x2 aB = FfxFloat16x2(a0.b, a1.b);
+    FfxFloat16x2 bR = FfxFloat16x2(b0.r, b1.r);
+    FfxFloat16x2 bG = FfxFloat16x2(b0.g, b1.g);
+    FfxFloat16x2 bB = FfxFloat16x2(b0.b, b1.b);
+    FfxFloat16x2 cR = FfxFloat16x2(c0.r, c1.r);
+    FfxFloat16x2 cG = FfxFloat16x2(c0.g, c1.g);
+    FfxFloat16x2 cB = FfxFloat16x2(c0.b, c1.b);
+    FfxFloat16x2 dR = FfxFloat16x2(d0.r, d1.r);
+    FfxFloat16x2 dG = FfxFloat16x2(d0.g, d1.g);
+    FfxFloat16x2 dB = FfxFloat16x2(d0.b, d1.b);
+    FfxFloat16x2 eR = FfxFloat16x2(e0.r, e1.r);
+    FfxFloat16x2 eG = FfxFloat16x2(e0.g, e1.g);
+    FfxFloat16x2 eB = FfxFloat16x2(e0.b, e1.b);
+    FfxFloat16x2 fR = FfxFloat16x2(f0.r, f1.r);
+    FfxFloat16x2 fG = FfxFloat16x2(f0.g, f1.g);
+    FfxFloat16x2 fB = FfxFloat16x2(f0.b, f1.b);
+    FfxFloat16x2 gR = FfxFloat16x2(g0.r, g1.r);
+    FfxFloat16x2 gG = FfxFloat16x2(g0.g, g1.g);
+    FfxFloat16x2 gB = FfxFloat16x2(g0.b, g1.b);
+    FfxFloat16x2 hR = FfxFloat16x2(h0.r, h1.r);
+    FfxFloat16x2 hG = FfxFloat16x2(h0.g, h1.g);
+    FfxFloat16x2 hB = FfxFloat16x2(h0.b, h1.b);
+    FfxFloat16x2 iR = FfxFloat16x2(i0.r, i1.r);
+    FfxFloat16x2 iG = FfxFloat16x2(i0.g, i1.g);
+    FfxFloat16x2 iB = FfxFloat16x2(i0.b, i1.b);
+    FfxFloat16x2 jR = FfxFloat16x2(j0.r, j1.r);
+    FfxFloat16x2 jG = FfxFloat16x2(j0.g, j1.g);
+    FfxFloat16x2 jB = FfxFloat16x2(j0.b, j1.b);
+    FfxFloat16x2 kR = FfxFloat16x2(k0.r, k1.r);
+    FfxFloat16x2 kG = FfxFloat16x2(k0.g, k1.g);
+    FfxFloat16x2 kB = FfxFloat16x2(k0.b, k1.b);
+    FfxFloat16x2 lR = FfxFloat16x2(l0.r, l1.r);
+    FfxFloat16x2 lG = FfxFloat16x2(l0.g, l1.g);
+    FfxFloat16x2 lB = FfxFloat16x2(l0.b, l1.b);
+    FfxFloat16x2 mR = FfxFloat16x2(m0.r, m1.r);
+    FfxFloat16x2 mG = FfxFloat16x2(m0.g, m1.g);
+    FfxFloat16x2 mB = FfxFloat16x2(m0.b, m1.b);
+    FfxFloat16x2 nR = FfxFloat16x2(n0.r, n1.r);
+    FfxFloat16x2 nG = FfxFloat16x2(n0.g, n1.g);
+    FfxFloat16x2 nB = FfxFloat16x2(n0.b, n1.b);
+    FfxFloat16x2 oR = FfxFloat16x2(o0.r, o1.r);
+    FfxFloat16x2 oG = FfxFloat16x2(o0.g, o1.g);
+    FfxFloat16x2 oB = FfxFloat16x2(o0.b, o1.b);
+    FfxFloat16x2 pR = FfxFloat16x2(p0.r, p1.r);
+    FfxFloat16x2 pG = FfxFloat16x2(p0.g, p1.g);
+    FfxFloat16x2 pB = FfxFloat16x2(p0.b, p1.b);
+
+    // Run optional input transform.
+    casInputHalf(aR, aG, aB);
+    casInputHalf(bR, bG, bB);
+    casInputHalf(cR, cG, cB);
+    casInputHalf(dR, dG, dB);
+    casInputHalf(eR, eG, eB);
+    casInputHalf(fR, fG, fB);
+    casInputHalf(gR, gG, gB);
+    casInputHalf(hR, hG, hB);
+    casInputHalf(iR, iG, iB);
+    casInputHalf(jR, jG, jB);
+    casInputHalf(kR, kG, kB);
+    casInputHalf(lR, lG, lB);
+    casInputHalf(mR, mG, mB);
+    casInputHalf(nR, nG, nB);
+    casInputHalf(oR, oG, oB);
+    casInputHalf(pR, pG, pB);
+
+    // Soft min and max.
+    // These are 2.0x bigger (factored out the extra multiply).
+    //  a b c             b
+    //  e f g * 0.5  +  e f g * 0.5  [F]
+    //  i j k             j
+    FfxFloat16x2 minimumRed = ffxMin3Half(ffxMin3Half(bR, eR, fR), gR, jR);
+    FfxFloat16x2 minimumGreen = ffxMin3Half(ffxMin3Half(bG, eG, fG), gG, jG);
+    FfxFloat16x2 minimumBlue = ffxMin3Half(ffxMin3Half(bB, eB, fB), gB, jB);
+
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mnfR2 = ffxMin3Half(ffxMin3Half(minimumRed, aR, cR), iR, kR);
+    FfxFloat16x2 mnfG2 = ffxMin3Half(ffxMin3Half(minimumGreen, aG, cG), iG, kG);
+    FfxFloat16x2 mnfB2 = ffxMin3Half(ffxMin3Half(minimumBlue, aB, cB), iB, kB);
+    minimumRed               = minimumRed + mnfR2;
+    minimumGreen               = minimumGreen + mnfG2;
+    minimumBlue               = minimumBlue + mnfB2;
+#endif
+    FfxFloat16x2 mxfR = ffxMax3Half(ffxMax3Half(bR, eR, fR), gR, jR);
+    FfxFloat16x2 mxfG = ffxMax3Half(ffxMax3Half(bG, eG, fG), gG, jG);
+    FfxFloat16x2 mxfB = ffxMax3Half(ffxMax3Half(bB, eB, fB), gB, jB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mxfR2 = ffxMax3Half(ffxMax3Half(mxfR, aR, cR), iR, kR);
+    FfxFloat16x2 mxfG2 = ffxMax3Half(ffxMax3Half(mxfG, aG, cG), iG, kG);
+    FfxFloat16x2 mxfB2 = ffxMax3Half(ffxMax3Half(mxfB, aB, cB), iB, kB);
+    mxfR               = mxfR + mxfR2;
+    mxfG               = mxfG + mxfG2;
+    mxfB               = mxfB + mxfB2;
+#endif
+    //  b c d             c
+    //  f g h * 0.5  +  f g h * 0.5  [G]
+    //  j k l             k
+    FfxFloat16x2 mngR = ffxMin3Half(ffxMin3Half(cR, fR, gR), hR, kR);
+    FfxFloat16x2 mngG = ffxMin3Half(ffxMin3Half(cG, fG, gG), hG, kG);
+    FfxFloat16x2 mngB = ffxMin3Half(ffxMin3Half(cB, fB, gB), hB, kB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mngR2 = ffxMin3Half(ffxMin3Half(mngR, bR, dR), jR, lR);
+    FfxFloat16x2 mngG2 = ffxMin3Half(ffxMin3Half(mngG, bG, dG), jG, lG);
+    FfxFloat16x2 mngB2 = ffxMin3Half(ffxMin3Half(mngB, bB, dB), jB, lB);
+    mngR               = mngR + mngR2;
+    mngG               = mngG + mngG2;
+    mngB               = mngB + mngB2;
+#endif
+    FfxFloat16x2 mxgR = ffxMax3Half(ffxMax3Half(cR, fR, gR), hR, kR);
+    FfxFloat16x2 mxgG = ffxMax3Half(ffxMax3Half(cG, fG, gG), hG, kG);
+    FfxFloat16x2 mxgB = ffxMax3Half(ffxMax3Half(cB, fB, gB), hB, kB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mxgR2 = ffxMax3Half(ffxMax3Half(mxgR, bR, dR), jR, lR);
+    FfxFloat16x2 mxgG2 = ffxMax3Half(ffxMax3Half(mxgG, bG, dG), jG, lG);
+    FfxFloat16x2 mxgB2 = ffxMax3Half(ffxMax3Half(mxgB, bB, dB), jB, lB);
+    mxgR               = mxgR + mxgR2;
+    mxgG               = mxgG + mxgG2;
+    mxgB               = mxgB + mxgB2;
+#endif
+    //  e f g             f
+    //  i j k * 0.5  +  i j k * 0.5  [J]
+    //  m n o             n
+    FfxFloat16x2 mnjR = ffxMin3Half(ffxMin3Half(fR, iR, jR), kR, nR);
+    FfxFloat16x2 mnjG = ffxMin3Half(ffxMin3Half(fG, iG, jG), kG, nG);
+    FfxFloat16x2 mnjB = ffxMin3Half(ffxMin3Half(fB, iB, jB), kB, nB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mnjR2 = ffxMin3Half(ffxMin3Half(mnjR, eR, gR), mR, oR);
+    FfxFloat16x2 mnjG2 = ffxMin3Half(ffxMin3Half(mnjG, eG, gG), mG, oG);
+    FfxFloat16x2 mnjB2 = ffxMin3Half(ffxMin3Half(mnjB, eB, gB), mB, oB);
+    mnjR               = mnjR + mnjR2;
+    mnjG               = mnjG + mnjG2;
+    mnjB               = mnjB + mnjB2;
+#endif
+    FfxFloat16x2 mxjR = ffxMax3Half(ffxMax3Half(fR, iR, jR), kR, nR);
+    FfxFloat16x2 mxjG = ffxMax3Half(ffxMax3Half(fG, iG, jG), kG, nG);
+    FfxFloat16x2 mxjB = ffxMax3Half(ffxMax3Half(fB, iB, jB), kB, nB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mxjR2 = ffxMax3Half(ffxMax3Half(mxjR, eR, gR), mR, oR);
+    FfxFloat16x2 mxjG2 = ffxMax3Half(ffxMax3Half(mxjG, eG, gG), mG, oG);
+    FfxFloat16x2 mxjB2 = ffxMax3Half(ffxMax3Half(mxjB, eB, gB), mB, oB);
+    mxjR               = mxjR + mxjR2;
+    mxjG               = mxjG + mxjG2;
+    mxjB               = mxjB + mxjB2;
+#endif
+    //  f g h             g
+    //  j k l * 0.5  +  j k l * 0.5  [K]
+    //  n o p             o
+    FfxFloat16x2 mnkR = ffxMin3Half(ffxMin3Half(gR, jR, kR), lR, oR);
+    FfxFloat16x2 mnkG = ffxMin3Half(ffxMin3Half(gG, jG, kG), lG, oG);
+    FfxFloat16x2 mnkB = ffxMin3Half(ffxMin3Half(gB, jB, kB), lB, oB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mnkR2 = ffxMin3Half(ffxMin3Half(mnkR, fR, hR), nR, pR);
+    FfxFloat16x2 mnkG2 = ffxMin3Half(ffxMin3Half(mnkG, fG, hG), nG, pG);
+    FfxFloat16x2 mnkB2 = ffxMin3Half(ffxMin3Half(mnkB, fB, hB), nB, pB);
+    mnkR               = mnkR + mnkR2;
+    mnkG               = mnkG + mnkG2;
+    mnkB               = mnkB + mnkB2;
+#endif
+    FfxFloat16x2 mxkR = ffxMax3Half(ffxMax3Half(gR, jR, kR), lR, oR);
+    FfxFloat16x2 mxkG = ffxMax3Half(ffxMax3Half(gG, jG, kG), lG, oG);
+    FfxFloat16x2 mxkB = ffxMax3Half(ffxMax3Half(gB, jB, kB), lB, oB);
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 mxkR2 = ffxMax3Half(ffxMax3Half(mxkR, fR, hR), nR, pR);
+    FfxFloat16x2 mxkG2 = ffxMax3Half(ffxMax3Half(mxkG, fG, hG), nG, pG);
+    FfxFloat16x2 mxkB2 = ffxMax3Half(ffxMax3Half(mxkB, fB, hB), nB, pB);
+    mxkR               = mxkR + mxkR2;
+    mxkG               = mxkG + mxkG2;
+    mxkB               = mxkB + mxkB2;
+#endif
+    // Smooth minimum distance to signal limit divided by smooth max.
+#ifdef FFX_CAS_USE_PRECISE_MATH
+    FfxFloat16x2 rcpMfR = ffxReciprocalHalf(mxfR);
+    FfxFloat16x2 rcpMfG = ffxReciprocalHalf(mxfG);
+    FfxFloat16x2 rcpMfB = ffxReciprocalHalf(mxfB);
+    FfxFloat16x2 rcpMgR = ffxReciprocalHalf(mxgR);
+    FfxFloat16x2 rcpMgG = ffxReciprocalHalf(mxgG);
+    FfxFloat16x2 rcpMgB = ffxReciprocalHalf(mxgB);
+    FfxFloat16x2 rcpMjR = ffxReciprocalHalf(mxjR);
+    FfxFloat16x2 rcpMjG = ffxReciprocalHalf(mxjG);
+    FfxFloat16x2 rcpMjB = ffxReciprocalHalf(mxjB);
+    FfxFloat16x2 rcpMkR = ffxReciprocalHalf(mxkR);
+    FfxFloat16x2 rcpMkG = ffxReciprocalHalf(mxkG);
+    FfxFloat16x2 rcpMkB = ffxReciprocalHalf(mxkB);
+#else
+    FfxFloat16x2 rcpMfR = ffxApproximateReciprocalHalf(mxfR);
+    FfxFloat16x2 rcpMfG = ffxApproximateReciprocalHalf(mxfG);
+    FfxFloat16x2 rcpMfB = ffxApproximateReciprocalHalf(mxfB);
+    FfxFloat16x2 rcpMgR = ffxApproximateReciprocalHalf(mxgR);
+    FfxFloat16x2 rcpMgG = ffxApproximateReciprocalHalf(mxgG);
+    FfxFloat16x2 rcpMgB = ffxApproximateReciprocalHalf(mxgB);
+    FfxFloat16x2 rcpMjR = ffxApproximateReciprocalHalf(mxjR);
+    FfxFloat16x2 rcpMjG = ffxApproximateReciprocalHalf(mxjG);
+    FfxFloat16x2 rcpMjB = ffxApproximateReciprocalHalf(mxjB);
+    FfxFloat16x2 rcpMkR = ffxApproximateReciprocalHalf(mxkR);
+    FfxFloat16x2 rcpMkG = ffxApproximateReciprocalHalf(mxkG);
+    FfxFloat16x2 rcpMkB = ffxApproximateReciprocalHalf(mxkB);
+#endif
+#ifdef FFX_CAS_BETTER_DIAGONALS
+    FfxFloat16x2 ampfR = ffxSaturate(min(minimumRed, FFX_BROADCAST_FLOAT16X2(2.0) - mxfR) * rcpMfR);
+    FfxFloat16x2 ampfG = ffxSaturate(min(minimumGreen, FFX_BROADCAST_FLOAT16X2(2.0) - mxfG) * rcpMfG);
+    FfxFloat16x2 ampfB = ffxSaturate(min(minimumBlue, FFX_BROADCAST_FLOAT16X2(2.0) - mxfB) * rcpMfB);
+    FfxFloat16x2 ampgR = ffxSaturate(min(mngR, FFX_BROADCAST_FLOAT16X2(2.0) - mxgR) * rcpMgR);
+    FfxFloat16x2 ampgG = ffxSaturate(min(mngG, FFX_BROADCAST_FLOAT16X2(2.0) - mxgG) * rcpMgG);
+    FfxFloat16x2 ampgB = ffxSaturate(min(mngB, FFX_BROADCAST_FLOAT16X2(2.0) - mxgB) * rcpMgB);
+    FfxFloat16x2 ampjR = ffxSaturate(min(mnjR, FFX_BROADCAST_FLOAT16X2(2.0) - mxjR) * rcpMjR);
+    FfxFloat16x2 ampjG = ffxSaturate(min(mnjG, FFX_BROADCAST_FLOAT16X2(2.0) - mxjG) * rcpMjG);
+    FfxFloat16x2 ampjB = ffxSaturate(min(mnjB, FFX_BROADCAST_FLOAT16X2(2.0) - mxjB) * rcpMjB);
+    FfxFloat16x2 ampkR = ffxSaturate(min(mnkR, FFX_BROADCAST_FLOAT16X2(2.0) - mxkR) * rcpMkR);
+    FfxFloat16x2 ampkG = ffxSaturate(min(mnkG, FFX_BROADCAST_FLOAT16X2(2.0) - mxkG) * rcpMkG);
+    FfxFloat16x2 ampkB = ffxSaturate(min(mnkB, FFX_BROADCAST_FLOAT16X2(2.0) - mxkB) * rcpMkB);
+#else
+    FfxFloat16x2 ampfR  = ffxSaturate(min(minimumRed, FFX_BROADCAST_FLOAT16X2(1.0) - mxfR) * rcpMfR);
+    FfxFloat16x2 ampfG  = ffxSaturate(min(minimumGreen, FFX_BROADCAST_FLOAT16X2(1.0) - mxfG) * rcpMfG);
+    FfxFloat16x2 ampfB  = ffxSaturate(min(minimumBlue, FFX_BROADCAST_FLOAT16X2(1.0) - mxfB) * rcpMfB);
+    FfxFloat16x2 ampgR  = ffxSaturate(min(mngR, FFX_BROADCAST_FLOAT16X2(1.0) - mxgR) * rcpMgR);
+    FfxFloat16x2 ampgG  = ffxSaturate(min(mngG, FFX_BROADCAST_FLOAT16X2(1.0) - mxgG) * rcpMgG);
+    FfxFloat16x2 ampgB  = ffxSaturate(min(mngB, FFX_BROADCAST_FLOAT16X2(1.0) - mxgB) * rcpMgB);
+    FfxFloat16x2 ampjR  = ffxSaturate(min(mnjR, FFX_BROADCAST_FLOAT16X2(1.0) - mxjR) * rcpMjR);
+    FfxFloat16x2 ampjG  = ffxSaturate(min(mnjG, FFX_BROADCAST_FLOAT16X2(1.0) - mxjG) * rcpMjG);
+    FfxFloat16x2 ampjB  = ffxSaturate(min(mnjB, FFX_BROADCAST_FLOAT16X2(1.0) - mxjB) * rcpMjB);
+    FfxFloat16x2 ampkR  = ffxSaturate(min(mnkR, FFX_BROADCAST_FLOAT16X2(1.0) - mxkR) * rcpMkR);
+    FfxFloat16x2 ampkG  = ffxSaturate(min(mnkG, FFX_BROADCAST_FLOAT16X2(1.0) - mxkG) * rcpMkG);
+    FfxFloat16x2 ampkB  = ffxSaturate(min(mnkB, FFX_BROADCAST_FLOAT16X2(1.0) - mxkB) * rcpMkB);
+#endif
+
+    // Shaping amount of sharpening.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    ampfR = ffxSqrt(ampfR);
+    ampfG = ffxSqrt(ampfG);
+    ampfB = ffxSqrt(ampfB);
+    ampgR = ffxSqrt(ampgR);
+    ampgG = ffxSqrt(ampgG);
+    ampgB = ffxSqrt(ampgB);
+    ampjR = ffxSqrt(ampjR);
+    ampjG = ffxSqrt(ampjG);
+    ampjB = ffxSqrt(ampjB);
+    ampkR = ffxSqrt(ampkR);
+    ampkG = ffxSqrt(ampkG);
+    ampkB = ffxSqrt(ampkB);
+#else
+    ampfR = ffxApproximateSqrtHalf(ampfR);
+    ampfG = ffxApproximateSqrtHalf(ampfG);
+    ampfB = ffxApproximateSqrtHalf(ampfB);
+    ampgR = ffxApproximateSqrtHalf(ampgR);
+    ampgG = ffxApproximateSqrtHalf(ampgG);
+    ampgB = ffxApproximateSqrtHalf(ampgB);
+    ampjR = ffxApproximateSqrtHalf(ampjR);
+    ampjG = ffxApproximateSqrtHalf(ampjG);
+    ampjB = ffxApproximateSqrtHalf(ampjB);
+    ampkR = ffxApproximateSqrtHalf(ampkR);
+    ampkG = ffxApproximateSqrtHalf(ampkG);
+    ampkB = ffxApproximateSqrtHalf(ampkB);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Filter shape.
+    FfxFloat16   peak = FFX_UINT32_TO_FLOAT16X2(const1.y).x;
+    FfxFloat16x2 wfR  = ampfR * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wfG  = ampfG * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wfB  = ampfB * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wgR  = ampgR * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wgG  = ampgG * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wgB  = ampgB * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wjR  = ampjR * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wjG  = ampjG * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wjB  = ampjB * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wkR  = ampkR * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wkG  = ampkG * FFX_BROADCAST_FLOAT16X2(peak);
+    FfxFloat16x2 wkB  = ampkB * FFX_BROADCAST_FLOAT16X2(peak);
+
+    // Blend between 4 results.
+    FfxFloat16x2 s = (FFX_BROADCAST_FLOAT16X2(1.0) - ppX) * (FFX_BROADCAST_FLOAT16X2(1.0) - FFX_BROADCAST_FLOAT16X2(ppY));
+    FfxFloat16x2 t = ppX * (FFX_BROADCAST_FLOAT16X2(1.0) - FFX_BROADCAST_FLOAT16X2(ppY));
+    FfxFloat16x2 u = (FFX_BROADCAST_FLOAT16X2(1.0) - ppX) * FFX_BROADCAST_FLOAT16X2(ppY);
+    FfxFloat16x2 v = ppX * FFX_BROADCAST_FLOAT16X2(ppY);
+
+    // Thin edges to hide bilinear interpolation (helps diagonals).
+    FfxFloat16x2 thinB = FFX_BROADCAST_FLOAT16X2(1.0 / 32.0);
+
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    s *= ffxReciprocalHalf(thinB + (mxfG - minimumGreen));
+    t *= ffxReciprocalHalf(thinB + (mxgG - mngG));
+    u *= ffxReciprocalHalf(thinB + (mxjG - mnjG));
+    v *= ffxReciprocalHalf(thinB + (mxkG - mnkG));
+#else
+    s *= ffxApproximateReciprocalHalf(thinB + (mxfG - minimumGreen));
+    t *= ffxApproximateReciprocalHalf(thinB + (mxgG - mngG));
+    u *= ffxApproximateReciprocalHalf(thinB + (mxjG - mnjG));
+    v *= ffxApproximateReciprocalHalf(thinB + (mxkG - mnkG));
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    // Final weighting.
+    FfxFloat16x2 qbeR = wfR * s;
+    FfxFloat16x2 qbeG = wfG * s;
+    FfxFloat16x2 qbeB = wfB * s;
+    FfxFloat16x2 qchR = wgR * t;
+    FfxFloat16x2 qchG = wgG * t;
+    FfxFloat16x2 qchB = wgB * t;
+    FfxFloat16x2 qfR  = wgR * t + wjR * u + s;
+    FfxFloat16x2 qfG  = wgG * t + wjG * u + s;
+    FfxFloat16x2 qfB  = wgB * t + wjB * u + s;
+    FfxFloat16x2 qgR  = wfR * s + wkR * v + t;
+    FfxFloat16x2 qgG  = wfG * s + wkG * v + t;
+    FfxFloat16x2 qgB  = wfB * s + wkB * v + t;
+    FfxFloat16x2 qjR  = wfR * s + wkR * v + u;
+    FfxFloat16x2 qjG  = wfG * s + wkG * v + u;
+    FfxFloat16x2 qjB  = wfB * s + wkB * v + u;
+    FfxFloat16x2 qkR  = wgR * t + wjR * u + v;
+    FfxFloat16x2 qkG  = wgG * t + wjG * u + v;
+    FfxFloat16x2 qkB  = wgB * t + wjB * u + v;
+    FfxFloat16x2 qinR = wjR * u;
+    FfxFloat16x2 qinG = wjG * u;
+    FfxFloat16x2 qinB = wjB * u;
+    FfxFloat16x2 qloR = wkR * v;
+    FfxFloat16x2 qloG = wkG * v;
+    FfxFloat16x2 qloB = wkB * v;
+
+    // Filter.
+#if defined(FFX_CAS_USE_PRECISE_MATH)
+    FfxFloat16x2 rcpWG = ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(2.0) * qbeG + FFX_BROADCAST_FLOAT16X2(2.0) * qchG + FFX_BROADCAST_FLOAT16X2(2.0) * qinG +
+                                           FFX_BROADCAST_FLOAT16X2(2.0) * qloG + qfG + qgG + qjG + qkG);
+#else
+    FfxFloat16x2 rcpWG = ffxApproximateReciprocalMediumHalf(
+                            FFX_BROADCAST_FLOAT16X2(2.0) * qbeG + FFX_BROADCAST_FLOAT16X2(2.0) * qchG + FFX_BROADCAST_FLOAT16X2(2.0) * qinG + 
+                            FFX_BROADCAST_FLOAT16X2(2.0) * qloG + qfG + qgG + qjG + qkG);
+#endif  // #if defined(FFX_CAS_USE_PRECISE_MATH)
+
+    pixR = ffxSaturate(
+        (bR * qbeG + eR * qbeG + cR * qchG + hR * qchG + iR * qinG + nR * qinG + lR * qloG + oR * qloG + fR * qfG + gR * qgG + jR * qjG + kR * qkG) * rcpWG);
+    pixG = ffxSaturate(
+        (bG * qbeG + eG * qbeG + cG * qchG + hG * qchG + iG * qinG + nG * qinG + lG * qloG + oG * qloG + fG * qfG + gG * qgG + jG * qjG + kG * qkG) * rcpWG);
+    pixB = ffxSaturate(
+        (bB * qbeG + eB * qbeG + cB * qchG + hB * qchG + iB * qinG + nB * qinG + lB * qloG + oB * qloG + fB * qfG + gB * qgG + jB * qjG + kB * qkG) * rcpWG);
+}
+#endif // #if FFX_HALF == 1
+
+/// Apply constant adaptive sharpening (CAS) filter to a single pixel.
+/// 
+/// @param [out] pixR                   Red channel output value. This is non-vector to enable switching between <c><i>ffxCasFilter</i></c> and <c><i>ffxCasFilterHalf</i></c>.
+/// @param [out] pixG                   Green channel output value. This is non-vector to enable switching between <c><i>ffxCasFilter</i></c> and <c><i>ffxCasFilterHalf</i></c>.
+/// @param [out] pixB                   Blue channel output value. This is non-vector to enable switching between <c><i>ffxCasFilter</i></c> and <c><i>ffxCasFilterHalf</i></c>.
+/// @param [in] samplePosition          The integer pixel position in the output.
+/// @param [in] const0                  The first constant generated by <c><i>ffxCasSetup</i></c>.
+/// @param [in] const1                  The second constant generated by <c><i>ffxCasSetup</i></c>.
+/// @param [in] noScaling               Must be a compile-time literal value. A value of true applies sharpening only (no resizing).
+/// 
+/// @ingroup FfxGPUCas
+void ffxCasFilter(
+    FFX_PARAMETER_OUT FfxFloat32 pixR,
+    FFX_PARAMETER_OUT FfxFloat32 pixG,
+    FFX_PARAMETER_OUT FfxFloat32 pixB,
+    FFX_PARAMETER_IN FfxUInt32x2 samplePosition,
+    FFX_PARAMETER_IN FfxUInt32x4 const0,
+    FFX_PARAMETER_IN FfxUInt32x4 const1,
+    FFX_PARAMETER_IN FfxBoolean noScaling)
+{
+#if defined(FFX_CAS_DEBUG_CHECKER)
+    // Debug a checker pattern of on/off tiles for visual inspection.
+    if ((((samplePosition.x ^ samplePosition.y) >> 8u) & 1u) == 0u) {
+
+        FfxFloat32x3 pix0 = casLoad(FfxInt32x2(samplePosition));
+        pixR = pix0.r;
+        pixG = pix0.g;
+        pixB = pix0.b;
+        casInput(pixR, pixG, pixB);
+        return;
+    }
+#endif // #if defined(FFX_CAS_PACKED_ONLY)
+
+    if (noScaling) {
+        casFilterNoScaling(pixR, pixG, pixB, samplePosition, const0, const1);
+    } else {
+        casFilterWithScaling(pixR, pixG, pixB, samplePosition, const0, const1);
+    }
+}
+
+#if FFX_HALF == 1
+#if defined(FFX_HLSL)
+#if !defined(FFX_CAS_USE_PRECISE_MATH)
+// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
+#define FFX_CAS_USE_PRECISE_MATH        (1)
+#endif // #if !defined(FFX_CAS_USE_PRECISE_MATH)
+#endif // #if defined(FFX_HLSL)
+
+/// A utility function which can be used to convert the packed SOA form results
+/// returned by <c><i>ffxCasFilterHalf</i></c> into AOS form data ready for storing.
+///
+/// The implementation of both <c><i>ffxCasDepackHalf</i></c> and <c><i>ffxCasFilterHalf</i></c> assumes
+/// that the pixels packed together are separated by 8 pixels in the X dimension.
+///
+/// It is suggested to only use <c><i>ffxCasDepack</i></c> right before stores. This is to maintain packed
+/// math for any work after <c><i>ffxCasFilterHalf</i></c>.
+///
+/// An example might look as follows:
+///     ffxCasFilterHalf(cR, cG, cB, gxy, const0, const1, false);
+///     ...
+///     ffxCasDepack(c0, c1, cR, cG, cB);
+///     imageStore(imgDst, FfxInt32x2(gxy), FfxFloat4(c0));
+///     imageStore(imgDst, FfxInt32x2(gxy) + FfxInt32x2(8, 0), FfxFloat4(c1));
+///
+/// @param [out] pix0                   
+/// @param [out] pix1                   
+/// @param [in] pixR                    The red channel components of two packed pixels.
+/// @param [in] pixG                    The green channel components of two packed pixels.
+/// @param [in] pixB                    The blue channel components of two packed pixels.
+/// 
+/// @ingroup FfxGPUCas
+void ffxCasDepackHalf(
+    FFX_PARAMETER_OUT FfxFloat16x4 pix0,
+    FFX_PARAMETER_OUT FfxFloat16x4 pix1,
+    FFX_PARAMETER_IN FfxFloat16x2 pixR,
+    FFX_PARAMETER_IN FfxFloat16x2 pixG,
+    FFX_PARAMETER_IN FfxFloat16x2 pixB)
+{
+#ifdef FFX_HLSL
+    // Invoke a slower path for DX only, since it won't allow uninitialized values.
+    pix0.a = pix1.a = 0.0;
+#endif
+    pix0.rgb = FfxFloat16x3(pixR.x, pixG.x, pixB.x);
+    pix1.rgb = FfxFloat16x3(pixR.y, pixG.y, pixB.y);
+}
+
+/// Apply constant adaptive sharpening (CAS) filter to a pair of pixels.
+/// 
+/// Output values are for 2 separate 8x8 tiles in a 16x8 region.
+///     pix<R,G,B>.x = right 8x8 tile
+///     pix<R,G,B>.y =  left 8x8 tile
+/// This enables later processing to easily be packed as well.
+///
+/// @param [out] pixR                   Red channel output value. This is non-vector to enable switching between <c><i>ffxCasFilter</i></c> and <c><i>ffxCasFilterHalf</i></c>.
+/// @param [out] pixG                   Green channel output value. This is non-vector to enable switching between <c><i>ffxCasFilter</i></c> and <c><i>ffxCasFilterHalf</i></c>.
+/// @param [out] pixB                   Blue channel output value. This is non-vector to enable switching between <c><i>ffxCasFilter</i></c> and <c><i>ffxCasFilterHalf</i></c>.
+/// @param [in] samplePosition          The integer pixel position in the output.
+/// @param [in] const0                  The first constant generated by <c><i>ffxCasSetup</i></c>.
+/// @param [in] const1                  The second constant generated by <c><i>ffxCasSetup</i></c>.
+/// @param [in] noScaling               Must be a compile-time literal value. A value of true applies sharpening only (no resizing).
+/// 
+/// @ingroup FfxGPUCas
+void ffxCasFilterHalf(
+   FFX_PARAMETER_OUT FfxFloat16x2 pixR,
+   FFX_PARAMETER_OUT FfxFloat16x2 pixG,
+   FFX_PARAMETER_OUT FfxFloat16x2 pixB,
+   FFX_PARAMETER_IN FfxUInt32x2 samplePosition,
+   FFX_PARAMETER_IN FfxUInt32x4 const0,
+   FFX_PARAMETER_IN FfxUInt32x4 const1,
+   FFX_PARAMETER_IN FfxBoolean noScaling)
+{
+#if defined(FFX_CAS_DEBUG_CHECKER)
+    // Debug a checker pattern of on/off tiles for visual inspection. 
+    if ((((samplePosition.x ^ samplePosition.y) >> 8u) & 1u) == 0u) {
+
+        FfxFloat16x3 pix0 = casLoadHalf(FfxInt16x2(ip));
+        FfxFloat16x3 pix1 = casLoadHalf(FfxInt16x2(ip) + FfxInt16x2(8, 0));
+        pixR = FfxFloat16x2(pix0.r, pix1.r);
+        pixG = FfxFloat16x2(pix0.g, pix1.g);
+        pixB = FfxFloat16x2(pix0.b, pix1.b);
+        casInputHalf(pixR, pixG, pixB);
+        return;
+    }
+#endif // #if defined(FFX_CAS_PACKED_ONLY)
+
+    // No scaling algorithm uses minimal 3x3 pixel neighborhood.
+    if (noScaling) {
+        casFilterNoScalingHalf(pixR, pixG, pixB, samplePosition, const0, const1);
+    } else {
+        casFilterWithScalingHalf(pixR, pixG, pixB, samplePosition, const0, const1);
+    }
+}
+#endif // #if FFX_HALF == 1
+#endif // #if defined(FFX_GPU)
diff --git a/Shaders/shaders/cas/ffx_cas.h.meta b/Shaders/shaders/cas/ffx_cas.h.meta
new file mode 100644
index 0000000..928f82d
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas.h.meta
@@ -0,0 +1,65 @@
+fileFormatVersion: 2
+guid: f674a479a9610d244a0b9f93b091b49d
+PluginImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  iconMap: {}
+  executionOrder: {}
+  defineConstraints: []
+  isPreloaded: 0
+  isOverridable: 0
+  isExplicitlyReferenced: 0
+  validateReferences: 1
+  platformData:
+  - first:
+      : Any
+    second:
+      enabled: 0
+      settings:
+        Exclude Editor: 1
+        Exclude GameCoreScarlett: 1
+        Exclude GameCoreXboxOne: 1
+        Exclude Linux64: 1
+        Exclude OSXUniversal: 1
+        Exclude PS4: 1
+        Exclude PS5: 1
+        Exclude Win: 1
+        Exclude Win64: 1
+  - first:
+      Any: 
+    second:
+      enabled: 0
+      settings: {}
+  - first:
+      Editor: Editor
+    second:
+      enabled: 0
+      settings:
+        DefaultValueInitialized: true
+  - first:
+      Standalone: Linux64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: OSXUniversal
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h b/Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h
new file mode 100644
index 0000000..9d89e61
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h
@@ -0,0 +1,226 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (C) 2024 Advanced Micro Devices, Inc.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "ffx_cas_resources.h"
+
+#if defined(FFX_GPU)
+#ifdef __hlsl_dx_compiler
+#pragma dxc diagnostic push
+#pragma dxc diagnostic ignored "-Wambig-lit-shift"
+#endif //__hlsl_dx_compiler
+#include "../ffx_core.h"
+#ifdef __hlsl_dx_compiler
+#pragma dxc diagnostic pop
+#endif //__hlsl_dx_compiler
+
+#ifndef FFX_PREFER_WAVE64
+#define FFX_PREFER_WAVE64
+#endif // #ifndef FFX_PREFER_WAVE64
+
+#if defined(FFX_GPU)
+#pragma warning(disable: 3205)  // conversion from larger type to smaller
+#endif // #if defined(FFX_GPU)
+
+#define DECLARE_SRV_REGISTER(regIndex)  t##regIndex
+#define DECLARE_UAV_REGISTER(regIndex)  u##regIndex
+#define DECLARE_CB_REGISTER(regIndex)   b##regIndex
+#define FFX_CAS_DECLARE_SRV(regIndex)  register(DECLARE_SRV_REGISTER(regIndex))
+#define FFX_CAS_DECLARE_UAV(regIndex)  register(DECLARE_UAV_REGISTER(regIndex))
+#define FFX_CAS_DECLARE_CB(regIndex)   register(DECLARE_CB_REGISTER(regIndex))
+
+#if defined(CAS_BIND_CB_CAS)
+    cbuffer cbCAS : FFX_CAS_DECLARE_CB(CAS_BIND_CB_CAS)
+    {
+        FfxUInt32x4 const0;
+        FfxUInt32x4 const1;
+       #define FFX_CAS_CONSTANT_BUFFER_1_SIZE 8  // Number of 32-bit values. This must be kept in sync with the cbCAS size.
+    };
+#else
+    #define const0 0
+    #define const1 0
+#endif
+
+#if defined(FFX_GPU)
+#define FFX_CAS_ROOTSIG_STRINGIFY(p) FFX_CAS_ROOTSIG_STR(p)
+#define FFX_CAS_ROOTSIG_STR(p) #p
+#define FFX_CAS_ROOTSIG [RootSignature( "DescriptorTable(UAV(u0, numDescriptors = " FFX_CAS_ROOTSIG_STRINGIFY(FFX_CAS_RESOURCE_IDENTIFIER_COUNT) ")), " \
+                                    "DescriptorTable(SRV(t0, numDescriptors = " FFX_CAS_ROOTSIG_STRINGIFY(FFX_CAS_RESOURCE_IDENTIFIER_COUNT) ")), " \
+                                    "CBV(b0), " \
+                                    "StaticSampler(s0, filter = FILTER_MIN_MAG_MIP_LINEAR, " \
+                                                      "addressU = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressV = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressW = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "comparisonFunc = COMPARISON_NEVER, " \
+                                                      "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK)" )]
+
+#if defined(FFX_CAS_EMBED_ROOTSIG)
+#define FFX_CAS_EMBED_ROOTSIG_CONTENT FFX_CAS_ROOTSIG
+#else
+#define FFX_CAS_EMBED_ROOTSIG_CONTENT
+#endif // #if FFX_CAS_EMBED_ROOTSIG
+#endif // #if defined(FFX_GPU)
+
+
+FfxUInt32x4 Const0()
+{
+#if defined(CAS_BIND_CB_CAS)
+    return const0;
+#else
+    return 0.f;
+#endif
+}
+
+FfxUInt32x4 Const1()
+{
+#if defined(CAS_BIND_CB_CAS)
+    return const1;
+#else
+    return 0.f;
+#endif
+}
+
+SamplerState s_LinearClamp : register(s0);
+
+    // SRVs
+    #if defined(CAS_BIND_SRV_INPUT_COLOR)
+        Texture2D<FfxFloat32x4>                   r_input_color                 : FFX_CAS_DECLARE_SRV(CAS_BIND_SRV_INPUT_COLOR);
+    #endif
+
+    // UAV declarations
+    #if defined(CAS_BIND_UAV_OUTPUT_COLOR)
+        RWTexture2D<FfxFloat32x4>                 rw_output_color               : FFX_CAS_DECLARE_UAV(CAS_BIND_UAV_OUTPUT_COLOR);
+    #endif
+
+#if FFX_HALF
+
+FfxFloat16x3 casLoadHalf(FFX_PARAMETER_IN FfxInt16x2 position)
+{
+#if defined(CAS_BIND_SRV_INPUT_COLOR) 
+    return FfxFloat16x3(r_input_color.Load(FfxInt32x3(position, 0)).rgb);
+#else
+    return 0.f;
+#endif
+}
+
+// Transform input from the load into a linear color space between 0 and 1.
+void casInputHalf(FFX_PARAMETER_INOUT FfxFloat16x2 red, FFX_PARAMETER_INOUT FfxFloat16x2 green, FFX_PARAMETER_INOUT FfxFloat16x2 blue)
+{
+#if FFX_CAS_COLOR_SPACE_CONVERSION == 1    // gamma 2.0
+    red   *= red;
+    green *= green;
+    blue  *= blue;
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 2  // gamma 2.2
+    red   = ffxLinearFromGammaHalf(red, FfxFloat16(2.2f));
+    green = ffxLinearFromGammaHalf(green, FfxFloat16(2.2f));
+    blue  = ffxLinearFromGammaHalf(blue, FfxFloat16(2.2f));
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 3  // sRGB output (auto-degamma'd on sampler read)
+
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 4  // sRGB input/output
+    red   = ffxLinearFromSrgbHalf(red);
+    green = ffxLinearFromSrgbHalf(green);
+    blue  = ffxLinearFromSrgbHalf(blue);
+#endif
+}
+
+void casOutputHalf(FFX_PARAMETER_INOUT FfxFloat16x2 red, FFX_PARAMETER_INOUT FfxFloat16x2 green, FFX_PARAMETER_INOUT FfxFloat16x2 blue)
+{
+#if FFX_CAS_COLOR_SPACE_CONVERSION == 1    // gamma 2.0
+    red   = ffxSqrt(red);
+    green = ffxSqrt(green);
+    blue  = ffxSqrt(blue);
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 2  // gamma 2.2
+    red   = ffxGammaFromLinearHalf(red, FfxFloat16(1/2.2f));
+    green = ffxGammaFromLinearHalf(green, FfxFloat16(1/2.2f));
+    blue  = ffxGammaFromLinearHalf(blue, FfxFloat16(1/2.2f));
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 3  // sRGB output (auto-degamma'd on sampler read)
+    red   = ffxSrgbFromLinearHalf(red);
+    green = ffxSrgbFromLinearHalf(green);
+    blue  = ffxSrgbFromLinearHalf(blue);
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 4  // sRGB input/output
+    red   = ffxSrgbFromLinearHalf(red);
+    green = ffxSrgbFromLinearHalf(green);
+    blue  = ffxSrgbFromLinearHalf(blue);
+#endif
+}
+
+#else
+
+FfxFloat32x3 casLoad(FFX_PARAMETER_IN FfxInt32x2 position)
+{
+#if defined(CAS_BIND_SRV_INPUT_COLOR) 
+    return r_input_color.Load(FfxInt32x3(position, 0)).rgb;
+#else
+    return 0.f;
+#endif
+}
+
+// Transform input from the load into a linear color space between 0 and 1.
+void casInput(FFX_PARAMETER_INOUT FfxFloat32 red, FFX_PARAMETER_INOUT FfxFloat32 green, FFX_PARAMETER_INOUT FfxFloat32 blue)
+{
+#if FFX_CAS_COLOR_SPACE_CONVERSION == 1    // gamma 2.0
+    red   *= red;
+    green *= green;
+    blue  *= blue;
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 2  // gamma 2.2
+    red   = ffxLinearFromGamma(red, FfxFloat32(2.2f));
+    green = ffxLinearFromGamma(green, FfxFloat32(2.2f));
+    blue  = ffxLinearFromGamma(blue, FfxFloat32(2.2f));
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 3  // sRGB output (auto-degamma'd on sampler read)
+
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 4  // sRGB input/output
+    red   = ffxLinearFromSrgb(red);
+    green = ffxLinearFromSrgb(green);
+    blue  = ffxLinearFromSrgb(blue);
+#endif
+}
+
+void casOutput(FFX_PARAMETER_INOUT FfxFloat32 red, FFX_PARAMETER_INOUT FfxFloat32 green, FFX_PARAMETER_INOUT FfxFloat32 blue)
+{
+#if FFX_CAS_COLOR_SPACE_CONVERSION == 1    // gamma 2.0
+    red   = ffxSqrt(red);
+    green = ffxSqrt(green);
+    blue  = ffxSqrt(blue);
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 2  // gamma 2.2
+    red   = ffxGammaFromLinear(red, FfxFloat32(1/2.2f));
+    green = ffxGammaFromLinear(green, FfxFloat32(1/2.2f));
+    blue  = ffxGammaFromLinear(blue, FfxFloat32(1/2.2f));
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 3  // sRGB output (auto-degamma'd on sampler read)
+    red   = ffxSrgbFromLinear(red);
+    green = ffxSrgbFromLinear(green);
+    blue  = ffxSrgbFromLinear(blue);
+#elif FFX_CAS_COLOR_SPACE_CONVERSION == 4  // sRGB input/output
+    red   = ffxSrgbFromLinear(red);
+    green = ffxSrgbFromLinear(green);
+    blue  = ffxSrgbFromLinear(blue);
+#endif
+}
+
+#endif  // FFX_HALF
+
+void casStoreOutput(FfxInt32x2 iPxPos, FfxFloat32x4 fColor)
+{
+#if defined(CAS_BIND_UAV_OUTPUT_COLOR) 
+    rw_output_color[iPxPos] = fColor;
+#endif
+}
+
+#endif // #if defined(FFX_GPU)
diff --git a/Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h.meta b/Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h.meta
new file mode 100644
index 0000000..59fd86f
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas_callbacks_hlsl.h.meta
@@ -0,0 +1,65 @@
+fileFormatVersion: 2
+guid: 75c2da7c7b951b940adb44e9342dd303
+PluginImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  iconMap: {}
+  executionOrder: {}
+  defineConstraints: []
+  isPreloaded: 0
+  isOverridable: 0
+  isExplicitlyReferenced: 0
+  validateReferences: 1
+  platformData:
+  - first:
+      : Any
+    second:
+      enabled: 0
+      settings:
+        Exclude Editor: 1
+        Exclude GameCoreScarlett: 1
+        Exclude GameCoreXboxOne: 1
+        Exclude Linux64: 1
+        Exclude OSXUniversal: 1
+        Exclude PS4: 1
+        Exclude PS5: 1
+        Exclude Win: 1
+        Exclude Win64: 1
+  - first:
+      Any: 
+    second:
+      enabled: 0
+      settings: {}
+  - first:
+      Editor: Editor
+    second:
+      enabled: 0
+      settings:
+        DefaultValueInitialized: true
+  - first:
+      Standalone: Linux64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: OSXUniversal
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Shaders/shaders/cas/ffx_cas_resources.h b/Shaders/shaders/cas/ffx_cas_resources.h
new file mode 100644
index 0000000..2ea1adb
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas_resources.h
@@ -0,0 +1,41 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (C) 2024 Advanced Micro Devices, Inc.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_CAS_RESOURCES_H
+#define FFX_CAS_RESOURCES_H
+
+#if defined(FFX_CPU) || defined(FFX_GPU)
+
+#define FFX_CAS_RESOURCE_IDENTIFIER_NULL                       0
+#define FFX_CAS_RESOURCE_IDENTIFIER_INPUT_COLOR                1
+#define FFX_CAS_RESOURCE_IDENTIFIER_OUTPUT_COLOR               2
+
+#define FFX_CAS_RESOURCE_IDENTIFIER_COUNT                      3
+
+// CBV resource definitions
+#define FFX_CAS_CONSTANTBUFFER_IDENTIFIER_CAS                  0
+
+#define FFX_CAS_CONSTANTBUFFER_IDENTIFIER_COUNT                1
+
+#endif  // #if defined(FFX_CPU) || defined(FFX_GPU)
+
+#endif  // FFX_CAS_RESOURCES_H
diff --git a/Shaders/shaders/cas/ffx_cas_resources.h.meta b/Shaders/shaders/cas/ffx_cas_resources.h.meta
new file mode 100644
index 0000000..65da05c
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas_resources.h.meta
@@ -0,0 +1,65 @@
+fileFormatVersion: 2
+guid: 178b95414522b1349920c12ff1ddc925
+PluginImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  iconMap: {}
+  executionOrder: {}
+  defineConstraints: []
+  isPreloaded: 0
+  isOverridable: 0
+  isExplicitlyReferenced: 0
+  validateReferences: 1
+  platformData:
+  - first:
+      : Any
+    second:
+      enabled: 0
+      settings:
+        Exclude Editor: 1
+        Exclude GameCoreScarlett: 1
+        Exclude GameCoreXboxOne: 1
+        Exclude Linux64: 1
+        Exclude OSXUniversal: 1
+        Exclude PS4: 1
+        Exclude PS5: 1
+        Exclude Win: 1
+        Exclude Win64: 1
+  - first:
+      Any: 
+    second:
+      enabled: 0
+      settings: {}
+  - first:
+      Editor: Editor
+    second:
+      enabled: 0
+      settings:
+        DefaultValueInitialized: true
+  - first:
+      Standalone: Linux64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: OSXUniversal
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Shaders/shaders/cas/ffx_cas_sharpen.h b/Shaders/shaders/cas/ffx_cas_sharpen.h
new file mode 100644
index 0000000..3e42f98
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas_sharpen.h
@@ -0,0 +1,89 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (C) 2024 Advanced Micro Devices, Inc.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "../ffx_core.h"
+
+#if FFX_HALF
+
+#define FFX_CAS_PACKED_ONLY 1
+
+#endif  // FFX_HALF
+
+#include "ffx_cas.h"
+
+void Sharpen(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid)
+{
+    // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
+    FfxUInt32x2 gxy = ffxRemapForQuad(LocalThreadId.x) + FfxUInt32x2(WorkGroupId.x << 4u, WorkGroupId.y << 4u);
+
+    FfxBoolean sharpenOnly;
+#if FFX_CAS_OPTION_SHARPEN_ONLY
+    sharpenOnly = true;
+#else
+    sharpenOnly = false;
+#endif  // FFX_CAS_OPTION_SHARPEN_ONLY
+
+#if FFX_HALF
+
+    // Filter.
+    FfxFloat16x4 c0, c1;
+    FfxFloat16x2 cR, cG, cB;
+
+    ffxCasFilterHalf(cR, cG, cB, gxy, Const0(), Const1(), sharpenOnly);
+    casOutputHalf(cR, cG, cB);
+    ffxCasDepackHalf(c0, c1, cR, cG, cB);
+    casStoreOutput(FfxInt32x2(gxy), FfxFloat32x4(c0));
+    casStoreOutput(FfxInt32x2(gxy) + FfxInt32x2(8, 0), FfxFloat32x4(c1));
+    gxy.y += 8u;
+
+    ffxCasFilterHalf(cR, cG, cB, gxy, Const0(), Const1(), sharpenOnly);
+    casOutputHalf(cR, cG, cB);
+    ffxCasDepackHalf(c0, c1, cR, cG, cB);
+    casStoreOutput(FfxInt32x2(gxy), FfxFloat32x4(c0));
+    casStoreOutput(FfxInt32x2(gxy) + FfxInt32x2(8, 0), FfxFloat32x4(c1));
+
+#else
+
+    // Filter.
+    FfxFloat32x3 c;
+
+    ffxCasFilter(c.r, c.g, c.b, gxy, Const0(), Const1(), sharpenOnly);
+    casOutput(c.r, c.g, c.b);
+    casStoreOutput(FfxInt32x2(gxy), FfxFloat32x4(c, 1));
+    gxy.x += 8u;
+
+    ffxCasFilter(c.r, c.g, c.b, gxy, Const0(), Const1(), sharpenOnly);
+    casOutput(c.r, c.g, c.b);
+    casStoreOutput(FfxInt32x2(gxy), FfxFloat32x4(c, 1));
+    gxy.y += 8u;
+
+    ffxCasFilter(c.r, c.g, c.b, gxy, Const0(), Const1(), sharpenOnly);
+    casOutput(c.r, c.g, c.b);
+    casStoreOutput(FfxInt32x2(gxy), FfxFloat32x4(c, 1));
+    gxy.x -= 8u;
+
+    ffxCasFilter(c.r, c.g, c.b, gxy, Const0(), Const1(), sharpenOnly);
+    casOutput(c.r, c.g, c.b);
+    casStoreOutput(FfxInt32x2(gxy), FfxFloat32x4(c, 1));
+
+#endif  // FFX_HALF
+}
diff --git a/Shaders/shaders/cas/ffx_cas_sharpen.h.meta b/Shaders/shaders/cas/ffx_cas_sharpen.h.meta
new file mode 100644
index 0000000..26e5d42
--- /dev/null
+++ b/Shaders/shaders/cas/ffx_cas_sharpen.h.meta
@@ -0,0 +1,65 @@
+fileFormatVersion: 2
+guid: 4a24b15e191a20745a2da66e8ff76069
+PluginImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  iconMap: {}
+  executionOrder: {}
+  defineConstraints: []
+  isPreloaded: 0
+  isOverridable: 0
+  isExplicitlyReferenced: 0
+  validateReferences: 1
+  platformData:
+  - first:
+      : Any
+    second:
+      enabled: 0
+      settings:
+        Exclude Editor: 1
+        Exclude GameCoreScarlett: 1
+        Exclude GameCoreXboxOne: 1
+        Exclude Linux64: 1
+        Exclude OSXUniversal: 1
+        Exclude PS4: 1
+        Exclude PS5: 1
+        Exclude Win: 1
+        Exclude Win64: 1
+  - first:
+      Any: 
+    second:
+      enabled: 0
+      settings: {}
+  - first:
+      Editor: Editor
+    second:
+      enabled: 0
+      settings:
+        DefaultValueInitialized: true
+  - first:
+      Standalone: Linux64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: OSXUniversal
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  - first:
+      Standalone: Win64
+    second:
+      enabled: 0
+      settings:
+        CPU: None
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Shaders/shaders/ffx_cas_sharpen_pass.hlsl b/Shaders/shaders/ffx_cas_sharpen_pass.hlsl
new file mode 100644
index 0000000..0b01c2c
--- /dev/null
+++ b/Shaders/shaders/ffx_cas_sharpen_pass.hlsl
@@ -0,0 +1,54 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (C) 2024 Advanced Micro Devices, Inc.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// CAS pass
+// SRV  0 : CAS_InputColor             : r_input_color
+// UAV  0 : CAS_OutputColor            : rw__output_color
+// CB   0 : cbCAS
+
+#define CAS_BIND_SRV_INPUT_COLOR               0
+#define CAS_BIND_UAV_OUTPUT_COLOR              0
+#define CAS_BIND_CB_CAS                        0
+
+#include "cas/ffx_cas_callbacks_hlsl.h"
+#include "cas/ffx_cas_sharpen.h"
+
+#ifndef FFX_CAS_THREAD_GROUP_WIDTH
+#define FFX_CAS_THREAD_GROUP_WIDTH 64
+#endif  // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_CAS_THREAD_GROUP_HEIGHT
+#define FFX_CAS_THREAD_GROUP_HEIGHT 1
+#endif  // FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_CAS_THREAD_GROUP_DEPTH
+#define FFX_CAS_THREAD_GROUP_DEPTH 1
+#endif  // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_CAS_NUM_THREADS
+#define FFX_CAS_NUM_THREADS [numthreads(FFX_CAS_THREAD_GROUP_WIDTH, FFX_CAS_THREAD_GROUP_HEIGHT, FFX_CAS_THREAD_GROUP_DEPTH)]
+#endif  // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_PREFER_WAVE64
+FFX_CAS_NUM_THREADS
+FFX_CAS_EMBED_ROOTSIG_CONTENT
+void CS(uint3 LocalThreadId : SV_GroupThreadID, uint3 WorkGroupId : SV_GroupID, uint3 Dtid : SV_DispatchThreadID)
+{
+    Sharpen(LocalThreadId, WorkGroupId, Dtid);
+}
diff --git a/Shaders/shaders/ffx_cas_sharpen_pass.hlsl.meta b/Shaders/shaders/ffx_cas_sharpen_pass.hlsl.meta
new file mode 100644
index 0000000..29dced9
--- /dev/null
+++ b/Shaders/shaders/ffx_cas_sharpen_pass.hlsl.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: b9b8c665d9f11a44e9ca915bb9ce0225
+ShaderIncludeImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: