#include "Packages/com.unity.render-pipelines.high-definition/Runtime/PostProcessing/Shaders/ExposureCommon.hlsl"

#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch

#pragma kernel KFixedExposure
#pragma kernel KManualCameraExposure
#pragma kernel KPrePass
#pragma kernel KReduction
#pragma kernel KReset

TEXTURE2D(_InputTexture);

#define PREPASS_TEX_SIZE 1024.0
#define PREPASS_TEX_HALF_SIZE 512.0
//#pragma enable_d3d11_debug_symbols

//
// Fixed exposure
// Doesn't do anything fancy, simply copies the exposure & clamp values set in the volume system
//
[numthreads(1,1,1)]
void KFixedExposure(uint2 dispatchThreadId : SV_DispatchThreadID)
{
    float ev100 = ParamEV100;
    ev100 -= ParamExposureCompensation;
    _OutputTexture[dispatchThreadId] = float2(ConvertEV100ToExposure(ev100, LensImperfectionExposureScale), ev100);
}

//
// Manual camera exposure
// Converts aperture / shutter speed / iso / compensation to EV100
//
[numthreads(1,1,1)]
void KManualCameraExposure(uint2 dispatchThreadId : SV_DispatchThreadID)
{
    float ev100 = ComputeEV100(ParamAperture, ParamShutterSpeed, ParamISO);
    ev100 -= ParamExposureCompensation;
    _OutputTexture[dispatchThreadId] = float2(ConvertEV100ToExposure(ev100, LensImperfectionExposureScale), ev100);
}

//
// Average luminance pre-pass
// Transforms the input to log luminance in a square-POT target
//
[numthreads(8,8,1)]
void KPrePass(uint2 dispatchThreadId : SV_DispatchThreadID)
{
    // For XR, interleave single-pass views in a checkerboard pattern
    UNITY_XR_ASSIGN_VIEW_INDEX((dispatchThreadId.x + dispatchThreadId.y) % _XRViewCount)

    PositionInputs posInputs = GetPositionInput(float2(dispatchThreadId), rcp(PREPASS_TEX_SIZE), uint2(8u, 8u));
    float2 uv = ClampAndScaleUVForBilinear(posInputs.positionNDC);
    float luma = SampleLuminance(uv);

    float weight = WeightSample(dispatchThreadId, PREPASS_TEX_SIZE.xx, luma);

    float logLuma = ComputeEV100FromAvgLuminance(max(luma, 1e-4), MeterCalibrationConstant);
    _OutputTexture[posInputs.positionSS] = float2(logLuma, weight);
}

//
// Average luminance 2nd & 3rd pass + Evaluation
//   - 2nd: Reduction 1024 -> 32
//   - 3rd: Reduction 32 -> 1
//
#define REDUCTION_GROUP_SIZE 16
#define REDUCTION_TOTAL_THREADS 256

groupshared float4 gs_luminances[REDUCTION_TOTAL_THREADS];
groupshared float gs_weights[REDUCTION_TOTAL_THREADS];

// This kernel runs twice, and as the final output, produces the average normalized luminance of the texture produced by
// the pre-pass.
//
// Let's work through the math, but with a simplified example. Instead of a 2D texture, let's assume we have a 1D
// texture. And instead of a 1024 -> 32 -> 1 reduction, let's assume we have a 4 -> 2 -> 1 reduction.
//
// Say the input texture has the following four pixels: (a, A), (b, B), (c, C), (d, D). The first channel of each pixel
// is the log luminance, and the second channel is the weight.
//
// The first pass combines two pixels per thread, and outputs the following two-pixel two-channel intermediate texture:
// ((a*A + b*B) / (A + B), (A + B)), ((c*C + d*D) / (C + D), (C + D))
// The second pass calculates exposure as follows:
// ((a*A + b*B) / (A + B) * (A + B) + (c*C + d*D) / (C + D) * (C + D)) / (A + B + C + D)
// which simplifies to:
// (a*A + b*B + c*C + d*D) / (A + B + C + D)
// which is the normalized weighted average of the log luminances. We can thus work with weights that don't have to sum
// up to 1.
//
// Notice that (A + B) multiplied in the first pass is cancelled out in the second pass. This is done for two reasons:
// It enables parallel reduction, and it keeps the values of the intermediate texture in a reasonable range to fit in
// the fp16 data format. We spend a bit more ALU, but we avoid fp16 quantization artifacts.
[numthreads(REDUCTION_GROUP_SIZE,REDUCTION_GROUP_SIZE,1)]
void KReduction(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID)
{
    uint threadIdx = groupThreadId.y * REDUCTION_GROUP_SIZE + groupThreadId.x;
    uint2 sampleIdx = (groupId.xy * REDUCTION_GROUP_SIZE + groupThreadId.xy) * 2u;

    // Store 4 pixels & their weights in the lds
    float2 p1 = _InputTexture[sampleIdx + uint2(0u, 0u)].xy;
    float2 p2 = _InputTexture[sampleIdx + uint2(1u, 0u)].xy;
    float2 p3 = _InputTexture[sampleIdx + uint2(0u, 1u)].xy;
    float2 p4 = _InputTexture[sampleIdx + uint2(1u, 1u)].xy;

    float4 smp = float4(p1.x, p2.x, p3.x, p4.x);
    float4 weights = float4(p1.y, p2.y, p3.y, p4.y);

    gs_luminances[threadIdx] = smp * weights;
    gs_weights[threadIdx] = dot(weights, 1.0);

    GroupMemoryBarrierWithGroupSync();

    // Parallel reduction of luminances & weights
    UNITY_UNROLL
    for(uint s = REDUCTION_TOTAL_THREADS / 2u; s > 0u; s >>= 1u)
    {
        if(threadIdx < s)
        {
            gs_luminances[threadIdx] += gs_luminances[threadIdx + s];
            gs_weights[threadIdx] += gs_weights[threadIdx + s];
        }

        GroupMemoryBarrierWithGroupSync();
    }

    // Evaluate on group thread 0
    if(threadIdx == 0u)
    {
        float avgLuminance = dot(gs_luminances[0], 0.25);

        if (IsNaN(avgLuminance) || IsInf(avgLuminance))
            avgLuminance = 1.0;

        if (gs_weights[0] > 0.0)
            avgLuminance /= (gs_weights[0] * 0.25);

        UNITY_BRANCH
        switch (ParamEvaluateMode)
        {
            case 1u:
            {
                // Automatic
                float exposure = AdaptExposure(avgLuminance - ParamExposureCompensation);
                exposure = clamp(exposure, ParamExposureLimitMin, ParamExposureLimitMax);
                _OutputTexture[groupId.xy] = float2(ConvertEV100ToExposure(exposure, LensImperfectionExposureScale), exposure);
                break;
            }
            case 2u:
            {
                // Curve remapping
                float minExposure = ParamExposureLimitMin;
                float maxExposure = ParamExposureLimitMax;
                float exposure = CurveRemap(avgLuminance, minExposure, maxExposure);
                exposure = AdaptExposure(exposure - ParamExposureCompensation);
                exposure = clamp(exposure, minExposure, maxExposure);
                _OutputTexture[groupId.xy] = float2(ConvertEV100ToExposure(exposure, LensImperfectionExposureScale), exposure);
                break;
            }
            default:
            {
                // No evaluate - passthrough to next pass
                // This is only used when going from 1024 to 32
                _OutputTexture[groupId.xy] = float2(avgLuminance, gs_weights[0]);
                break;
            }
        }
    }
}

//
// Reset the exposure texture to a default state (1,0)
//
[numthreads(1, 1, 1)]
void KReset(uint2 dispatchThreadId : SV_DispatchThreadID)
{
    _OutputTexture[dispatchThreadId] = float2(1.0, 0.0);
}