#if !defined INCLUDE_UTILITY_SPHERICALHARMONICS
#define INCLUDE_UTILITY_SPHERICALHARMONICS
    float[9] SHCoeffs(vec3 r) {
        r = fNormalize(r);
        float x = r.x;
        float y = r.y;
        float z = r.z;

        return float[9](
            (1.0 / 2.0) * sqrt(1.0 / pi),

            sqrt(3.0 / (4.0 * pi)) * y,
            sqrt(3.0 / (4.0 * pi)) * z,
            sqrt(3.0 / (4.0 * pi)) * x,

            (1.0 / 2.0) * sqrt(15.0 / pi) * (x * y),
            (1.0 / 2.0) * sqrt(15.0 / pi) * (y * z),
            (1.0 / 4.0) * sqrt(5.0 / pi) * (3.0 * z * z - 1.0),
            (1.0 / 2.0) * sqrt(15.0 / pi) * (z * x),
            (1.0 / 4.0) * sqrt(15.0 / pi) * ((x * x) - (y * y))
        );
    }

    // Taken from https://patapom.com/blog/SHPortal/#signal-triple-product
    // Performs the SH triple product r = a * b
    // From John Snyder (appendix A8)
    // https://www.microsoft.com/en-us/research/publication/code-generation-and-factoring-for-fast-evaluation-of-low-order-spherical-harmonic-products-and-squares/
    vec3[9] SHProduct(const vec3[9] a, const vec3[9] b) {
        vec3 ta, tb, t;
        vec3[9] r;

        const float C0 = 0.282094792935999980;
        const float C1 = -0.126156626101000010;
        const float C2 = 0.218509686119999990;
        const float C3 = 0.252313259986999990;
        const float C4 = 0.180223751576000010;
        const float C5 = 0.156078347226000000;
        const float C6 = 0.090111875786499998;

        // [0,0]: 0,
        r[0] = C0*a[0]*b[0];

        // [1,1]: 0,6,8,
        ta = C0*a[0]+C1*a[6]-C2*a[8];
        tb = C0*b[0]+C1*b[6]-C2*b[8];
        r[1] = ta*b[1]+tb*a[1];
        t = a[1]*b[1];
        r[0] += C0*t;
        r[6] = C1*t;
        r[8] = -C2*t;

        // [1,2]: 5,
        ta = C2*a[5];
        tb = C2*b[5];
        r[1] += ta*b[2]+tb*a[2];
        r[2] = ta*b[1]+tb*a[1];
        t = a[1]*b[2]+a[2]*b[1];
        r[5] = C2*t;

        // [1,3]: 4,
        ta = C2*a[4];
        tb = C2*b[4];
        r[1] += ta*b[3]+tb*a[3];
        r[3] = ta*b[1]+tb*a[1];
        t = a[1]*b[3]+a[3]*b[1];
        r[4] = C2*t;

        // [2,2]: 0,6,
        ta = C0*a[0]+C3*a[6];
        tb = C0*b[0]+C3*b[6];
        r[2] += ta*b[2]+tb*a[2];
        t = a[2]*b[2];
        r[0] += C0*t;
        r[6] += C3*t;

        // [2,3]: 7,
        ta = C2*a[7];
        tb = C2*b[7];
        r[2] += ta*b[3]+tb*a[3];
        r[3] += ta*b[2]+tb*a[2];
        t = a[2]*b[3]+a[3]*b[2];
        r[7] = C2*t;

        // [3,3]: 0,6,8,
        ta = C0*a[0]+C1*a[6]+C2*a[8];
        tb = C0*b[0]+C1*b[6]+C2*b[8];
        r[3] += ta*b[3]+tb*a[3];
        t = a[3]*b[3];
        r[0] += C0*t;
        r[6] += C1*t;
        r[8] += C2*t;

        // [4,4]: 0,6,
        ta = C0*a[0]-C4*a[6];
        tb = C0*b[0]-C4*b[6];
        r[4] += ta*b[4]+tb*a[4];
        t = a[4]*b[4];
        r[0] += C0*t;
        r[6] -= C4*t;

        // [4,5]: 7,
        ta = C5*a[7];
        tb = C5*b[7];
        r[4] += ta*b[5]+tb*a[5];
        r[5] += ta*b[4]+tb*a[4];
        t = a[4]*b[5]+a[5]*b[4];
        r[7] += C5*t;

        // [5,5]: 0,6,8,
        ta = C0*a[0]+C6*a[6]-C5*a[8];
        tb = C0*b[0]+C6*b[6]-C5*b[8];
        r[5] += ta*b[5]+tb*a[5];
        t = a[5]*b[5];
        r[0] += C0*t;
        r[6] += C6*t;
        r[8] -= C5*t;

        // [6,6]: 0,6,
        ta = C0*a[0];
        tb = C0*b[0];
        r[6] += ta*b[6]+tb*a[6];
        t = a[6]*b[6];
        r[0] += C0*t;
        r[6] += C4*t;

        // [7,7]: 0,6,8,
        ta = C0*a[0]+C6*a[6]+C5*a[8];
        tb = C0*b[0]+C6*b[6]+C5*b[8];
        r[7] += ta*b[7]+tb*a[7];
        t = a[7]*b[7];
        r[0] += C0*t;
        r[6] += C6*t;
        r[8] += C5*t;

        // [8,8]: 0,6,
        ta = C0*a[0]-C4*a[6];
        tb = C0*b[0]-C4*b[6];
        r[8] += ta*b[8]+tb*a[8];
        t = a[8]*b[8];
        r[0] += C0*t;
        r[6] -= C4*t;
        // entry count=13
        // **multiplications count=120**
        // **addition count=74**

        return r;
    }

    /* wip stuff
    // general matrix multiplication macro
    #define MatrixMult(A, B, C) \
    for (int i = 0; i < A.length(); ++i) { \
        for (int j = 0; j < B[0].length(); ++j) { \
            C[i][j] = 0.0;
            for (int k = 0; k < A[0].length(); ++k) {
                C[i][j] += A[i][k] * B[k][j]; \
            } \
        } \
    }

    vec3[9] SHRotate(mat3 m, vec3[9] x) {
        // band 0 is untouched
        // band 1 is a standard vec3 rotation
        vec3 tmp = m * vec3(x[1].x, x[2].x, x[3].x);
        x[1].x = tmp.x; x[2].x = tmp.y; x[3].x = tmp.z;
        tmp = m * vec3(x[1].y, x[2].y, x[3].y);
        x[1].y = tmp.x; x[2].y = tmp.y; x[3].y = tmp.z;
        tmp = m * vec3(x[1].z, x[2].z, x[3].z);
        x[1].z = tmp.x; x[2].z = tmp.y; x[3].z = tmp.z;

        // band 2 is somewhat more complex.
        // this method is from http://filmicworlds.com/blog/simple-and-fast-spherical-harmonic-rotation/
        // but in summary:
        // x = x[indexes 4 through 8]
        // P(r) = SHCoeffs(r)[indexes 4 through 8]
        // M = m
        // R = the 5x5 rotation matrix we want to apply to band 2
        // N = some 3D unit vector

        // R * P(N) = = P(M * N)
        // selecting 5 vectors makes it easy to solve:
        // R * [P(N0), ..., P(M4)] = [P(M * N0), ..., P(M * N4)]
        // A = [P(N0), ..., P(M4)]
        // R * A = [P(M * N0), ..., P(M * N4)]
        // R = [P(M * N0), ..., P(M * N4)] * inverse(a)
        // we don't actually need to calculate r directly, so:
        // R * x = [P(M * N0), ..., P(M * N4)] * inverse(a) * x
        const float k = inversesqrt(2.0);
        const vec3[5] N = vec3[5](
            vec3(1,0,0),
            vec3(0,0,1),
            vec3(k,k,0),
            vec3(k,0,k),
            vec3(0,k,k)
        );

        const float k0 = 0.91529124286551084;
        const float k1 = 2.0 * k0;
        const float k2 = 1.5853309190550713;
        const float[5][5] invA = float[5][5](
            float[5]( 0,-k0,  0, k0, k1),
            float[5](k0,  0, k2, k0, k0),
            float[5](k1,  0,  0,  0,  0),
            float[5]( 0,  0,  0,-k1,  0),
            float[5]( 0,-k1,  0,  0,  0)
        );

        float[5][5] m1;
        float[5][9] c = float[5][9](
            SHCoeffs(N[0]),
            SHCoeffs(N[1]),
            SHCoeffs(N[2]),
            SHCoeffs(N[3]),
            SHCoeffs(N[4]),
        );
        for (int i = 0; i < 5; ++i) {
            for (int j = 0; j < 5; ++j) {
                m1[i][j] = c[i][j + 4];
            }
        }


        float[5][5] R;
        MatrixMult(m1, invA, R);

        float[5] tmp, x2 = float[5](x[4].x, x[5].x, x[6].x, x[7].x, x[8].x);
        MatrixMult(R, x2, tmp);
        x[4].x = tmp[0]; x[5].x = tmp[1]; x[6].x = tmp[2]; x[7].x = tmp[3]; x[8].x = tmp[4];
        x2 = float[5](x[4].y, x[5].y, x[6].y, x[7].y, x[8].y);
        MatrixMult(R, x2, tmp);
        x[4].y = tmp[0]; x[5].y = tmp[1]; x[6].y = tmp[2]; x[7].y = tmp[3]; x[8].y = tmp[4];
        x2 = float[5](x[4].z, x[5].z, x[6].z, x[7].z, x[8].z);
        MatrixMult(R, x2, tmp);
        x[4].z = tmp[0]; x[5].z = tmp[1]; x[6].z = tmp[2]; x[7].z = tmp[3]; x[8].z = tmp[4];

        return x;
    }
    //*/

    // SH coefficients for a hemisphere in direction r
    float[9] SHHemisphere(vec3 r) {
        return float[9](
            sqrt(pi),

            sqrt(3.0 * pi / 4.0) * r.y,
            sqrt(3.0 * pi / 4.0) * r.z,
            sqrt(3.0 * pi / 4.0) * r.x,

            // coeffs for l=2 are 180-degree rotationally symmetric, so always 0
            0.0, 0.0, 0.0, 0.0, 0.0
        );
    }

    // Taken from https://patapom.com/blog/SHPortal/#using-the-result-for-real-time-diffuse-irradiance-estimation
    vec3 EvaluateSHIrradiance( vec3 _Direction, vec3 _SH[9] ) {
        const float c1 = 0.42904276540489171563379376569857;    // 4 * Â2.Y22 = 1/4 * sqrt(15.PI)
        const float c2 = 0.51166335397324424423977581244463;    // 0.5 * Â1.Y10 = 1/2 * sqrt(PI/3)
        const float c3 = 0.24770795610037568833406429782001;    // Â2.Y20 = 1/16 * sqrt(5.PI)
        const float c4 = 0.88622692545275801364908374167057;    // Â0.Y00 = 1/2 * sqrt(PI)

        float x = _Direction.x;
        float y = _Direction.y;
        float z = _Direction.z;

        return  max( vec3(0.0),
                (c1*(x*x - y*y)) * _SH[8]                       // c1.L22.(x^2-y^2)
                + (c3*(3.0*z*z - 1)) * _SH[6]                   // c3.L20.(3.z^2 - 1)
                + c4 * _SH[0]                                   // c4.L00 
                + 2.0*c1*(_SH[4]*x*y + _SH[7]*x*z + _SH[5]*y*z) // 2.c1.(L2-2.xy + L21.xz + L2-1.yz)
                + 2.0*c2*(_SH[3]*x + _SH[1]*y + _SH[2]*z) );    // 2.c2.(L11.x + L1-1.y + L10.z)
    }
#endif
