godot/thirdparty/cvtt/ConvectionKernels_ParallelMath.h

/*
Convection Texture Tools
Copyright (c) 2018-2019 Eric Lasota

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject
to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

*/
#pragma once
#ifndef __CVTT_PARALLELMATH_H__
#define __CVTT_PARALLELMATH_H__

#include "ConvectionKernels.h"
#include "ConvectionKernels_Config.h"

#ifdef CVTT_USE_SSE2
#include <emmintrin.h>
#endif

#include <float.h>
#include <assert.h>
#include <string.h>
#include <algorithm>
#include <math.h>

#define UNREFERENCED_PARAMETER(n)

// Parallel math implementation
//
// After preprocessor defs are handled, what this should do is expose the following types:
// SInt16 - Signed 16-bit integer
// UInt16 - Signed 16-bit integer
// UInt15 - Unsigned 15-bit integer
// SInt32 - Signed 32-bit integer
// UInt31 - Unsigned 31-bit integer
// AInt16 - 16-bit integer of unknown signedness (only used for storage)
// Int16CompFlag - Comparison flags from comparing 16-bit integers
// Int32CompFlag - Comparison flags from comparing 32-bit integers
// FloatCompFlag - Comparison flags from comparing 32-bit floats
//
// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
// (particularly max, min, compares, and right shift) may not be available.  In cases where ops are not available, it's
// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers.  The 15-bit and 31-bit uint types
// can elide the bit flips if unsigned versions are not available.

namespace cvtt
{
#ifdef CVTT_USE_SSE2
    // SSE2 version
    struct ParallelMath
    {};

#else
    // Scalar version
    struct ParallelMath
    {
        struct RoundTowardZeroForScope
        {
        };

        struct RoundTowardNearestForScope
        {
        };

        struct RoundUpForScope
        {
        };

        struct RoundDownForScope
        {
        };

        static const int ParallelSize = 1;

        enum Int16Subtype
        {
            IntSubtype_Signed,
            IntSubtype_UnsignedFull,
            IntSubtype_UnsignedTruncated,
            IntSubtype_Abstract,
        };

        typedef int32_t SInt16;
        typedef int32_t UInt15;
        typedef int32_t UInt16;
        typedef int32_t AInt16;

        typedef int32_t SInt32;
        typedef int32_t UInt31;
        typedef int32_t UInt32;
        typedef int32_t AInt32;

        typedef int32_t ScalarUInt16;
        typedef int32_t ScalarSInt16;

        typedef float Float;

        template<class TTargetType>
        struct LosslessCast
        {
            static const int32_t& Cast(const int32_t &src)
            {
                return src;
            }
        };

        typedef bool Int16CompFlag;
        typedef bool FloatCompFlag;

        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
        {
            return a + b;
        }

        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
        {
            return a - b;
        }

        static float Select(bool flag, float a, float b)
        {
            return flag ? a : b;
        }

        static int32_t Select(bool flag, int32_t a, int32_t b)
        {
            return flag ? a : b;
        }

        static int32_t SelectOrZero(bool flag, int32_t a)
        {
            return flag ? a : 0;
        }

        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
        {
            if (flag)
                dest = src;
        }

        static void ConditionalSet(bool& dest, bool flag, bool src)
        {
            if (flag)
                dest = src;
        }

        static int32_t ConditionalNegate(bool flag, int32_t v)
        {
            return (flag) ? -v : v;
        }

        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
        {
            if (!flag)
                dest = src;
        }

        static void ConditionalSet(float& dest, bool flag, float src)
        {
            if (flag)
                dest = src;
        }

        static void NotConditionalSet(float& dest, bool flag, float src)
        {
            if (!flag)
                dest = src;
        }

        static void MakeSafeDenominator(float& v)
        {
            if (v == 0.0f)
                v = 1.0f;
        }

        static int32_t SignedRightShift(int32_t v, int bits)
        {
            return v >> bits;
        }

        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
        {
            v = (v << (32 - precision)) & 0xffffffff;
            return SignedRightShift(v, 32 - precision);
        }

        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
        {
            return v & ((1 << precision) - 1);
        }

        static int32_t Min(int32_t a, int32_t b)
        {
            if (a < b)
                return a;
            return b;
        }

        static float Min(float a, float b)
        {
            if (a < b)
                return a;
            return b;
        }

        static int32_t Max(int32_t a, int32_t b)
        {
            if (a > b)
                return a;
            return b;
        }

        static float Max(float a, float b)
        {
            if (a > b)
                return a;
            return b;
        }

        static float Abs(float a)
        {
            return fabsf(a);
        }

        static int32_t Abs(int32_t a)
        {
            if (a < 0)
                return -a;
            return a;
        }

        static float Clamp(float v, float min, float max)
        {
            if (v < min)
                return min;
            if (v > max)
                return max;
            return v;
        }

        static float Reciprocal(float v)
        {
            return 1.0f / v;
        }

        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
        {
            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
        }

        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
        {
            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
        }

        static float MakeFloat(float v)
        {
            return v;
        }

        static float MakeFloatZero()
        {
            return 0.0f;
        }

        static int32_t MakeUInt16(uint16_t v)
        {
            return v;
        }

        static int32_t MakeSInt16(int16_t v)
        {
            return v;
        }

        static int32_t MakeAInt16(int16_t v)
        {
            return v;
        }

        static int32_t MakeUInt15(uint16_t v)
        {
            return v;
        }

        static int32_t MakeSInt32(int32_t v)
        {
            return v;
        }

        static int32_t MakeUInt31(int32_t v)
        {
            return v;
        }

        static int32_t Extract(int32_t v, int offset)
        {
            UNREFERENCED_PARAMETER(offset);
            return v;
        }

        static bool Extract(bool v, int offset)
        {
            UNREFERENCED_PARAMETER(offset);
            return v;
        }

        static float Extract(float v, int offset)
        {
            UNREFERENCED_PARAMETER(offset);
            return v;
        }

        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
        {
            UNREFERENCED_PARAMETER(offset);
            dest = v;
        }

        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
        {
            UNREFERENCED_PARAMETER(offset);
            dest = v;
        }

        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
        {
            UNREFERENCED_PARAMETER(offset);
            dest = v;
        }

        static float ExtractFloat(float v, int offset)
        {
            UNREFERENCED_PARAMETER(offset);
            return v;
        }

        static void PutFloat(float &dest, int offset, float v)
        {
            UNREFERENCED_PARAMETER(offset);
            dest = v;
        }

        static void PutBoolInt16(bool &dest, int offset, bool v)
        {
            UNREFERENCED_PARAMETER(offset);
            dest = v;
        }

        static bool Less(int32_t a, int32_t b)
        {
            return a < b;
        }

        static bool Less(float a, float b)
        {
            return a < b;
        }

        static bool LessOrEqual(int32_t a, int32_t b)
        {
            return a < b;
        }

        static bool LessOrEqual(float a, float b)
        {
            return a < b;
        }

        static bool Equal(int32_t a, int32_t b)
        {
            return a == b;
        }

        static bool Equal(float a, float b)
        {
            return a == b;
        }

        static float ToFloat(int32_t v)
        {
            return static_cast<float>(v);
        }

        static int32_t ToUInt31(int32_t v)
        {
            return v;
        }

        static int32_t ToInt32(int32_t v)
        {
            return v;
        }

        static bool FloatFlagToInt16(bool v)
        {
            return v;
        }

        static bool Int32FlagToInt16(bool v)
        {
            return v;
        }

        static bool Int16FlagToFloat(bool v)
        {
            return v;
        }

        static bool MakeBoolInt16(bool b)
        {
            return b;
        }

        static bool MakeBoolFloat(bool b)
        {
            return b;
        }

        static bool AndNot(bool a, bool b)
        {
            return a && !b;
        }

        static bool Not(bool b)
        {
            return !b;
        }

        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
        {
            UNREFERENCED_PARAMETER(rtz);
            return static_cast<int>(v);
        }

        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
        {
            UNREFERENCED_PARAMETER(ru);
            return static_cast<int>(ceilf(v));
        }

        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
        {
            UNREFERENCED_PARAMETER(rd);
            return static_cast<int>(floorf(v));
        }

        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
        {
            UNREFERENCED_PARAMETER(rtn);
            return static_cast<int>(floorf(v + 0.5f));
        }

        template<class TRoundMode>
        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
        {
            return RoundAndConvertToInt(v, roundingMode);
        }

        template<class TRoundMode>
        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
        {
            return RoundAndConvertToInt(v, roundingMode);
        }

        template<class TRoundMode>
        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
        {
            return RoundAndConvertToInt(v, roundingMode);
        }

        static float Sqrt(float f)
        {
            return sqrtf(f);
        }

        static int32_t SqDiffUInt8(int32_t a, int32_t b)
        {
            int32_t delta = a - b;
            return delta * delta;
        }

        static int32_t SqDiffInt16(int32_t a, int32_t b)
        {
            int32_t delta = a - b;
            return delta * delta;
        }

        static int32_t SqDiffSInt16(int32_t a, int32_t b)
        {
            int32_t delta = a - b;
            return delta * delta;
        }

        static float TwosCLHalfToFloat(int32_t v)
        {
            int32_t absV = (v < 0) ? -v : v;

            int32_t signBits = (absV & -32768);
            int32_t mantissa = (absV & 0x03ff);
            int32_t exponent = (absV & 0x7c00);

            bool isDenormal = (exponent == 0);

            // Convert exponent to high-bits
            exponent = (exponent >> 3) + 14336;

            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;

            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);

            float f, correction;
            memcpy(&f, &fBits, 4);
            memcpy(&correction, &denormalCorrection, 4);

            return f - correction;
        }

        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
        {
            Float fa = TwosCLHalfToFloat(a);

            Float diff = fa - b;
            return diff * diff;
        }

        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
        {
            Float fa = TwosCLHalfToFloat(a);
            Float fb = TwosCLHalfToFloat(b);

            Float diff = fa - fb;
            return diff * diff;
        }

        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
        {
            Float fa = TwosCLHalfToFloat(a) * aWeight;

            Float diff = fa - b;
            return diff * diff;
        }

        static int32_t RightShift(int32_t v, int bits)
        {
            return SignedRightShift(v, bits);
        }

        static int32_t ToSInt16(int32_t v)
        {
            return v;
        }

        static int32_t ToUInt16(int32_t v)
        {
            return v;
        }

        static int32_t ToUInt15(int32_t v)
        {
            return v;
        }

        static int32_t XMultiply(int32_t a, int32_t b)
        {
            return a * b;
        }

        static int32_t CompactMultiply(int32_t a, int32_t b)
        {
            return a * b;
        }

        static bool AnySet(bool v)
        {
            return v;
        }

        static bool AllSet(bool v)
        {
            return v;
        }
    };

#endif
}

#endif