Transform_inl.h | Explore in Territory

/*
 * Copyright 2018 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

// Intentionally NO #pragma once... included multiple times.

// This file is included from skcms.cc in a namespace with some pre-defines:
//    - N:    SIMD width of all vectors; 1, 4, 8 or 16 (preprocessor define)
//    - V<T>: a template to create a vector of N T's.

using F   = V<float>;
using I32 = V<int32_t>;
using U64 = V<uint64_t>;
using U32 = V<uint32_t>;
using U16 = V<uint16_t>;
using U8  = V<uint8_t>;

#if defined(__GNUC__) && !defined(__clang__)
    // GCC is kind of weird, not allowing vector = scalar directly.
    static constexpr F F0 = F() + 0.0f,
                       F1 = F() + 1.0f,
                       FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
#else
    static constexpr F F0 = 0.0f,
                       F1 = 1.0f,
                       FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
#endif

// Instead of checking __AVX__ below, we'll check USING_AVX.
// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.

#if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
    #define USING_AVX
#endif
#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
    #define USING_AVX_F16C
#endif
#if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
    #define USING_AVX2
#endif
#if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
    #define USING_AVX512F
#endif

// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
// This is more for organizational clarity... skcms.cc doesn't force these.
#if N > 1 && defined(__ARM_NEON)
    #define USING_NEON

    // We have to use two different mechanisms to enable the f16 conversion intrinsics:
    #if defined(__clang__)
        // Clang's arm_neon.h guards them with the FP hardware bit:
        #if __ARM_FP & 2
            #define USING_NEON_F16C
        #endif
    #elif defined(__GNUC__)
        // GCC's arm_neon.h guards them with the FP16 format macros (IEEE and ALTERNATIVE).
        // We don't actually want the alternative format - we're reading/writing IEEE f16 values.
        #if defined(__ARM_FP16_FORMAT_IEEE)
            #define USING_NEON_F16C
        #endif
    #endif
#endif

// These -Wvector-conversion warnings seem to trigger in very bogus situations,
// like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
#if defined(USING_NEON) && defined(__clang__)
    #pragma clang diagnostic ignored "-Wvector-conversion"
#endif

// GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
// You'd see warnings like, "using AVX even though AVX is not enabled".
// We stifle these warnings; our helpers that return U64 are always inlined.
#if defined(__SSE__) && defined(__GNUC__)
    #if !defined(__has_warning)
        #pragma GCC diagnostic ignored "-Wpsabi"
    #elif __has_warning("-Wpsabi")
        #pragma GCC diagnostic ignored "-Wpsabi"
    #endif
#endif

// We tag most helper functions as SI, to enforce good code generation
// but also work around what we think is a bug in GCC: when targeting 32-bit
// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
// MMX mm0 register, which seems to mess with unrelated code that later uses
// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
#if defined(__clang__) || defined(__GNUC__)
    #define SI …
#else
    #define SI …
#endif

template <typename T, typename P>
SI T load(const P* ptr) { … }
template <typename T, typename P>
SI void store(P* ptr, const T& val) { … }

// (T)v is a cast when N == 1 and a bit-pun when N>1,
// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
template <typename D, typename S>
SI D cast(const S& v) { … }

template <typename D, typename S>
SI D bit_pun(const S& v) { … }

// When we convert from float to fixed point, it's very common to want to round,
// and for some reason compilers generate better code when converting to int32_t.
// To serve both those ends, we use this function to_fixed() instead of direct cast().
SI U32 to_fixed(F f) { … }

// Sometimes we do something crazy on one branch of a conditonal,
// like divide by zero or convert a huge float to an integer,
// but then harmlessly select the other side.  That trips up N==1
// sanitizer builds, so we make if_then_else() a macro to avoid
// evaluating the unused side.

#if N == 1
    #define if_then_else …
#else
    template <typename C, typename T>
    SI T if_then_else(C cond, T t, T e) { … }
#endif


SI F F_from_Half(U16 half) { … }

#if defined(__clang__)
    // The -((127-15)<<10) underflows that side of the math when
    // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
    __attribute__((no_sanitize("unsigned-integer-overflow")))
#endif
SI U16 Half_from_F(F f) { … }

// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
#if defined(USING_NEON)
    SI U16 swap_endian_16(U16 v) {
        return (U16)vrev16_u8((uint8x8_t) v);
    }
#endif

SI U64 swap_endian_16x4(const U64& rgba) { … }

#if defined(USING_NEON)
    SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
    SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
#elif defined(__loongarch_sx)
    SI F min_(F x, F y) { return (F)__lsx_vfmin_s(x, y); }
    SI F max_(F x, F y) { return (F)__lsx_vfmax_s(x, y); }
#else
    SI F min_(F x, F y) { … }
    SI F max_(F x, F y) { … }
#endif

SI F floor_(F x) { … }

SI F approx_log2(F x) { … }

SI F approx_log(F x) { … }

SI F approx_exp2(F x) { … }

SI F approx_pow(F x, float y) { … }

SI F approx_exp(F x) { … }

SI F strip_sign(F x, U32* sign) { … }

SI F apply_sign(F x, U32 sign) { … }

// Return tf(x).
SI F apply_tf(const skcms_TransferFunction* tf, F x) { … }

// Return the gamma function (|x|^G with the original sign re-applied to x).
SI F apply_gamma(const skcms_TransferFunction* tf, F x) { … }

SI F apply_pq(const skcms_TransferFunction* tf, F x) { … }

SI F apply_hlg(const skcms_TransferFunction* tf, F x) { … }

SI F apply_hlginv(const skcms_TransferFunction* tf, F x) { … }


// Strided loads and stores of N values, starting from p.
template <typename T, typename P>
SI T load_3(const P* p) { … }

template <typename T, typename P>
SI T load_4(const P* p) { … }

template <typename T, typename P>
SI void store_3(P* p, const T& v) { … }

template <typename T, typename P>
SI void store_4(P* p, const T& v) { … }


SI U8 gather_8(const uint8_t* p, I32 ix) { … }

SI U16 gather_16(const uint8_t* p, I32 ix) { … }

SI U32 gather_32(const uint8_t* p, I32 ix) { … }

SI U32 gather_24(const uint8_t* p, I32 ix) { … }

#if !defined(__arm__)
    SI void gather_48(const uint8_t* p, I32 ix, U64* v) { … }
#endif

SI F F_from_U8(U8 v) { … }

SI F F_from_U16_BE(U16 v) { … }

SI U16 U16_from_F(F v) { … }

SI F minus_1_ulp(F v) { … }

SI F table(const skcms_Curve* curve, F v) { … }

SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) { … }

SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) { … }

SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) { … }

SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) { … }

static void clut(uint32_t input_channels, uint32_t output_channels,
                 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
                 F* r, F* g, F* b, F* a) { … }

static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) { … }
static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) { … }

struct NoCtx { … };

struct Ctx { … };

#define STAGE_PARAMS(MAYBE_REF) …

#if SKCMS_HAS_MUSTTAIL

    // Stages take a stage list, and each stage is responsible for tail-calling the next one.
    //
    // Unfortunately, we can't declare a StageFn as a function pointer which takes a pointer to
    // another StageFn; declaring this leads to a circular dependency. To avoid this, StageFn is
    // wrapped in a single-element `struct StageList` which we are able to forward-declare.
    struct StageList;
    StageFn;
    struct StageList { … };

    #define DECLARE_STAGE(name, arg, CALL_NEXT) …

    #define STAGE(name, arg) …

    #define FINAL_STAGE(name, arg) …

#else

    #define DECLARE_STAGE …

    #define STAGE …
    #define FINAL_STAGE …

#endif

STAGE(load_a8, NoCtx) { … }

STAGE(load_g8, NoCtx) { … }

STAGE(load_ga88, NoCtx) { … }

STAGE(load_4444, NoCtx) { … }

STAGE(load_565, NoCtx) { … }

STAGE(load_888, NoCtx) { … }

STAGE(load_8888, NoCtx) { … }

STAGE(load_1010102, NoCtx) { … }

STAGE(load_101010x_XR, NoCtx) { … }

STAGE(load_10101010_XR, NoCtx) { … }

STAGE(load_161616LE, NoCtx) { … }

STAGE(load_16161616LE, NoCtx) { … }

STAGE(load_161616BE, NoCtx) { … }

STAGE(load_16161616BE, NoCtx) { … }

STAGE(load_hhh, NoCtx) { … }

STAGE(load_hhhh, NoCtx) { … }

STAGE(load_fff, NoCtx) { … }

STAGE(load_ffff, NoCtx) { … }

STAGE(swap_rb, NoCtx) { … }

STAGE(clamp, NoCtx) { … }

STAGE(invert, NoCtx) { … }

STAGE(force_opaque, NoCtx) { … }

STAGE(premul, NoCtx) { … }

STAGE(unpremul, NoCtx) { … }

STAGE(matrix_3x3, const skcms_Matrix3x3* matrix) { … }

STAGE(matrix_3x4, const skcms_Matrix3x4* matrix) { … }

STAGE(lab_to_xyz, NoCtx) { … }

// As above, in reverse.
STAGE(xyz_to_lab, NoCtx) { … }

STAGE(gamma_r, const skcms_TransferFunction* tf) { … }
STAGE(gamma_g, const skcms_TransferFunction* tf) { … }
STAGE(gamma_b, const skcms_TransferFunction* tf) { … }
STAGE(gamma_a, const skcms_TransferFunction* tf) { … }

STAGE(gamma_rgb, const skcms_TransferFunction* tf) { … }

STAGE(tf_r, const skcms_TransferFunction* tf) { … }
STAGE(tf_g, const skcms_TransferFunction* tf) { … }
STAGE(tf_b, const skcms_TransferFunction* tf) { … }
STAGE(tf_a, const skcms_TransferFunction* tf) { … }

STAGE(tf_rgb, const skcms_TransferFunction* tf) { … }

STAGE(pq_r, const skcms_TransferFunction* tf) { … }
STAGE(pq_g, const skcms_TransferFunction* tf) { … }
STAGE(pq_b, const skcms_TransferFunction* tf) { … }
STAGE(pq_a, const skcms_TransferFunction* tf) { … }

STAGE(pq_rgb, const skcms_TransferFunction* tf) { … }

STAGE(hlg_r, const skcms_TransferFunction* tf) { … }
STAGE(hlg_g, const skcms_TransferFunction* tf) { … }
STAGE(hlg_b, const skcms_TransferFunction* tf) { … }
STAGE(hlg_a, const skcms_TransferFunction* tf) { … }

STAGE(hlg_rgb, const skcms_TransferFunction* tf) { … }

STAGE(hlginv_r, const skcms_TransferFunction* tf) { … }
STAGE(hlginv_g, const skcms_TransferFunction* tf) { … }
STAGE(hlginv_b, const skcms_TransferFunction* tf) { … }
STAGE(hlginv_a, const skcms_TransferFunction* tf) { … }

STAGE(hlginv_rgb, const skcms_TransferFunction* tf) { … }

STAGE(table_r, const skcms_Curve* curve) { … }
STAGE(table_g, const skcms_Curve* curve) { … }
STAGE(table_b, const skcms_Curve* curve) { … }
STAGE(table_a, const skcms_Curve* curve) { … }

STAGE(clut_A2B, const skcms_A2B* a2b) { … }

STAGE(clut_B2A, const skcms_B2A* b2a) { … }

// From here on down, the store_ ops are all "final stages," terminating processing of this group.

FINAL_STAGE(store_a8, NoCtx) { … }

FINAL_STAGE(store_g8, NoCtx) { … }

FINAL_STAGE(store_ga88, NoCtx) { … }

FINAL_STAGE(store_4444, NoCtx) { … }

FINAL_STAGE(store_565, NoCtx) { … }

FINAL_STAGE(store_888, NoCtx) { … }

FINAL_STAGE(store_8888, NoCtx) { … }

FINAL_STAGE(store_101010x_XR, NoCtx) { … }

FINAL_STAGE(store_1010102, NoCtx) { … }

FINAL_STAGE(store_161616LE, NoCtx) { … }

FINAL_STAGE(store_16161616LE, NoCtx) { … }

FINAL_STAGE(store_161616BE, NoCtx) { … }

FINAL_STAGE(store_16161616BE, NoCtx) { … }

FINAL_STAGE(store_hhh, NoCtx) { … }

FINAL_STAGE(store_hhhh, NoCtx) { … }

FINAL_STAGE(store_fff, NoCtx) { … }

FINAL_STAGE(store_ffff, NoCtx) { … }

#if SKCMS_HAS_MUSTTAIL

    SI void exec_stages(StageFn* stages, const void** contexts, const char* src, char* dst, int i) { … }

#else

    static void exec_stages(const Op* ops, const void** contexts,
                            const char* src, char* dst, int i) {
        F r = F0, g = F0, b = F0, a = F1;
        while (true) {
            switch (*ops++) {
#define M …
                SKCMS_WORK_OPS(M)
#undef M
#define M …
                SKCMS_STORE_OPS(M)
#undef M
            }
        }
    }

#endif

// NOLINTNEXTLINE(misc-definitions-in-headers)
void run_program(const Op* program, const void** contexts, SKCMS_MAYBE_UNUSED ptrdiff_t programSize,
                 const char* src, char* dst, int n,
                 const size_t src_bpp, const size_t dst_bpp) { … }
chromium/third_party/skia/modules/skcms/src/Transform_inl.h