common.h | Explore in Territory

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_

#include <algorithm>
#include <cstddef>
#include <cstdint>

#include "tensorflow/lite/kernels/internal/runtime_shape.h"
#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
#endif
#endif

#include <cmath>
#include <functional>

#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/core/macros.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
#include "tensorflow/lite/kernels/internal/types.h"

namespace tflite {

constexpr int kReverseShift = …;

// Reduces and compresses dimensions so that broadcast handling becomes more
// efficient. Returns true if the output shape is broadcastable; it doesn't
// contain any degenerate dimension, i.e. shape dimension = 0. False otherwise.
template <int MAX_DIM = 6>
bool ReduceDimensionsForBroadcast(const RuntimeShape& input1_shape,
                                  const RuntimeShape& input2_shape,
                                  size_t* compressed_input1_stride,
                                  size_t* compressed_input2_stride,
                                  size_t* compressed_output_shape) { … }

inline void GetActivationMinMax(FusedActivationFunctionType ac,
                                float* output_activation_min,
                                float* output_activation_max) { … }

template <typename T>
inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
                                      T output_activation_max) { … }

// Legacy function, left for compatibility only.
template <FusedActivationFunctionType Ac>
float ActivationFunction(float x) { … }

inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
                         const float* bias_data, int array_size,
                         float* array_data) { … }

TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
    int32_t x, int32_t quantized_multiplier, int shift);

TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
    int64_t x, int32_t quantized_multiplier, int shift);

// Single-rounding MultiplyByQuantizedMultiplier
#if TFLITE_SINGLE_ROUNDING
inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
    int32_t x, int32_t quantized_multiplier, int shift) {
  TFLITE_DCHECK_LE(shift, 0);
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
}

inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
    int32_t x, int32_t quantized_multiplier, int shift) {
  TFLITE_DCHECK_GE(shift, 0);
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
}

#ifdef USE_NEON
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
  TFLITE_DCHECK(quantized_multiplier >= 0);

  const int right_shift = std::min(-1, shift);
  const int left_shift = shift - right_shift;

  const int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
  const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
  const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);

  int32x4x4_t result;
  result.val[0] = vrshlq_s32(
      vqdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup),
      right_shift_dup);

  result.val[1] = vrshlq_s32(
      vqdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup),
      right_shift_dup);

  result.val[2] = vrshlq_s32(
      vqdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup),
      right_shift_dup);

  result.val[3] = vrshlq_s32(
      vqdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup),
      right_shift_dup);

  return result;
}
#endif  // USE_NEON
// Double-rounding MultiplyByQuantizedMultiplier
#else
inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
    int32_t x, int32_t quantized_multiplier, int left_shift) { … }

inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
    int32_t x, int32_t quantized_multiplier, int left_shift) { … }

#ifdef USE_NEON
// Round uses ARM's rounding shift right.
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
  const int left_shift = std::max(shift, 0);
  const int right_shift = std::min(shift, 0);
  int32x4x4_t result;

  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);

  result.val[0] =
      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
                               multiplier_dup),
                 right_shift_dup);

  result.val[1] =
      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
                               multiplier_dup),
                 right_shift_dup);

  result.val[2] =
      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
                               multiplier_dup),
                 right_shift_dup);

  result.val[3] =
      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
                               multiplier_dup),
                 right_shift_dup);

  return result;
}
#endif  // USE_NEON
#endif  // TFLITE_SINGLE_ROUNDING

template <typename T>
int CountLeadingZeros(T integer_input) { … }

template <typename T>
inline int CountLeadingSignBits(T integer_input) { … }

// Use "count leading zeros" helper functions to do a fast Floor(log_2(x)).
template <typename Integer>
inline Integer FloorLog2(Integer n) { … }

namespace detail {

// LUTPopulate takes an optional type-erased transform_params to allow passing
// extra parameters to the transform function pointer. const void* is used
// instead of std::function to be compatible with TFLite Micro
template <typename FloatT, typename Func>
inline typename std::enable_if<std::is_same<Func, FloatT (*)(FloatT)>::value,
                               FloatT>::type
LUTTransform(Func transform, const void* /*transform_params*/, FloatT value) { … }

template <typename FloatT, typename Func>
inline typename std::enable_if<
    std::is_same<Func, FloatT (*)(FloatT, const void*)>::value, FloatT>::type
LUTTransform(Func transform, const void* transform_params, FloatT value) { … }

// Use the same LUT generation code for both uint8_t and int8_t. Int8_t indexes
// will be directly casted to uint8_t, the int8 LUT will thus be ordered as [0,
// 1, ..., 127, -128, ..., -2, -1] instead of [-128, -127, ..., -1, 0, 1, ...,
// 126, 127].
template <typename T, typename Func>
inline void LUTPopulateInt8(float input_scale, int32_t input_zero_point,
                            float output_scale, int32_t output_zero_point,
                            Func transform, const void* transform_params,
                            T* lut) { … }

// Keep floating-point type configurable for backward compatibility. float
// should be used for FloatT by default.
template <typename FloatT, typename Func>
inline void LUTPopulateInt16(FloatT input_scale, int32_t input_zero_point,
                             FloatT output_scale, int32_t output_zero_point,
                             Func transform, const void* transform_params,
                             int16_t* lut) { … }

}  // namespace detail

template <typename T>
inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
                                   std::is_same<T, int8_t>::value,
                               void>::type
LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
            int32_t output_zero_point, float (*transform)(float), T* lut) { … }

template <typename T>
inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
                                   std::is_same<T, int8_t>::value,
                               void>::type
LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
            int32_t output_zero_point, float (*transform)(float, const void*),
            const void* transform_params, T* lut) { … }

template <typename T>
inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
            int32_t output_zero_point, float (*transform)(float), T* lut) { … }

template <typename T>
inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
            int32_t output_zero_point, float (*transform)(float, const void*),
            const void* transform_params, T* lut) { … }

// Deprecated, avoid usage and prefer the float version. Kept for
// backward-compatiblity.
template <typename T>
inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
LUTPopulate(double input_scale, int32_t input_zero_point, double output_scale,
            int32_t output_zero_point, double (*transform)(double), T* lut) { … }

// The size of the LUT depends on the type of input. For uint8 and int8 inputs a
// simple 256 entries LUT is used. For int16 inputs the high 9 bits are used for
// indexing and the 7 remaining bits are used for interpolation. We thus use a
// 513-entries LUT for int16 cases, 512 for the 9-bit indexing and 1 extra entry
// to interpolate the last value.
template <typename T>
constexpr int LUTSize() { … }

// int16_t -> int16_t table lookup with interpolation
// LUT must have 513 values
inline int16_t LUTLookup(int16_t value, const int16_t* lut) { … }

// int8_t -> int8_t table lookup without interpolation
// LUT must have 256 values
// LUTPopulate<int8_t> has ordered the LUT so that indexing it with an
// int8_t is just done by casting it to an uint8_t.
inline int8_t LUTLookup(int8_t value, const int8_t* lut) { … }

// uint8_t -> uint8_t table lookup without interpolation
// LUT must have 256 values
inline uint8_t LUTLookup(uint8_t value, const uint8_t* lut) { … }

// Table of sigmoid(i/24) at 0.16 format - 256 elements.

// We use combined sigmoid and tanh look-up table, since
// tanh(x) = 2*sigmoid(2*x) -1.
// Both functions are symmetric, so the LUT table is only needed
// for the absolute value of the input.
static const uint16_t sigmoid_table_uint16[256] = …;

// TODO(b/77858996): Add these to gemmlowp.
template <typename IntegerType>
IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) { … }

template <>
inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) { … }

template <typename tRawType, int tIntegerBits>
gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { … }

template <typename IntegerType>
IntegerType SaturatingSub(IntegerType a, IntegerType b) { … }

template <>
inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) { … }

template <>
inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) { … }

template <typename tRawType, int tIntegerBits>
gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { … }
// End section to be moved to gemmlowp.

template <typename IntegerType>
IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) { … }

// If we want to leave IntegerBits fixed, then multiplication
// by a power of two has to be saturating/rounding, not exact anymore.
template <typename tRawType, int tIntegerBits>
gemmlowp::FixedPoint<tRawType, tIntegerBits>
SaturatingRoundingMultiplyByPOTParam(
    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) { … }

// Convert int32_t multiplier to int16_t with rounding.
inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
                                            int16_t* multiplier_int16_t) { … }

// Minimum output bits to accommodate log of maximum input range.  It actually
// does not matter if one considers, say, [-64,64] or [-64,64).
//
// For example, run this through Octave:
// [0:127; ...
//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
constexpr int min_log_x_output_bits(int input_bits) { … }

// Although currently the name of this function says that it cannot handle
// values less than 1, in practice it can handle as low as 1/x_max, where
// x_max is the largest representable input.  In other words, the output range
// is symmetric.
template <int OutputIntegerBits, int InputIntegerBits>
inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
log_x_for_x_greater_than_or_equal_to_1_impl(
    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) { … }

template <int OutputIntegerBits, int InputIntegerBits>
inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
log_x_for_x_greater_than_or_equal_to_1(
    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) { … }

inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
                             int* num_bits_over_unit) { … }

inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
                                             int32_t* output_inv_sqrt,
                                             int* output_shift) { … }

// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
// BROADCASTING.
//
// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
// rectangular array of numbers.
//
// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
// However, as Dims<N> is to be deprecated, this class exists as an adaptor
// to enable simple unoptimized implementations of element-wise broadcasting
// operations.
template <int N>
struct NdArrayDesc { … };

// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
// BROADCASTING.
//
// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
                            int i3) { … }

inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) { … }

inline int SubscriptToIndex(const NdArrayDesc<8>& desc, int indexes[8]) { … }

// Given the dimensions of the operands for an element-wise binary broadcast,
// adjusts them so that they can be directly iterated over with simple loops.
// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
//
// This function assumes that the two input shapes are compatible up to
// broadcasting and the shorter one has already been prepended with 1s to be the
// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
//
// When two shapes are compatible up to broadcasting, for each dimension d,
// the input extents are either equal, or one of them is 1.
//
// This function performs the following for each dimension d:
// - If the extents are equal, then do nothing since the loop that walks over
//   both of the input arrays is correct.
// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
//   array0 to be referenced *at any index* in dimension d and still access the
//   same slice.
template <int N>
inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
                                                const Dims<N>& input1_dims,
                                                NdArrayDesc<N>* desc0_out,
                                                NdArrayDesc<N>* desc1_out) { … }

// Copies dims to desc, calculating strides.
template <int N>
TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape,
                                    NdArrayDesc<N>* desc_out) { … }

template <int N>
inline void NdArrayDescsForElementwiseBroadcast(
    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
    NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) { … }

template <int N>
inline void NdArrayDescsForElementwiseBroadcast(
    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
    const RuntimeShape& input2_shape, NdArrayDesc<N>* desc0_out,
    NdArrayDesc<N>* desc1_out, NdArrayDesc<N>* desc2_out) { … }

// Detailed implementation of NDOpsHelper, the indexes must be a zero array.
// This implementation is equivalent to N nested loops. Ex, if N=4, it can be
// re-writen as:
// for (int b = 0; b < output.extents[0]; ++b) {
//   for (int y = 0; y < output.extents[1]; ++y) {
//     for (int x = 0; x < output.extents[2]; ++x) {
//       for (int c = 0; c < output.extents[3]; ++c) {
//           calc({b,y,x,c});
//       }
//     }
//   }
// }
template <int N, int DIM, typename Calc>
typename std::enable_if<DIM != N - 1, void>::type NDOpsHelperImpl(
    const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) { … }

template <int N, int DIM, typename Calc>
typename std::enable_if<DIM == N - 1, void>::type NDOpsHelperImpl(
    const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) { … }

// Execute the calc function in the innermost iteration based on the shape of
// the output. The calc function should take a single argument of type int[N].
template <int N, typename Calc>
inline void NDOpsHelper(const NdArrayDesc<N>& output, const Calc& calc) { … }
// Copied from gemmlowp::RoundDown when we dropped direct dependency on
// gemmlowp.
//
// Returns the runtime argument rounded down to the nearest multiple of
// the fixed Modulus.
template <unsigned Modulus, typename Integer>
Integer RoundDown(Integer i) { … }

// Copied from gemmlowp::RoundUp when we dropped direct dependency on
// gemmlowp.
//
// Returns the runtime argument rounded up to the nearest multiple of
// the fixed Modulus.
template <unsigned Modulus, typename Integer>
Integer RoundUp(Integer i) { … }

// Copied from gemmlowp::CeilQuotient when we dropped direct dependency on
// gemmlowp.
//
// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) { … }

// This function is a copy of gemmlowp::HowManyThreads, copied when we dropped
// the direct dependency of internal/optimized/ on gemmlowp.
//
// It computes a reasonable number of threads to use for a GEMM of shape
// (rows, cols, depth).
//
// TODO(b/131910176): get rid of this function by switching each call site
// to its own more sensible logic for its own workload.
template <int KernelRows>
inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols,
                                int depth) { … }

template <typename T>
void optimized_ops_preload_l1_stream(const T* ptr) { … }

template <typename T>
void optimized_ops_preload_l1_keep(const T* ptr) { … }

template <typename T>
void optimized_ops_prefetch_write_l1_keep(const T* ptr) { … }

}  // namespace tflite

#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
chromium/third_party/tflite/src/tensorflow/lite/kernels/internal/common.h