optimized_ops.h | Explore in Territory

/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_

#include <assert.h>
#include <stdint.h>
#include <sys/types.h>

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <limits>
#include <memory>
#include <tuple>
#include <type_traits>
#include <utility>

#include "tensorflow/lite/core/macros.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/reference/add.h"
#include "tensorflow/lite/kernels/internal/reference/mul.h"
#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"

#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
#include <Accelerate/Accelerate.h>
#endif

#include "Eigen/Core"  // from @eigen_archive
#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
#include "fixedpoint/fixedpoint.h"
#include "ruy/profiler/instrumentation.h"  // from @ruy
#include "tensorflow/lite/core/c/common.h"
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
#include "tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
#include "tensorflow/lite/kernels/internal/tensor.h"
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
#include "tensorflow/lite/kernels/internal/transpose_utils.h"
#include "tensorflow/lite/kernels/internal/types.h"

#if __aarch64__ && __clang__
#define TFLITE_SOFTMAX_USE_UINT16_LUT
#endif

namespace tflite {
namespace optimized_ops {

// Unoptimized reference ops:
Broadcast4DSlowGreater;
Broadcast4DSlowGreaterEqual;
Broadcast4DSlowGreaterEqualWithScaling;
Broadcast4DSlowGreaterWithScaling;
Broadcast4DSlowLess;
Broadcast4DSlowLessEqual;
Broadcast4DSlowLessEqualWithScaling;
Broadcast4DSlowLessWithScaling;
BroadcastAdd6DSlow;
BroadcastMul6DSlow;
BroadcastSub16POTSlow;
BroadcastSubSlow;
Concatenation;
ConcatenationWithScaling;
DepthConcatenation;
Div;
Elu;
FakeQuant;
Fill;
Gather;
Greater;
GreaterEqual;
GreaterEqualWithScaling;
GreaterWithScaling;
LeakyRelu;
Less;
LessEqual;
LessEqualWithScaling;
LessWithScaling;
ProcessBroadcastShapes;
RankOneSelect;
Relu0To1;  // NOLINT
Relu1;
Relu6;
ReluX;
Round;
Select;
SpaceToBatchND;
Split;
Sub16;

// TODO(b/80247582) Remove this constant.
// This will be phased out as the shifts are revised with more thought. Use of a
// constant enables us to track progress on this work.
//
// Used to convert from old-style shifts (right) to new-style (left).
static constexpr int kReverseShift = …;

// Copied from tensorflow/core/framework/tensor_types.h
template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
struct TTypes { … };

// TODO(b/62193649): this function is only needed as long
// as we have the --variable_batch hack.
template <typename Scalar>
MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
                                                   const RuntimeShape& shape,
                                                   int rows) { … }

static inline void swap_data(ArithmeticParams& arithmetic_params) { … }

template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
TFLITE_NOINLINE void BinaryBroadcastFiveFold(
    const ArithmeticParams& unswitched_params,
    const RuntimeShape& unswitched_input1_shape,
    const T* unswitched_input1_data,
    const RuntimeShape& unswitched_input2_shape,
    const T* unswitched_input2_data, const RuntimeShape& output_shape,
    T* output_data, ElementwiseF elementwise_f,
    ScalarBroadcastF scalar_broadcast_f) { … }

#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT

// Looks up each element of <indices> in <table>, returns them in a vector.
inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
                                        uint8x16_t indices) {
  // Look up in 1st quarter of the table: top 2 bits of indices == 00
  uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
  // Look up in 2nd quarter of the table: top 2 bits of indices == 01
  uint8x16_t output2 =
      vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
  // Look up in 3rd quarter of the table: top 2 bits of indices == 10
  uint8x16_t output3 =
      vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
  // Look up in 4th quarter of the table: top 2 bits of indices == 11
  uint8x16_t output4 =
      vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));

  // Combine result of the 4 lookups.
  return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
}

#endif

inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                             float output_activation_max,
                                             const RuntimeShape& bias_shape,
                                             const float* bias_data,
                                             const RuntimeShape& array_shape,
                                             float* array_data) { … }

inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const float* input_data, const RuntimeShape& weights_shape,
    const float* weights_data, const RuntimeShape& bias_shape,
    const float* optional_bias_data, const RuntimeShape& output_shape,
    float* output_data, CpuBackendContext* cpu_backend_context) { … }

inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& filter_shape,
    const uint8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    uint8_t* output_data, CpuBackendContext* cpu_backend_context) { … }

inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& filter_shape,
    const uint8_t* filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data_int32, const RuntimeShape& output_shape,
    int16_t* output_data, CpuBackendContext* cpu_backend_context) { … }

// Internal function doing the actual arithmetic work for
// ShuffledFullyConnected.
// May be called either directly by it (single-threaded case) or may be used
// as the 'task' for worker threads to run (multi-threaded case, see
// ShuffledFullyConnectedWorkerTask below).
inline void ShuffledFullyConnectedWorkerImpl(
    const uint8_t* shuffled_input_workspace_data,
    const int8_t* shuffled_weights_data, int batches, int output_depth,
    int output_stride, int accum_depth, const int32_t* bias_data,
    int32_t output_multiplier, int output_shift, int16_t* output_data) { … }

// Wraps ShuffledFullyConnectedWorkerImpl into a Task class
// to allow using gemmlowp's threadpool.
struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task { … };

inline void ShuffledFullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& weights_shape,
    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data, uint8_t* shuffled_input_workspace_data,
    CpuBackendContext* cpu_backend_context) { … }

#ifdef USE_NEON

inline int32x4_t RoundToNearest(const float32x4_t input) {
#if defined(__aarch64__) || defined(__SSSE3__)
  // Note: vcvtnq_s32_f32 is not available in ARMv7
  return vcvtnq_s32_f32(input);
#else
  static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
  static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);

  const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
  const float32x4_t round =
      vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
  return vcvtq_s32_f32(vaddq_f32(input, round));
#endif  // defined(__aarch64__) || defined(__SSSE3__)
}

inline uint32x4_t RoundToNearestUnsigned(const float32x4_t input) {
#if defined(__aarch64__)
  // Note that vcvtnq_u32_f32 is not available in ARMv7 or in arm_neon_sse.h.
  return vcvtnq_u32_f32(input);
#else
  static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);

  return vcvtq_u32_f32(vaddq_f32(input, point5_val_dup));
#endif  // defined(__aarch64__)
}

#endif  // USE_NEON

inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                 const float* input_data, const RuntimeShape& filter_shape,
                 const float* filter_data, const RuntimeShape& bias_shape,
                 const float* bias_data, const RuntimeShape& output_shape,
                 float* output_data, const RuntimeShape& im2col_shape,
                 float* im2col_data, CpuBackendContext* cpu_backend_context) { … }

inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
                       const RuntimeShape& input_shape,
                       const int8_t* input_data,
                       const RuntimeShape& filter_shape,
                       const int8_t* filter_data,
                       const RuntimeShape& bias_shape, const float* bias_data,
                       const RuntimeShape& accum_scratch_shape,
                       int32_t* accum_scratch, const RuntimeShape& output_shape,
                       float* output_data, const RuntimeShape& im2col_shape,
                       int8_t* im2col_data, CpuBackendContext* context) { … }

inline void HybridConvPerChannel(
    const ConvParams& params, float* scaling_factors_ptr,
    const RuntimeShape& input_shape, const int8_t* input_data,
    const RuntimeShape& filter_shape, const int8_t* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const RuntimeShape& im2col_shape, int8_t* im2col_data,
    const float* per_channel_scale, int32_t* input_offset,
    const RuntimeShape& scratch_shape, int32_t* scratch, int32_t* row_sums,
    bool* compute_row_sums, CpuBackendContext* cpu_backend_context) { … }

inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                 const uint8_t* input_data, const RuntimeShape& filter_shape,
                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
                 const int32_t* bias_data, const RuntimeShape& output_shape,
                 uint8_t* output_data, const RuntimeShape& im2col_shape,
                 uint8_t* im2col_data, CpuBackendContext* cpu_backend_context) { … }

template <typename T>
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
                         const RuntimeShape& unextended_input_shape,
                         const T* input_data,
                         const RuntimeShape& unextended_output_shape,
                         T* output_data) { … }

template <typename T>
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
                         const RuntimeShape& unextended_input_shape,
                         const T* input_data,
                         const RuntimeShape& unextended_output_shape,
                         T* output_data) { … }

inline void Relu(const RuntimeShape& input_shape, const float* input_data,
                 const RuntimeShape& output_shape, float* output_data) { … }

inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                            const RuntimeShape& input_shape,
                            const float* input_data,
                            const RuntimeShape& output_shape,
                            float* output_data, float epsilon = 1e-6) { … }

inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                            const RuntimeShape& input_shape,
                            const uint8_t* input_data,
                            const RuntimeShape& output_shape,
                            uint8_t* output_data) { … }

inline void AddElementwise(int size, const ArithmeticParams& params,
                           const float* input1_data, const float* input2_data,
                           float* output_data) { … }

inline void Add(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const float* input1_data,
                const RuntimeShape& input2_shape, const float* input2_data,
                const RuntimeShape& output_shape, float* output_data) { … }

// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
inline void AddElementwise(int size, const ArithmeticParams& params,
                           const uint8_t* input1_data,
                           const uint8_t* input2_data, uint8_t* output_data) { … }

// Scalar-broadcast add that can be used for inner loop of more general
// broadcast add, so that, for example, scalar-broadcast with batch will still
// be fast.
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
                               uint8_t input1_data, const uint8_t* input2_data,
                               uint8_t* output_data) { … }

// Scalar-broadcast add that can be used for inner loop of more general
// broadcast add, so that, for example, scalar-broadcast with batch will still
// be fast.
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
                               float broadcast_value, const float* input2_data,
                               float* output_data) { … }

inline void Add(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const uint8_t* input1_data,
                const RuntimeShape& input2_shape, const uint8_t* input2_data,
                const RuntimeShape& output_shape, uint8_t* output_data) { … }

inline void Add(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int16_t* input1_data,
                const RuntimeShape& input2_shape, const int16_t* input2_data,
                const RuntimeShape& output_shape, int16_t* output_data) { … }

template <typename T>
inline typename std::enable_if<is_int32_or_int64<T>::value, void>::type Add(
    const ArithmeticParams& params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }

template <typename T>
inline void BroadcastAddDispatch(
    const ArithmeticParams& params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }

inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                 const RuntimeShape& unswitched_input1_shape,
                                 const uint8_t* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
                                 const uint8_t* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
                                 uint8_t* output_data) { … }

inline void BroadcastAddFivefold(const ArithmeticParams& params,
                                 const RuntimeShape& unswitched_input1_shape,
                                 const float* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
                                 const float* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
                                 float* output_data) { … }

inline void MulElementwise(int size, const ArithmeticParams& params,
                           const float* input1_data, const float* input2_data,
                           float* output_data) { … }

inline void MulElementwise(int32_t n, const ArithmeticParams& params,
                           const int32_t* __restrict lhs,
                           const int32_t* __restrict rhs,
                           int32_t* __restrict out) { … }

inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const float* input1_data,
                const RuntimeShape& input2_shape, const float* input2_data,
                const RuntimeShape& output_shape, float* output_data) { … }

inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int32_t* input1_data,
                const RuntimeShape& input2_shape, const int32_t* input2_data,
                const RuntimeShape& output_shape, int32_t* output_data) { … }

inline void MulNoActivation(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
                            const int32_t* input1_data,
                            const RuntimeShape& input2_shape,
                            const int32_t* input2_data,
                            const RuntimeShape& output_shape,
                            int32_t* output_data) { … }

inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int16_t* input1_data,
                const RuntimeShape& input2_shape, const int16_t* input2_data,
                const RuntimeShape& output_shape, int16_t* output_data) { … }

inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int16_t* input1_data,
                const RuntimeShape& input2_shape, const int16_t* input2_data,
                const RuntimeShape& output_shape, uint8_t* output_data) { … }

// Element-wise mul that can often be used for inner loop of broadcast Mul as
// well as the non-broadcast Mul.
inline void MulElementwise(int size, const ArithmeticParams& params,
                           const uint8_t* input1_data,
                           const uint8_t* input2_data, uint8_t* output_data) { … }

// Broadcast mul that can often be used for inner loop of broadcast Mul.
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
                               const uint8_t broadcast_value,
                               const uint8_t* input2_data,
                               uint8_t* output_data) { … }

// Broadcast mul that can often be used for inner loop of broadcast Mul.
// This function will handle scalar_value (LHS) * vector_values (RHS).
// Since it's a float function, input params does not matter here.
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
                               const float broadcast_value,
                               const float* input2_data, float* output_data) { … }

inline void Mul(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const uint8_t* input1_data,
                const RuntimeShape& input2_shape, const uint8_t* input2_data,
                const RuntimeShape& output_shape, uint8_t* output_data) { … }

template <typename T>
inline void BroadcastMulDispatch(
    const ArithmeticParams& params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }

inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
                                 const RuntimeShape& unswitched_input1_shape,
                                 const uint8_t* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
                                 const uint8_t* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
                                 uint8_t* output_data) { … }

inline void BroadcastMulFivefold(const ArithmeticParams& params,
                                 const RuntimeShape& unswitched_input1_shape,
                                 const float* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
                                 const float* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
                                 float* output_data) { … }

// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
// reference_ops.h.
template <typename T, int N = 5>
void BroadcastDivSlow(const ArithmeticParams& params,
                      const RuntimeShape& unextended_input1_shape,
                      const T* input1_data,
                      const RuntimeShape& unextended_input2_shape,
                      const T* input2_data,
                      const RuntimeShape& unextended_output_shape,
                      T* output_data) { … }

// BroadcastDiv is intentionally duplicated from reference_ops.h.
// For more details see the comment above the generic version of
// BroadcastDivSlow.
template <int N = 5>
inline void BroadcastDivSlow(const ArithmeticParams& params,
                             const RuntimeShape& unextended_input1_shape,
                             const uint8_t* input1_data,
                             const RuntimeShape& unextended_input2_shape,
                             const uint8_t* input2_data,
                             const RuntimeShape& unextended_output_shape,
                             uint8_t* output_data) { … }

template <typename T>
inline void SubWithActivation(
    const ArithmeticParams& params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }

inline void SubNonBroadcast(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
                            const float* input1_data,
                            const RuntimeShape& input2_shape,
                            const float* input2_data,
                            const RuntimeShape& output_shape,
                            float* output_data) { … }

template <typename T>
void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
         const T* input1_data, const RuntimeShape& input2_shape,
         const T* input2_data, const RuntimeShape& output_shape,
         T* output_data) { … }

inline void LstmCell(
    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
    const float* prev_activ_data, const RuntimeShape& weights_shape,
    const float* weights_data, const RuntimeShape& unextended_bias_shape,
    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
    const float* prev_state_data,
    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data,
    CpuBackendContext* cpu_backend_context) { … }

template <int StateIntegerBits>
inline void LstmCell(
    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
    const uint8_t* input_data_uint8,
    const RuntimeShape& unextended_prev_activ_shape,
    const uint8_t* prev_activ_data_uint8, const RuntimeShape& weights_shape,
    const uint8_t* weights_data_uint8,
    const RuntimeShape& unextended_bias_shape, const int32_t* bias_data_int32,
    const RuntimeShape& unextended_prev_state_shape,
    const int16_t* prev_state_data_int16,
    const RuntimeShape& unextended_output_state_shape,
    int16_t* output_state_data_int16,
    const RuntimeShape& unextended_output_activ_shape,
    uint8_t* output_activ_data_uint8,
    const RuntimeShape& unextended_concat_temp_shape,
    uint8_t* concat_temp_data_uint8,
    const RuntimeShape& unextended_activ_temp_shape,
    int16_t* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) { … }

inline int NodeOffset(int b, int h, int w, int height, int width) { … }

inline bool AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
                        const float* input_data,
                        const RuntimeShape& output_shape, float* output_data) { … }

inline bool AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
                        const uint8_t* input_data,
                        const RuntimeShape& output_shape,
                        uint8_t* output_data) { … }

inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                    const float* input_data, const RuntimeShape& output_shape,
                    float* output_data) { … }

inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                    const uint8_t* input_data, const RuntimeShape& output_shape,
                    uint8_t* output_data) { … }

inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
                   const float* input_data, const RuntimeShape& output_shape,
                   float* output_data) { … }

inline void LocalResponseNormalization(
    const tflite::LocalResponseNormalizationParams& op_params,
    const RuntimeShape& input_shape, const float* input_data,
    const RuntimeShape& output_shape, float* output_data) { … }

inline void SoftmaxImpl(const SoftmaxParams& params,
                        const RuntimeShape& input_shape,
                        const float* input_data,
                        const RuntimeShape& output_shape, float* output_data,
                        int start_batch, int end_batch) { … }

struct SoftmaxWorkerTask : cpu_backend_threadpool::Task { … };

inline void Softmax(const SoftmaxParams& params,
                    const RuntimeShape& input_shape, const float* input_data,
                    const RuntimeShape& output_shape, float* output_data,
                    CpuBackendContext* cpu_backend_context = nullptr) { … }

template <typename T>
inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) { … }

#if !__aarch64__
// With ARM64, rounding is faster than add + truncation.
template <>
inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled,
                                              int32_t zero_point) { … }
#endif

inline void PopulateSoftmaxLookupTable(SoftmaxParams* data, float input_scale,
                                       float beta) { … }

template <typename In, typename Out>
inline void Softmax(const SoftmaxParams& params,
                    const RuntimeShape& input_shape, const In* input_data,
                    const RuntimeShape& output_shape, Out* output_data) { … }

// Here's the softmax LUT optimization strategy:
// For softmax, we can do some mathmetically equivalent transformation:
//
// softmax(x) = e^x / sum(e^x, 0...n)  ===> equals to
// softmax(x) = e^(x - CONST) / sum(e^(x - CONST), 0...n)
//
// For quantization, `x` in our case is (input_q - input_zp) * input_s
// For uint8_t case (int8_t can be handled similarly), the range is [0, 255]
//
// so if we let
// CONST = (255 - input_zp) * input_s
// then we will have:
// softmax(x) = e^((input_q - 255) * input_s) --------- (1)
//         /
// sum(e^(input_q - 255) * input_s, 0...n)   -------- (2)
//
// the good thing about (1) is it's within the range of (0, 1), so we can
// approximate its result with uint16_t.
//  (1) = uint8_out * 1 / 2^16.
//
// so (1) is lookup_uint8_table(input_zp) * 1 / 2^16.
// then (2) is essentially the following:
// sum(lookup_uint8_table(input_zp), 0...n) / 2^16.
//
// since (output_q - output_zp) * output_s = softmax(x)
// output_q = lookup_uint8_table(input_zp)
//            /
// (sum(lookup_uint8_table(input_zp), 0...n) * output_s)
//             +
//   output_zp
//
// We can actually further improve the performance by using uint8_t instead of
// uint16_t. But that we may lose some accuracy, so we need to pay attention
// to that.
inline void PopulateSoftmaxUInt8LookupTable(SoftmaxParams* data,
                                            float input_scale, float beta) { … }

inline int FindMaxValue(int size, const uint8_t* input_data, uint8_t offset) { … }

#ifdef USE_NEON
// Value_to_store layout:
// [high_high, high_low, low_high, low_low].
inline void StoreValue(int32x4x4_t value_to_store, int8_t* output) {
  const int16x8_t result_1 = vcombine_s16(vqmovn_s32(value_to_store.val[1]),
                                          vqmovn_s32(value_to_store.val[0]));
  const int16x8_t result_2 = vcombine_s16(vqmovn_s32(value_to_store.val[3]),
                                          vqmovn_s32(value_to_store.val[2]));
  const int8x16_t result =
      vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
  vst1q_s8(output, result);
}

// Value_to_store layout:
// [high_high, high_low, low_high, low_low].
inline void StoreValue(int32x4x4_t value_to_store, uint8_t* output) {
  const uint16x8_t result_1 =
      vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
                   vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
  const uint16x8_t result_2 =
      vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
                   vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
  const uint8x16_t result =
      vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
  vst1q_u8(output, result);
}

#endif

template <typename In, typename Out>
inline void SoftmaxInt8LUT(const SoftmaxParams& params,
                           const RuntimeShape& input_shape,
                           const In* input_data,
                           const RuntimeShape& output_shape, Out* output_data) { … }

inline void LogSoftmax(const SoftmaxParams& params,
                       const RuntimeShape& input_shape, const float* input_data,
                       const RuntimeShape& output_shape, float* output_data) { … }

// Backwards compatibility. Less optimized than below version.
inline void LogSoftmax(const SoftmaxParams& params,
                       const RuntimeShape& input_shape,
                       const uint8_t* input_data,
                       const RuntimeShape& output_shape, uint8_t* output_data) { … }

// Compute LogSoftmax as (x - x_max) - ln(sum(e^(x_i - x_max)...)
// as done in tf.nn.log_softmax to prevent underflow and overflow.
// This is in contrast to just log(softmax(x))
//
// To handle quantization, first dequantize the inputs (from doing
// e^(input scale * val) where we ignore the zero point since it cancels
// out during subtraction due to the ln) and do a rescale at the end to int8_t.
//
// Notably this makes use of float and is intended as the optimized
// form for quantized execution on CPU. For a fully integer version,
// see the reference op.
//
// TODO(tflite): notes for optimization:
// 1) See if e^ is also bottleneck in the reference fully-integer
// version and apply lookup there and compare.
template <typename T>
inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
                       const RuntimeShape& input_shape, const T* input_data,
                       const RuntimeShape& output_shape, T* output_data) { … }

inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
                     const RuntimeShape& output_shape, float* output_data) { … }

// Convenience version that allows, for example, generated-code calls to be
// uniform between data types.
inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
                     const float* input_data, const RuntimeShape& output_shape,
                     float* output_data) { … }

inline void Logistic(const LogisticParams& params,
                     const RuntimeShape& input_shape, const int16_t* input_data,
                     const RuntimeShape& output_shape, int16_t* output_data) { … }

inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
                 const RuntimeShape& output_shape, float* output_data) { … }

// Convenience version that allows, for example, generated-code calls to be
// uniform between data types.
inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
                 const float* input_data, const RuntimeShape& output_shape,
                 float* output_data) { … }

inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
                 const int16_t* input_data, const RuntimeShape& output_shape,
                 int16_t* output_data) { … }

template <typename SrcT, typename DstT>
inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
                 const RuntimeShape& output_shape, DstT* output_data) { … }

inline void Floor(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) { … }

inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
                 const RuntimeShape& output_shape, float* output_data) { … }

// Helper methods for BatchToSpaceND.
// `spatial_index_dim` specifies post-crop offset index in this spatial
// dimension, i.e. spatial offset introduced by flattening batch to spatial
// dimension minus the crop size at beginning. `block_shape_dim` is the block
// size in current dimension. `input_dim` and `output_dim` are input and output
// size of BatchToSpaceND operation in current dimension.
// Output start index is inclusive and end index is exclusive.
inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
                          int input_dim, int output_dim, int* start_index,
                          int* end_index) { … }

template <typename T>
inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
                           const T* input1_data,
                           const RuntimeShape& unextended_input2_shape,
                           const int32_t* block_shape_data,
                           const RuntimeShape& unextended_input3_shape,
                           const int32_t* crops_data,
                           const RuntimeShape& unextended_output_shape,
                           T* output_data) { … }

template <typename T>
TFLITE_NOINLINE void TypedMemset(void* ptr, T value, size_t num) { … }

// This makes heavy use of Offset, along with conditional branches. There may be
// opportunities for improvement.
//
// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
// scalar input that provides the padding value.  Therefore pad_value_ptr can be
// equivalent to a simple input1_data.  For Pad, it should point to a zero
// value.
//
// Note that two typenames are required, so that T=P=int32_t is considered a
// specialization distinct from P=int32_t.
template <typename T, typename P>
inline void PadImpl(const tflite::PadParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
                    const P* pad_value_ptr, const RuntimeShape& output_shape,
                    T* output_data) { … }

template <typename T, typename P>
inline void Pad(const tflite::PadParams& op_params,
                const RuntimeShape& input_shape, const T* input_data,
                const P* pad_value_ptr, const RuntimeShape& output_shape,
                T* output_data) { … }

// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
template <typename T>
inline void Pad(const tflite::PadParams& op_params,
                const RuntimeShape& input_shape, const T* input_data,
                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                T* output_data) { … }

// This version avoids conflicting template matching.
template <>
inline void Pad(const tflite::PadParams& op_params,
                const RuntimeShape& input_shape, const int32_t* input_data,
                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                int32_t* output_data) { … }

// TODO(b/117643175): Optimize. (This is an introductory copy of standard Pad.)
//
// This pad requires that (a) left and right paddings are in the 4D patterns
// {0, h_pad, w_pad, 0}, and (b) memset can be used: *pad_value_ptr == 0 and/or
// T is uint8_t.
//
// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
// scalar input that provides the padding value.  Therefore pad_value_ptr can be
// equivalent to a simple input1_data.  For Pad, it should point to a zero
// value.
//
// Note that two typenames are required, so that T=P=int32_t is considered a
// specialization distinct from P=int32_t.
template <typename T, typename P>
inline void PadImageStyleMemset(const tflite::PadParams& op_params,
                                const RuntimeShape& input_shape,
                                const T* input_data, const P* pad_value_ptr,
                                const RuntimeShape& output_shape,
                                T* output_data) { … }

template <typename T, typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape, const T* input_data,
                          const P* pad_value_ptr,
                          const RuntimeShape& output_shape, T* output_data) { … }

template <typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape,
                          const uint8_t* input_data, const P* pad_value_ptr,
                          const RuntimeShape& output_shape,
                          uint8_t* output_data) { … }

template <typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape,
                          const float* input_data, const P* pad_value_ptr,
                          const RuntimeShape& output_shape,
                          float* output_data) { … }

template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
                  const RuntimeShape& input_shape,
                  const RuntimeShape& output_shape,
                  SequentialTensorWriter<T>* writer) { … }

template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
                  const RuntimeShape& input_shape, const T* input_data,
                  const RuntimeShape& output_shape, T* output_data) { … }

template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
                  const RuntimeShape& input_shape, const TfLiteTensor* input,
                  const RuntimeShape& output_shape, TfLiteTensor* output) { … }

template <typename T>
void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
             const T* input2_data, const RuntimeShape& output_shape,
             T* output_data) { … }

// Convenience version that allows, for example, generated-code calls to be
// the same as other binary ops.
template <typename T>
inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
                    const RuntimeShape&, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) { … }

template <typename T>
void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
             const T* input2_data, const RuntimeShape& output_shape,
             T* output_data) { … }

// Convenience version that allows, for example, generated-code calls to be
// the same as other binary ops.
template <typename T>
inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
                    const RuntimeShape&, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) { … }

template <typename T>
void TransposeIm2col(const ConvParams& params, uint8_t zero_byte,
                     const RuntimeShape& input_shape, const T* input_data,
                     const RuntimeShape& filter_shape,
                     const RuntimeShape& output_shape, T* im2col_data) { … }

// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage
// order (height, width, depth), constructed from patches in 'col_data', which
// is required to be in storage order (out_height * out_width, filter_height,
// filter_width, in_depth).  Implementation by Yangqing Jia (jiayq).
// Copied from //tensorflow/core/kernels/conv_grad_input_ops.cc
template <typename T>
void Col2im(const T* col_data, const int depth, const int height,
            const int width, const int filter_h, const int filter_w,
            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
            const int stride_h, const int stride_w, T* im_data) { … }

// TODO(b/188008864) Optimize this function by combining outer loops.
template <typename T>
void BiasAdd(T* im_data, const T* bias_data, const int batch_size,
             const int height, const int width, const int depth) { … }

// TransposeConvV2 expect the weights in HWOI order.
inline void TransposeConvV2(
    const ConvParams& params, const RuntimeShape& input_shape,
    const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
    const float* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
    const float* bias_data, const RuntimeShape& output_shape,
    float* const output_data, const RuntimeShape& col2im_shape,
    float* col2im_data, CpuBackendContext* cpu_backend_context) { … }

inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
                     int32_t output_zp, const int32_t output_min,
                     const int32_t output_max, int32_t* scratch,
                     uint8_t* output) { … }

// Single-rounding MultiplyByQuantizedMultiplier
#if TFLITE_SINGLE_ROUNDING
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                     int32_t channel_size, int32_t total_size,
                     int32_t output_zp, int32_t output_min, int32_t output_max,
                     int32_t* scratch, int8_t* output) {
  ruy::profiler::ScopeLabel label("Quantize/int8_t");

  // Here we're trying to quantize the raw accumulators:
  //        output_channels
  //       data data data data data
  // rows  data data data data data
  //       data data data data data
  //          ....
  //
  // In order to minimize the reload of the multipliers & shifts, once we load
  // the multipliers & shifts, we load & quantize the raw accumulators for every
  // row.
#ifdef USE_NEON
  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
  const int32x4_t minus_ones = vdupq_n_s32(-1);
#endif

  TFLITE_DCHECK_EQ(total_size % channel_size, 0);
  const int32_t rows = total_size / channel_size;

  int c = 0;

#ifdef USE_NEON
  for (; c <= channel_size - 8; c += 8) {
    int32x4_t out_shift_1 = vld1q_s32(shift + c);
    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);

    int32x4_t right_shift_1 = vminq_s32(out_shift_1, minus_ones);
    int32x4_t right_shift_2 = vminq_s32(out_shift_2, minus_ones);

    int32x4_t left_shift_1 = vsubq_s32(out_shift_1, right_shift_1);
    int32x4_t left_shift_2 = vsubq_s32(out_shift_2, right_shift_2);

    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
    for (int n = 0; n < rows; ++n) {
      int loc = n * channel_size + c;
      int32x4_t acc_1 = vld1q_s32(scratch + loc);
      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);

      // Saturating Doubling High Mul.
      acc_1 = vshlq_s32(acc_1, left_shift_1);
      acc_1 = vqdmulhq_s32(acc_1, out_mul_1);
      acc_2 = vshlq_s32(acc_2, left_shift_2);
      acc_2 = vqdmulhq_s32(acc_2, out_mul_2);

      // Rounding Dividing By POT.
      acc_1 = vrshlq_s32(acc_1, right_shift_1);
      acc_2 = vrshlq_s32(acc_2, right_shift_2);

      // Add the output offset.
      acc_1 = vaddq_s32(acc_1, output_offset_vec);
      acc_2 = vaddq_s32(acc_2, output_offset_vec);

      // Apply the activation function.
      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
      acc_2 = vminq_s32(acc_2, output_activation_max_vec);

      // Saturating cast to int8_t and store to destination.
      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
      const int8x8_t res_s8 = vqmovn_s16(res_s16);
      vst1_s8(output + loc, res_s8);
    }
  }

#endif  // USE_NEON
  // Handle leftover values, one by one. This is very slow.
  for (; c < channel_size; c++) {
    for (int n = 0; n < rows; ++n) {
      int loc = n * channel_size + c;
      int32_t acc = scratch[loc];
      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
      acc += output_zp;
      acc = std::max(acc, output_min);
      acc = std::min(acc, output_max);
      output[loc] = static_cast<int8_t>(acc);
    }
  }
}

inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                     int32_t channel_size, int32_t total_size,
                     int32_t output_zp, int32_t output_min, int32_t output_max,
                     int32_t* scratch, int16_t* output) {
  ruy::profiler::ScopeLabel label("Quantize(Single-rounding)/int16_t");

  // Here we're trying to quantize the raw accumulators:
  //        output_channels
  //       data data data data data
  // rows  data data data data data
  //       data data data data data
  //          ....
  //
  // In order to minimize the reload of the multipliers & shifts, once we load
  // the multipliers & shifts, we load & quantize the raw accumulators for every
  // row.
#ifdef USE_NEON
  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
  const int32x4_t minus_ones = vdupq_n_s32(-1);
#endif

  TFLITE_DCHECK_EQ(total_size % channel_size, 0);
  const int32_t rows = total_size / channel_size;

  int c = 0;

#ifdef USE_NEON
  for (; c <= channel_size - 8; c += 8) {
    int32x4_t out_shift_1 = vld1q_s32(shift + c);
    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);

    int32x4_t right_shift_1 = vminq_s32(out_shift_1, minus_ones);
    int32x4_t right_shift_2 = vminq_s32(out_shift_2, minus_ones);

    int32x4_t left_shift_1 = vsubq_s32(out_shift_1, right_shift_1);
    int32x4_t left_shift_2 = vsubq_s32(out_shift_2, right_shift_2);

    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
    for (int n = 0; n < rows; ++n) {
      int loc = n * channel_size + c;
      int32x4_t acc_1 = vld1q_s32(scratch + loc);
      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);

      // Saturating Doubling High Mul.
      acc_1 = vshlq_s32(acc_1, left_shift_1);
      acc_1 = vqdmulhq_s32(acc_1, out_mul_1);
      acc_2 = vshlq_s32(acc_2, left_shift_2);
      acc_2 = vqdmulhq_s32(acc_2, out_mul_2);

      // Rounding Dividing By POT.
      acc_1 = vrshlq_s32(acc_1, right_shift_1);
      acc_2 = vrshlq_s32(acc_2, right_shift_2);

      // Add the output offset.
      acc_1 = vaddq_s32(acc_1, output_offset_vec);
      acc_2 = vaddq_s32(acc_2, output_offset_vec);

      // Apply the activation function.
      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
      acc_2 = vminq_s32(acc_2, output_activation_max_vec);

      // Saturating cast to int16_t and store to destination.
      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
      vst1_s16(reinterpret_cast<int16_t*>(output) + loc, acc_s16_1);
      vst1_s16(reinterpret_cast<int16_t*>(output) + loc + 4, acc_s16_2);
    }
  }

#endif  // USE_NEON
  // Handle leftover values, one by one. This is very slow.
  for (; c < channel_size; c++) {
    for (int n = 0; n < rows; ++n) {
      int loc = n * channel_size + c;
      int32_t acc = scratch[loc];
      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
      acc += output_zp;
      acc = std::max(acc, output_min);
      acc = std::min(acc, output_max);
      output[loc] = static_cast<int16_t>(acc);
    }
  }
}
// Double-rounding MultiplyByQuantizedMultiplier
#else
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                     int32_t channel_size, int32_t total_size,
                     int32_t output_zp, int32_t output_min, int32_t output_max,
                     int32_t* scratch, int8_t* output) { … }

inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                     int32_t channel_size, int32_t total_size,
                     int32_t output_zp, int32_t output_min, int32_t output_max,
                     int32_t* scratch, int16_t* output) { … }
#endif  // TFLITE_SINGLE_ROUNDING

// TransposeConvV2 expect the weights in HWOI order.
inline void TransposeConvV2(
    const ConvParams& params, const RuntimeShape& input_shape,
    const uint8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
    const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
    const int32_t* bias_data, const RuntimeShape& output_shape,
    uint8_t* output_data, const RuntimeShape& col2im_shape,
    int32_t* col2im_data, int32_t* scratch_data,
    CpuBackendContext* cpu_backend_context) { … }

// Integer-only version of ResizeNearestNeighbor. Since scales are represented
// in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
// reference version. Debug checks are in place to test if this occurs.
// NOTE: If align_corners or half_pixel_centers is true, we use the reference
// version.
inline void ResizeNearestNeighbor(
    const tflite::ResizeNearestNeighborParams& op_params,
    const RuntimeShape& unextended_input_shape, const uint8_t* input_data,
    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
    const RuntimeShape& unextended_output_shape, uint8_t* output_data) { … }

template <typename input_type, typename output_type>
inline void Requantize(const input_type* input_data, int32_t size,
                       int32_t effective_scale_multiplier,
                       int32_t effective_scale_shift, int32_t input_zeropoint,
                       int32_t output_zeropoint, output_type* output_data) { … }

template <>
inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
                                        int32_t effective_scale_multiplier,
                                        int32_t effective_scale_shift,
                                        int32_t input_zeropoint,
                                        int32_t output_zeropoint,
                                        uint8_t* output_data) { … }

template <>
inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
                                        int32_t effective_scale_multiplier,
                                        int32_t effective_scale_shift,
                                        int32_t input_zeropoint,
                                        int32_t output_zeropoint,
                                        int8_t* output_data) { … }

template <>
inline void Requantize<int8_t, int8_t>(const int8_t* input_data, int32_t size,
                                       int32_t effective_scale_multiplier,
                                       int32_t effective_scale_shift,
                                       int32_t input_zeropoint,
                                       int32_t output_zeropoint,
                                       int8_t* output_data) { … }

template <>
inline void Requantize<uint8_t, uint8_t>(
    const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier,
    int32_t effective_scale_shift, int32_t input_zeropoint,
    int32_t output_zeropoint, uint8_t* output_data) { … }

inline void HardSwish(const RuntimeShape& input_shape, const float* input_data,
                      const RuntimeShape& output_shape, float* output_data) { … }

#ifdef USE_NEON
inline void SaturateAndStore(int16x8_t src, std::uint8_t* dst) {
  // Narrow values down to 8 bit unsigned, saturating.
  uint8x8_t res8 = vqmovun_s16(src);
  // Store results to destination.
  vst1_u8(dst, res8);
}

inline void SaturateAndStore(int16x8_t src, std::int8_t* dst) {
  // Narrow values down to 8 bit unsigned, saturating.
  int8x8_t res8 = vqmovn_s16(src);
  // Store results to destination.
  vst1_s8(dst, res8);
}
#endif

template <typename T>
inline void HardSwish(const HardSwishParams& params,
                      const RuntimeShape& input_shape, const T* input_data,
                      const RuntimeShape& output_shape, T* output_data) { … }

template <typename T>
inline void IntegerExponentPow(const ArithmeticParams& params,
                               const RuntimeShape& unextended_base_shape,
                               const T* base_data, const int exponent,
                               const RuntimeShape& unextended_output_shape,
                               T* output_data) { … }

template <typename T>
inline void BroadcastPow4D(const RuntimeShape& unextended_input1_shape,
                           const T* input1_data,
                           const RuntimeShape& unextended_input2_shape,
                           const T* input2_data,
                           const RuntimeShape& unextended_output_shape,
                           T* output_data) { … }

#ifdef USE_NEON

inline void ScaleWithNewZeroPoint(const int32x4_t input,
                                  const float32x4_t scale_dup,
                                  const float32x4_t zero_times_scale_dup,
                                  float32x4_t* output) {
#ifdef __ARM_FEATURE_FMA
  *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
#else
  *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup),
                      zero_times_scale_dup);
#endif
}

#endif  // USE_NEON

inline void Dequantize(const tflite::DequantizationParams& op_params,
                       const RuntimeShape& input_shape,
                       const uint8_t* input_data,
                       const RuntimeShape& output_shape, float* output_data) { … }

inline void Dequantize(const tflite::DequantizationParams& op_params,
                       const RuntimeShape& input_shape,
                       const int8_t* input_data,
                       const RuntimeShape& output_shape, float* output_data) { … }

inline void Dequantize(const tflite::DequantizationParams& op_params,
                       const RuntimeShape& input_shape,
                       const int16_t* input_data,
                       const RuntimeShape& output_shape, float* output_data) { … }

inline void Dequantize(const RuntimeShape& input_shape,
                       const Eigen::half* input_data,
                       const RuntimeShape& output_shape, float* output_data) { … }

template <typename T>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const RuntimeShape& input_shape,
                           const float* input_data,
                           const RuntimeShape& output_shape, T* output_data) { … }

template <>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const RuntimeShape& input_shape,
                           const float* input_data,
                           const RuntimeShape& output_shape,
                           int8_t* output_data) { … }

template <>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const RuntimeShape& input_shape,
                           const float* input_data,
                           const RuntimeShape& output_shape,
                           uint8_t* output_data) { … }

template <>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const RuntimeShape& input_shape,
                           const float* input_data,
                           const RuntimeShape& output_shape,
                           int16_t* output_data) { … }

// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
#ifdef GEMMLOWP_NEON

inline int16x8x4_t SaturatingRounding(
    int16x8_t input_val_0, int16x8_t input_val_1, int16x8_t input_val_2,
    int16x8_t input_val_3, int input_left_shift, int input_multiplier) {
  // This performs what is expressed in the scalar code as
  // const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
  //      static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
  //      static_cast<int16_t>(input_multiplier));
  const int16x8_t left_shift_dup = vdupq_n_s16(input_left_shift);
  const int16x8_t input_val_shifted_0 = vshlq_s16(input_val_0, left_shift_dup);
  const int16x8_t input_val_shifted_1 = vshlq_s16(input_val_1, left_shift_dup);
  const int16x8_t input_val_shifted_2 = vshlq_s16(input_val_2, left_shift_dup);
  const int16x8_t input_val_shifted_3 = vshlq_s16(input_val_3, left_shift_dup);
  int16x8x4_t result;
  result.val[0] = vqrdmulhq_n_s16(input_val_shifted_0, input_multiplier);
  result.val[1] = vqrdmulhq_n_s16(input_val_shifted_1, input_multiplier);
  result.val[2] = vqrdmulhq_n_s16(input_val_shifted_2, input_multiplier);
  result.val[3] = vqrdmulhq_n_s16(input_val_shifted_3, input_multiplier);
  return result;
}

// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
// considering 7 digits under zero.
inline int16x8x4_t FixedPoint4Logistic(int16x8x4_t input_val) {
  // Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
  using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
  using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
  const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
  const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
  const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
  const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);

  // TODO(b/134622898) Implement a low accuracy version of logistic. In this
  // method, gemmlowp::tanh spends about 80% of the execution times. The
  // current implementation is rougly 12-bit accurate in the 16-bit fixed
  // point case. Until reaching to error bounds, there are rooms for
  // improvements.
  const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
  const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
  const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
  const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);

  // Divide by 2^7 as in the scalar code
  int16x8x4_t result;
  result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 7);
  result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 7);
  result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 7);
  result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 7);
  return result;
}

// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
// considering 11 digits under zero at least.
inline int16x8x4_t FixedPoint4Tanh(int16x8x4_t input_val) {
  // Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
  using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
  using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
  const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
  const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
  const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
  const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);

  // TODO(b/134622898) Implement a low accuracy version of logistic. In this
  // method, gemmlowp::tanh spends about 80% of the execution times. The
  // current implementation is rougly 12-bit accurate in the 16-bit fixed
  // point case. Until reaching to error bounds, there are rooms for
  // improvements.
  const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
  const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
  const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
  const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);

  // Divide by 2^7 as in the scalar code
  int16x8x4_t result;
  result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 8);
  result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 8);
  result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 8);
  result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 8);
  return result;
}

inline uint8x16x2_t CalculateUnsignedClampingWithRangeBitMasks(
    int16x8x2_t input_val, int16x8_t range_radius_dup,
    int16x8_t neg_range_radius_dup) {
  const uint16x8_t mask_rightclamp_0 =
      vcgtq_s16(input_val.val[0], range_radius_dup);
  const uint16x8_t mask_rightclamp_1 =
      vcgtq_s16(input_val.val[1], range_radius_dup);

  const uint16x8_t mask_leftclamp_0 =
      vcgeq_s16(input_val.val[0], neg_range_radius_dup);
  const uint16x8_t mask_leftclamp_1 =
      vcgeq_s16(input_val.val[1], neg_range_radius_dup);

  uint8x16x2_t result;
  result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
                              vshrn_n_u16(mask_leftclamp_1, 8));
  result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
                              vshrn_n_u16(mask_rightclamp_1, 8));
  return result;
}

inline uint8x16x2_t CalculateSignedClampingWithRangeBitMasks(
    int16x8x2_t input_val, int16x8_t range_radius_dup,
    int16x8_t neg_range_radius_dup) {
  const uint16x8_t mask_rightclamp_0 =
      vcgtq_s16(input_val.val[0], range_radius_dup);
  const uint16x8_t mask_rightclamp_1 =
      vcgtq_s16(input_val.val[1], range_radius_dup);

  const uint16x8_t mask_leftclamp_0 =
      vcltq_s16(input_val.val[0], neg_range_radius_dup);
  const uint16x8_t mask_leftclamp_1 =
      vcltq_s16(input_val.val[1], neg_range_radius_dup);

  uint8x16x2_t result;
  result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
                              vshrn_n_u16(mask_leftclamp_1, 8));
  result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
                              vshrn_n_u16(mask_rightclamp_1, 8));
  return result;
}

inline void ClampWithRangeAndStore(uint8_t* output_dst, uint8x16_t input_val,
                                   uint8x16x2_t masks_clamp) {
  // Store back to memory
  vst1q_u8(output_dst, vandq_u8(vorrq_u8(input_val, masks_clamp.val[1]),
                                masks_clamp.val[0]));
}

inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
                                   uint8x16x2_t masks_clamp) {
  static const int8x16_t max_dup = vdupq_n_s8(127);
  static const int8x16_t min_dup = vdupq_n_s8(-128);
  // Store back to memory
  vst1q_s8(output_dst,
           vbslq_s8(masks_clamp.val[1], max_dup,
                    vbslq_s8(masks_clamp.val[0], min_dup, input_val)));
}

#endif  // GEMMLOWP_NEON

inline void Tanh16bitPrecision(const TanhParams& params,
                               const RuntimeShape& input_shape,
                               const uint8_t* input_data,
                               const RuntimeShape& output_shape,
                               uint8_t* output_data) { … }

inline void Tanh16bitPrecision(const TanhParams& params,
                               const RuntimeShape& input_shape,
                               const int8_t* input_data,
                               const RuntimeShape& output_shape,
                               int8_t* output_data) { … }

inline void Logistic16bitPrecision(const LogisticParams& params,
                                   const RuntimeShape& input_shape,
                                   const uint8_t* input_data,
                                   const RuntimeShape& output_shape,
                                   uint8_t* output_data) { … }

inline void Logistic16bitPrecision(const LogisticParams& params,
                                   const RuntimeShape& input_shape,
                                   const int8_t* input_data,
                                   const RuntimeShape& output_shape,
                                   int8_t* output_data) { … }

// Transpose2D only deals with typical 2D matrix transpose ops.
// Perform transpose by transposing 4x4 blocks of the input, proceeding from
// left to right (down the rows) of the input, and then from top to bottom.
template <typename T>
inline void Transpose2D(const RuntimeShape& input_shape, const T* input_data,
                        const RuntimeShape& output_shape, T* output_data) { … }

template <>
inline void Transpose2D(const RuntimeShape& input_shape,
                        const int32_t* input_data,
                        const RuntimeShape& output_shape,
                        int32_t* output_data) { … }

// TODO(b/173718660): see if we can reduce the number
// of lines of code in branching without affecting latency.
template <typename T>
inline void Transpose3D(const TransposeParams& params,
                        const RuntimeShape& input_shape, const T* input_data,
                        const RuntimeShape& output_shape, T* output_data) { … }

template <typename T>
void TransposeImpl(const TransposeParams& params,
                   const RuntimeShape& input_shape, const T* input_data,
                   const RuntimeShape& output_shape, T* output_data) { … }

template <typename T, int N = 6>
void Transpose(const TransposeParams& unshrinked_params,
               const RuntimeShape& unshrinked_input_shape, const T* input_data,
               const RuntimeShape& unshrinked_output_shape, T* output_data) { … }

// Assume input1 & input2 have the same scale & zero point.
inline void MaximumElementwise(int size, const ArithmeticParams& params,
                               const int8_t* input1_data,
                               const int8_t* input2_data, int8_t* output_data) { … }

inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
                                   int8_t input1_data,
                                   const int8_t* input2_data,
                                   int8_t* output_data) { … }

// Assume input1 & input2 have the same scale & zero point.
inline void MinimumElementwise(int size, const ArithmeticParams& params,
                               const int8_t* input1_data,
                               const int8_t* input2_data, int8_t* output_data) { … }

inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
                                   int8_t input1_data,
                                   const int8_t* input2_data,
                                   int8_t* output_data) { … }

template <typename Op>
inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
                                     const RuntimeShape& input1_shape,
                                     const int8_t* input1_data,
                                     const RuntimeShape& input2_shape,
                                     const int8_t* input2_data,
                                     const RuntimeShape& output_shape,
                                     int8_t* output_data, Op op) { … }

template <typename Op>
inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
                                     const RuntimeShape& input1_shape,
                                     const int8_t* input1_data,
                                     const RuntimeShape& input2_shape,
                                     const int8_t* input2_data,
                                     const RuntimeShape& output_shape,
                                     int8_t* output_data, Op op) { … }

template <typename T>
void CumsumImpl(const T* input_data, const RuntimeShape& shape, int axis,
                bool exclusive, bool reverse, T* output_data) { … }

template <typename T>
void CumSum(const T* input_data, const RuntimeShape& shape, int axis,
            bool exclusive, bool reverse, T* output_data) { … }

inline void PReluScalarBroadcast(int size, const ArithmeticParams& params,
                                 float alpha, const float* input_data,
                                 float* output_data) { … }

inline void PReluElementWise(int flat_size, const ArithmeticParams& params,
                             const float* alpha_data, const float* input_data,
                             float* output_data) { … }

inline void BroadcastPReluDispatch(
    const ArithmeticParams& params, const RuntimeShape& input_shape,
    const float* input_data, const RuntimeShape& alpha_shape,
    const float* alpha_data, const RuntimeShape& output_shape,
    float* output_data, float (*func)(float, float)) { … }

// Returns the index with minimum value within `input_data`.
// If there is a tie, returns the smaller index.
template <typename T>
inline int ArgMinVector(const T* input_data, int size) { … }

// Returns the index with maximum value within `input_data`.
// If there is a tie, returns the smaller index.
template <typename T>
inline int ArgMaxVector(const T* input_data, int size) { … }

template <>
inline int ArgMinVector(const float* input_data, int size) { … }

template <>
inline int ArgMaxVector(const float* input_data, int size) { … }

template <>
inline int ArgMaxVector(const int8_t* input_data, int size) { … }

template <>
inline int ArgMaxVector(const uint8_t* input_data, int size) { … }

// Specializes ArgMinMax function with axis=dims-1.
// In this case, ArgMinMax reduction is applied on contiguous memory.
template <typename T1, typename T2, bool is_arg_max>
inline void ArgMinMaxLastAxis(const RuntimeShape& input_shape,
                              const T1* input_data,
                              const RuntimeShape& output_shape,
                              T2* output_data) { … }

template <typename T1, typename T2, typename T3>
inline void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
                      const T3* input2_data, const RuntimeShape& output_shape,
                      T2* output_data, const bool is_arg_max) { … }

template <typename T1, typename T2, typename T3>
void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
            const T3* input2_data, const RuntimeShape& output_shape,
            T2* output_data) { … }

// Convenience version that allows, for example, generated-code calls to be
// the same as other binary ops.
// For backward compatibility, reference_ops has ArgMax function.
template <typename T1, typename T2, typename T3>
inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
                   const RuntimeShape& input2_shape, const T3* input2_data,
                   const RuntimeShape& output_shape, T2* output_data) { … }

inline TfLiteStatus Conv3D(
    const Conv3DParams& params, const RuntimeShape& input_shape,
    const float* input_data, const RuntimeShape& filter_shape,
    const float* filter_data, const RuntimeShape& bias_shape,
    const float* bias_data, const RuntimeShape& output_shape,
    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data,
    CpuBackendContext* cpu_backend_context) { … }

// Returns in 'im_data' (assumed to be zero-initialized) image patch in storage
// order (planes, height, width, channel), constructed from patches in
// 'col_data', which is required to be in storage order (out_planes * out_height
// * out_width, filter_planes, filter_height, filter_width, in_channel).
//
// This function is copied from tensorflow/core/kernels/conv_grad_ops_3d.cc
// authored by Eugene Zhulenev(ezhulenev).
template <typename T>
void Col2im(const T* col_data, const int channel, const int planes,
            const int height, const int width, const int filter_p,
            const int filter_h, const int filter_w, const int pad_pt,
            const int pad_t, const int pad_l, const int pad_pb, const int pad_b,
            const int pad_r, const int stride_p, const int stride_h,
            const int stride_w, T* im_data) { … }

template <typename T>
void BiasAdd3D(T* im_data, const T* bias_data, const RuntimeShape& input_shape,
               float float_activation_min, float float_activation_max) { … }

inline void Conv3DTranspose(
    const Conv3DTransposeParams& params, const RuntimeShape& input_shape,
    const float* input_data, const RuntimeShape& filter_shape,
    const float* filter_data, const RuntimeShape& bias_shape,
    const float* bias_data, const RuntimeShape& output_shape,
    float* const output_data, const RuntimeShape& col2im_shape,
    float* col2im_data, CpuBackendContext* cpu_backend_context) { … }

// Worker for summing up within a single interval. Interval is identified by
// index from [start, end).
template <typename T>
struct AddNWorkerTask : cpu_backend_threadpool::Task { … };

// T is expected to be either float or int.
template <typename T>
inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
                 const T* const* input_data, T* output_data, T* scratch_buffer,
                 CpuBackendContext* cpu_backend_context) { … }

}  // namespace optimized_ops
}  // namespace tflite

#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
#pragma GCC diagnostic pop
#endif

#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
chromium/third_party/tflite/src/tensorflow/lite/kernels/internal/optimized/optimized_ops.h