#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
#include <assert.h>
#include <stdint.h>
#include <sys/types.h>
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <limits>
#include <memory>
#include <tuple>
#include <type_traits>
#include <utility>
#include "tensorflow/lite/core/macros.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/reference/add.h"
#include "tensorflow/lite/kernels/internal/reference/mul.h"
#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
#include <Accelerate/Accelerate.h>
#endif
#include "Eigen/Core"
#include "unsupported/Eigen/CXX11/Tensor"
#include "fixedpoint/fixedpoint.h"
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/core/c/common.h"
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
#include "tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
#include "tensorflow/lite/kernels/internal/tensor.h"
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
#include "tensorflow/lite/kernels/internal/transpose_utils.h"
#include "tensorflow/lite/kernels/internal/types.h"
#if __aarch64__ && __clang__
#define TFLITE_SOFTMAX_USE_UINT16_LUT
#endif
namespace tflite {
namespace optimized_ops {
Broadcast4DSlowGreater;
Broadcast4DSlowGreaterEqual;
Broadcast4DSlowGreaterEqualWithScaling;
Broadcast4DSlowGreaterWithScaling;
Broadcast4DSlowLess;
Broadcast4DSlowLessEqual;
Broadcast4DSlowLessEqualWithScaling;
Broadcast4DSlowLessWithScaling;
BroadcastAdd6DSlow;
BroadcastMul6DSlow;
BroadcastSub16POTSlow;
BroadcastSubSlow;
Concatenation;
ConcatenationWithScaling;
DepthConcatenation;
Div;
Elu;
FakeQuant;
Fill;
Gather;
Greater;
GreaterEqual;
GreaterEqualWithScaling;
GreaterWithScaling;
LeakyRelu;
Less;
LessEqual;
LessEqualWithScaling;
LessWithScaling;
ProcessBroadcastShapes;
RankOneSelect;
Relu0To1;
Relu1;
Relu6;
ReluX;
Round;
Select;
SpaceToBatchND;
Split;
Sub16;
static constexpr int kReverseShift = …;
template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
struct TTypes { … };
template <typename Scalar>
MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
const RuntimeShape& shape,
int rows) { … }
static inline void swap_data(ArithmeticParams& arithmetic_params) { … }
template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
TFLITE_NOINLINE void BinaryBroadcastFiveFold(
const ArithmeticParams& unswitched_params,
const RuntimeShape& unswitched_input1_shape,
const T* unswitched_input1_data,
const RuntimeShape& unswitched_input2_shape,
const T* unswitched_input2_data, const RuntimeShape& output_shape,
T* output_data, ElementwiseF elementwise_f,
ScalarBroadcastF scalar_broadcast_f) { … }
#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
uint8x16_t indices) {
uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
uint8x16_t output2 =
vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
uint8x16_t output3 =
vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
uint8x16_t output4 =
vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
}
#endif
inline void AddBiasAndEvalActivationFunction(float output_activation_min,
float output_activation_max,
const RuntimeShape& bias_shape,
const float* bias_data,
const RuntimeShape& array_shape,
float* array_data) { … }
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& weights_shape,
const float* weights_data, const RuntimeShape& bias_shape,
const float* optional_bias_data, const RuntimeShape& output_shape,
float* output_data, CpuBackendContext* cpu_backend_context) { … }
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, CpuBackendContext* cpu_backend_context) { … }
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data_int32, const RuntimeShape& output_shape,
int16_t* output_data, CpuBackendContext* cpu_backend_context) { … }
inline void ShuffledFullyConnectedWorkerImpl(
const uint8_t* shuffled_input_workspace_data,
const int8_t* shuffled_weights_data, int batches, int output_depth,
int output_stride, int accum_depth, const int32_t* bias_data,
int32_t output_multiplier, int output_shift, int16_t* output_data) { … }
struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task { … };
inline void ShuffledFullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& weights_shape,
const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data, uint8_t* shuffled_input_workspace_data,
CpuBackendContext* cpu_backend_context) { … }
#ifdef USE_NEON
inline int32x4_t RoundToNearest(const float32x4_t input) {
#if defined(__aarch64__) || defined(__SSSE3__)
return vcvtnq_s32_f32(input);
#else
static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);
const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
const float32x4_t round =
vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
return vcvtq_s32_f32(vaddq_f32(input, round));
#endif
}
inline uint32x4_t RoundToNearestUnsigned(const float32x4_t input) {
#if defined(__aarch64__)
return vcvtnq_u32_f32(input);
#else
static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
return vcvtq_u32_f32(vaddq_f32(input, point5_val_dup));
#endif
}
#endif
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data, const RuntimeShape& im2col_shape,
float* im2col_data, CpuBackendContext* cpu_backend_context) { … }
inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& filter_shape,
const int8_t* filter_data,
const RuntimeShape& bias_shape, const float* bias_data,
const RuntimeShape& accum_scratch_shape,
int32_t* accum_scratch, const RuntimeShape& output_shape,
float* output_data, const RuntimeShape& im2col_shape,
int8_t* im2col_data, CpuBackendContext* context) { … }
inline void HybridConvPerChannel(
const ConvParams& params, float* scaling_factors_ptr,
const RuntimeShape& input_shape, const int8_t* input_data,
const RuntimeShape& filter_shape, const int8_t* filter_data,
const RuntimeShape& bias_shape, const float* bias_data,
const RuntimeShape& output_shape, float* output_data,
const RuntimeShape& im2col_shape, int8_t* im2col_data,
const float* per_channel_scale, int32_t* input_offset,
const RuntimeShape& scratch_shape, int32_t* scratch, int32_t* row_sums,
bool* compute_row_sums, CpuBackendContext* cpu_backend_context) { … }
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const RuntimeShape& im2col_shape,
uint8_t* im2col_data, CpuBackendContext* cpu_backend_context) { … }
template <typename T>
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) { … }
template <typename T>
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) { … }
inline void Relu(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape,
float* output_data, float epsilon = 1e-6) { … }
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void AddElementwise(int size, const ArithmeticParams& params,
const float* input1_data, const float* input2_data,
float* output_data) { … }
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const float* input1_data,
const RuntimeShape& input2_shape, const float* input2_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void AddElementwise(int size, const ArithmeticParams& params,
const uint8_t* input1_data,
const uint8_t* input2_data, uint8_t* output_data) { … }
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
uint8_t input1_data, const uint8_t* input2_data,
uint8_t* output_data) { … }
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
float broadcast_value, const float* input2_data,
float* output_data) { … }
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) { … }
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16_t* input1_data,
const RuntimeShape& input2_shape, const int16_t* input2_data,
const RuntimeShape& output_shape, int16_t* output_data) { … }
template <typename T>
inline typename std::enable_if<is_int32_or_int64<T>::value, void>::type Add(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }
template <typename T>
inline void BroadcastAddDispatch(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
const RuntimeShape& unswitched_input1_shape,
const uint8_t* unswitched_input1_data,
const RuntimeShape& unswitched_input2_shape,
const uint8_t* unswitched_input2_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void BroadcastAddFivefold(const ArithmeticParams& params,
const RuntimeShape& unswitched_input1_shape,
const float* unswitched_input1_data,
const RuntimeShape& unswitched_input2_shape,
const float* unswitched_input2_data,
const RuntimeShape& output_shape,
float* output_data) { … }
inline void MulElementwise(int size, const ArithmeticParams& params,
const float* input1_data, const float* input2_data,
float* output_data) { … }
inline void MulElementwise(int32_t n, const ArithmeticParams& params,
const int32_t* __restrict lhs,
const int32_t* __restrict rhs,
int32_t* __restrict out) { … }
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const float* input1_data,
const RuntimeShape& input2_shape, const float* input2_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int32_t* input1_data,
const RuntimeShape& input2_shape, const int32_t* input2_data,
const RuntimeShape& output_shape, int32_t* output_data) { … }
inline void MulNoActivation(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int32_t* input1_data,
const RuntimeShape& input2_shape,
const int32_t* input2_data,
const RuntimeShape& output_shape,
int32_t* output_data) { … }
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16_t* input1_data,
const RuntimeShape& input2_shape, const int16_t* input2_data,
const RuntimeShape& output_shape, int16_t* output_data) { … }
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16_t* input1_data,
const RuntimeShape& input2_shape, const int16_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) { … }
inline void MulElementwise(int size, const ArithmeticParams& params,
const uint8_t* input1_data,
const uint8_t* input2_data, uint8_t* output_data) { … }
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
const uint8_t broadcast_value,
const uint8_t* input2_data,
uint8_t* output_data) { … }
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
const float broadcast_value,
const float* input2_data, float* output_data) { … }
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) { … }
template <typename T>
inline void BroadcastMulDispatch(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }
inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
const RuntimeShape& unswitched_input1_shape,
const uint8_t* unswitched_input1_data,
const RuntimeShape& unswitched_input2_shape,
const uint8_t* unswitched_input2_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void BroadcastMulFivefold(const ArithmeticParams& params,
const RuntimeShape& unswitched_input1_shape,
const float* unswitched_input1_data,
const RuntimeShape& unswitched_input2_shape,
const float* unswitched_input2_data,
const RuntimeShape& output_shape,
float* output_data) { … }
template <typename T, int N = 5>
void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const T* input2_data,
const RuntimeShape& unextended_output_shape,
T* output_data) { … }
template <int N = 5>
inline void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const uint8_t* input1_data,
const RuntimeShape& unextended_input2_shape,
const uint8_t* input2_data,
const RuntimeShape& unextended_output_shape,
uint8_t* output_data) { … }
template <typename T>
inline void SubWithActivation(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) { … }
inline void SubNonBroadcast(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const float* input1_data,
const RuntimeShape& input2_shape,
const float* input2_data,
const RuntimeShape& output_shape,
float* output_data) { … }
template <typename T>
void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape,
T* output_data) { … }
inline void LstmCell(
const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
const float* prev_activ_data, const RuntimeShape& weights_shape,
const float* weights_data, const RuntimeShape& unextended_bias_shape,
const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
const float* prev_state_data,
const RuntimeShape& unextended_output_state_shape, float* output_state_data,
const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data,
CpuBackendContext* cpu_backend_context) { … }
template <int StateIntegerBits>
inline void LstmCell(
const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
const uint8_t* input_data_uint8,
const RuntimeShape& unextended_prev_activ_shape,
const uint8_t* prev_activ_data_uint8, const RuntimeShape& weights_shape,
const uint8_t* weights_data_uint8,
const RuntimeShape& unextended_bias_shape, const int32_t* bias_data_int32,
const RuntimeShape& unextended_prev_state_shape,
const int16_t* prev_state_data_int16,
const RuntimeShape& unextended_output_state_shape,
int16_t* output_state_data_int16,
const RuntimeShape& unextended_output_activ_shape,
uint8_t* output_activ_data_uint8,
const RuntimeShape& unextended_concat_temp_shape,
uint8_t* concat_temp_data_uint8,
const RuntimeShape& unextended_activ_temp_shape,
int16_t* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) { … }
inline int NodeOffset(int b, int h, int w, int height, int width) { … }
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) { … }
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) { … }
inline void LocalResponseNormalization(
const tflite::LocalResponseNormalizationParams& op_params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void SoftmaxImpl(const SoftmaxParams& params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape, float* output_data,
int start_batch, int end_batch) { … }
struct SoftmaxWorkerTask : cpu_backend_threadpool::Task { … };
inline void Softmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data,
CpuBackendContext* cpu_backend_context = nullptr) { … }
template <typename T>
inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) { … }
#if !__aarch64__
template <>
inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled,
int32_t zero_point) { … }
#endif
inline void PopulateSoftmaxLookupTable(SoftmaxParams* data, float input_scale,
float beta) { … }
template <typename In, typename Out>
inline void Softmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const In* input_data,
const RuntimeShape& output_shape, Out* output_data) { … }
inline void PopulateSoftmaxUInt8LookupTable(SoftmaxParams* data,
float input_scale, float beta) { … }
inline int FindMaxValue(int size, const uint8_t* input_data, uint8_t offset) { … }
#ifdef USE_NEON
inline void StoreValue(int32x4x4_t value_to_store, int8_t* output) {
const int16x8_t result_1 = vcombine_s16(vqmovn_s32(value_to_store.val[1]),
vqmovn_s32(value_to_store.val[0]));
const int16x8_t result_2 = vcombine_s16(vqmovn_s32(value_to_store.val[3]),
vqmovn_s32(value_to_store.val[2]));
const int8x16_t result =
vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
vst1q_s8(output, result);
}
inline void StoreValue(int32x4x4_t value_to_store, uint8_t* output) {
const uint16x8_t result_1 =
vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
const uint16x8_t result_2 =
vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
const uint8x16_t result =
vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
vst1q_u8(output, result);
}
#endif
template <typename In, typename Out>
inline void SoftmaxInt8LUT(const SoftmaxParams& params,
const RuntimeShape& input_shape,
const In* input_data,
const RuntimeShape& output_shape, Out* output_data) { … }
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape, uint8_t* output_data) { … }
template <typename T>
inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) { … }
inline void Logistic(const LogisticParams& params,
const RuntimeShape& input_shape, const int16_t* input_data,
const RuntimeShape& output_shape, int16_t* output_data) { … }
inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) { … }
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& output_shape,
int16_t* output_data) { … }
template <typename SrcT, typename DstT>
inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
const RuntimeShape& output_shape, DstT* output_data) { … }
inline void Floor(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
int input_dim, int output_dim, int* start_index,
int* end_index) { … }
template <typename T>
inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const int32_t* block_shape_data,
const RuntimeShape& unextended_input3_shape,
const int32_t* crops_data,
const RuntimeShape& unextended_output_shape,
T* output_data) { … }
template <typename T>
TFLITE_NOINLINE void TypedMemset(void* ptr, T value, size_t num) { … }
template <typename T, typename P>
inline void PadImpl(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const P* pad_value_ptr, const RuntimeShape& output_shape,
T* output_data) { … }
template <typename T, typename P>
inline void Pad(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const P* pad_value_ptr, const RuntimeShape& output_shape,
T* output_data) { … }
template <typename T>
inline void Pad(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
T* output_data) { … }
template <>
inline void Pad(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const int32_t* input_data,
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
int32_t* output_data) { … }
template <typename T, typename P>
inline void PadImageStyleMemset(const tflite::PadParams& op_params,
const RuntimeShape& input_shape,
const T* input_data, const P* pad_value_ptr,
const RuntimeShape& output_shape,
T* output_data) { … }
template <typename T, typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const P* pad_value_ptr,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
const RuntimeShape& input_shape,
const uint8_t* input_data, const P* pad_value_ptr,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
template <typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
const RuntimeShape& input_shape,
const float* input_data, const P* pad_value_ptr,
const RuntimeShape& output_shape,
float* output_data) { … }
template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
const RuntimeShape& input_shape,
const RuntimeShape& output_shape,
SequentialTensorWriter<T>* writer) { … }
template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
const RuntimeShape& input_shape, const TfLiteTensor* input,
const RuntimeShape& output_shape, TfLiteTensor* output) { … }
template <typename T>
void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
const T* input2_data, const RuntimeShape& output_shape,
T* output_data) { … }
template <typename T>
inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape&, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename T>
void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
const T* input2_data, const RuntimeShape& output_shape,
T* output_data) { … }
template <typename T>
inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape&, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename T>
void TransposeIm2col(const ConvParams& params, uint8_t zero_byte,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& filter_shape,
const RuntimeShape& output_shape, T* im2col_data) { … }
template <typename T>
void Col2im(const T* col_data, const int depth, const int height,
const int width, const int filter_h, const int filter_w,
const int pad_t, const int pad_l, const int pad_b, const int pad_r,
const int stride_h, const int stride_w, T* im_data) { … }
template <typename T>
void BiasAdd(T* im_data, const T* bias_data, const int batch_size,
const int height, const int width, const int depth) { … }
inline void TransposeConvV2(
const ConvParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
const float* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* const output_data, const RuntimeShape& col2im_shape,
float* col2im_data, CpuBackendContext* cpu_backend_context) { … }
inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
int32_t output_zp, const int32_t output_min,
const int32_t output_max, int32_t* scratch,
uint8_t* output) { … }
#if TFLITE_SINGLE_ROUNDING
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
int32_t channel_size, int32_t total_size,
int32_t output_zp, int32_t output_min, int32_t output_max,
int32_t* scratch, int8_t* output) {
ruy::profiler::ScopeLabel label("Quantize/int8_t");
#ifdef USE_NEON
const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
const int32x4_t minus_ones = vdupq_n_s32(-1);
#endif
TFLITE_DCHECK_EQ(total_size % channel_size, 0);
const int32_t rows = total_size / channel_size;
int c = 0;
#ifdef USE_NEON
for (; c <= channel_size - 8; c += 8) {
int32x4_t out_shift_1 = vld1q_s32(shift + c);
int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
int32x4_t right_shift_1 = vminq_s32(out_shift_1, minus_ones);
int32x4_t right_shift_2 = vminq_s32(out_shift_2, minus_ones);
int32x4_t left_shift_1 = vsubq_s32(out_shift_1, right_shift_1);
int32x4_t left_shift_2 = vsubq_s32(out_shift_2, right_shift_2);
int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
for (int n = 0; n < rows; ++n) {
int loc = n * channel_size + c;
int32x4_t acc_1 = vld1q_s32(scratch + loc);
int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
acc_1 = vshlq_s32(acc_1, left_shift_1);
acc_1 = vqdmulhq_s32(acc_1, out_mul_1);
acc_2 = vshlq_s32(acc_2, left_shift_2);
acc_2 = vqdmulhq_s32(acc_2, out_mul_2);
acc_1 = vrshlq_s32(acc_1, right_shift_1);
acc_2 = vrshlq_s32(acc_2, right_shift_2);
acc_1 = vaddq_s32(acc_1, output_offset_vec);
acc_2 = vaddq_s32(acc_2, output_offset_vec);
acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
acc_1 = vminq_s32(acc_1, output_activation_max_vec);
acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
acc_2 = vminq_s32(acc_2, output_activation_max_vec);
const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
const int8x8_t res_s8 = vqmovn_s16(res_s16);
vst1_s8(output + loc, res_s8);
}
}
#endif
for (; c < channel_size; c++) {
for (int n = 0; n < rows; ++n) {
int loc = n * channel_size + c;
int32_t acc = scratch[loc];
acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
acc += output_zp;
acc = std::max(acc, output_min);
acc = std::min(acc, output_max);
output[loc] = static_cast<int8_t>(acc);
}
}
}
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
int32_t channel_size, int32_t total_size,
int32_t output_zp, int32_t output_min, int32_t output_max,
int32_t* scratch, int16_t* output) {
ruy::profiler::ScopeLabel label("Quantize(Single-rounding)/int16_t");
#ifdef USE_NEON
const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
const int32x4_t minus_ones = vdupq_n_s32(-1);
#endif
TFLITE_DCHECK_EQ(total_size % channel_size, 0);
const int32_t rows = total_size / channel_size;
int c = 0;
#ifdef USE_NEON
for (; c <= channel_size - 8; c += 8) {
int32x4_t out_shift_1 = vld1q_s32(shift + c);
int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
int32x4_t right_shift_1 = vminq_s32(out_shift_1, minus_ones);
int32x4_t right_shift_2 = vminq_s32(out_shift_2, minus_ones);
int32x4_t left_shift_1 = vsubq_s32(out_shift_1, right_shift_1);
int32x4_t left_shift_2 = vsubq_s32(out_shift_2, right_shift_2);
int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
for (int n = 0; n < rows; ++n) {
int loc = n * channel_size + c;
int32x4_t acc_1 = vld1q_s32(scratch + loc);
int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
acc_1 = vshlq_s32(acc_1, left_shift_1);
acc_1 = vqdmulhq_s32(acc_1, out_mul_1);
acc_2 = vshlq_s32(acc_2, left_shift_2);
acc_2 = vqdmulhq_s32(acc_2, out_mul_2);
acc_1 = vrshlq_s32(acc_1, right_shift_1);
acc_2 = vrshlq_s32(acc_2, right_shift_2);
acc_1 = vaddq_s32(acc_1, output_offset_vec);
acc_2 = vaddq_s32(acc_2, output_offset_vec);
acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
acc_1 = vminq_s32(acc_1, output_activation_max_vec);
acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
acc_2 = vminq_s32(acc_2, output_activation_max_vec);
const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
vst1_s16(reinterpret_cast<int16_t*>(output) + loc, acc_s16_1);
vst1_s16(reinterpret_cast<int16_t*>(output) + loc + 4, acc_s16_2);
}
}
#endif
for (; c < channel_size; c++) {
for (int n = 0; n < rows; ++n) {
int loc = n * channel_size + c;
int32_t acc = scratch[loc];
acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
acc += output_zp;
acc = std::max(acc, output_min);
acc = std::min(acc, output_max);
output[loc] = static_cast<int16_t>(acc);
}
}
}
#else
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
int32_t channel_size, int32_t total_size,
int32_t output_zp, int32_t output_min, int32_t output_max,
int32_t* scratch, int8_t* output) { … }
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
int32_t channel_size, int32_t total_size,
int32_t output_zp, int32_t output_min, int32_t output_max,
int32_t* scratch, int16_t* output) { … }
#endif
inline void TransposeConvV2(
const ConvParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const RuntimeShape& col2im_shape,
int32_t* col2im_data, int32_t* scratch_data,
CpuBackendContext* cpu_backend_context) { … }
inline void ResizeNearestNeighbor(
const tflite::ResizeNearestNeighborParams& op_params,
const RuntimeShape& unextended_input_shape, const uint8_t* input_data,
const RuntimeShape& output_size_shape, const int32_t* output_size_data,
const RuntimeShape& unextended_output_shape, uint8_t* output_data) { … }
template <typename input_type, typename output_type>
inline void Requantize(const input_type* input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift, int32_t input_zeropoint,
int32_t output_zeropoint, output_type* output_data) { … }
template <>
inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift,
int32_t input_zeropoint,
int32_t output_zeropoint,
uint8_t* output_data) { … }
template <>
inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift,
int32_t input_zeropoint,
int32_t output_zeropoint,
int8_t* output_data) { … }
template <>
inline void Requantize<int8_t, int8_t>(const int8_t* input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift,
int32_t input_zeropoint,
int32_t output_zeropoint,
int8_t* output_data) { … }
template <>
inline void Requantize<uint8_t, uint8_t>(
const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier,
int32_t effective_scale_shift, int32_t input_zeropoint,
int32_t output_zeropoint, uint8_t* output_data) { … }
inline void HardSwish(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
#ifdef USE_NEON
inline void SaturateAndStore(int16x8_t src, std::uint8_t* dst) {
uint8x8_t res8 = vqmovun_s16(src);
vst1_u8(dst, res8);
}
inline void SaturateAndStore(int16x8_t src, std::int8_t* dst) {
int8x8_t res8 = vqmovn_s16(src);
vst1_s8(dst, res8);
}
#endif
template <typename T>
inline void HardSwish(const HardSwishParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename T>
inline void IntegerExponentPow(const ArithmeticParams& params,
const RuntimeShape& unextended_base_shape,
const T* base_data, const int exponent,
const RuntimeShape& unextended_output_shape,
T* output_data) { … }
template <typename T>
inline void BroadcastPow4D(const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const T* input2_data,
const RuntimeShape& unextended_output_shape,
T* output_data) { … }
#ifdef USE_NEON
inline void ScaleWithNewZeroPoint(const int32x4_t input,
const float32x4_t scale_dup,
const float32x4_t zero_times_scale_dup,
float32x4_t* output) {
#ifdef __ARM_FEATURE_FMA
*output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
#else
*output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup),
zero_times_scale_dup);
#endif
}
#endif
inline void Dequantize(const tflite::DequantizationParams& op_params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Dequantize(const tflite::DequantizationParams& op_params,
const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Dequantize(const tflite::DequantizationParams& op_params,
const RuntimeShape& input_shape,
const int16_t* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
inline void Dequantize(const RuntimeShape& input_shape,
const Eigen::half* input_data,
const RuntimeShape& output_shape, float* output_data) { … }
template <typename T>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape,
int8_t* output_data) { … }
template <>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
template <>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape,
int16_t* output_data) { … }
#ifdef GEMMLOWP_NEON
inline int16x8x4_t SaturatingRounding(
int16x8_t input_val_0, int16x8_t input_val_1, int16x8_t input_val_2,
int16x8_t input_val_3, int input_left_shift, int input_multiplier) {
const int16x8_t left_shift_dup = vdupq_n_s16(input_left_shift);
const int16x8_t input_val_shifted_0 = vshlq_s16(input_val_0, left_shift_dup);
const int16x8_t input_val_shifted_1 = vshlq_s16(input_val_1, left_shift_dup);
const int16x8_t input_val_shifted_2 = vshlq_s16(input_val_2, left_shift_dup);
const int16x8_t input_val_shifted_3 = vshlq_s16(input_val_3, left_shift_dup);
int16x8x4_t result;
result.val[0] = vqrdmulhq_n_s16(input_val_shifted_0, input_multiplier);
result.val[1] = vqrdmulhq_n_s16(input_val_shifted_1, input_multiplier);
result.val[2] = vqrdmulhq_n_s16(input_val_shifted_2, input_multiplier);
result.val[3] = vqrdmulhq_n_s16(input_val_shifted_3, input_multiplier);
return result;
}
inline int16x8x4_t FixedPoint4Logistic(int16x8x4_t input_val) {
using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
int16x8x4_t result;
result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 7);
result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 7);
result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 7);
result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 7);
return result;
}
inline int16x8x4_t FixedPoint4Tanh(int16x8x4_t input_val) {
using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
int16x8x4_t result;
result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 8);
result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 8);
result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 8);
result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 8);
return result;
}
inline uint8x16x2_t CalculateUnsignedClampingWithRangeBitMasks(
int16x8x2_t input_val, int16x8_t range_radius_dup,
int16x8_t neg_range_radius_dup) {
const uint16x8_t mask_rightclamp_0 =
vcgtq_s16(input_val.val[0], range_radius_dup);
const uint16x8_t mask_rightclamp_1 =
vcgtq_s16(input_val.val[1], range_radius_dup);
const uint16x8_t mask_leftclamp_0 =
vcgeq_s16(input_val.val[0], neg_range_radius_dup);
const uint16x8_t mask_leftclamp_1 =
vcgeq_s16(input_val.val[1], neg_range_radius_dup);
uint8x16x2_t result;
result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
vshrn_n_u16(mask_leftclamp_1, 8));
result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
vshrn_n_u16(mask_rightclamp_1, 8));
return result;
}
inline uint8x16x2_t CalculateSignedClampingWithRangeBitMasks(
int16x8x2_t input_val, int16x8_t range_radius_dup,
int16x8_t neg_range_radius_dup) {
const uint16x8_t mask_rightclamp_0 =
vcgtq_s16(input_val.val[0], range_radius_dup);
const uint16x8_t mask_rightclamp_1 =
vcgtq_s16(input_val.val[1], range_radius_dup);
const uint16x8_t mask_leftclamp_0 =
vcltq_s16(input_val.val[0], neg_range_radius_dup);
const uint16x8_t mask_leftclamp_1 =
vcltq_s16(input_val.val[1], neg_range_radius_dup);
uint8x16x2_t result;
result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
vshrn_n_u16(mask_leftclamp_1, 8));
result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
vshrn_n_u16(mask_rightclamp_1, 8));
return result;
}
inline void ClampWithRangeAndStore(uint8_t* output_dst, uint8x16_t input_val,
uint8x16x2_t masks_clamp) {
vst1q_u8(output_dst, vandq_u8(vorrq_u8(input_val, masks_clamp.val[1]),
masks_clamp.val[0]));
}
inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
uint8x16x2_t masks_clamp) {
static const int8x16_t max_dup = vdupq_n_s8(127);
static const int8x16_t min_dup = vdupq_n_s8(-128);
vst1q_s8(output_dst,
vbslq_s8(masks_clamp.val[1], max_dup,
vbslq_s8(masks_clamp.val[0], min_dup, input_val)));
}
#endif
inline void Tanh16bitPrecision(const TanhParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void Tanh16bitPrecision(const TanhParams& params,
const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape,
int8_t* output_data) { … }
inline void Logistic16bitPrecision(const LogisticParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) { … }
inline void Logistic16bitPrecision(const LogisticParams& params,
const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape,
int8_t* output_data) { … }
template <typename T>
inline void Transpose2D(const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <>
inline void Transpose2D(const RuntimeShape& input_shape,
const int32_t* input_data,
const RuntimeShape& output_shape,
int32_t* output_data) { … }
template <typename T>
inline void Transpose3D(const TransposeParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename T>
void TransposeImpl(const TransposeParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) { … }
template <typename T, int N = 6>
void Transpose(const TransposeParams& unshrinked_params,
const RuntimeShape& unshrinked_input_shape, const T* input_data,
const RuntimeShape& unshrinked_output_shape, T* output_data) { … }
inline void MaximumElementwise(int size, const ArithmeticParams& params,
const int8_t* input1_data,
const int8_t* input2_data, int8_t* output_data) { … }
inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
int8_t input1_data,
const int8_t* input2_data,
int8_t* output_data) { … }
inline void MinimumElementwise(int size, const ArithmeticParams& params,
const int8_t* input1_data,
const int8_t* input2_data, int8_t* output_data) { … }
inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
int8_t input1_data,
const int8_t* input2_data,
int8_t* output_data) { … }
template <typename Op>
inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int8_t* input1_data,
const RuntimeShape& input2_shape,
const int8_t* input2_data,
const RuntimeShape& output_shape,
int8_t* output_data, Op op) { … }
template <typename Op>
inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int8_t* input1_data,
const RuntimeShape& input2_shape,
const int8_t* input2_data,
const RuntimeShape& output_shape,
int8_t* output_data, Op op) { … }
template <typename T>
void CumsumImpl(const T* input_data, const RuntimeShape& shape, int axis,
bool exclusive, bool reverse, T* output_data) { … }
template <typename T>
void CumSum(const T* input_data, const RuntimeShape& shape, int axis,
bool exclusive, bool reverse, T* output_data) { … }
inline void PReluScalarBroadcast(int size, const ArithmeticParams& params,
float alpha, const float* input_data,
float* output_data) { … }
inline void PReluElementWise(int flat_size, const ArithmeticParams& params,
const float* alpha_data, const float* input_data,
float* output_data) { … }
inline void BroadcastPReluDispatch(
const ArithmeticParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& alpha_shape,
const float* alpha_data, const RuntimeShape& output_shape,
float* output_data, float (*func)(float, float)) { … }
template <typename T>
inline int ArgMinVector(const T* input_data, int size) { … }
template <typename T>
inline int ArgMaxVector(const T* input_data, int size) { … }
template <>
inline int ArgMinVector(const float* input_data, int size) { … }
template <>
inline int ArgMaxVector(const float* input_data, int size) { … }
template <>
inline int ArgMaxVector(const int8_t* input_data, int size) { … }
template <>
inline int ArgMaxVector(const uint8_t* input_data, int size) { … }
template <typename T1, typename T2, bool is_arg_max>
inline void ArgMinMaxLastAxis(const RuntimeShape& input_shape,
const T1* input_data,
const RuntimeShape& output_shape,
T2* output_data) { … }
template <typename T1, typename T2, typename T3>
inline void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
const T3* input2_data, const RuntimeShape& output_shape,
T2* output_data, const bool is_arg_max) { … }
template <typename T1, typename T2, typename T3>
void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
const T3* input2_data, const RuntimeShape& output_shape,
T2* output_data) { … }
template <typename T1, typename T2, typename T3>
inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
const RuntimeShape& input2_shape, const T3* input2_data,
const RuntimeShape& output_shape, T2* output_data) { … }
inline TfLiteStatus Conv3D(
const Conv3DParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data, const RuntimeShape& im2col_shape, float* im2col_data,
CpuBackendContext* cpu_backend_context) { … }
template <typename T>
void Col2im(const T* col_data, const int channel, const int planes,
const int height, const int width, const int filter_p,
const int filter_h, const int filter_w, const int pad_pt,
const int pad_t, const int pad_l, const int pad_pb, const int pad_b,
const int pad_r, const int stride_p, const int stride_h,
const int stride_w, T* im_data) { … }
template <typename T>
void BiasAdd3D(T* im_data, const T* bias_data, const RuntimeShape& input_shape,
float float_activation_min, float float_activation_max) { … }
inline void Conv3DTranspose(
const Conv3DTransposeParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* const output_data, const RuntimeShape& col2im_shape,
float* col2im_data, CpuBackendContext* cpu_backend_context) { … }
template <typename T>
struct AddNWorkerTask : cpu_backend_threadpool::Task { … };
template <typename T>
inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
const T* const* input_data, T* output_data, T* scratch_buffer,
CpuBackendContext* cpu_backend_context) { … }
}
}
#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
#pragma GCC diagnostic pop
#endif
#endif