depthwise_conv_hybrid.h | Explore in Territory

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_

#include <algorithm>
#include <memory>

#include "ruy/profiler/instrumentation.h"  // from @ruy
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/types.h"

namespace tflite {
namespace optimized_integer_ops {
namespace depthwise_conv {

// Initializes the accumulator buffer with zeros.
inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
                                       int32* acc_buffer) { … }

// Base DWConv Implementation used with both static and dynamic
// accumulator buffers.
// Initializes the accumulator buffer with bias values.
static void DoDepthwiseConvHybridGeneral(
    const DepthwiseParams& params, const float* input_scales,
    const RuntimeShape& input_shape, const int8* input_data,
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scales, const int32_t* input_offsets,
    int thread_start, int thread_end, int thread_dim, int32* acc_buffer,
    int32 acc_buffer_size) { … }

// Utilize the base implementation of DWConv with a stack allocated accumulator
// buffer. The static allocation limits the number of depthwise channels that
// can be processed to kStaticAccBufferMaxSize.
static void DoDepthwiseConvHybridGeneralStatic(
    const DepthwiseParams& params, const float* input_scales,
    const RuntimeShape& input_shape, const int8* input_data,
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scales, const int32_t* input_offsets,
    int thread_start, int thread_end, int thread_dim) { … }

// This DWConv function uses static memory for accumulation by default for upto
// kStaticAccBufferMaxSize channels. Beyound that, a dynamic buffer is used on
// a per call basis. The function errors out if number of channels is larger
// than kStaticAccBufferMaxSize and TF_LITE_STATIC_MEMORY is defined.
inline void DepthwiseConvHybridGeneral(
    const DepthwiseParams& params, const float* input_scales,
    const RuntimeShape& input_shape, const int8* input_data,
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scales, const int32_t* input_offsets,
    int thread_start, int thread_end, int thread_dim) { … }

}  // namespace depthwise_conv

template <DepthwiseConvOutputRounding kOutputRounding>
inline void DepthwiseConvHybridWithRounding(
    const DepthwiseParams& params, const float* input_scales,
    const RuntimeShape& input_shape, const int8* input_data,
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scales, const int32_t* input_offsets,
    int thread_start, int thread_end, int thread_dim) { … }

inline void DepthwiseConvHybridImpl(
    const DepthwiseParams& params, const float* input_scales,
    const RuntimeShape& input_shape, const int8* input_data,
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scales, const int32_t* input_offsets,
    int thread_start, int thread_end, int thread_dim) { … }

template <typename T, typename TS>
struct DepthwiseConvHybridWorkerTask : cpu_backend_threadpool::Task { … };

inline void DepthwiseConvHybridPerChannel(
    const DepthwiseParams& params, const float* input_scales,
    const RuntimeShape& input_shape, const int8* input_data,
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scales, int32_t* input_offsets,
    CpuBackendContext* cpu_backend_context) { … }

}  // namespace optimized_integer_ops
}  // namespace tflite

#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
chromium/third_party/tflite/src/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid.h