chromium/third_party/gemmlowp/src/internal/output.h

// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// output.h: processing the 32-bit accumulators output by the unpack
// stage, obtaining the final result matrix entries and storing them into
// the destination matrix.

#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
#define GEMMLOWP_INTERNAL_OUTPUT_H_

#include <cmath>
#include <tuple>
#include <type_traits>
#include <typeinfo>

#include "../fixedpoint/fixedpoint.h"
#include "../public/output_stages.h"
#include "simd_wrappers.h"

namespace gemmlowp {

template <typename OutputStage, typename InputBufferType>
struct OutputStageEvalBufferImpl {};

template <typename OutputStage, typename InputType>
struct OutputStageEvalImpl {};

OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale, RegisterBuffer<std::int32_t, Size>>;

OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>, RegisterBlock<std::int32_t, Rows, Cols>>;

OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ByFixedPoint, RegisterBuffer<std::int32_t, Size>>;

OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent, RegisterBuffer<std::int32_t, Size>>;

OutputStageEvalImpl<OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>, RegisterBlock<std::int32_t, Rows, Cols>>;

// Implementation of OutputStageSaturatingCastToUint8 for scalar data.
OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, RegisterBuffer<std::int32_t, Size>>;

// Implementation of OutputStageSaturatingCastToInt8 for scalar data.
OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8, RegisterBuffer<std::int32_t, Size>>;

// Implementation of OutputStageSaturatingCastToInt16 for scalar data.
OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, RegisterBuffer<std::int32_t, Size>>;

// Implementation of OutputStageTruncatingCastToUint8 for scalar data
OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8, RegisterBuffer<std::int32_t, Size>>;

OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, RegisterBlock<std::int32_t, Rows, Cols>>;

OutputStageEvalBufferImpl<OutputStageClamp, RegisterBuffer<std::int32_t, Size>>;

OutputStageEvalBufferImpl<OutputStageTanh, RegisterBuffer<std::int32_t, Size>>;

// OutputPipelineOutputType is a helper to determine the output data type of a
// pipeline, for a
// given input data type. It is a recursive template; see the explanation on
// OutputPipelineEvalImpl below.
template <typename OutputPipelineType, int FirstStage, typename InputType,
          bool StopRecursion =
              FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineOutputType {};

OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, true>;

// OutputPipelineEvalImpl is a helper to implement the evaluation of
// the whole pipeline. It is a recursive template to implement compile-time
// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
// is how we implement recursion: each specialization implements only
// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
// helper to implement the termination of the recursion as a partial
// specialization below.
template <typename OutputPipelineType, int FirstStage, typename InputType,
          bool StopRecursion =
              FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineEvalImpl {};

// Specialization on 'StopRecursion' for terminating the recursion.
OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true>;

template <typename RegisterBlockType, typename DstType>
struct StoreFinalOutputImpl {};

StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType>;

// StoreFinalOutput takes the final value at the end of the output pipeline and
// stores it into the destination matrix. It can be specialized for different
// data types; the generic implementation here is typically used only for plain
// old scalar (not SIMD) types.
template <typename RegisterBlockType, typename DstType>
void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {}

template <typename OutputPipelineType, typename InputType>
struct OutputPipelineExecutor {};

}  // namespace gemmlowp

#ifdef GEMMLOWP_NEON
#include "output_neon.h"
#elif defined(GEMMLOWP_SSE4)
#include "output_sse.h"
#elif defined(GEMMLOWP_MSA)
#include "output_msa.h"
#endif

#endif  // GEMMLOWP_INTERNAL_OUTPUT_H_