#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
#define GEMMLOWP_INTERNAL_OUTPUT_H_
#include <cmath>
#include <tuple>
#include <type_traits>
#include <typeinfo>
#include "../fixedpoint/fixedpoint.h"
#include "../public/output_stages.h"
#include "simd_wrappers.h"
namespace gemmlowp {
template <typename OutputStage, typename InputBufferType>
struct OutputStageEvalBufferImpl { … };
template <typename OutputStage, typename InputType>
struct OutputStageEvalImpl { … };
OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>, RegisterBlock<std::int32_t, Rows, Cols>>;
OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ByFixedPoint, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalImpl<OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>, RegisterBlock<std::int32_t, Rows, Cols>>;
OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, RegisterBlock<std::int32_t, Rows, Cols>>;
OutputStageEvalBufferImpl<OutputStageClamp, RegisterBuffer<std::int32_t, Size>>;
OutputStageEvalBufferImpl<OutputStageTanh, RegisterBuffer<std::int32_t, Size>>;
template <typename OutputPipelineType, int FirstStage, typename InputType,
bool StopRecursion =
FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineOutputType { … };
OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, true>;
template <typename OutputPipelineType, int FirstStage, typename InputType,
bool StopRecursion =
FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineEvalImpl { … };
OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true>;
template <typename RegisterBlockType, typename DstType>
struct StoreFinalOutputImpl { … };
StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType>;
template <typename RegisterBlockType, typename DstType>
void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) { … }
template <typename OutputPipelineType, typename InputType>
struct OutputPipelineExecutor { … };
}
#ifdef GEMMLOWP_NEON
#include "output_neon.h"
#elif defined(GEMMLOWP_SSE4)
#include "output_sse.h"
#elif defined(GEMMLOWP_MSA)
#include "output_msa.h"
#endif
#endif