kernel_avx.cc | Explore in Territory

/* Copyright 2020 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include <algorithm>
#include <cstdint>
#include <cstring>

#include "ruy/check_macros.h"
#include "ruy/kernel_common.h"
#include "ruy/kernel_x86.h"
#include "ruy/opt_set.h"
#include "ruy/platform.h"
#include "ruy/profiler/instrumentation.h"

#if RUY_PLATFORM_AVX && RUY_OPT(ASM)
#include <immintrin.h>  // IWYU pragma: keep
#endif

namespace ruy {

#if !(RUY_PLATFORM_AVX && RUY_OPT(ASM))

void Kernel8bitAvx(const KernelParams8bit<8, 8>&) {
  // CPU-ID-based checks should disable the path that would reach this point.
  RUY_DCHECK(false);
}

void Kernel8bitAvxSingleCol(const KernelParams8bit<8, 8>&) {
  // CPU-ID-based checks should disable the path that would reach this point.
  RUY_DCHECK(false);
}

void KernelFloatAvx(const KernelParamsFloat<8, 8>&) {
  // CPU-ID-based checks should disable the path that would reach this point.
  RUY_DCHECK(false);
}

void KernelFloatAvxSingleCol(const KernelParamsFloat<8, 8>&) {
  // CPU-ID-based checks should disable the path that would reach this point.
  RUY_DCHECK(false);
}

#else  // RUY_PLATFORM_AVX && RUY_OPT(ASM)

static constexpr int kAvx8bitBlockSize = …;
static constexpr int kAvx8bitInnerSize = …;

namespace {
namespace intrin_utils {

template <>
inline __m256i mm256_shuffle_epi8<Path::kAvx>(const __m256i& a,
                                              const __m256i& b) { … }

template <>
inline __m128i mm256_extracti128_si256<Path::kAvx>(const __m256i& a,
                                                   const int imm) { … }

template <Path path>
inline __m256i mm256_cvtepi8_epi16(const __m128i& a) { … }

template <Path path>
inline __m256i mm256_cvtepi32_epi64(const __m128i& a) { … }

inline __m128i mm_permute_helper(const __m256i& a, const __m256i& b,
                                 const int imm) { … }

template <Path path>
inline __m256i mm256_permute2x128_si256(const __m256i& a, const __m256i& b,
                                        const int imm) { … }

template <Path path>
inline __m256i mm256_max_epi32(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_min_epi32(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_add_epi32(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_add_epi64(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_slli_epi64(const __m256i& a, int imm) { … }

template <Path path>
inline __m256i mm256_mullo_epi32(const __m256i& a, const __m256i& b) { … }

// Defined as a macro since `imm` must be an immediate.
#define BlendM128_epi32(a, b, imm) …

// Defined as a macro since `imm` must be an immediate.
#define BlendM128_epi64(a, b, imm) …

// Defined as a macro since `imm` must be an immediate.
#define mm256_blend_epi32(ans, a, b, imm) …

#define mm256_shuffle_epi32(ans, a, a_lo, a_hi, imm) …

template <Path path>
inline __m256i mm256_madd_epi16(const __m256i& a, const __m256i& b) { … }

inline __m128i mm_srlv_epi64(const __m128i& a, const __m128i& b) { … }

template <Path path>
inline __m256i mm256_srlv_epi64(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m128i mm_sllv_epi64(const __m128i& a, const __m128i& b) { … }

template <Path path>
inline __m256i mm256_sllv_epi64(const __m256i& a, const __m256i& b) { … }

#define PermuteM128_epi32(a, imm) …

inline __m128i mm_sllv_epi32(const __m128i& a, const __m128i& b) { … }

template <Path path>
inline __m256i mm256_sllv_epi32(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_sub_epi32(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_mul_epi32(const __m256i& a, const __m256i& b) { … }

// Perform the equivalent of mm256_permutevar8x32 with
// a second argument of {7, 5, 3, 1, 6, 4, 2, 0}
template <Path path>
inline __m256i PermuteEpi32EvenOdds(const __m256i& a) { … }

template <Path path>
inline __m256i AddBiasEpi32(const __m256i& a, const int32_t* bias, int offset) { … }

__m256i mm256_blendv_epi32(const __m256i& a, const __m256i& b,
                           const __m256i& mask) { … }

template <Path path>
inline __m256i mm256_cmpgt_epi32(const __m256i& a, const __m256i& b) { … }

template <Path path>
inline __m256i mm256_srav_epi32(const __m256i& a, const __m256i& b) { … }

// AVX doesn't have fused multiply-add so we define an inline function to be
// used in the common code following.
template <>
inline __m256 MulAdd<Path::kAvx>(const __m256& a, const __m256& b,
                                 const __m256& c) { … }

}  // namespace intrin_utils
}  // namespace

template <Path path>
void Kernel8bitAvxImpl(const KernelParams8bit<8, 8>& params) { … }  // NOLINT(readability/fn_size)

void Kernel8bitAvx(const KernelParams8bit<8, 8>& params) { … }

template <Path path>
void Kernel8bitAvxSingleColImpl(const KernelParams8bit<8, 8>& params) { … }  // NOLINT(readability/fn_size)

void Kernel8bitAvxSingleCol(const KernelParams8bit<8, 8>& params) { … }

void KernelFloatAvx(const KernelParamsFloat<8, 8>& params) { … }

void KernelFloatAvxSingleCol(const KernelParamsFloat<8, 8>& params) { … }

#endif  //  RUY_PLATFORM_AVX && RUY_OPT(ASM)

}  // namespace ruy
chromium/third_party/ruy/src/ruy/kernel_avx.cc