pack_avx.cc | Explore in Territory

/* Copyright 2020 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include <cstdint>
#include <cstring>

#include "ruy/check_macros.h"
#include "ruy/opt_set.h"
#include "ruy/pack_x86.h"
#include "ruy/path.h"
#include "ruy/platform.h"
#include "ruy/profiler/instrumentation.h"

#if RUY_PLATFORM_AVX && RUY_OPT(INTRINSICS)
#include <immintrin.h>  // IWYU pragma: keep
#endif

namespace ruy {

#if !(RUY_PLATFORM_AVX && RUY_OPT(ASM))

void Pack8bitColMajorForAvx(const std::int8_t*, std::int8_t, const std::int8_t*,
                            int, int, int, std::int8_t*, std::int32_t*) {
  // CPU-ID-based checks should disable the path that would reach this point.
  RUY_DCHECK(false);
}

void PackFloatColMajorForAvx(const float*, const float*, int, int, int,
                             float*) {
  // CPU-ID-based checks should disable the path that would reach this point.
  RUY_DCHECK(false);
}

void Pack8bitRowMajorForAvx(const std::uint8_t*, int, int, std::int8_t*, int,
                            int, int, int, int, int, int, std::int32_t*) {
  RUY_DCHECK(false);
}

#else  // RUY_PLATFORM_AVX && RUY_OPT(ASM)

// The first int8_t template parameter is arbitrary: this routine is common to
// all 8-bit source matrix types.
PackImpl8bitAvx;

PackImplFloatAvx;

namespace {

// Perform the equivalent of mm256_permutevar8x32 with
// a second argument of {7, 5, 3, 1, 6, 4, 2, 0}
inline __m256i PermuteEpi32EvenOdds(const __m256i& a) { … }

inline __m128i mm256_extracti128_si256(const __m256i& a, const int imm) { … }

inline __m256i mm256_cvtepi8_epi16(const __m128i& a) { … }

inline __m256i mm256_cvtepi16_epi32(const __m128i& a) { … }

inline __m256i mm256_xor_si256(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_unpacklo_epi32(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_unpacklo_epi64(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_unpackhi_epi32(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_unpackhi_epi64(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_add_epi32(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_add_epi16(const __m256i& a, const __m256i& b) { … }

inline __m256i mm256_madd_epi16(const __m256i& a, const __m256i& b) { … }

inline __m128i mm_permute_helper(const __m256i& a, const __m256i& b,
                                 const int imm) { … }

inline __m256i mm256_permute2x128_si256(const __m256i& a, const __m256i& b,
                                        const int imm) { … }

inline void Pack8bitColMajorForAvxPacker(const std::int8_t* src_ptr,
                                         std::int8_t input_xor,
                                         const std::int8_t* zerobuf,
                                         int src_stride, int remaining_src_cols,
                                         int src_rows, std::int8_t* packed_ptr,
                                         std::int32_t* sums_ptr,
                                         std::int8_t* trailing_buf) { … }

// Use a generic AVX intrinsic for greater-than comparison.
template <>
inline __m256i CompareGreaterThan<Path::kAvx>(const __m256i& a,
                                              const __m256i& b) { … }

}  // namespace.

void Pack8bitColMajorForAvx(const std::int8_t* src_ptr, std::int8_t input_xor,
                            const std::int8_t* zerobuf, int src_stride,
                            int remaining_src_cols, int src_rows,
                            std::int8_t* packed_ptr, std::int32_t* sums_ptr) { … }

void PackFloatColMajorForAvx(const float* src_ptr, const float* zerobuf,
                             int src_stride, int remaining_src_cols,
                             int src_rows, float* packed_ptr) { … }

void Pack8bitRowMajorForAvx(const std::uint8_t* src_ptr, int src_stride,
                            int src_zero_point, std::int8_t* packed_ptr,
                            int packed_stride, int start_col, int end_col,
                            int src_cols, int block_row, int src_rows,
                            int input_xor, std::int32_t* sums) { … }

#endif  // RUY_PLATFORM_AVX && RUY_OPT(INTRINSICS)

}  // namespace ruy
chromium/third_party/ruy/src/ruy/pack_avx.cc