#include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h"
#ifdef __SSSE3__
#include <emmintrin.h>
#include <tmmintrin.h>
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
#ifdef __AVX2__
#include <immintrin.h>
#include "absl/base/prefetch.h"
#endif
#include <cstdint>
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
namespace tflite {
namespace tensor_utils {
namespace {
#if defined(__SSE2__)
#if (defined(__GNUC__) && !defined(__clang__) && \
!defined(__INTEL_COMPILER)) || \
(defined(__clang__) && !defined(__apple_build_version__) && \
(__clang_major__ < 8)) || \
(defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && \
(__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
(defined(__clang__) && defined(__apple_build_version__) && \
(__apple_build_version__ < 11000000)) || \
(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1600))
static inline __m128i _mm_loadu_si32(const void* address) {
return _mm_cvtsi32_si128(*((const int*)address));
}
#endif
#endif
static inline __m128i DotProdInt8x4x4(__m128i a_8x16, __m128i b_8x16) {
b_8x16 = _mm_sign_epi8(b_8x16, a_8x16);
a_8x16 = _mm_abs_epi8(a_8x16);
__m128i sumprod_16x8 = _mm_maddubs_epi16(a_8x16, b_8x16);
return _mm_madd_epi16(sumprod_16x8, _mm_set1_epi16(1));
}
static inline int32_t ReduceInt32x4(__m128i acc) {
__m128i shuffle = _mm_unpackhi_epi64(acc, acc);
acc = _mm_add_epi32(acc, shuffle);
shuffle = _mm_shuffle_epi32(acc, _MM_SHUFFLE(2, 3, 0, 1));
acc = _mm_add_epi32(acc, shuffle);
return _mm_cvtsi128_si32(acc);
}
#ifdef __AVX2__
static inline float ReduceFloat32x4(__m128 acc) {
__m128 shuffle = _mm_movehdup_ps(acc);
acc = _mm_add_ps(acc, shuffle);
shuffle = _mm_movehl_ps(shuffle, acc);
acc = _mm_add_ss(acc, shuffle);
return _mm_cvtss_f32(acc);
}
static inline float ReduceFloat32x8(__m256 acc) {
__m128 low = _mm256_extractf128_ps(acc, 0);
__m128 high = _mm256_extractf128_ps(acc, 1);
return ReduceFloat32x4(_mm_add_ps(low, high));
}
static inline __m256i DotProdInt8x4x8(__m256i a_16x16, __m256i b_16x16) {
b_16x16 = _mm256_sign_epi8(b_16x16, a_16x16);
a_16x16 = _mm256_abs_epi8(a_16x16);
__m256i sumprod_16x16 = _mm256_maddubs_epi16(a_16x16, b_16x16);
return _mm256_madd_epi16(sumprod_16x16, _mm256_set1_epi16(1));
}
#endif
static inline __m128i ReduceInt32x4x4(__m128i a, __m128i b, __m128i c,
__m128i d) {
const __m128i a_b_lo_half = _mm_unpacklo_epi32(a, b);
const __m128i a_b_hi_half = _mm_unpackhi_epi32(a, b);
const __m128i a_plus_b =
_mm_add_epi32(a_b_lo_half, a_b_hi_half);
const __m128i c_d_lo_half = _mm_unpacklo_epi32(c, d);
const __m128i c_d_hi_half = _mm_unpackhi_epi32(c, d);
const __m128i c_plus_d =
_mm_add_epi32(c_d_lo_half, c_d_hi_half);
const __m128i all_evns =
_mm_unpacklo_epi64(a_plus_b, c_plus_d);
const __m128i all_odds =
_mm_unpackhi_epi64(a_plus_b, c_plus_d);
return _mm_add_epi32(all_evns, all_odds);
}
template <int i>
float GetFloatVectorElement(__m128 v) {
static_assert(i >= 0 && i < 4, "The index must be 0 <= i < 4.");
v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
return _mm_cvtss_f32(v);
}
}
#ifdef __AVX2__
constexpr int kFloatValuesPerAvx2Vector = 8;
template <int PerVectorSize>
inline int RoundDownVectors(int size) {
return size & ~(PerVectorSize - 1);
}
void Avx2MatrixBatchVectorMultiplyAccumulateImpl(
const float* __restrict__ matrix, int m_rows, int m_cols,
const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
const int postamble_start =
RoundDownVectors<kFloatValuesPerAvx2Vector>(m_cols);
for (int b = 0; b < n_batch; ++b) {
float* result_in_batch = result + b * m_rows;
const float* vector_in_batch = vector + b * m_cols;
const float* matrix_row = matrix;
for (int r = 0; r < m_rows; ++r) {
__m256 acc_32x8 = _mm256_setzero_ps();
int c = 0;
for (; c < postamble_start; c += kFloatValuesPerAvx2Vector) {
__m256 vector_f32x8 = _mm256_loadu_ps(vector_in_batch + c);
__m256 matrix_f32x8 = _mm256_loadu_ps(matrix_row + c);
__m256 res = _mm256_mul_ps(vector_f32x8, matrix_f32x8);
acc_32x8 = _mm256_add_ps(acc_32x8, res);
}
float sum = ReduceFloat32x8(acc_32x8);
for (; (c < m_cols); c++) {
sum += matrix_row[c] * vector_in_batch[c];
}
*result_in_batch += sum;
++result_in_batch;
matrix_row += m_cols;
}
}
}
void Avx2MatrixBatchVectorMultiplyAccumulateImpl(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors,
const float* __restrict__ scaling_factors, int n_batch,
float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, const int32_t* row_sums) {
for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
const float batch_scaling_factor = scaling_factors[batch];
const int32_t batch_offset = input_offset ? input_offset[batch] : 0;
for (std::intptr_t row = 0; row < m_rows; ++row) {
const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
const float row_scale =
per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
: batch_scaling_factor;
const int32_t row_offset =
row_sums && batch_offset ? batch_offset * row_sums[row] : 0;
__m256i dotprod_32x8 = _mm256_setzero_si256();
std::intptr_t col = 0;
constexpr int prefetch_distance = 704;
while (col < (m_cols & ~31)) {
absl::PrefetchToLocalCache(vectors + col + prefetch_distance);
absl::PrefetchToLocalCache(row_ptr + col + prefetch_distance);
const __m256i vec_16x16 =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(vectors + col));
const __m256i row_16x16 =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ptr + col));
dotprod_32x8 = _mm256_add_epi32(dotprod_32x8,
DotProdInt8x4x8(vec_16x16, row_16x16));
col += 32;
}
__m128i low = _mm256_extracti128_si256(dotprod_32x8, 0);
__m128i high = _mm256_extracti128_si256(dotprod_32x8, 1);
__m128i dotprod_32x4 = _mm_add_epi32(low, high);
if (col < (m_cols & ~15)) {
const __m128i vec_16x8 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(vectors + col));
const __m128i row_16x8 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_16x8, row_16x8));
col += 16;
}
if (col < (m_cols & ~7)) {
const __m128i vec_16x8 = _mm_cvtepi8_epi16(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(vectors + col)));
const __m128i row_16x8 = _mm_cvtepi8_epi16(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, _mm_madd_epi16(vec_16x8, row_16x8));
col += 8;
}
if (col < (m_cols & ~3)) {
const __m128i vec_32x4 = _mm_cvtepi8_epi32(
_mm_loadu_si32(reinterpret_cast<const __m128i*>(vectors + col)));
const __m128i row_32x4 = _mm_cvtepi8_epi32(
_mm_loadu_si32(reinterpret_cast<const __m128i*>(row_ptr + col)));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, _mm_mullo_epi32(vec_32x4, row_32x4));
col += 4;
}
int32_t sum = ReduceInt32x4(dotprod_32x4);
#pragma clang loop unroll(disable) vectorize(disable)
for (; col < m_cols; ++col) {
sum += row_ptr[col] * vectors[col];
}
if (row_offset) {
sum -= row_offset;
}
*result += sum * row_scale;
++result;
}
vectors += m_cols;
}
}
#endif
void SseMatrixBatchVectorMultiplyAccumulateImpl(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors,
const float* __restrict__ scaling_factors, int n_batch,
float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, const int32_t* row_sums) {
#ifdef __AVX2__
Avx2MatrixBatchVectorMultiplyAccumulateImpl(
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
per_channel_scale, input_offset, row_sums);
return;
#else
for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
const float batch_scaling_factor = scaling_factors[batch];
const int32_t batch_offset = input_offset ? input_offset[batch] : 0;
for (std::intptr_t row = 0; row < m_rows; ++row) {
const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
const float row_scale =
per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
: batch_scaling_factor;
const int32_t row_offset =
row_sums && batch_offset ? batch_offset * row_sums[row] : 0;
__m128i dotprod_32x4 = _mm_setzero_si128();
std::intptr_t col = 0;
while (col < (m_cols & ~15)) {
const __m128i vec_8x16 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(vectors + col));
const __m128i row_8x16 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16));
col += 16;
}
#ifdef __SSE4_1__
if (col < (m_cols & ~7)) {
const __m128i vec_16x8 = _mm_cvtepi8_epi16(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(vectors + col)));
const __m128i row_16x8 = _mm_cvtepi8_epi16(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, _mm_madd_epi16(vec_16x8, row_16x8));
col += 8;
}
if (col < (m_cols & ~3)) {
const __m128i vec_32x4 = _mm_cvtepi8_epi32(
_mm_loadu_si32(reinterpret_cast<const __m128i*>(vectors + col)));
const __m128i row_32x4 = _mm_cvtepi8_epi32(
_mm_loadu_si32(reinterpret_cast<const __m128i*>(row_ptr + col)));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, _mm_mullo_epi32(vec_32x4, row_32x4));
col += 4;
}
#endif
int32_t sum = ReduceInt32x4(dotprod_32x4);
#if defined(__SSE4_1__) && defined(__clang__)
#pragma clang loop unroll(disable) vectorize(disable)
#endif
for (; col < m_cols; ++col) {
sum += row_ptr[col] * vectors[col];
}
if (row_offset) {
sum -= row_offset;
}
*result += sum * row_scale;
++result;
}
vectors += m_cols;
}
#endif
}
void SseCpuBackendGemm(const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t n_batch,
int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, CpuBackendContext* context) {
using ::tflite::cpu_backend_gemm::Gemm;
using ::tflite::cpu_backend_gemm::GemmParams;
using ::tflite::cpu_backend_gemm::MatrixParams;
MatrixParams<int8_t> lhs_params;
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
lhs_params.rows = n_output;
lhs_params.cols = n_input;
lhs_params.cache_policy = cpu_backend_gemm::CachePolicy::kCacheIfLargeSpeedup;
MatrixParams<int8_t> rhs_params;
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
rhs_params.rows = n_input;
rhs_params.cols = n_batch;
MatrixParams<int32_t> dst_params;
dst_params.order = cpu_backend_gemm::Order::kColMajor;
dst_params.rows = n_output;
dst_params.cols = n_batch;
GemmParams<int32, int32> gemm_params;
if (bias) {
gemm_params.bias = bias;
}
cpu_backend_gemm::Gemm(lhs_params, input_to_gate_weights, rhs_params, input,
dst_params, scratch, gemm_params, context);
}
void SseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors,
const float* __restrict__ scaling_factors, int n_batch,
float* __restrict__ result) {
SseMatrixBatchVectorMultiplyAccumulateImpl(
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
nullptr, nullptr,
nullptr);
}
void SseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors,
const float* __restrict__ scaling_factors, int n_batch, int32_t* scratch,
float* __restrict__ result, CpuBackendContext* context) {
if (m_rows % 4 == 0 && !context->PreferGemmlowpOnX86()) {
const int32_t* bias = static_cast<const int32_t*>(nullptr);
SseCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
0, scratch, context);
{
ruy::profiler::ScopeLabel label("HybridMultiplyScalingFactor");
const int total_size = n_batch * m_rows;
int i = 0;
for (; i <= total_size - 8; i += 8, result += 8) {
const float batch_scaling_factor0 = scaling_factors[i / m_rows];
const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
const __m128 scaling_factor0 = _mm_set1_ps(batch_scaling_factor0);
const __m128 scaling_factor1 = _mm_set1_ps(batch_scaling_factor1);
const __m128i scratch_val0 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(scratch + i));
const __m128i scratch_val1 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(scratch + i + 4));
const __m128 float_val0 = _mm_cvtepi32_ps(scratch_val0);
const __m128 float_val1 = _mm_cvtepi32_ps(scratch_val1);
const __m128 prod0 = _mm_mul_ps(float_val0, scaling_factor0);
const __m128 result0 = _mm_add_ps(_mm_load1_ps(result), prod0);
const __m128 prod1 = _mm_mul_ps(float_val1, scaling_factor1);
const __m128 result1 = _mm_add_ps(_mm_load1_ps(result + 4), prod1);
_mm_store_ps(result, result0);
_mm_store_ps(result + 4, result1);
}
scratch += i;
for (; i < total_size; i++) {
const float batch_scaling_factor = scaling_factors[i / m_rows];
int32_t x = *(scratch++);
*result += x * batch_scaling_factor;
++result;
}
}
return;
}
SseMatrixBatchVectorMultiplyAccumulateImpl(
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
nullptr, nullptr,
nullptr);
}
void SseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors,
const float* __restrict__ scaling_factors, int n_batch,
float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
bool* compute_row_sums, CpuBackendContext* context) {
if ((input_offset != nullptr) && (!compute_row_sums || *compute_row_sums)) {
SseReductionSumVector(matrix, row_sums, m_rows, m_cols);
if (compute_row_sums) {
*compute_row_sums = false;
}
}
SseMatrixBatchVectorMultiplyAccumulateImpl(
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
per_channel_scale, input_offset, row_sums);
}
namespace {
inline void SseSparseMatrixVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
const int m_rows, const int m_cols, const int8_t* __restrict__ vector,
const float batch_scaling_factor, float* __restrict__ result,
const float* per_channel_scale) {
static const std::intptr_t kBlockSize = 16;
TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
const uint8_t* __restrict__ ledger_ptr = ledger;
for (std::intptr_t row = 0; row < m_rows; ++row) {
__m128i dotprod_32x4 = _mm_setzero_si128();
std::intptr_t num_nonzero_blocks = *ledger_ptr++;
for (std::intptr_t i = 0; i < num_nonzero_blocks; i++) {
const std::intptr_t col_index = *ledger_ptr++ * kBlockSize;
const __m128i vec_8x16 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(vector + col_index));
const __m128i row_8x16 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(matrix));
dotprod_32x4 =
_mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16));
matrix += kBlockSize;
}
int32_t dotprod = ReduceInt32x4(dotprod_32x4);
const float total_scaling_factor =
per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
: batch_scaling_factor;
result[row] += dotprod * total_scaling_factor;
}
}
inline void SseSparseMatrix4VectorsMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
const int m_rows, const int m_cols,
const int8_t* __restrict__ const vectors,
const __m128 batch_scaling_factors_fx4, float* __restrict__ const results,
const float* per_channel_scale) {
static const std::intptr_t kBlockSize = 16;
TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
const int8_t* __restrict__ vector0 = vectors + 0 * m_cols;
const int8_t* __restrict__ vector1 = vectors + 1 * m_cols;
const int8_t* __restrict__ vector2 = vectors + 2 * m_cols;
const int8_t* __restrict__ vector3 = vectors + 3 * m_cols;
float* __restrict__ result0 = results + 0 * m_rows;
float* __restrict__ result1 = results + 1 * m_rows;
float* __restrict__ result2 = results + 2 * m_rows;
float* __restrict__ result3 = results + 3 * m_rows;
for (std::intptr_t row = 0; row < m_rows; ++row) {
__m128i dp0_32x4 = _mm_setzero_si128();
__m128i dp1_32x4 = _mm_setzero_si128();
__m128i dp2_32x4 = _mm_setzero_si128();
__m128i dp3_32x4 = _mm_setzero_si128();
std::intptr_t num_nonzero_blocks = *ledger++;
for (std::intptr_t i = 0; i < num_nonzero_blocks; i++) {
const std::intptr_t col_index = *ledger++ * kBlockSize;
const __m128i vec0_8x16 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(vector0 + col_index));
const __m128i vec1_8x16 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(vector1 + col_index));
const __m128i vec2_8x16 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(vector2 + col_index));
const __m128i vec3_8x16 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(vector3 + col_index));
const __m128i row_8x16 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(matrix));
dp0_32x4 = _mm_add_epi32(dp0_32x4, DotProdInt8x4x4(vec0_8x16, row_8x16));
dp1_32x4 = _mm_add_epi32(dp1_32x4, DotProdInt8x4x4(vec1_8x16, row_8x16));
dp2_32x4 = _mm_add_epi32(dp2_32x4, DotProdInt8x4x4(vec2_8x16, row_8x16));
dp3_32x4 = _mm_add_epi32(dp3_32x4, DotProdInt8x4x4(vec3_8x16, row_8x16));
matrix += kBlockSize;
}
const __m128i dp_32x4 =
ReduceInt32x4x4(dp0_32x4, dp1_32x4, dp2_32x4, dp3_32x4);
const __m128 dp_fx4 = _mm_cvtepi32_ps(dp_32x4);
__m128 result_fx4 =
_mm_set_ps(result3[row], result2[row], result1[row], result0[row]);
const __m128 total_scaling_factors_fx4 =
per_channel_scale ? _mm_mul_ps(batch_scaling_factors_fx4,
_mm_set1_ps(per_channel_scale[row]))
: batch_scaling_factors_fx4;
result_fx4 =
_mm_add_ps(result_fx4, _mm_mul_ps(dp_fx4, total_scaling_factors_fx4));
result0[row] = GetFloatVectorElement<0>(result_fx4);
result1[row] = GetFloatVectorElement<1>(result_fx4);
result2[row] = GetFloatVectorElement<2>(result_fx4);
result3[row] = GetFloatVectorElement<3>(result_fx4);
}
}
}
void SseSparseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
const float* __restrict__ scaling_factors, int n_batch,
float* __restrict__ results, const float* per_channel_scale) {
int batch = 0;
const int kBatchSize4 = 4;
const int n_batch_rounddown_to_batchsize_4 = n_batch & ~(kBatchSize4 - 1);
while (batch < n_batch_rounddown_to_batchsize_4) {
const __m128 scaling_factors_fx4 = _mm_loadu_ps(scaling_factors + batch);
SseSparseMatrix4VectorsMultiplyAccumulate(matrix, ledger, m_rows, m_cols,
vectors, scaling_factors_fx4,
results, per_channel_scale);
batch += kBatchSize4;
vectors += kBatchSize4 * m_cols;
results += kBatchSize4 * m_rows;
}
while (batch < n_batch) {
SseSparseMatrixVectorMultiplyAccumulate(matrix, ledger, m_rows, m_cols,
vectors, scaling_factors[batch],
results, per_channel_scale);
++batch;
vectors += m_cols;
results += m_rows;
}
}
void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
const int output_size, const int reduction_size) {
static constexpr std::intptr_t kBlockSize = 16;
for (std::intptr_t row = 0; row < output_size; ++row) {
const int8_t* __restrict__ row_ptr = input_vector + row * reduction_size;
__m128i row_sum_16x8 = _mm_setzero_si128();
std::intptr_t col = 0;
for (; col < (reduction_size & ~(kBlockSize - 1)); col += kBlockSize) {
const __m128i row_8x16 =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
}
#ifdef __SSE4_1__
if (col < (reduction_size & ~7)) {
const __m128i row_16x8 = _mm_cvtepi8_epi16(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
col += 8;
}
#endif
const __m128i row_sum_32x4 =
_mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
int32_t row_sum = ReduceInt32x4(row_sum_32x4);
#if defined(__SSE4_1__) && defined(__clang__)
#pragma clang loop unroll(disable) vectorize(disable)
#endif
for (; col < reduction_size; col++) {
row_sum += row_ptr[col];
}
output_vector[row] = row_sum;
}
}
}
}
#endif