#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
#include "xnnpack/common.h"
#include "xnnpack/dwconv.h"
#include "xnnpack/gemm.h"
#include "xnnpack/igemm.h"
#include "xnnpack/intrinsics-polyfill.h"
#include "xnnpack/lut.h"
#include "xnnpack/math.h"
#include "xnnpack/microparams.h"
#include "xnnpack/packw.h"
#include "xnnpack/prelu.h"
#include "xnnpack/reduce.h"
#include "xnnpack/simd/f32-avx.h"
#include "xnnpack/transpose.h"
#include "xnnpack/unaligned.h"
#include "xnnpack/vbinary.h"
#include "xnnpack/vcvt.h"
#include "xnnpack/vlrelu.h"
#include "xnnpack/vunary.h"
void xnn_f16_f32_vcvt_ukernel__avx_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_25p8c__avx(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_3p16c__avx(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_4p16c__avx(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
size_t kernel_size,
float* buffer,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_9p16c__avx(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_f16_vcvt_ukernel__avx_u24(
size_t batch,
const float* input,
void* output,
const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
static const int32_t mask_table[14] = …;
void xnn_f32_prelu_ukernel__avx_2x16(
size_t rows,
size_t channels,
const float* restrict input,
size_t input_stride,
const float* restrict weights,
float* restrict output,
size_t output_stride)
{ … }
void xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx_broadcast(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_qs8_vcvt_ukernel__avx_u32(
size_t batch,
const float* input,
int8_t* output,
const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_qu8_vcvt_ukernel__avx_u32(
size_t batch,
const float* input,
uint8_t* output,
const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rdsum_ukernel_7p7x__avx_c32(
size_t rows,
size_t channels,
const float* input,
size_t input_stride,
const float* zero,
float* output,
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rmax_ukernel__avx_u32_acc4(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rminmax_ukernel__avx_u32_acc4(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rsum_ukernel__avx_u32_acc4(
size_t batch,
const float* input,
float* output,
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vadd_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vaddc_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vdiv_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vdivc_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vmaxc_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vmin_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vminc_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vmul_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vmulc_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrdivc_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrsubc_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vsqrdiff_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vsqrdiffc_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vsub_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vsubc_minmax_ukernel__avx_u16(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vclamp_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_u32(
size_t batch,
const float* input,
float* output,
const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vhswish_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vlrelu_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrndd_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrndne_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrndu_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrndz_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u40(
size_t batch,
const float* input,
float* output,
const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vsqrt_ukernel__avx_rsqrt_u16(
size_t batch, const float* input, float* output,
const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)])
XNN_OOB_READS { … }
void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_qb4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_qb4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const int8_t** restrict a,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const int8_t* zero,
const int8_t* zero_data,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const int8_t** restrict a,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const int8_t* zero,
const int8_t* zero_data,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs16_qs8_vcvt_ukernel__avx_u16(
size_t batch,
const int16_t* input,
int8_t* output,
const union xnn_qs16_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16(
size_t channels,
size_t output_width,
const int8_t** input,
const void* weights,
int8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const int8_t* zero,
const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16(
size_t channels,
size_t output_width,
const int8_t** input,
const void* weights,
int8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const int8_t* zero,
const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_f32_vcvt_ukernel__avx_u32(
size_t batch,
const int8_t* input,
float* output,
const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16(
size_t channels,
size_t output_width,
const int8_t** input,
const void* weights,
int8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const int8_t* zero,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16(
size_t channels,
size_t output_width,
const int8_t** input,
const void* weights,
int8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const int8_t* zero,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16(
size_t channels,
size_t output_width,
const int8_t** input,
const void* weights,
int8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const int8_t* zero,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
int8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
int8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const int8_t** restrict a,
const void* restrict w,
int8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const int8_t* zero,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const int8_t** restrict a,
const void* restrict w,
int8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const int8_t* zero,
const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_u8(
size_t batch,
const int8_t* input_a,
const int8_t* input_b,
int8_t* output,
const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_u8(
size_t batch,
const int8_t* input_a,
const int8_t* input_b,
int8_t* output,
const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_vcvt_ukernel__avx_u32(
size_t batch,
const int8_t* input,
int8_t* output,
const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_vlrelu_ukernel__avx_u32(
size_t batch,
const int8_t* input,
int8_t* output,
const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16(
size_t batch,
const int8_t* input_a,
const int8_t* input_b,
int8_t* output,
const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16(
size_t batch,
const int8_t* input_a,
const int8_t* input_b,
int8_t* output,
const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16(
size_t channels,
size_t output_width,
const uint8_t** input,
const void* weights,
uint8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const uint8_t* zero,
const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16(
size_t channels,
size_t output_width,
const uint8_t** input,
const void* weights,
uint8_t* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const uint8_t* zero,
const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_f32_vcvt_ukernel__avx_u32(
size_t batch,
const uint8_t* input,
float* output,
const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const uint8_t* restrict a,
size_t a_stride,
const void* restrict w,
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
const uint8_t* restrict a,
size_t a_stride,
const void* restrict w,
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const uint8_t** restrict a,
const void* restrict w,
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const uint8_t* zero,
const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const uint8_t** restrict a,
const void* restrict w,
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const uint8_t* zero,
const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_u8(
size_t batch,
const uint8_t* input_a,
const uint8_t* input_b,
uint8_t* output,
const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u8(
size_t batch,
const uint8_t* input_a,
const uint8_t* input_b,
uint8_t* output,
const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_vcvt_ukernel__avx_u32(
size_t batch,
const uint8_t* input,
uint8_t* output,
const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_vlrelu_ukernel__avx_u32(
size_t batch,
const uint8_t* input,
uint8_t* output,
const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_u16(
size_t batch,
const uint8_t* input_a,
const uint8_t* input_b,
uint8_t* output,
const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_u16(
size_t batch,
const uint8_t* input_a,
const uint8_t* input_b,
uint8_t* output,
const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4(
size_t g,
size_t nc,
size_t kc,
size_t nr,
size_t kr,
size_t sr,
const uint32_t* weights,
const uint32_t* bias,
const void* scale,
uint32_t* packed_weights,
size_t extra_bytes,
const void* params)
{ … }void xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4(
size_t g,
size_t nc,
size_t kc,
size_t nr,
size_t kr,
size_t sr,
const uint32_t* weights,
const uint32_t* bias,
const void* scale,
uint32_t* packed_weights,
size_t extra_bytes,
const void* params)
{ … }void xnn_x32_transposec_ukernel__8x8_reuse_multi_avx(
const uint32_t* input,
uint32_t* output,
size_t input_stride,
size_t output_stride,
size_t block_width,
size_t block_height,
const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_x64_transposec_ukernel__4x4_reuse_multi_avx(
const uint64_t* input,
uint64_t* output,
size_t input_stride,
size_t output_stride,
size_t block_width,
size_t block_height,
const union xnn_x64_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }void xnn_x8_lut_ukernel__avx_u64(
size_t batch,
const uint8_t* input,
uint8_t* output,
const uint8_t table[restrict XNN_MIN_ELEMENTS(256)])
{ … }void xnn_f32_vabs_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vcopysign_ukernel__avx_u16(
size_t batch,
const float* mag,
const float* sign,
float* output,
const union xnn_f32_default_params unused_params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vcopysignc_ukernel__avx_u16(
size_t batch,
const float* mag,
const float* sign,
float* output,
const union xnn_f32_default_params unused_params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params unused_params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vneg_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vrcopysignc_ukernel__avx_u16(
size_t batch,
const float* sign,
const float* mag,
float* output,
const union xnn_f32_default_params unused_params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vsqr_ukernel__avx_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }void xnn_f32_vtanh_ukernel__avx_rational_9_6_div_u16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_tanh_params unused_params[restrict XNN_MIN_ELEMENTS(1)])
{ … }