#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
#include "xnnpack/avgpool.h"
#include "xnnpack/common.h"
#include "xnnpack/conv.h"
#include "xnnpack/dwconv.h"
#include "xnnpack/gavgpool.h"
#include "xnnpack/gemm.h"
#include "xnnpack/ibilinear.h"
#include "xnnpack/igemm.h"
#include "xnnpack/intrinsics-polyfill.h"
#include "xnnpack/math.h"
#include "xnnpack/maxpool.h"
#include "xnnpack/pavgpool.h"
#include "xnnpack/reduce.h"
#include "xnnpack/spmm.h"
#include "xnnpack/transpose.h"
#include "xnnpack/vbinary.h"
#include "xnnpack/vmulcaddc.h"
#include "xnnpack/vunary.h"
void xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
float* buffer,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
size_t input_height,
size_t input_width,
size_t output_y_start,
size_t output_y_end,
const float* input,
const float* zero,
const float* weights,
float* output,
size_t input_padding_top,
size_t output_channels,
size_t output_height_stride,
size_t output_channel_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_25p8c__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_3p8c__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_4p8c__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
size_t kernel_size,
float* buffer,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv_minmax_ukernel_9p8c__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_gavgpool_cw_ukernel__sse_u4(
size_t elements,
size_t channels,
const float* input,
float* output,
const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(
size_t rows,
size_t channels,
const float* input,
size_t input_stride,
const float* zero,
float* buffer,
float* output,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(
size_t rows,
size_t channels,
const float* input,
size_t input_stride,
const float* zero,
float* output,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_gemm_minmax_ukernel_4x2c4__sse(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_ibilinear_chw_ukernel__sse_p8(
size_t output_pixels,
size_t channels,
const float** restrict input,
size_t input_offset,
const float* restrict weights,
float* restrict output,
size_t input_increment) XNN_OOB_READS
{ … }
void xnn_f32_ibilinear_ukernel__sse_c8(
size_t output_pixels,
size_t channels,
const float** restrict input,
size_t input_offset,
const float* restrict weights,
float* restrict output,
size_t output_increment) XNN_OOB_READS
{ … }
void xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_igemm_minmax_ukernel_4x2c4__sse(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
const float* multiplier,
float* buffer,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
const float* multiplier,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_rdsum_ukernel_7p7x__sse_c16(
size_t rows,
size_t channels,
const float* input,
size_t input_stride,
const float* zero,
float* output,
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rmax_ukernel__sse_u16_acc4(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rminmax_ukernel__sse_u16_acc4(
size_t batch,
const float* input,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_rsum_ukernel__sse_u16_acc4(
size_t batch,
const float* input,
float* output,
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_spmm_minmax_ukernel_32x1__sse(
size_t mc,
size_t nc,
const float* input,
const float* weights,
const int32_t* widx_dmap,
const uint32_t* nidx_nnzmap,
float* output,
size_t output_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{ … }
void xnn_f32_vadd_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vaddc_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vdiv_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vdivc_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vmaxc_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vmin_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vminc_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vmul_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vmulc_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vrdivc_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vrsubc_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vsqrdiff_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vsqrdiffc_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vsub_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vsubc_minmax_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vclamp_ukernel__sse_u8(
size_t batch,
const float* input,
float* output,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vcmul_ukernel__sse_u8(
size_t batch,
const float* input_a,
const float* input_b,
float* output,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vhswish_ukernel__sse_u8(
size_t batch,
const float* input,
float* output,
const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vlrelu_ukernel__sse_u8(
size_t batch,
const float* input,
float* output,
const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(
size_t rows,
size_t channels,
const float* restrict input,
size_t input_stride,
const float* restrict weights,
float* restrict output,
size_t output_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8(
size_t batch,
const float* input,
float* output,
const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_f32_vsqrt_ukernel__sse_rsqrt_u12(
size_t batch,
const float* input,
float* output,
const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }
void xnn_x32_transposec_ukernel__4x4_sse(
const uint32_t* input,
uint32_t* output,
size_t input_stride,
size_t output_stride,
size_t block_width,
size_t block_height,
const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{ … }