21#ifndef INCLUDE_SHL_RVV_H_
22#define INCLUDE_SHL_RVV_H_
25#include <riscv_vector.h>
27#if (__riscv_v == 1000000)
29#elif (__riscv_v == 7000)
33#ifdef __riscv_xtheadvdot
35#define SHL_USE_DOT_INT8
121void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(
struct csinn_tensor *kernel,
123void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(
struct csinn_tensor *kernel,
125void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(
struct csinn_tensor *kernel,
138void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(
struct csinn_tensor *kernel,
140void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(
struct csinn_tensor *kernel,
142void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(
struct csinn_tensor *kernel,
155void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(
struct csinn_tensor *kernel,
157void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(
struct csinn_tensor *kernel,
159void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(
struct csinn_tensor *kernel,
172void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(
struct csinn_tensor *kernel,
174void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(
struct csinn_tensor *kernel,
176void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(
struct csinn_tensor *kernel,
190void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(
struct csinn_tensor *kernel,
192void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(
struct csinn_tensor *kernel,
194void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(
struct csinn_tensor *kernel,
207void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(
struct csinn_tensor *kernel,
209void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(
struct csinn_tensor *kernel,
211void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(
struct csinn_tensor *kernel,
224void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(
struct csinn_tensor *kernel,
226void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(
struct csinn_tensor *kernel,
228void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(
struct csinn_tensor *kernel,
241void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(
struct csinn_tensor *kernel,
243void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(
struct csinn_tensor *kernel,
245void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(
struct csinn_tensor *kernel,
259void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(
struct csinn_tensor *src_kernel,
261void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(
struct csinn_tensor *src_kernel,
271void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(
struct csinn_tensor *src_kernel,
273void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(
struct csinn_tensor *src_kernel,
275void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(
struct csinn_tensor *src_kernel,
314void shl_rvv_dwconv_reorder_kernel_packn_fp32(
struct csinn_tensor *kernel,
316void shl_rvv_dwconv_reorder_kernel_packn_fp16(
struct csinn_tensor *kernel,
318void shl_rvv_dwconv_reorder_kernel_packn_int8(
struct csinn_tensor *kernel,
361void shl_rvv_reorder_kernel_n8_fp32(
float *a,
float *sa,
int m,
int k,
int ldx);
362void shl_rvv_reorder_input_z8_fp32(
float *b,
float *sb,
int k,
int n,
int ldx);
363void shl_rvv_gemm_8x8_fp32(
float *dst,
const float *sa,
const float *sb,
float *bias,
int m,
int k,
366void shl_rvv256_reorder_input_z16_fp32(
float *b,
float *sb,
int k,
int n,
int ldx);
367void shl_rvv256_gemm_8x16_fp32(
float *dst,
const float *sa,
const float *sb,
float *bias,
int m,
368 int k,
int n,
int ldc);
370void shl_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa,
int m,
int k,
int ldx);
371void shl_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb,
int k,
int n,
int ldx);
372void shl_rvv_gemm_8x16_fp16(__fp16 *dst,
const __fp16 *sa,
const __fp16 *sb, __fp16 *bias,
int m,
373 int k,
int n,
int ldc);
375void shl_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa,
int m,
int k,
int ldx);
376void shl_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb,
int k,
int n,
int ldx);
377void shl_rvv256_gemm_16x16_fp16(__fp16 *dst,
const __fp16 *sa,
const __fp16 *sb, __fp16 *bias,
378 int m,
int k,
int n,
int ldc);
380void shl_rvv_reorder_kernel_n8_int8_dot(int8_t *a, int8_t *sa,
int m,
int k,
int ldx);
381void shl_rvv_reorder_input_z8_int8_dot(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
382void shl_rvv_gemm_8x8_int32(int32_t *dst,
const int8_t *sa,
const int8_t *sb, int32_t *bias,
int m,
383 int k,
int n,
int ldc);
384void shl_rvv_gemm_8x8_int8_dot(int8_t *dst,
const int8_t *sa,
const int8_t *sb, int32_t *bias,
385 int m,
int k,
int n,
int ldc, int32_t out_zp, int32_t *mult,
388void shl_rvv_reorder_kernel_n4_int8_v128(int8_t *a, int8_t *sa,
int m,
int k,
int ldx);
389void shl_rvv_reorder_input_z16_int8_v128(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
390void shl_rvv_gemm_4x16_int8_v128(int8_t *dst,
const int8_t *sa,
const int8_t *sb, int32_t *bias,
391 int m,
int k,
int n,
int ldc, int32_t out_zp, int32_t *mult,
394void shl_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
395void shl_rvv256_gemm_8x16_int32(int32_t *dst,
const int8_t *sa,
const int8_t *sb, int32_t *bias,
396 int m,
int k,
int n,
int ldc);
398void shl_rvv_reorder_input_n8_int4_dot(int8_t *a, int8_t *sa,
int m,
int k,
int ldx);
399void shl_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb,
int n,
int k,
int ldx);
400void shl_rvv_gemm_8x8_int4_dot(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
int m,
int k,
int n,
401 int ldc, int32_t *bias, int32_t out_zp, int32_t *mult,
404void shl_rvv_reorder_input_m4_int8(int8_t *a, int8_t *sa,
int m,
int k);
405void shl_rvv_reorder_kernel_int8(int8_t *b, int8_t *sb,
int n,
int k);
406void shl_rvv_gemm_4xn_int8(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
const int32_t *bias,
407 int m,
int k,
int n,
int ldc, int32_t out_zp, int32_t *mult,
411void shl_rvv_reorder_kernel_packn_fp32(
float *a,
float *sa,
int m,
int k,
int ldx);
412void shl_rvv_reorder_input_z8_packn_fp32(
float *b,
float *sb,
int k,
int n,
int ldx);
413void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(
float *dst,
const float *sa,
const float *sb,
float *bias,
414 int m,
int k,
int n,
int ldc);
415void shl_rvv_reorder_input_z12_packn_fp32(
float *b,
float *sb,
int k,
int n,
int ldx);
416void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(
float *dst,
const float *sa,
const float *sb,
float *bias,
417 int m,
int k,
int n,
int ldc);
419void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa,
int m,
int k,
int ldx);
420void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb,
int k,
int n,
int ldx);
421void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst,
const __fp16 *sa,
const __fp16 *sb,
422 __fp16 *bias,
int m,
int k,
int n,
int ldc);
423void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb,
int k,
int n,
int ldx);
424void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst,
const __fp16 *sa,
const __fp16 *sb,
425 __fp16 *bias,
int m,
int k,
int n,
int ldc);
427void shl_rvv_reorder_input_z8_packn_int8_dot(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
428void shl_rvv_ncxhwx_gemm_8xpackn_int8_dot(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
429 int32_t *bias,
int m,
int k,
int n,
int ldc,
430 int32_t out_zp, int32_t *mult, int32_t *shift);
431void shl_rvv_reorder_input_z12_packn_int8_dot(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
432void shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
433 int32_t *bias,
int m,
int k,
int n,
int ldc,
434 int32_t out_zp, int32_t *mult, int32_t *shift);
436void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
437void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
438 int32_t *bias,
int m,
int k,
int n,
int ldc, int32_t out_zp,
439 int32_t *mult, int32_t *shift);
441void shl_rvv_reorder_input_z12_packn_int4(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
442void shl_rvv_ncxhwx_gemm_12xpackn_int4(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
443 int32_t *bias,
int m,
int k,
int n,
int ldc, int32_t out_zp,
444 int32_t *mult, int32_t *shift);
446void shl_rvv_reorder_input_z12_pack1ton_fp32(
float *b,
float *sb,
int inc,
int maxk,
int n,
448void shl_rvv_reorder_input_z12_pack1ton_fp16(__fp16 *b, __fp16 *sb,
int inc,
int maxk,
int n,
450void shl_rvv_reorder_input_z4_pack1ton_int8(int8_t *b, int8_t *sb,
int inc,
int maxk,
int n,
452void shl_rvv_reorder_input_z12_pack1ton_int8_dot(int8_t *b, int8_t *sb,
int inc,
int maxk,
int n,
455void shl_rvv_reorder_input_z4_packn_int8(int8_t *b, int8_t *sb,
int k,
int n,
int ldx);
456void shl_rvv_ncxhwx_gemm_4xpack2n_int8(int8_t *dst,
const int8_t *sa,
const int8_t *sb,
457 int32_t *bias,
int m,
int k,
int n,
int ldc, int32_t out_zp,
458 int32_t *mult, int32_t *shift);
616void shl_rvv_fc_gemv_transform_weight_fp32(
struct csinn_tensor *weights);
617void shl_rvv_fc_gemv_transform_weight_fp16(
struct csinn_tensor *weights);
618void shl_rvv_fc_gemv_transform_weight_int8(
struct csinn_tensor *weights);
619void shl_rvv_fc_gemm_transform_weight_int8(
struct csinn_tensor *weights);
725 struct csinn_layer_norm_params *params);
728 struct csinn_layer_norm_params *params);
731 struct csinn_layer_norm_params *params);
738void shl_rvv_pad_input_fp32(
const float *input,
float *input_padded,
int inc,
int inh,
int inw,
739 int padded_h,
int padded_w,
int pad_top,
int pad_left);
740void shl_rvv_pad_input_fp16(
const __fp16 *input, __fp16 *input_padded,
int inc,
int inh,
int inw,
741 int padded_h,
int padded_w,
int pad_top,
int pad_left);
742void shl_rvv_pad_input_int8(
const int8_t *input, int8_t *input_padded,
int inc,
int inh,
int inw,
743 int padded_h,
int padded_w,
int pad_top,
int pad_left,
746void shl_rvv_pad_input_packn_fp32(
const float *input,
float *input_padded,
int inc,
int inh,
747 int inw,
int padded_h,
int padded_w,
int pad_top,
int pad_left);
748void shl_rvv_pad_input_packn_fp16(
const __fp16 *input, __fp16 *input_padded,
int inc,
int inh,
749 int inw,
int padded_h,
int padded_w,
int pad_top,
int pad_left);
750void shl_rvv_pad_input_packn_int8(
const int8_t *input, int8_t *input_padded,
int inc,
int inh,
751 int inw,
int padded_h,
int padded_w,
int pad_top,
int pad_left,
754void shl_rvv_pad_input_pack1ton_fp32(
const float *input,
float *input_padded,
int inc,
int inh,
755 int inw,
int padded_h,
int padded_w,
int pad_top,
757void shl_rvv_pad_input_pack1ton_fp16(
const __fp16 *input, __fp16 *input_padded,
int inc,
int inh,
758 int inw,
int padded_h,
int padded_w,
int pad_top,
760void shl_rvv_pad_input_pack1ton_int8(
const int8_t *input, int8_t *input_padded,
int inc,
int inh,
761 int inw,
int padded_h,
int padded_w,
int pad_top,
int pad_left,
764void shl_rvv_pad_input_nhwc_fp32(
const float *input,
float *input_padded,
int inh,
int inw,
int inc,
765 int padded_h,
int padded_w,
int pad_top,
int pad_left);
766void shl_rvv_pad_input_nhwc_fp16(
const __fp16 *input, __fp16 *input_padded,
int inh,
int inw,
767 int inc,
int padded_h,
int padded_w,
int pad_top,
int pad_left);
768void shl_rvv_pad_input_nhwc_int8(
const int8_t *input, int8_t *input_padded,
int inh,
int inw,
769 int inc,
int padded_h,
int padded_w,
int pad_top,
int pad_left,
772void shl_rvv_reorder_input_pack1ton_fp32(
const float *src,
float *dst,
int inc,
int inh,
int inw);
773void shl_rvv_reorder_input_pack1ton_fp16(
const __fp16 *src, __fp16 *dst,
int inc,
int inh,
int inw);
774void shl_rvv_reorder_input_pack1ton_int8(
const int8_t *src, int8_t *dst,
int inc,
int inh,
int inw);
775void shl_rvv_reorder_input_packnto1_fp32(
const float *src,
float *dst,
int inc,
int inh,
int inw);
776void shl_rvv_reorder_input_packnto1_fp16(
const __fp16 *src, __fp16 *dst,
int inc,
int inh,
int inw);
777void shl_rvv_reorder_input_packnto1_int8(
const int8_t *src, int8_t *dst,
int inc,
int inh,
int inw);
779void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp,
int size);
781void shl_rvv_requantize_fp16(__fp16 *src, __fp16 scale,
int size);
788void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift,
int channel_size);
790void shl_rvv_dequantize_i8_to_f16(int8_t *src, __fp16 *dst,
int size, int32_t zp,
float scale);
792void shl_rvv_reorder_kernel_n8_fp16_w_int8(int8_t *a, int8_t *sa,
int m,
int k,
int ldx);
794void shl_rvv_pad_input_int4_trans_int8(
const int8_t *input, int8_t *input_padded,
int inc,
int inh,
795 int inw,
int padded_h,
int padded_w,
int pad_top,
796 int pad_left, int8_t pad_value);
797void shl_rvv_int4_to_int8(int8_t *src, int8_t *dst,
int size);
798void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst,
int size);
799void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst,
int size);
800void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp,
int size);
802void shl_rvv_i16_to_f32(
const int16_t *input,
float *output, int32_t offset,
float *scale,
804void shl_rvv_f32_to_i16(
const float *input, int16_t *output, int32_t offset,
float *scale,
806void shl_rvv_f16_to_f32(
const __fp16 *input,
float *output,
float *scale, uint32_t length);
807void shl_rvv_f32_to_f16(
const float *input, __fp16 *output,
float *scale, uint32_t length);
814void shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(
struct csinn_tensor *t);
815void shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(
struct csinn_tensor *t);
816void shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(
struct csinn_tensor *t);
817void shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(
struct csinn_tensor *t);
818void shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(
struct csinn_tensor *t);
819void shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(
struct csinn_tensor *t);
821void shl_rvv_tensor_ndarray_to_nc1xc0_inplace_fp32(
struct csinn_tensor *t);
822void shl_rvv_tensor_ndarray_to_nc1xc0_inplace_fp16(
struct csinn_tensor *t);
823void shl_rvv_tensor_ndarray_to_nc1xc0_inplace_int8(
struct csinn_tensor *t);
824void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_fp32(
struct csinn_tensor *t);
825void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_fp16(
struct csinn_tensor *t);
826void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_int8(
struct csinn_tensor *t);
832 void *
est,
void *cap);
837enum avgpool_loc_enum {
838 AVGPOOL_LEFT_TOP = 0,
841 AVGPOOL_RIGHT_BOTTOM,
849int shl_rvv_avgpool_get_window_size(
struct csinn_pool_params *params,
int idx_h_start,
850 int idx_h_end,
int idx_w_start,
int idx_w_end,
851 enum avgpool_loc_enum loc);
853void shl_rvv_conv1d_gemm_reorder_kernel_int8(
struct csinn_tensor *kernel,
863#ifdef SHL_USE_DOT_INT4
867void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(
struct csinn_tensor *kernel,
872void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(
struct csinn_tensor *kernel,
877void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(
struct csinn_tensor *kernel,
882void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(
struct csinn_tensor *kernel,
887void shl_rvv_fc_gemv_transform_weight_int4_dot(
struct csinn_tensor *weights);
893struct shl_rvv_option {
894 bool use_packn_layout;
895 bool binary_model_op_init;
898struct shl_rvv_option *shl_rvv_get_graph_option(
struct csinn_session *sess);
899bool shl_rvv_get_binary_model_op_init(
struct csinn_session *sess);
csinn_op_enum
Definition: csinn_data_structure.h:127
csinn_dtype_enum
Definition: csinn_data_structure.h:39
Definition: csinn_data_structure.h:524
int(* est)()
Definition: csinn_data_structure.h:526
int(* init)()
Definition: csinn_data_structure.h:525
int(* exec)()
Definition: csinn_data_structure.h:527
Definition: csinn_data_structure.h:1081
Definition: csinn_data_structure.h:780
Definition: csinn_data_structure.h:1162
Definition: csinn_data_structure.h:553
Definition: csinn_data_structure.h:753
Definition: csinn_data_structure.h:596
Definition: csinn_data_structure.h:919
Definition: csinn_data_structure.h:746
Definition: csinn_data_structure.h:605
Definition: csinn_data_structure.h:686
Definition: csinn_data_structure.h:1033
Definition: csinn_data_structure.h:676
Definition: csinn_data_structure.h:825
Definition: csinn_data_structure.h:502
Definition: csinn_data_structure.h:671
Definition: csinn_data_structure.h:661
Definition: csinn_data_structure.h:692
Definition: csinn_data_structure.h:1088
Definition: csinn_data_structure.h:475
Definition: csinn_data_structure.h:818