SHL 2.2.x
Loading...
Searching...
No Matches
shl_thead_rvv.h
1/*
2 * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19/* SHL version 2.2.x */
20
21#ifndef INCLUDE_SHL_RVV_H_
22#define INCLUDE_SHL_RVV_H_
23
24#if __riscv_vector
25#include <riscv_vector.h>
26
27#if (__riscv_v == 1000000)
28#define RVV_1_0_0
29#elif (__riscv_v == 7000)
30#define RVV_0_7_1
31#endif
32
33#ifdef __riscv_xtheadvdot
34#define XTHEADVDOT
35#define SHL_USE_DOT_INT8 // default: support int8 dot
36// #define SHL_USE_DOT_INT4 // easter eggs
37#endif // __riscv_xtheadvdot
38
39#endif // __riscv_vector
40
41#include "csi_nn.h"
42#include "shl_gref.h"
43#include "shl_ref.h"
44
45#ifdef __cplusplus
46extern "C" {
47#endif
48
49/********************************** initialization ******************************/
50int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
51 struct csinn_tensor *kernel, struct csinn_tensor *bias,
52 struct csinn_conv2d_params *params);
53int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
54 struct csinn_tensor *kernel, struct csinn_tensor *bias,
55 struct csinn_conv2d_params *params);
56int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
57 struct csinn_tensor *kernel, struct csinn_tensor *bias,
58 struct csinn_conv2d_params *params);
59
60int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
61 struct csinn_tensor *kernel, struct csinn_tensor *bias,
62 struct csinn_conv1d_params *params);
63
64int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
65 struct csinn_tensor *kernel, struct csinn_tensor *bias,
66 struct csinn_conv2d_params *params);
67int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
68 struct csinn_tensor *kernel, struct csinn_tensor *bias,
69 struct csinn_conv2d_params *params);
70int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
71 struct csinn_tensor *kernel, struct csinn_tensor *bias,
72 struct csinn_conv2d_params *params);
73int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
74 struct csinn_tensor *kernel, struct csinn_tensor *bias,
75 struct csinn_conv2d_params *params);
76
77int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
78 struct csinn_pool_params *params);
79int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
80 struct csinn_pool_params *params);
81int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
82 struct csinn_pool_params *params);
83int shl_rvv_global_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
84 struct csinn_pool_params *params);
85int shl_rvv_global_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
86 struct csinn_pool_params *params);
87int shl_rvv_global_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
88 struct csinn_pool_params *params);
89
90int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
91 struct csinn_pool_params *params);
92int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
93 struct csinn_pool_params *params);
94int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
95 struct csinn_pool_params *params);
96
97int shl_rvv_global_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
98 struct csinn_pool_params *params);
99int shl_rvv_global_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
100 struct csinn_pool_params *params);
101int shl_rvv_global_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
102 struct csinn_pool_params *params);
103
104int shl_rvv_fullyconnected_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
105 struct csinn_tensor *weights, struct csinn_tensor *bias,
106 struct csinn_fc_params *params);
107
108int shl_rvv_fullyconnected_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
109 struct csinn_tensor *weights, struct csinn_tensor *bias,
110 struct csinn_fc_params *params);
111
112int shl_rvv_fullyconnected_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
113 struct csinn_tensor *weights, struct csinn_tensor *bias,
114 struct csinn_fc_params *params);
115
116int shl_rvv_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output,
117 struct csinn_siso_params *params);
118
119/************************************ convolution *********************************/
120/*********************************** im2col + gemm ********************************/
121void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
122 struct csinn_conv2d_params *params);
123void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
124 struct csinn_conv2d_params *params);
125void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
126 struct csinn_conv2d_params *params);
127
128int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
129 struct csinn_tensor *kernel, struct csinn_tensor *bias,
130 struct csinn_conv2d_params *params);
131int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
132 struct csinn_tensor *kernel, struct csinn_tensor *bias,
133 struct csinn_conv2d_params *params);
134int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
135 struct csinn_tensor *kernel, struct csinn_tensor *bias,
136 struct csinn_conv2d_params *params);
137
138void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
139 struct csinn_conv2d_params *params);
140void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
141 struct csinn_conv2d_params *params);
142void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
143 struct csinn_conv2d_params *params);
144
145int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
146 struct csinn_tensor *kernel, struct csinn_tensor *bias,
147 struct csinn_conv2d_params *params);
148int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
149 struct csinn_tensor *kernel, struct csinn_tensor *bias,
150 struct csinn_conv2d_params *params);
151int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
152 struct csinn_tensor *kernel, struct csinn_tensor *bias,
153 struct csinn_conv2d_params *params);
154
155void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
156 struct csinn_conv2d_params *params);
157void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
158 struct csinn_conv2d_params *params);
159void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
160 struct csinn_conv2d_params *params);
161
162int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
163 struct csinn_tensor *kernel, struct csinn_tensor *bias,
164 struct csinn_conv2d_params *params);
165int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
166 struct csinn_tensor *kernel, struct csinn_tensor *bias,
167 struct csinn_conv2d_params *params);
168int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
169 struct csinn_tensor *kernel, struct csinn_tensor *bias,
170 struct csinn_conv2d_params *params);
171
172void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
173 struct csinn_conv2d_params *params);
174void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
175 struct csinn_conv2d_params *params);
176void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
177 struct csinn_conv2d_params *params);
178
179int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
180 struct csinn_tensor *kernel, struct csinn_tensor *bias,
181 struct csinn_conv2d_params *params);
182int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
183 struct csinn_tensor *kernel, struct csinn_tensor *bias,
184 struct csinn_conv2d_params *params);
185int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
186 struct csinn_tensor *kernel, struct csinn_tensor *bias,
187 struct csinn_conv2d_params *params);
188
189/******************************** conv2d1x1s1 + gemm ******************************/
190void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
191 struct csinn_conv2d_params *params);
192void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
193 struct csinn_conv2d_params *params);
194void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
195 struct csinn_conv2d_params *params);
196
197int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
198 struct csinn_tensor *kernel, struct csinn_tensor *bias,
199 struct csinn_conv2d_params *params);
200int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
201 struct csinn_tensor *kernel, struct csinn_tensor *bias,
202 struct csinn_conv2d_params *params);
203int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
204 struct csinn_tensor *kernel, struct csinn_tensor *bias,
205 struct csinn_conv2d_params *params);
206
207void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
208 struct csinn_conv2d_params *params);
209void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
210 struct csinn_conv2d_params *params);
211void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
212 struct csinn_conv2d_params *params);
213
214int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
215 struct csinn_tensor *kernel, struct csinn_tensor *bias,
216 struct csinn_conv2d_params *params);
217int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
218 struct csinn_tensor *kernel, struct csinn_tensor *bias,
219 struct csinn_conv2d_params *params);
220int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
221 struct csinn_tensor *kernel, struct csinn_tensor *bias,
222 struct csinn_conv2d_params *params);
223
224void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
225 struct csinn_conv2d_params *params);
226void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
227 struct csinn_conv2d_params *params);
228void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
229 struct csinn_conv2d_params *params);
230
231int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
232 struct csinn_tensor *kernel, struct csinn_tensor *bias,
233 struct csinn_conv2d_params *params);
234int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
235 struct csinn_tensor *kernel, struct csinn_tensor *bias,
236 struct csinn_conv2d_params *params);
237int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
238 struct csinn_tensor *kernel, struct csinn_tensor *bias,
239 struct csinn_conv2d_params *params);
240
241void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
242 struct csinn_conv2d_params *params);
243void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
244 struct csinn_conv2d_params *params);
245void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
246 struct csinn_conv2d_params *params);
247
248int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
249 struct csinn_tensor *kernel, struct csinn_tensor *bias,
250 struct csinn_conv2d_params *params);
251int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
252 struct csinn_tensor *kernel, struct csinn_tensor *bias,
253 struct csinn_conv2d_params *params);
254int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
255 struct csinn_tensor *kernel, struct csinn_tensor *bias,
256 struct csinn_conv2d_params *params);
257
258/************************************* winograd ***********************************/
259void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
260 struct csinn_tensor *dst_kernel);
261void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
262 struct csinn_tensor *dst_kernel);
263
264int shl_rvv_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
265 struct csinn_tensor *kernel, struct csinn_tensor *bias,
266 struct csinn_conv2d_params *params);
267int shl_rvv_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
268 struct csinn_tensor *kernel, struct csinn_tensor *bias,
269 struct csinn_conv2d_params *params);
270
271void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
272 struct csinn_tensor *dst_kernel);
273void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
274 struct csinn_tensor *dst_kernel);
275void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel,
276 struct csinn_tensor *dst_kernel);
277
278int shl_rvv_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
279 struct csinn_tensor *kernel, struct csinn_tensor *bias,
280 struct csinn_conv2d_params *params);
281int shl_rvv_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
282 struct csinn_tensor *kernel, struct csinn_tensor *bias,
283 struct csinn_conv2d_params *params);
284int shl_rvv_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
285 struct csinn_tensor *kernel, struct csinn_tensor *bias,
286 struct csinn_conv2d_params *params);
287
288/******************************* depthwise convolution ****************************/
289int shl_rvv_dwconv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
290 struct csinn_tensor *kernel, struct csinn_tensor *bias,
291 struct csinn_conv2d_params *params);
292int shl_rvv_dwconv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
293 struct csinn_tensor *kernel, struct csinn_tensor *bias,
294 struct csinn_conv2d_params *params);
295int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
296 struct csinn_tensor *kernel, struct csinn_tensor *bias,
297 struct csinn_conv2d_params *params);
298int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
299 struct csinn_tensor *kernel, struct csinn_tensor *bias,
300 struct csinn_conv2d_params *params);
301int shl_rvv_dwconv3x3s1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
302 struct csinn_tensor *kernel, struct csinn_tensor *bias,
303 struct csinn_conv2d_params *params);
304int shl_rvv_dwconv3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
305 struct csinn_tensor *kernel, struct csinn_tensor *bias,
306 struct csinn_conv2d_params *params);
307int shl_rvv_dwconv3x3s1_int4(struct csinn_tensor *input, struct csinn_tensor *output,
308 struct csinn_tensor *kernel, struct csinn_tensor *bias,
309 struct csinn_conv2d_params *params);
310int shl_rvv_dwconv3x3s2_int4(struct csinn_tensor *input, struct csinn_tensor *output,
311 struct csinn_tensor *kernel, struct csinn_tensor *bias,
312 struct csinn_conv2d_params *params);
313
314void shl_rvv_dwconv_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
315 struct csinn_conv2d_params *params);
316void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
317 struct csinn_conv2d_params *params);
318void shl_rvv_dwconv_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
319 struct csinn_conv2d_params *params);
320
321int shl_rvv_dwconv3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
322 struct csinn_tensor *kernel, struct csinn_tensor *bias,
323 struct csinn_conv2d_params *params);
324int shl_rvv_dwconv3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
325 struct csinn_tensor *kernel, struct csinn_tensor *bias,
326 struct csinn_conv2d_params *params);
327int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
328 struct csinn_tensor *kernel, struct csinn_tensor *bias,
329 struct csinn_conv2d_params *params);
330int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
331 struct csinn_tensor *kernel, struct csinn_tensor *bias,
332 struct csinn_conv2d_params *params);
333int shl_rvv_dwconv3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
334 struct csinn_tensor *kernel, struct csinn_tensor *bias,
335 struct csinn_conv2d_params *params);
336int shl_rvv_dwconv3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
337 struct csinn_tensor *kernel, struct csinn_tensor *bias,
338 struct csinn_conv2d_params *params);
339
340int shl_rvv_dwconv_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
341 struct csinn_tensor *kernel, struct csinn_tensor *bias,
342 struct csinn_conv2d_params *params);
343int shl_rvv_dwconv_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
344 struct csinn_tensor *kernel, struct csinn_tensor *bias,
345 struct csinn_conv2d_params *params);
346int shl_rvv_dwconv_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
347 struct csinn_tensor *kernel, struct csinn_tensor *bias,
348 struct csinn_conv2d_params *params);
349
350int shl_rvv_dwconv_nhwc_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
351 struct csinn_tensor *kernel, struct csinn_tensor *bias,
352 struct csinn_conv2d_params *params);
353int shl_rvv_dwconv_nhwc_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
354 struct csinn_tensor *kernel, struct csinn_tensor *bias,
355 struct csinn_conv2d_params *params);
356int shl_rvv_dwconv_nhwc_int8(struct csinn_tensor *input, struct csinn_tensor *output,
357 struct csinn_tensor *kernel, struct csinn_tensor *bias,
358 struct csinn_conv2d_params *params);
359
360/*************************************** gemm *************************************/
361void shl_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx);
362void shl_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx);
363void shl_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k,
364 int n, int ldc);
365
366void shl_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx);
367void shl_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, float *bias, int m,
368 int k, int n, int ldc);
369
370void shl_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
371void shl_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
372void shl_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
373 int k, int n, int ldc);
374
375void shl_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
376void shl_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
377void shl_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
378 int m, int k, int n, int ldc);
379
380void shl_rvv_reorder_kernel_n8_int8_dot(int8_t *a, int8_t *sa, int m, int k, int ldx);
381void shl_rvv_reorder_input_z8_int8_dot(int8_t *b, int8_t *sb, int k, int n, int ldx);
382void shl_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
383 int k, int n, int ldc);
384void shl_rvv_gemm_8x8_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
385 int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult,
386 int32_t *shift);
387
388void shl_rvv_reorder_kernel_n4_int8_v128(int8_t *a, int8_t *sa, int m, int k, int ldx);
389void shl_rvv_reorder_input_z16_int8_v128(int8_t *b, int8_t *sb, int k, int n, int ldx);
390void shl_rvv_gemm_4x16_int8_v128(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
391 int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult,
392 int32_t *shift);
393
394void shl_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
395void shl_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
396 int m, int k, int n, int ldc);
397
398void shl_rvv_reorder_input_n8_int4_dot(int8_t *a, int8_t *sa, int m, int k, int ldx);
399void shl_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx);
400void shl_rvv_gemm_8x8_int4_dot(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
401 int ldc, int32_t *bias, int32_t out_zp, int32_t *mult,
402 int32_t *shift);
403
404void shl_rvv_reorder_input_m4_int8(int8_t *a, int8_t *sa, int m, int k);
405void shl_rvv_reorder_kernel_int8(int8_t *b, int8_t *sb, int n, int k);
406void shl_rvv_gemm_4xn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, const int32_t *bias,
407 int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult,
408 int32_t *shift);
409
410/************************************ gemm ncxhwx *********************************/
411void shl_rvv_reorder_kernel_packn_fp32(float *a, float *sa, int m, int k, int ldx);
412void shl_rvv_reorder_input_z8_packn_fp32(float *b, float *sb, int k, int n, int ldx);
413void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
414 int m, int k, int n, int ldc);
415void shl_rvv_reorder_input_z12_packn_fp32(float *b, float *sb, int k, int n, int ldx);
416void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
417 int m, int k, int n, int ldc);
418
419void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
420void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
421void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
422 __fp16 *bias, int m, int k, int n, int ldc);
423void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
424void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
425 __fp16 *bias, int m, int k, int n, int ldc);
426
427void shl_rvv_reorder_input_z8_packn_int8_dot(int8_t *b, int8_t *sb, int k, int n, int ldx);
428void shl_rvv_ncxhwx_gemm_8xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
429 int32_t *bias, int m, int k, int n, int ldc,
430 int32_t out_zp, int32_t *mult, int32_t *shift);
431void shl_rvv_reorder_input_z12_packn_int8_dot(int8_t *b, int8_t *sb, int k, int n, int ldx);
432void shl_rvv_ncxhwx_gemm_12xpackn_int8_dot(int8_t *dst, const int8_t *sa, const int8_t *sb,
433 int32_t *bias, int m, int k, int n, int ldc,
434 int32_t out_zp, int32_t *mult, int32_t *shift);
435
436void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx);
437void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
438 int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
439 int32_t *mult, int32_t *shift);
440
441void shl_rvv_reorder_input_z12_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx);
442void shl_rvv_ncxhwx_gemm_12xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
443 int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
444 int32_t *mult, int32_t *shift);
445
446void shl_rvv_reorder_input_z12_pack1ton_fp32(float *b, float *sb, int inc, int maxk, int n,
447 int ldx);
448void shl_rvv_reorder_input_z12_pack1ton_fp16(__fp16 *b, __fp16 *sb, int inc, int maxk, int n,
449 int ldx);
450void shl_rvv_reorder_input_z4_pack1ton_int8(int8_t *b, int8_t *sb, int inc, int maxk, int n,
451 int ldx);
452void shl_rvv_reorder_input_z12_pack1ton_int8_dot(int8_t *b, int8_t *sb, int inc, int maxk, int n,
453 int ldx);
454
455void shl_rvv_reorder_input_z4_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
456void shl_rvv_ncxhwx_gemm_4xpack2n_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
457 int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
458 int32_t *mult, int32_t *shift);
459
460/************************************ pooling *********************************/
461int shl_rvv_avgpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
462 struct csinn_pool_params *params);
463int shl_rvv_avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
464 struct csinn_pool_params *params);
465int shl_rvv_avgpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
466 struct csinn_pool_params *params);
467int shl_rvv_avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
468 struct csinn_pool_params *params);
469int shl_rvv_avgpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
470 struct csinn_pool_params *params);
471int shl_rvv_avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
472 struct csinn_pool_params *params);
473int shl_rvv_avgpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
474 struct csinn_pool_params *params);
475int shl_rvv_avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
476 struct csinn_pool_params *params);
477int shl_rvv_avgpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
478 struct csinn_pool_params *params);
479int shl_rvv_avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
480 struct csinn_pool_params *params);
481
482int shl_rvv_maxpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
483 struct csinn_pool_params *params);
484int shl_rvv_maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
485 struct csinn_pool_params *params);
486int shl_rvv_maxpool2x2s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
487 struct csinn_pool_params *params);
488int shl_rvv_maxpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
489 struct csinn_pool_params *params);
490int shl_rvv_maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
491 struct csinn_pool_params *params);
492int shl_rvv_maxpool2x2s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
493 struct csinn_pool_params *params);
494int shl_rvv_maxpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
495 struct csinn_pool_params *params);
496int shl_rvv_maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
497 struct csinn_pool_params *params);
498int shl_rvv_maxpool3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
499 struct csinn_pool_params *params);
500int shl_rvv_maxpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
501 struct csinn_pool_params *params);
502int shl_rvv_maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
503 struct csinn_pool_params *params);
504int shl_rvv_maxpool3x3s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
505 struct csinn_pool_params *params);
506int shl_rvv_maxpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
507 struct csinn_pool_params *params);
508int shl_rvv_maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
509 struct csinn_pool_params *params);
510int shl_rvv_maxpool3x3s1_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
511 struct csinn_pool_params *params);
512
513int shl_rvv_global_avgpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
514 struct csinn_pool_params *params);
515int shl_rvv_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
516 struct csinn_pool_params *params);
517
518int shl_rvv_global_maxpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
519 struct csinn_pool_params *params);
520int shl_rvv_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
521 struct csinn_pool_params *params);
522
523int shl_rvv_maxpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
524 struct csinn_pool_params *params);
525int shl_rvv_maxpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
526 struct csinn_pool_params *params);
527int shl_rvv_maxpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
528 struct csinn_pool_params *params);
529int shl_rvv_maxpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
530 struct csinn_pool_params *params);
531int shl_rvv_maxpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
532 struct csinn_pool_params *params);
533int shl_rvv_maxpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
534 struct csinn_pool_params *params);
535int shl_rvv_maxpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
536 struct csinn_pool_params *params);
537int shl_rvv_maxpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
538 struct csinn_pool_params *params);
539int shl_rvv_maxpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
540 struct csinn_pool_params *params);
541
542int shl_rvv_avgpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
543 struct csinn_pool_params *params);
544int shl_rvv_avgpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
545 struct csinn_pool_params *params);
546int shl_rvv_avgpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
547 struct csinn_pool_params *params);
548int shl_rvv_avgpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
549 struct csinn_pool_params *params);
550int shl_rvv_avgpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
551 struct csinn_pool_params *params);
552int shl_rvv_avgpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
553 struct csinn_pool_params *params);
554int shl_rvv_avgpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
555 struct csinn_pool_params *params);
556int shl_rvv_avgpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
557 struct csinn_pool_params *params);
558int shl_rvv_avgpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
559 struct csinn_pool_params *params);
560
561int shl_rvv_global_maxpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
562 struct csinn_pool_params *params);
563int shl_rvv_global_maxpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
564 struct csinn_pool_params *params);
565int shl_rvv_global_maxpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
566 struct csinn_pool_params *params);
567int shl_rvv_global_avgpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
568 struct csinn_pool_params *params);
569int shl_rvv_global_avgpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
570 struct csinn_pool_params *params);
571int shl_rvv_global_avgpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
572 struct csinn_pool_params *params);
573
574int shl_rvv_maxpool_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
575 struct csinn_pool_params *params);
576int shl_rvv_maxpool_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
577 struct csinn_pool_params *params);
578int shl_rvv_maxpool_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
579 struct csinn_pool_params *params);
580
581int shl_rvv_avgpool_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
582 struct csinn_pool_params *params);
583int shl_rvv_avgpool_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
584 struct csinn_pool_params *params);
585int shl_rvv_avgpool_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
586 struct csinn_pool_params *params);
587
588int shl_rvv_maxpool_nhwc_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
589 struct csinn_pool_params *params);
590int shl_rvv_maxpool_nhwc_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
591 struct csinn_pool_params *params);
592int shl_rvv_maxpool_nhwc_int8(struct csinn_tensor *input, struct csinn_tensor *output,
593 struct csinn_pool_params *params);
594
595int shl_rvv_avgpool_nhwc_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
596 struct csinn_pool_params *params);
597int shl_rvv_avgpool_nhwc_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
598 struct csinn_pool_params *params);
599int shl_rvv_avgpool_nhwc_int8(struct csinn_tensor *input, struct csinn_tensor *output,
600 struct csinn_pool_params *params);
601
602int shl_rvv_global_maxpool2d_nhwc_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
603 struct csinn_pool_params *params);
604int shl_rvv_global_maxpool2d_nhwc_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
605 struct csinn_pool_params *params);
606int shl_rvv_global_maxpool2d_nhwc_int8(struct csinn_tensor *input, struct csinn_tensor *output,
607 struct csinn_pool_params *params);
608int shl_rvv_global_avgpool2d_nhwc_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
609 struct csinn_pool_params *params);
610int shl_rvv_global_avgpool2d_nhwc_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
611 struct csinn_pool_params *params);
612int shl_rvv_global_avgpool2d_nhwc_int8(struct csinn_tensor *input, struct csinn_tensor *output,
613 struct csinn_pool_params *params);
614
615/************************************ fullyconnected *********************************/
616void shl_rvv_fc_gemv_transform_weight_fp32(struct csinn_tensor *weights);
617void shl_rvv_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights);
618void shl_rvv_fc_gemv_transform_weight_int8(struct csinn_tensor *weights);
619void shl_rvv_fc_gemm_transform_weight_int8(struct csinn_tensor *weights);
620
621int shl_rvv_fullyconnected_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
622 struct csinn_tensor *weights, struct csinn_tensor *bias,
623 struct csinn_fc_params *params);
624int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
625 struct csinn_tensor *weights, struct csinn_tensor *bias,
626 struct csinn_fc_params *params);
627int shl_rvv_fullyconnected_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
628 struct csinn_tensor *weights, struct csinn_tensor *bias,
629 struct csinn_fc_params *params);
630int shl_rvv_fullyconnected_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
631 struct csinn_tensor *weights, struct csinn_tensor *bias,
632 struct csinn_fc_params *params);
633
634/************************************ activation *********************************/
635int shl_rvv_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
636 struct csinn_relu_params *params);
637int shl_rvv_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
638 struct csinn_relu_params *params);
639int shl_rvv_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output,
640 struct csinn_relu_params *params);
641
642int shl_rvv_relu6_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
643 struct csinn_relu_params *params);
644int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
645 struct csinn_relu_params *params);
646
647int shl_rvv_leaky_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
648 struct csinn_relu_params *params);
649int shl_rvv_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
650 struct csinn_relu_params *params);
651int shl_rvv_leaky_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output,
652 struct csinn_relu_params *params);
653
654int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
655 struct csinn_sigmoid_params *params);
656
657int shl_rvv_softmax_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
658 struct csinn_softmax_params *params);
659int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
660 struct csinn_softmax_params *params);
661
662int shl_rvv_prelu_fp32(struct csinn_tensor *input, struct csinn_tensor *alpha,
663 struct csinn_tensor *output, struct csinn_prelu_params *params);
664int shl_rvv_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha,
665 struct csinn_tensor *output, struct csinn_prelu_params *params);
666int shl_rvv_prelu_int8(struct csinn_tensor *input, struct csinn_tensor *alpha,
667 struct csinn_tensor *output, struct csinn_prelu_params *params);
668
669int shl_rvv_clip_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
670 struct csinn_clip_params *params);
671int shl_rvv_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
672 struct csinn_clip_params *params);
673int shl_rvv_clip_int8(struct csinn_tensor *input, struct csinn_tensor *output,
674 struct csinn_clip_params *params);
675
676/************************************ layout/memory transform *********************************/
677int shl_rvv_concat_fp32(struct csinn_tensor **input, struct csinn_tensor *output,
678 struct csinn_concat_params *params);
679int shl_rvv_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
680 struct csinn_concat_params *params);
681int shl_rvv_concat_int8(struct csinn_tensor **input, struct csinn_tensor *output,
682 struct csinn_concat_params *params);
683
684int shl_rvv_reshape_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
685 struct csinn_reshape_params *params);
686int shl_rvv_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
687 struct csinn_reshape_params *params);
688int shl_rvv_reshape_int8(struct csinn_tensor *input, struct csinn_tensor *output,
689 struct csinn_reshape_params *params);
690
691int shl_rvv_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
692 struct csinn_transpose_params *params);
693int shl_rvv_transpose_int8(struct csinn_tensor *input, struct csinn_tensor *output,
694 struct csinn_transpose_params *params);
695
696int shl_rvv_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices,
697 struct csinn_tensor *output, struct csinn_gather_params *params);
698int shl_rvv_gather_int8(struct csinn_tensor *input, struct csinn_tensor *indices,
699 struct csinn_tensor *output, struct csinn_gather_params *params);
700
701int shl_rvv_strided_slice_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
702 struct csinn_strided_slice_params *params);
703
704/************************************ basic math *********************************/
705int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
706 struct csinn_tensor *output, struct csinn_diso_params *params);
707int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
708 struct csinn_tensor *output, struct csinn_diso_params *params);
709int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
710 struct csinn_tensor *output, struct csinn_diso_params *params);
711
712int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
713 struct csinn_tensor *output, struct csinn_diso_params *params);
714int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
715 struct csinn_tensor *output, struct csinn_diso_params *params);
716int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
717 struct csinn_tensor *output, struct csinn_diso_params *params);
718
719int shl_rvv_reduce_sum_int8(struct csinn_tensor *input, struct csinn_tensor *output,
720 struct csinn_reduce_params *params);
721
722/******************************** normalization *****************************/
723int shl_rvv_layer_norm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
724 struct csinn_tensor *gamma, struct csinn_tensor *beta,
725 struct csinn_layer_norm_params *params);
726int shl_rvv_layer_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
727 struct csinn_tensor *gamma, struct csinn_tensor *beta,
728 struct csinn_layer_norm_params *params);
729int shl_rvv_layer_norm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
730 struct csinn_tensor *gamma, struct csinn_tensor *beta,
731 struct csinn_layer_norm_params *params);
732
733/******************************** linear algebra ****************************/
734int shl_rvv_matmul_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
735 struct csinn_tensor *output, struct csinn_matmul_params *params);
736
737/************************************ utils *********************************/
738void shl_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw,
739 int padded_h, int padded_w, int pad_top, int pad_left);
740void shl_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
741 int padded_h, int padded_w, int pad_top, int pad_left);
742void shl_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw,
743 int padded_h, int padded_w, int pad_top, int pad_left,
744 int8_t pad_value);
745
746void shl_rvv_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh,
747 int inw, int padded_h, int padded_w, int pad_top, int pad_left);
748void shl_rvv_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
749 int inw, int padded_h, int padded_w, int pad_top, int pad_left);
750void shl_rvv_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
751 int inw, int padded_h, int padded_w, int pad_top, int pad_left,
752 int8_t pad_value);
753
754void shl_rvv_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, int inh,
755 int inw, int padded_h, int padded_w, int pad_top,
756 int pad_left);
757void shl_rvv_pad_input_pack1ton_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
758 int inw, int padded_h, int padded_w, int pad_top,
759 int pad_left);
760void shl_rvv_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
761 int inw, int padded_h, int padded_w, int pad_top, int pad_left,
762 int8_t pad_value);
763
764void shl_rvv_pad_input_nhwc_fp32(const float *input, float *input_padded, int inh, int inw, int inc,
765 int padded_h, int padded_w, int pad_top, int pad_left);
766void shl_rvv_pad_input_nhwc_fp16(const __fp16 *input, __fp16 *input_padded, int inh, int inw,
767 int inc, int padded_h, int padded_w, int pad_top, int pad_left);
768void shl_rvv_pad_input_nhwc_int8(const int8_t *input, int8_t *input_padded, int inh, int inw,
769 int inc, int padded_h, int padded_w, int pad_top, int pad_left,
770 int8_t pad_value);
771
772void shl_rvv_reorder_input_pack1ton_fp32(const float *src, float *dst, int inc, int inh, int inw);
773void shl_rvv_reorder_input_pack1ton_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw);
774void shl_rvv_reorder_input_pack1ton_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw);
775void shl_rvv_reorder_input_packnto1_fp32(const float *src, float *dst, int inc, int inh, int inw);
776void shl_rvv_reorder_input_packnto1_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw);
777void shl_rvv_reorder_input_packnto1_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw);
778
779void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size);
780
781void shl_rvv_requantize_fp16(__fp16 *src, __fp16 scale, int size);
782void shl_rvv_sidcso_op_requantize_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
783 struct csinn_tensor *kernel);
784void shl_rvv_siso_op_requantize_fp16(struct csinn_tensor *input, struct csinn_tensor *output);
785void shl_rvv_diso_op_requantize_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
786 struct csinn_tensor *output);
787
788void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size);
789
790void shl_rvv_dequantize_i8_to_f16(int8_t *src, __fp16 *dst, int size, int32_t zp, float scale);
791
792void shl_rvv_reorder_kernel_n8_fp16_w_int8(int8_t *a, int8_t *sa, int m, int k, int ldx);
793
794void shl_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
795 int inw, int padded_h, int padded_w, int pad_top,
796 int pad_left, int8_t pad_value);
797void shl_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size);
798void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size);
799void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size);
800void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp, int size);
801
802void shl_rvv_i16_to_f32(const int16_t *input, float *output, int32_t offset, float *scale,
803 uint32_t length);
804void shl_rvv_f32_to_i16(const float *input, int16_t *output, int32_t offset, float *scale,
805 uint32_t length);
806void shl_rvv_f16_to_f32(const __fp16 *input, float *output, float *scale, uint32_t length);
807void shl_rvv_f32_to_f16(const float *input, __fp16 *output, float *scale, uint32_t length);
808
809int shl_rvv_data_convert_int8_to_int4(struct csinn_tensor *input, struct csinn_tensor *output,
810 struct csinn_siso_params *params);
811int shl_rvv_data_convert_int4_to_int8(struct csinn_tensor *input, struct csinn_tensor *output,
812 struct csinn_siso_params *params);
813
814void shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp32(struct csinn_tensor *t);
815void shl_rvv_tensor_ndarray_to_nc1xc0_replace_fp16(struct csinn_tensor *t);
816void shl_rvv_tensor_ndarray_to_nc1xc0_replace_int8(struct csinn_tensor *t);
817void shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(struct csinn_tensor *t);
818void shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(struct csinn_tensor *t);
819void shl_rvv_tensor_nc1xc0_to_ndarray_replace_int8(struct csinn_tensor *t);
820
821void shl_rvv_tensor_ndarray_to_nc1xc0_inplace_fp32(struct csinn_tensor *t);
822void shl_rvv_tensor_ndarray_to_nc1xc0_inplace_fp16(struct csinn_tensor *t);
823void shl_rvv_tensor_ndarray_to_nc1xc0_inplace_int8(struct csinn_tensor *t);
824void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_fp32(struct csinn_tensor *t);
825void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_fp16(struct csinn_tensor *t);
826void shl_rvv_tensor_nc1xc0_to_ndarray_inplace_int8(struct csinn_tensor *t);
827
828void shl_rvv_nc1xc0_fp16_to_nchw_fp32(struct csinn_tensor *dest, struct csinn_tensor *src);
829
830struct csinn_callback *shl_cb_map_rvv(int op, int dtype);
831void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec,
832 void *est, void *cap);
833
834int csrr_vl();
835int csrr_vlenb();
836
837enum avgpool_loc_enum {
838 AVGPOOL_LEFT_TOP = 0,
839 AVGPOOL_RIGHT_TOP,
840 AVGPOOL_LEFT_BOTTOM,
841 AVGPOOL_RIGHT_BOTTOM,
842 AVGPOOL_LEFT,
843 AVGPOOL_RIGHT,
844 AVGPOOL_TOP,
845 AVGPOOL_BOTTOM,
846 AVGPOOL_CENTER,
847};
848
849int shl_rvv_avgpool_get_window_size(struct csinn_pool_params *params, int idx_h_start,
850 int idx_h_end, int idx_w_start, int idx_w_end,
851 enum avgpool_loc_enum loc);
852
853void shl_rvv_conv1d_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
854 struct csinn_conv1d_params *params);
855int shl_rvv_conv1d_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
856 struct csinn_tensor *kernel, struct csinn_tensor *bias,
857 struct csinn_conv1d_params *params);
858
859int shl_rvv_dwconv1d_int8(struct csinn_tensor *input, struct csinn_tensor *output,
860 struct csinn_tensor *kernel, struct csinn_tensor *bias,
861 struct csinn_conv1d_params *params);
862
863#ifdef SHL_USE_DOT_INT4
864int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
865 struct csinn_tensor *kernel, struct csinn_tensor *bias,
866 struct csinn_conv2d_params *params);
867void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
868 struct csinn_conv2d_params *params);
869int shl_rvv_conv_im2col_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output,
870 struct csinn_tensor *kernel, struct csinn_tensor *bias,
871 struct csinn_conv2d_params *params);
872void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
873 struct csinn_conv2d_params *params);
874int shl_rvv_conv_im2col_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output,
875 struct csinn_tensor *kernel, struct csinn_tensor *bias,
876 struct csinn_conv2d_params *params);
877void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
878 struct csinn_conv2d_params *params);
879int shl_rvv_conv1x1s1_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output,
880 struct csinn_tensor *kernel, struct csinn_tensor *bias,
881 struct csinn_conv2d_params *params);
882void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
883 struct csinn_conv2d_params *params);
884int shl_rvv_conv1x1s1_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output,
885 struct csinn_tensor *kernel, struct csinn_tensor *bias,
886 struct csinn_conv2d_params *params);
887void shl_rvv_fc_gemv_transform_weight_int4_dot(struct csinn_tensor *weights);
888int shl_rvv_fullyconnected_packn_int4_dot(struct csinn_tensor *input, struct csinn_tensor *output,
889 struct csinn_tensor *weights, struct csinn_tensor *bias,
890 struct csinn_fc_params *params);
891#endif
892
893struct shl_rvv_option {
894 bool use_packn_layout;
895 bool binary_model_op_init;
896};
897
898struct shl_rvv_option *shl_rvv_get_graph_option(struct csinn_session *sess);
899bool shl_rvv_get_binary_model_op_init(struct csinn_session *sess);
900
901#ifdef __cplusplus
902}
903#endif
904
905#endif // INCLUDE_SHL_RVV_H_
csinn_op_enum
Definition: csinn_data_structure.h:127
csinn_dtype_enum
Definition: csinn_data_structure.h:39
Definition: csinn_data_structure.h:524
int(* est)()
Definition: csinn_data_structure.h:526
int(* init)()
Definition: csinn_data_structure.h:525
int(* exec)()
Definition: csinn_data_structure.h:527
Definition: csinn_data_structure.h:1081
Definition: csinn_data_structure.h:780
Definition: csinn_data_structure.h:1162
Definition: csinn_data_structure.h:553
Definition: csinn_data_structure.h:753
Definition: csinn_data_structure.h:596
Definition: csinn_data_structure.h:919
Definition: csinn_data_structure.h:746
Definition: csinn_data_structure.h:605
Definition: csinn_data_structure.h:686
Definition: csinn_data_structure.h:1033
Definition: csinn_data_structure.h:676
Definition: csinn_data_structure.h:825
Definition: csinn_data_structure.h:502
Definition: csinn_data_structure.h:671
Definition: csinn_data_structure.h:661
Definition: csinn_data_structure.h:692
Definition: csinn_data_structure.h:1088
Definition: csinn_data_structure.h:475
Definition: csinn_data_structure.h:818