2 * Generic vectorized operation runtime
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "qemu/host-utils.h"
23 #include "exec/helper-proto.h"
24 #include "tcg-gvec-desc.h"
27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate
28 * them via GCC's generic vector extension. This turns out to be simpler and
29 * more reliable than getting the compiler to autovectorize.
31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
32 * are multiples of 16.
34 * When the compiler does not support all of the operations we require, the
35 * loops are written so that we can always fall back on the base types.
37 #ifdef CONFIG_VECTOR16
38 typedef uint8_t vec8
__attribute__((vector_size(16)));
39 typedef uint16_t vec16
__attribute__((vector_size(16)));
40 typedef uint32_t vec32
__attribute__((vector_size(16)));
41 typedef uint64_t vec64
__attribute__((vector_size(16)));
43 typedef int8_t svec8
__attribute__((vector_size(16)));
44 typedef int16_t svec16
__attribute__((vector_size(16)));
45 typedef int32_t svec32
__attribute__((vector_size(16)));
46 typedef int64_t svec64
__attribute__((vector_size(16)));
48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
49 #define DUP8(X) { X, X, X, X, X, X, X, X }
50 #define DUP4(X) { X, X, X, X }
51 #define DUP2(X) { X, X }
54 typedef uint16_t vec16
;
55 typedef uint32_t vec32
;
56 typedef uint64_t vec64
;
59 typedef int16_t svec16
;
60 typedef int32_t svec32
;
61 typedef int64_t svec64
;
67 #endif /* CONFIG_VECTOR16 */
69 static inline void clear_high(void *d
, intptr_t oprsz
, uint32_t desc
)
71 intptr_t maxsz
= simd_maxsz(desc
);
74 if (unlikely(maxsz
> oprsz
)) {
75 for (i
= oprsz
; i
< maxsz
; i
+= sizeof(uint64_t)) {
76 *(uint64_t *)(d
+ i
) = 0;
81 void HELPER(gvec_add8
)(void *d
, void *a
, void *b
, uint32_t desc
)
83 intptr_t oprsz
= simd_oprsz(desc
);
86 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
87 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) + *(vec8
*)(b
+ i
);
89 clear_high(d
, oprsz
, desc
);
92 void HELPER(gvec_add16
)(void *d
, void *a
, void *b
, uint32_t desc
)
94 intptr_t oprsz
= simd_oprsz(desc
);
97 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
98 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) + *(vec16
*)(b
+ i
);
100 clear_high(d
, oprsz
, desc
);
103 void HELPER(gvec_add32
)(void *d
, void *a
, void *b
, uint32_t desc
)
105 intptr_t oprsz
= simd_oprsz(desc
);
108 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
109 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) + *(vec32
*)(b
+ i
);
111 clear_high(d
, oprsz
, desc
);
114 void HELPER(gvec_add64
)(void *d
, void *a
, void *b
, uint32_t desc
)
116 intptr_t oprsz
= simd_oprsz(desc
);
119 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
120 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) + *(vec64
*)(b
+ i
);
122 clear_high(d
, oprsz
, desc
);
125 void HELPER(gvec_adds8
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
127 intptr_t oprsz
= simd_oprsz(desc
);
128 vec8 vecb
= (vec8
)DUP16(b
);
131 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
132 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) + vecb
;
134 clear_high(d
, oprsz
, desc
);
137 void HELPER(gvec_adds16
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
139 intptr_t oprsz
= simd_oprsz(desc
);
140 vec16 vecb
= (vec16
)DUP8(b
);
143 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
144 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) + vecb
;
146 clear_high(d
, oprsz
, desc
);
149 void HELPER(gvec_adds32
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
151 intptr_t oprsz
= simd_oprsz(desc
);
152 vec32 vecb
= (vec32
)DUP4(b
);
155 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
156 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) + vecb
;
158 clear_high(d
, oprsz
, desc
);
161 void HELPER(gvec_adds64
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
163 intptr_t oprsz
= simd_oprsz(desc
);
164 vec64 vecb
= (vec64
)DUP2(b
);
167 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
168 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) + vecb
;
170 clear_high(d
, oprsz
, desc
);
173 void HELPER(gvec_sub8
)(void *d
, void *a
, void *b
, uint32_t desc
)
175 intptr_t oprsz
= simd_oprsz(desc
);
178 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
179 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) - *(vec8
*)(b
+ i
);
181 clear_high(d
, oprsz
, desc
);
184 void HELPER(gvec_sub16
)(void *d
, void *a
, void *b
, uint32_t desc
)
186 intptr_t oprsz
= simd_oprsz(desc
);
189 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
190 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) - *(vec16
*)(b
+ i
);
192 clear_high(d
, oprsz
, desc
);
195 void HELPER(gvec_sub32
)(void *d
, void *a
, void *b
, uint32_t desc
)
197 intptr_t oprsz
= simd_oprsz(desc
);
200 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
201 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) - *(vec32
*)(b
+ i
);
203 clear_high(d
, oprsz
, desc
);
206 void HELPER(gvec_sub64
)(void *d
, void *a
, void *b
, uint32_t desc
)
208 intptr_t oprsz
= simd_oprsz(desc
);
211 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
212 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) - *(vec64
*)(b
+ i
);
214 clear_high(d
, oprsz
, desc
);
217 void HELPER(gvec_subs8
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
219 intptr_t oprsz
= simd_oprsz(desc
);
220 vec8 vecb
= (vec8
)DUP16(b
);
223 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
224 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) - vecb
;
226 clear_high(d
, oprsz
, desc
);
229 void HELPER(gvec_subs16
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
231 intptr_t oprsz
= simd_oprsz(desc
);
232 vec16 vecb
= (vec16
)DUP8(b
);
235 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
236 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) - vecb
;
238 clear_high(d
, oprsz
, desc
);
241 void HELPER(gvec_subs32
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
243 intptr_t oprsz
= simd_oprsz(desc
);
244 vec32 vecb
= (vec32
)DUP4(b
);
247 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
248 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) - vecb
;
250 clear_high(d
, oprsz
, desc
);
253 void HELPER(gvec_subs64
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
255 intptr_t oprsz
= simd_oprsz(desc
);
256 vec64 vecb
= (vec64
)DUP2(b
);
259 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
260 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) - vecb
;
262 clear_high(d
, oprsz
, desc
);
265 void HELPER(gvec_mul8
)(void *d
, void *a
, void *b
, uint32_t desc
)
267 intptr_t oprsz
= simd_oprsz(desc
);
270 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
271 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) * *(vec8
*)(b
+ i
);
273 clear_high(d
, oprsz
, desc
);
276 void HELPER(gvec_mul16
)(void *d
, void *a
, void *b
, uint32_t desc
)
278 intptr_t oprsz
= simd_oprsz(desc
);
281 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
282 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) * *(vec16
*)(b
+ i
);
284 clear_high(d
, oprsz
, desc
);
287 void HELPER(gvec_mul32
)(void *d
, void *a
, void *b
, uint32_t desc
)
289 intptr_t oprsz
= simd_oprsz(desc
);
292 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
293 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) * *(vec32
*)(b
+ i
);
295 clear_high(d
, oprsz
, desc
);
298 void HELPER(gvec_mul64
)(void *d
, void *a
, void *b
, uint32_t desc
)
300 intptr_t oprsz
= simd_oprsz(desc
);
303 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
304 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) * *(vec64
*)(b
+ i
);
306 clear_high(d
, oprsz
, desc
);
309 void HELPER(gvec_muls8
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
311 intptr_t oprsz
= simd_oprsz(desc
);
312 vec8 vecb
= (vec8
)DUP16(b
);
315 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
316 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) * vecb
;
318 clear_high(d
, oprsz
, desc
);
321 void HELPER(gvec_muls16
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
323 intptr_t oprsz
= simd_oprsz(desc
);
324 vec16 vecb
= (vec16
)DUP8(b
);
327 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
328 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) * vecb
;
330 clear_high(d
, oprsz
, desc
);
333 void HELPER(gvec_muls32
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
335 intptr_t oprsz
= simd_oprsz(desc
);
336 vec32 vecb
= (vec32
)DUP4(b
);
339 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
340 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) * vecb
;
342 clear_high(d
, oprsz
, desc
);
345 void HELPER(gvec_muls64
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
347 intptr_t oprsz
= simd_oprsz(desc
);
348 vec64 vecb
= (vec64
)DUP2(b
);
351 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
352 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) * vecb
;
354 clear_high(d
, oprsz
, desc
);
357 void HELPER(gvec_neg8
)(void *d
, void *a
, uint32_t desc
)
359 intptr_t oprsz
= simd_oprsz(desc
);
362 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
363 *(vec8
*)(d
+ i
) = -*(vec8
*)(a
+ i
);
365 clear_high(d
, oprsz
, desc
);
368 void HELPER(gvec_neg16
)(void *d
, void *a
, uint32_t desc
)
370 intptr_t oprsz
= simd_oprsz(desc
);
373 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
374 *(vec16
*)(d
+ i
) = -*(vec16
*)(a
+ i
);
376 clear_high(d
, oprsz
, desc
);
379 void HELPER(gvec_neg32
)(void *d
, void *a
, uint32_t desc
)
381 intptr_t oprsz
= simd_oprsz(desc
);
384 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
385 *(vec32
*)(d
+ i
) = -*(vec32
*)(a
+ i
);
387 clear_high(d
, oprsz
, desc
);
390 void HELPER(gvec_neg64
)(void *d
, void *a
, uint32_t desc
)
392 intptr_t oprsz
= simd_oprsz(desc
);
395 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
396 *(vec64
*)(d
+ i
) = -*(vec64
*)(a
+ i
);
398 clear_high(d
, oprsz
, desc
);
401 void HELPER(gvec_mov
)(void *d
, void *a
, uint32_t desc
)
403 intptr_t oprsz
= simd_oprsz(desc
);
406 clear_high(d
, oprsz
, desc
);
409 void HELPER(gvec_dup64
)(void *d
, uint32_t desc
, uint64_t c
)
411 intptr_t oprsz
= simd_oprsz(desc
);
417 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
418 *(uint64_t *)(d
+ i
) = c
;
421 clear_high(d
, oprsz
, desc
);
424 void HELPER(gvec_dup32
)(void *d
, uint32_t desc
, uint32_t c
)
426 intptr_t oprsz
= simd_oprsz(desc
);
432 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
433 *(uint32_t *)(d
+ i
) = c
;
436 clear_high(d
, oprsz
, desc
);
439 void HELPER(gvec_dup16
)(void *d
, uint32_t desc
, uint32_t c
)
441 HELPER(gvec_dup32
)(d
, desc
, 0x00010001 * (c
& 0xffff));
444 void HELPER(gvec_dup8
)(void *d
, uint32_t desc
, uint32_t c
)
446 HELPER(gvec_dup32
)(d
, desc
, 0x01010101 * (c
& 0xff));
449 void HELPER(gvec_not
)(void *d
, void *a
, uint32_t desc
)
451 intptr_t oprsz
= simd_oprsz(desc
);
454 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
455 *(vec64
*)(d
+ i
) = ~*(vec64
*)(a
+ i
);
457 clear_high(d
, oprsz
, desc
);
460 void HELPER(gvec_and
)(void *d
, void *a
, void *b
, uint32_t desc
)
462 intptr_t oprsz
= simd_oprsz(desc
);
465 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
466 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) & *(vec64
*)(b
+ i
);
468 clear_high(d
, oprsz
, desc
);
471 void HELPER(gvec_or
)(void *d
, void *a
, void *b
, uint32_t desc
)
473 intptr_t oprsz
= simd_oprsz(desc
);
476 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
477 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) | *(vec64
*)(b
+ i
);
479 clear_high(d
, oprsz
, desc
);
482 void HELPER(gvec_xor
)(void *d
, void *a
, void *b
, uint32_t desc
)
484 intptr_t oprsz
= simd_oprsz(desc
);
487 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
488 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) ^ *(vec64
*)(b
+ i
);
490 clear_high(d
, oprsz
, desc
);
493 void HELPER(gvec_andc
)(void *d
, void *a
, void *b
, uint32_t desc
)
495 intptr_t oprsz
= simd_oprsz(desc
);
498 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
499 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) &~ *(vec64
*)(b
+ i
);
501 clear_high(d
, oprsz
, desc
);
504 void HELPER(gvec_orc
)(void *d
, void *a
, void *b
, uint32_t desc
)
506 intptr_t oprsz
= simd_oprsz(desc
);
509 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
510 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) |~ *(vec64
*)(b
+ i
);
512 clear_high(d
, oprsz
, desc
);
515 void HELPER(gvec_nand
)(void *d
, void *a
, void *b
, uint32_t desc
)
517 intptr_t oprsz
= simd_oprsz(desc
);
520 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
521 *(vec64
*)(d
+ i
) = ~(*(vec64
*)(a
+ i
) & *(vec64
*)(b
+ i
));
523 clear_high(d
, oprsz
, desc
);
526 void HELPER(gvec_nor
)(void *d
, void *a
, void *b
, uint32_t desc
)
528 intptr_t oprsz
= simd_oprsz(desc
);
531 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
532 *(vec64
*)(d
+ i
) = ~(*(vec64
*)(a
+ i
) | *(vec64
*)(b
+ i
));
534 clear_high(d
, oprsz
, desc
);
537 void HELPER(gvec_eqv
)(void *d
, void *a
, void *b
, uint32_t desc
)
539 intptr_t oprsz
= simd_oprsz(desc
);
542 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
543 *(vec64
*)(d
+ i
) = ~(*(vec64
*)(a
+ i
) ^ *(vec64
*)(b
+ i
));
545 clear_high(d
, oprsz
, desc
);
548 void HELPER(gvec_ands
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
550 intptr_t oprsz
= simd_oprsz(desc
);
551 vec64 vecb
= (vec64
)DUP2(b
);
554 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
555 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) & vecb
;
557 clear_high(d
, oprsz
, desc
);
560 void HELPER(gvec_xors
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
562 intptr_t oprsz
= simd_oprsz(desc
);
563 vec64 vecb
= (vec64
)DUP2(b
);
566 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
567 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) ^ vecb
;
569 clear_high(d
, oprsz
, desc
);
572 void HELPER(gvec_ors
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
574 intptr_t oprsz
= simd_oprsz(desc
);
575 vec64 vecb
= (vec64
)DUP2(b
);
578 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
579 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) | vecb
;
581 clear_high(d
, oprsz
, desc
);
584 void HELPER(gvec_shl8i
)(void *d
, void *a
, uint32_t desc
)
586 intptr_t oprsz
= simd_oprsz(desc
);
587 int shift
= simd_data(desc
);
590 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
591 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) << shift
;
593 clear_high(d
, oprsz
, desc
);
596 void HELPER(gvec_shl16i
)(void *d
, void *a
, uint32_t desc
)
598 intptr_t oprsz
= simd_oprsz(desc
);
599 int shift
= simd_data(desc
);
602 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
603 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) << shift
;
605 clear_high(d
, oprsz
, desc
);
608 void HELPER(gvec_shl32i
)(void *d
, void *a
, uint32_t desc
)
610 intptr_t oprsz
= simd_oprsz(desc
);
611 int shift
= simd_data(desc
);
614 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
615 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) << shift
;
617 clear_high(d
, oprsz
, desc
);
620 void HELPER(gvec_shl64i
)(void *d
, void *a
, uint32_t desc
)
622 intptr_t oprsz
= simd_oprsz(desc
);
623 int shift
= simd_data(desc
);
626 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
627 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) << shift
;
629 clear_high(d
, oprsz
, desc
);
632 void HELPER(gvec_shr8i
)(void *d
, void *a
, uint32_t desc
)
634 intptr_t oprsz
= simd_oprsz(desc
);
635 int shift
= simd_data(desc
);
638 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
639 *(vec8
*)(d
+ i
) = *(vec8
*)(a
+ i
) >> shift
;
641 clear_high(d
, oprsz
, desc
);
644 void HELPER(gvec_shr16i
)(void *d
, void *a
, uint32_t desc
)
646 intptr_t oprsz
= simd_oprsz(desc
);
647 int shift
= simd_data(desc
);
650 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
651 *(vec16
*)(d
+ i
) = *(vec16
*)(a
+ i
) >> shift
;
653 clear_high(d
, oprsz
, desc
);
656 void HELPER(gvec_shr32i
)(void *d
, void *a
, uint32_t desc
)
658 intptr_t oprsz
= simd_oprsz(desc
);
659 int shift
= simd_data(desc
);
662 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
663 *(vec32
*)(d
+ i
) = *(vec32
*)(a
+ i
) >> shift
;
665 clear_high(d
, oprsz
, desc
);
668 void HELPER(gvec_shr64i
)(void *d
, void *a
, uint32_t desc
)
670 intptr_t oprsz
= simd_oprsz(desc
);
671 int shift
= simd_data(desc
);
674 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
675 *(vec64
*)(d
+ i
) = *(vec64
*)(a
+ i
) >> shift
;
677 clear_high(d
, oprsz
, desc
);
680 void HELPER(gvec_sar8i
)(void *d
, void *a
, uint32_t desc
)
682 intptr_t oprsz
= simd_oprsz(desc
);
683 int shift
= simd_data(desc
);
686 for (i
= 0; i
< oprsz
; i
+= sizeof(vec8
)) {
687 *(svec8
*)(d
+ i
) = *(svec8
*)(a
+ i
) >> shift
;
689 clear_high(d
, oprsz
, desc
);
692 void HELPER(gvec_sar16i
)(void *d
, void *a
, uint32_t desc
)
694 intptr_t oprsz
= simd_oprsz(desc
);
695 int shift
= simd_data(desc
);
698 for (i
= 0; i
< oprsz
; i
+= sizeof(vec16
)) {
699 *(svec16
*)(d
+ i
) = *(svec16
*)(a
+ i
) >> shift
;
701 clear_high(d
, oprsz
, desc
);
704 void HELPER(gvec_sar32i
)(void *d
, void *a
, uint32_t desc
)
706 intptr_t oprsz
= simd_oprsz(desc
);
707 int shift
= simd_data(desc
);
710 for (i
= 0; i
< oprsz
; i
+= sizeof(vec32
)) {
711 *(svec32
*)(d
+ i
) = *(svec32
*)(a
+ i
) >> shift
;
713 clear_high(d
, oprsz
, desc
);
716 void HELPER(gvec_sar64i
)(void *d
, void *a
, uint32_t desc
)
718 intptr_t oprsz
= simd_oprsz(desc
);
719 int shift
= simd_data(desc
);
722 for (i
= 0; i
< oprsz
; i
+= sizeof(vec64
)) {
723 *(svec64
*)(d
+ i
) = *(svec64
*)(a
+ i
) >> shift
;
725 clear_high(d
, oprsz
, desc
);
728 /* If vectors are enabled, the compiler fills in -1 for true.
729 Otherwise, we must take care of this by hand. */
730 #ifdef CONFIG_VECTOR16
731 # define DO_CMP0(X) X
733 # define DO_CMP0(X) -(X)
736 #define DO_CMP1(NAME, TYPE, OP) \
737 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \
739 intptr_t oprsz = simd_oprsz(desc); \
741 for (i = 0; i < oprsz; i += sizeof(TYPE)) { \
742 *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \
744 clear_high(d, oprsz, desc); \
747 #define DO_CMP2(SZ) \
748 DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \
749 DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \
750 DO_CMP1(gvec_lt##SZ, svec##SZ, <) \
751 DO_CMP1(gvec_le##SZ, svec##SZ, <=) \
752 DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \
753 DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
764 void HELPER(gvec_ssadd8
)(void *d
, void *a
, void *b
, uint32_t desc
)
766 intptr_t oprsz
= simd_oprsz(desc
);
769 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
770 int r
= *(int8_t *)(a
+ i
) + *(int8_t *)(b
+ i
);
773 } else if (r
< INT8_MIN
) {
776 *(int8_t *)(d
+ i
) = r
;
778 clear_high(d
, oprsz
, desc
);
781 void HELPER(gvec_ssadd16
)(void *d
, void *a
, void *b
, uint32_t desc
)
783 intptr_t oprsz
= simd_oprsz(desc
);
786 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
787 int r
= *(int16_t *)(a
+ i
) + *(int16_t *)(b
+ i
);
790 } else if (r
< INT16_MIN
) {
793 *(int16_t *)(d
+ i
) = r
;
795 clear_high(d
, oprsz
, desc
);
798 void HELPER(gvec_ssadd32
)(void *d
, void *a
, void *b
, uint32_t desc
)
800 intptr_t oprsz
= simd_oprsz(desc
);
803 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
804 int32_t ai
= *(int32_t *)(a
+ i
);
805 int32_t bi
= *(int32_t *)(b
+ i
);
806 int32_t di
= ai
+ bi
;
807 if (((di
^ ai
) &~ (ai
^ bi
)) < 0) {
808 /* Signed overflow. */
809 di
= (di
< 0 ? INT32_MAX
: INT32_MIN
);
811 *(int32_t *)(d
+ i
) = di
;
813 clear_high(d
, oprsz
, desc
);
816 void HELPER(gvec_ssadd64
)(void *d
, void *a
, void *b
, uint32_t desc
)
818 intptr_t oprsz
= simd_oprsz(desc
);
821 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
822 int64_t ai
= *(int64_t *)(a
+ i
);
823 int64_t bi
= *(int64_t *)(b
+ i
);
824 int64_t di
= ai
+ bi
;
825 if (((di
^ ai
) &~ (ai
^ bi
)) < 0) {
826 /* Signed overflow. */
827 di
= (di
< 0 ? INT64_MAX
: INT64_MIN
);
829 *(int64_t *)(d
+ i
) = di
;
831 clear_high(d
, oprsz
, desc
);
834 void HELPER(gvec_sssub8
)(void *d
, void *a
, void *b
, uint32_t desc
)
836 intptr_t oprsz
= simd_oprsz(desc
);
839 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
840 int r
= *(int8_t *)(a
+ i
) - *(int8_t *)(b
+ i
);
843 } else if (r
< INT8_MIN
) {
846 *(uint8_t *)(d
+ i
) = r
;
848 clear_high(d
, oprsz
, desc
);
851 void HELPER(gvec_sssub16
)(void *d
, void *a
, void *b
, uint32_t desc
)
853 intptr_t oprsz
= simd_oprsz(desc
);
856 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
857 int r
= *(int16_t *)(a
+ i
) - *(int16_t *)(b
+ i
);
860 } else if (r
< INT16_MIN
) {
863 *(int16_t *)(d
+ i
) = r
;
865 clear_high(d
, oprsz
, desc
);
868 void HELPER(gvec_sssub32
)(void *d
, void *a
, void *b
, uint32_t desc
)
870 intptr_t oprsz
= simd_oprsz(desc
);
873 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
874 int32_t ai
= *(int32_t *)(a
+ i
);
875 int32_t bi
= *(int32_t *)(b
+ i
);
876 int32_t di
= ai
- bi
;
877 if (((di
^ ai
) & (ai
^ bi
)) < 0) {
878 /* Signed overflow. */
879 di
= (di
< 0 ? INT32_MAX
: INT32_MIN
);
881 *(int32_t *)(d
+ i
) = di
;
883 clear_high(d
, oprsz
, desc
);
886 void HELPER(gvec_sssub64
)(void *d
, void *a
, void *b
, uint32_t desc
)
888 intptr_t oprsz
= simd_oprsz(desc
);
891 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
892 int64_t ai
= *(int64_t *)(a
+ i
);
893 int64_t bi
= *(int64_t *)(b
+ i
);
894 int64_t di
= ai
- bi
;
895 if (((di
^ ai
) & (ai
^ bi
)) < 0) {
896 /* Signed overflow. */
897 di
= (di
< 0 ? INT64_MAX
: INT64_MIN
);
899 *(int64_t *)(d
+ i
) = di
;
901 clear_high(d
, oprsz
, desc
);
904 void HELPER(gvec_usadd8
)(void *d
, void *a
, void *b
, uint32_t desc
)
906 intptr_t oprsz
= simd_oprsz(desc
);
909 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
910 unsigned r
= *(uint8_t *)(a
+ i
) + *(uint8_t *)(b
+ i
);
914 *(uint8_t *)(d
+ i
) = r
;
916 clear_high(d
, oprsz
, desc
);
919 void HELPER(gvec_usadd16
)(void *d
, void *a
, void *b
, uint32_t desc
)
921 intptr_t oprsz
= simd_oprsz(desc
);
924 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
925 unsigned r
= *(uint16_t *)(a
+ i
) + *(uint16_t *)(b
+ i
);
926 if (r
> UINT16_MAX
) {
929 *(uint16_t *)(d
+ i
) = r
;
931 clear_high(d
, oprsz
, desc
);
934 void HELPER(gvec_usadd32
)(void *d
, void *a
, void *b
, uint32_t desc
)
936 intptr_t oprsz
= simd_oprsz(desc
);
939 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
940 uint32_t ai
= *(uint32_t *)(a
+ i
);
941 uint32_t bi
= *(uint32_t *)(b
+ i
);
942 uint32_t di
= ai
+ bi
;
946 *(uint32_t *)(d
+ i
) = di
;
948 clear_high(d
, oprsz
, desc
);
951 void HELPER(gvec_usadd64
)(void *d
, void *a
, void *b
, uint32_t desc
)
953 intptr_t oprsz
= simd_oprsz(desc
);
956 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
957 uint64_t ai
= *(uint64_t *)(a
+ i
);
958 uint64_t bi
= *(uint64_t *)(b
+ i
);
959 uint64_t di
= ai
+ bi
;
963 *(uint64_t *)(d
+ i
) = di
;
965 clear_high(d
, oprsz
, desc
);
968 void HELPER(gvec_ussub8
)(void *d
, void *a
, void *b
, uint32_t desc
)
970 intptr_t oprsz
= simd_oprsz(desc
);
973 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
974 int r
= *(uint8_t *)(a
+ i
) - *(uint8_t *)(b
+ i
);
978 *(uint8_t *)(d
+ i
) = r
;
980 clear_high(d
, oprsz
, desc
);
983 void HELPER(gvec_ussub16
)(void *d
, void *a
, void *b
, uint32_t desc
)
985 intptr_t oprsz
= simd_oprsz(desc
);
988 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
989 int r
= *(uint16_t *)(a
+ i
) - *(uint16_t *)(b
+ i
);
993 *(uint16_t *)(d
+ i
) = r
;
995 clear_high(d
, oprsz
, desc
);
998 void HELPER(gvec_ussub32
)(void *d
, void *a
, void *b
, uint32_t desc
)
1000 intptr_t oprsz
= simd_oprsz(desc
);
1003 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
1004 uint32_t ai
= *(uint32_t *)(a
+ i
);
1005 uint32_t bi
= *(uint32_t *)(b
+ i
);
1006 uint32_t di
= ai
- bi
;
1010 *(uint32_t *)(d
+ i
) = di
;
1012 clear_high(d
, oprsz
, desc
);
1015 void HELPER(gvec_ussub64
)(void *d
, void *a
, void *b
, uint32_t desc
)
1017 intptr_t oprsz
= simd_oprsz(desc
);
1020 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1021 uint64_t ai
= *(uint64_t *)(a
+ i
);
1022 uint64_t bi
= *(uint64_t *)(b
+ i
);
1023 uint64_t di
= ai
- bi
;
1027 *(uint64_t *)(d
+ i
) = di
;
1029 clear_high(d
, oprsz
, desc
);
1032 void HELPER(gvec_smin8
)(void *d
, void *a
, void *b
, uint32_t desc
)
1034 intptr_t oprsz
= simd_oprsz(desc
);
1037 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
1038 int8_t aa
= *(int8_t *)(a
+ i
);
1039 int8_t bb
= *(int8_t *)(b
+ i
);
1040 int8_t dd
= aa
< bb
? aa
: bb
;
1041 *(int8_t *)(d
+ i
) = dd
;
1043 clear_high(d
, oprsz
, desc
);
1046 void HELPER(gvec_smin16
)(void *d
, void *a
, void *b
, uint32_t desc
)
1048 intptr_t oprsz
= simd_oprsz(desc
);
1051 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
1052 int16_t aa
= *(int16_t *)(a
+ i
);
1053 int16_t bb
= *(int16_t *)(b
+ i
);
1054 int16_t dd
= aa
< bb
? aa
: bb
;
1055 *(int16_t *)(d
+ i
) = dd
;
1057 clear_high(d
, oprsz
, desc
);
1060 void HELPER(gvec_smin32
)(void *d
, void *a
, void *b
, uint32_t desc
)
1062 intptr_t oprsz
= simd_oprsz(desc
);
1065 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
1066 int32_t aa
= *(int32_t *)(a
+ i
);
1067 int32_t bb
= *(int32_t *)(b
+ i
);
1068 int32_t dd
= aa
< bb
? aa
: bb
;
1069 *(int32_t *)(d
+ i
) = dd
;
1071 clear_high(d
, oprsz
, desc
);
1074 void HELPER(gvec_smin64
)(void *d
, void *a
, void *b
, uint32_t desc
)
1076 intptr_t oprsz
= simd_oprsz(desc
);
1079 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
1080 int64_t aa
= *(int64_t *)(a
+ i
);
1081 int64_t bb
= *(int64_t *)(b
+ i
);
1082 int64_t dd
= aa
< bb
? aa
: bb
;
1083 *(int64_t *)(d
+ i
) = dd
;
1085 clear_high(d
, oprsz
, desc
);
1088 void HELPER(gvec_smax8
)(void *d
, void *a
, void *b
, uint32_t desc
)
1090 intptr_t oprsz
= simd_oprsz(desc
);
1093 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
1094 int8_t aa
= *(int8_t *)(a
+ i
);
1095 int8_t bb
= *(int8_t *)(b
+ i
);
1096 int8_t dd
= aa
> bb
? aa
: bb
;
1097 *(int8_t *)(d
+ i
) = dd
;
1099 clear_high(d
, oprsz
, desc
);
1102 void HELPER(gvec_smax16
)(void *d
, void *a
, void *b
, uint32_t desc
)
1104 intptr_t oprsz
= simd_oprsz(desc
);
1107 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
1108 int16_t aa
= *(int16_t *)(a
+ i
);
1109 int16_t bb
= *(int16_t *)(b
+ i
);
1110 int16_t dd
= aa
> bb
? aa
: bb
;
1111 *(int16_t *)(d
+ i
) = dd
;
1113 clear_high(d
, oprsz
, desc
);
1116 void HELPER(gvec_smax32
)(void *d
, void *a
, void *b
, uint32_t desc
)
1118 intptr_t oprsz
= simd_oprsz(desc
);
1121 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
1122 int32_t aa
= *(int32_t *)(a
+ i
);
1123 int32_t bb
= *(int32_t *)(b
+ i
);
1124 int32_t dd
= aa
> bb
? aa
: bb
;
1125 *(int32_t *)(d
+ i
) = dd
;
1127 clear_high(d
, oprsz
, desc
);
1130 void HELPER(gvec_smax64
)(void *d
, void *a
, void *b
, uint32_t desc
)
1132 intptr_t oprsz
= simd_oprsz(desc
);
1135 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
1136 int64_t aa
= *(int64_t *)(a
+ i
);
1137 int64_t bb
= *(int64_t *)(b
+ i
);
1138 int64_t dd
= aa
> bb
? aa
: bb
;
1139 *(int64_t *)(d
+ i
) = dd
;
1141 clear_high(d
, oprsz
, desc
);
1144 void HELPER(gvec_umin8
)(void *d
, void *a
, void *b
, uint32_t desc
)
1146 intptr_t oprsz
= simd_oprsz(desc
);
1149 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
1150 uint8_t aa
= *(uint8_t *)(a
+ i
);
1151 uint8_t bb
= *(uint8_t *)(b
+ i
);
1152 uint8_t dd
= aa
< bb
? aa
: bb
;
1153 *(uint8_t *)(d
+ i
) = dd
;
1155 clear_high(d
, oprsz
, desc
);
1158 void HELPER(gvec_umin16
)(void *d
, void *a
, void *b
, uint32_t desc
)
1160 intptr_t oprsz
= simd_oprsz(desc
);
1163 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
1164 uint16_t aa
= *(uint16_t *)(a
+ i
);
1165 uint16_t bb
= *(uint16_t *)(b
+ i
);
1166 uint16_t dd
= aa
< bb
? aa
: bb
;
1167 *(uint16_t *)(d
+ i
) = dd
;
1169 clear_high(d
, oprsz
, desc
);
1172 void HELPER(gvec_umin32
)(void *d
, void *a
, void *b
, uint32_t desc
)
1174 intptr_t oprsz
= simd_oprsz(desc
);
1177 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
1178 uint32_t aa
= *(uint32_t *)(a
+ i
);
1179 uint32_t bb
= *(uint32_t *)(b
+ i
);
1180 uint32_t dd
= aa
< bb
? aa
: bb
;
1181 *(uint32_t *)(d
+ i
) = dd
;
1183 clear_high(d
, oprsz
, desc
);
1186 void HELPER(gvec_umin64
)(void *d
, void *a
, void *b
, uint32_t desc
)
1188 intptr_t oprsz
= simd_oprsz(desc
);
1191 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1192 uint64_t aa
= *(uint64_t *)(a
+ i
);
1193 uint64_t bb
= *(uint64_t *)(b
+ i
);
1194 uint64_t dd
= aa
< bb
? aa
: bb
;
1195 *(uint64_t *)(d
+ i
) = dd
;
1197 clear_high(d
, oprsz
, desc
);
1200 void HELPER(gvec_umax8
)(void *d
, void *a
, void *b
, uint32_t desc
)
1202 intptr_t oprsz
= simd_oprsz(desc
);
1205 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
1206 uint8_t aa
= *(uint8_t *)(a
+ i
);
1207 uint8_t bb
= *(uint8_t *)(b
+ i
);
1208 uint8_t dd
= aa
> bb
? aa
: bb
;
1209 *(uint8_t *)(d
+ i
) = dd
;
1211 clear_high(d
, oprsz
, desc
);
1214 void HELPER(gvec_umax16
)(void *d
, void *a
, void *b
, uint32_t desc
)
1216 intptr_t oprsz
= simd_oprsz(desc
);
1219 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
1220 uint16_t aa
= *(uint16_t *)(a
+ i
);
1221 uint16_t bb
= *(uint16_t *)(b
+ i
);
1222 uint16_t dd
= aa
> bb
? aa
: bb
;
1223 *(uint16_t *)(d
+ i
) = dd
;
1225 clear_high(d
, oprsz
, desc
);
1228 void HELPER(gvec_umax32
)(void *d
, void *a
, void *b
, uint32_t desc
)
1230 intptr_t oprsz
= simd_oprsz(desc
);
1233 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
1234 uint32_t aa
= *(uint32_t *)(a
+ i
);
1235 uint32_t bb
= *(uint32_t *)(b
+ i
);
1236 uint32_t dd
= aa
> bb
? aa
: bb
;
1237 *(uint32_t *)(d
+ i
) = dd
;
1239 clear_high(d
, oprsz
, desc
);
1242 void HELPER(gvec_umax64
)(void *d
, void *a
, void *b
, uint32_t desc
)
1244 intptr_t oprsz
= simd_oprsz(desc
);
1247 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1248 uint64_t aa
= *(uint64_t *)(a
+ i
);
1249 uint64_t bb
= *(uint64_t *)(b
+ i
);
1250 uint64_t dd
= aa
> bb
? aa
: bb
;
1251 *(uint64_t *)(d
+ i
) = dd
;
1253 clear_high(d
, oprsz
, desc
);