2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
28 /* Note that vector data is stored in host-endian 64-bit chunks,
29 so addressing units smaller than that needs a host-endian fixup. */
30 #ifdef HOST_WORDS_BIGENDIAN
31 #define H1(x) ((x) ^ 7)
32 #define H2(x) ((x) ^ 3)
33 #define H4(x) ((x) ^ 1)
40 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
41 int8_t do_sqrdmlah_b(int8_t src1
, int8_t src2
, int8_t src3
,
46 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
47 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
49 int32_t ret
= (int32_t)src1
* src2
;
53 ret
+= ((int32_t)src3
<< 7) + (round
<< 6);
56 if (ret
!= (int8_t)ret
) {
57 ret
= (ret
< 0 ? INT8_MIN
: INT8_MAX
);
62 void HELPER(sve2_sqrdmlah_b
)(void *vd
, void *vn
, void *vm
,
63 void *va
, uint32_t desc
)
65 intptr_t i
, opr_sz
= simd_oprsz(desc
);
66 int8_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
68 for (i
= 0; i
< opr_sz
; ++i
) {
69 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], a
[i
], false, true);
73 void HELPER(sve2_sqrdmlsh_b
)(void *vd
, void *vn
, void *vm
,
74 void *va
, uint32_t desc
)
76 intptr_t i
, opr_sz
= simd_oprsz(desc
);
77 int8_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
79 for (i
= 0; i
< opr_sz
; ++i
) {
80 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], a
[i
], true, true);
84 void HELPER(sve2_sqdmulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
86 intptr_t i
, opr_sz
= simd_oprsz(desc
);
87 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
89 for (i
= 0; i
< opr_sz
; ++i
) {
90 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], 0, false, false);
94 void HELPER(sve2_sqrdmulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
96 intptr_t i
, opr_sz
= simd_oprsz(desc
);
97 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
99 for (i
= 0; i
< opr_sz
; ++i
) {
100 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], 0, false, true);
104 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
105 int16_t do_sqrdmlah_h(int16_t src1
, int16_t src2
, int16_t src3
,
106 bool neg
, bool round
, uint32_t *sat
)
108 /* Simplify similarly to do_sqrdmlah_b above. */
109 int32_t ret
= (int32_t)src1
* src2
;
113 ret
+= ((int32_t)src3
<< 15) + (round
<< 14);
116 if (ret
!= (int16_t)ret
) {
118 ret
= (ret
< 0 ? INT16_MIN
: INT16_MAX
);
123 uint32_t HELPER(neon_qrdmlah_s16
)(CPUARMState
*env
, uint32_t src1
,
124 uint32_t src2
, uint32_t src3
)
126 uint32_t *sat
= &env
->vfp
.qc
[0];
127 uint16_t e1
= do_sqrdmlah_h(src1
, src2
, src3
, false, true, sat
);
128 uint16_t e2
= do_sqrdmlah_h(src1
>> 16, src2
>> 16, src3
>> 16,
130 return deposit32(e1
, 16, 16, e2
);
133 void HELPER(gvec_qrdmlah_s16
)(void *vd
, void *vn
, void *vm
,
134 void *vq
, uint32_t desc
)
136 uintptr_t opr_sz
= simd_oprsz(desc
);
142 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
143 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], d
[i
], false, true, vq
);
145 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
148 uint32_t HELPER(neon_qrdmlsh_s16
)(CPUARMState
*env
, uint32_t src1
,
149 uint32_t src2
, uint32_t src3
)
151 uint32_t *sat
= &env
->vfp
.qc
[0];
152 uint16_t e1
= do_sqrdmlah_h(src1
, src2
, src3
, true, true, sat
);
153 uint16_t e2
= do_sqrdmlah_h(src1
>> 16, src2
>> 16, src3
>> 16,
155 return deposit32(e1
, 16, 16, e2
);
158 void HELPER(gvec_qrdmlsh_s16
)(void *vd
, void *vn
, void *vm
,
159 void *vq
, uint32_t desc
)
161 uintptr_t opr_sz
= simd_oprsz(desc
);
167 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
168 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], d
[i
], true, true, vq
);
170 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
173 void HELPER(neon_sqdmulh_h
)(void *vd
, void *vn
, void *vm
,
174 void *vq
, uint32_t desc
)
176 intptr_t i
, opr_sz
= simd_oprsz(desc
);
177 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
179 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
180 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, false, vq
);
182 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
185 void HELPER(neon_sqrdmulh_h
)(void *vd
, void *vn
, void *vm
,
186 void *vq
, uint32_t desc
)
188 intptr_t i
, opr_sz
= simd_oprsz(desc
);
189 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
191 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
192 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, true, vq
);
194 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
197 void HELPER(sve2_sqrdmlah_h
)(void *vd
, void *vn
, void *vm
,
198 void *va
, uint32_t desc
)
200 intptr_t i
, opr_sz
= simd_oprsz(desc
);
201 int16_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
204 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
205 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], a
[i
], false, true, &discard
);
209 void HELPER(sve2_sqrdmlsh_h
)(void *vd
, void *vn
, void *vm
,
210 void *va
, uint32_t desc
)
212 intptr_t i
, opr_sz
= simd_oprsz(desc
);
213 int16_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
216 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
217 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], a
[i
], true, true, &discard
);
221 void HELPER(sve2_sqdmulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
223 intptr_t i
, opr_sz
= simd_oprsz(desc
);
224 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
227 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
228 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, false, &discard
);
232 void HELPER(sve2_sqrdmulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
234 intptr_t i
, opr_sz
= simd_oprsz(desc
);
235 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
238 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
239 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, true, &discard
);
243 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
244 int32_t do_sqrdmlah_s(int32_t src1
, int32_t src2
, int32_t src3
,
245 bool neg
, bool round
, uint32_t *sat
)
247 /* Simplify similarly to do_sqrdmlah_b above. */
248 int64_t ret
= (int64_t)src1
* src2
;
252 ret
+= ((int64_t)src3
<< 31) + (round
<< 30);
255 if (ret
!= (int32_t)ret
) {
257 ret
= (ret
< 0 ? INT32_MIN
: INT32_MAX
);
262 uint32_t HELPER(neon_qrdmlah_s32
)(CPUARMState
*env
, int32_t src1
,
263 int32_t src2
, int32_t src3
)
265 uint32_t *sat
= &env
->vfp
.qc
[0];
266 return do_sqrdmlah_s(src1
, src2
, src3
, false, true, sat
);
269 void HELPER(gvec_qrdmlah_s32
)(void *vd
, void *vn
, void *vm
,
270 void *vq
, uint32_t desc
)
272 uintptr_t opr_sz
= simd_oprsz(desc
);
278 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
279 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], d
[i
], false, true, vq
);
281 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
284 uint32_t HELPER(neon_qrdmlsh_s32
)(CPUARMState
*env
, int32_t src1
,
285 int32_t src2
, int32_t src3
)
287 uint32_t *sat
= &env
->vfp
.qc
[0];
288 return do_sqrdmlah_s(src1
, src2
, src3
, true, true, sat
);
291 void HELPER(gvec_qrdmlsh_s32
)(void *vd
, void *vn
, void *vm
,
292 void *vq
, uint32_t desc
)
294 uintptr_t opr_sz
= simd_oprsz(desc
);
300 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
301 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], d
[i
], true, true, vq
);
303 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
306 void HELPER(neon_sqdmulh_s
)(void *vd
, void *vn
, void *vm
,
307 void *vq
, uint32_t desc
)
309 intptr_t i
, opr_sz
= simd_oprsz(desc
);
310 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
312 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
313 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, false, vq
);
315 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
318 void HELPER(neon_sqrdmulh_s
)(void *vd
, void *vn
, void *vm
,
319 void *vq
, uint32_t desc
)
321 intptr_t i
, opr_sz
= simd_oprsz(desc
);
322 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
324 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
325 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, true, vq
);
327 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
330 void HELPER(sve2_sqrdmlah_s
)(void *vd
, void *vn
, void *vm
,
331 void *va
, uint32_t desc
)
333 intptr_t i
, opr_sz
= simd_oprsz(desc
);
334 int32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
337 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
338 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], a
[i
], false, true, &discard
);
342 void HELPER(sve2_sqrdmlsh_s
)(void *vd
, void *vn
, void *vm
,
343 void *va
, uint32_t desc
)
345 intptr_t i
, opr_sz
= simd_oprsz(desc
);
346 int32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
349 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
350 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], a
[i
], true, true, &discard
);
354 void HELPER(sve2_sqdmulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
356 intptr_t i
, opr_sz
= simd_oprsz(desc
);
357 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
360 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
361 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, false, &discard
);
365 void HELPER(sve2_sqrdmulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
367 intptr_t i
, opr_sz
= simd_oprsz(desc
);
368 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
371 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
372 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, true, &discard
);
376 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
377 static int64_t do_sat128_d(Int128 r
)
379 int64_t ls
= int128_getlo(r
);
380 int64_t hs
= int128_gethi(r
);
382 if (unlikely(hs
!= (ls
>> 63))) {
383 return hs
< 0 ? INT64_MIN
: INT64_MAX
;
388 int64_t do_sqrdmlah_d(int64_t n
, int64_t m
, int64_t a
, bool neg
, bool round
)
393 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
394 muls64(&l
, &h
, m
, n
);
395 r
= int128_make128(l
, h
);
400 t
= int128_exts64(a
);
401 t
= int128_lshift(t
, 63);
402 r
= int128_add(r
, t
);
405 t
= int128_exts64(1ll << 62);
406 r
= int128_add(r
, t
);
408 r
= int128_rshift(r
, 63);
410 return do_sat128_d(r
);
413 void HELPER(sve2_sqrdmlah_d
)(void *vd
, void *vn
, void *vm
,
414 void *va
, uint32_t desc
)
416 intptr_t i
, opr_sz
= simd_oprsz(desc
);
417 int64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
419 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
420 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], a
[i
], false, true);
424 void HELPER(sve2_sqrdmlsh_d
)(void *vd
, void *vn
, void *vm
,
425 void *va
, uint32_t desc
)
427 intptr_t i
, opr_sz
= simd_oprsz(desc
);
428 int64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
430 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
431 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], a
[i
], true, true);
435 void HELPER(sve2_sqdmulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
437 intptr_t i
, opr_sz
= simd_oprsz(desc
);
438 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
440 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
441 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], 0, false, false);
445 void HELPER(sve2_sqrdmulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
447 intptr_t i
, opr_sz
= simd_oprsz(desc
);
448 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
450 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
451 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], 0, false, true);
455 /* Integer 8 and 16-bit dot-product.
457 * Note that for the loops herein, host endianness does not matter
458 * with respect to the ordering of data within the 64-bit lanes.
459 * All elements are treated equally, no matter where they are.
462 void HELPER(gvec_sdot_b
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
464 intptr_t i
, opr_sz
= simd_oprsz(desc
);
465 int32_t *d
= vd
, *a
= va
;
466 int8_t *n
= vn
, *m
= vm
;
468 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
470 n
[i
* 4 + 0] * m
[i
* 4 + 0] +
471 n
[i
* 4 + 1] * m
[i
* 4 + 1] +
472 n
[i
* 4 + 2] * m
[i
* 4 + 2] +
473 n
[i
* 4 + 3] * m
[i
* 4 + 3]);
475 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
478 void HELPER(gvec_udot_b
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
480 intptr_t i
, opr_sz
= simd_oprsz(desc
);
481 uint32_t *d
= vd
, *a
= va
;
482 uint8_t *n
= vn
, *m
= vm
;
484 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
486 n
[i
* 4 + 0] * m
[i
* 4 + 0] +
487 n
[i
* 4 + 1] * m
[i
* 4 + 1] +
488 n
[i
* 4 + 2] * m
[i
* 4 + 2] +
489 n
[i
* 4 + 3] * m
[i
* 4 + 3]);
491 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
494 void HELPER(gvec_sdot_h
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
496 intptr_t i
, opr_sz
= simd_oprsz(desc
);
497 int64_t *d
= vd
, *a
= va
;
498 int16_t *n
= vn
, *m
= vm
;
500 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
502 (int64_t)n
[i
* 4 + 0] * m
[i
* 4 + 0] +
503 (int64_t)n
[i
* 4 + 1] * m
[i
* 4 + 1] +
504 (int64_t)n
[i
* 4 + 2] * m
[i
* 4 + 2] +
505 (int64_t)n
[i
* 4 + 3] * m
[i
* 4 + 3]);
507 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
510 void HELPER(gvec_udot_h
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
512 intptr_t i
, opr_sz
= simd_oprsz(desc
);
513 uint64_t *d
= vd
, *a
= va
;
514 uint16_t *n
= vn
, *m
= vm
;
516 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
518 (uint64_t)n
[i
* 4 + 0] * m
[i
* 4 + 0] +
519 (uint64_t)n
[i
* 4 + 1] * m
[i
* 4 + 1] +
520 (uint64_t)n
[i
* 4 + 2] * m
[i
* 4 + 2] +
521 (uint64_t)n
[i
* 4 + 3] * m
[i
* 4 + 3]);
523 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
526 void HELPER(gvec_sdot_idx_b
)(void *vd
, void *vn
, void *vm
,
527 void *va
, uint32_t desc
)
529 intptr_t i
, segend
, opr_sz
= simd_oprsz(desc
), opr_sz_4
= opr_sz
/ 4;
530 intptr_t index
= simd_data(desc
);
531 int32_t *d
= vd
, *a
= va
;
533 int8_t *m_indexed
= (int8_t *)vm
+ H4(index
) * 4;
535 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
536 * Otherwise opr_sz is a multiple of 16.
538 segend
= MIN(4, opr_sz_4
);
541 int8_t m0
= m_indexed
[i
* 4 + 0];
542 int8_t m1
= m_indexed
[i
* 4 + 1];
543 int8_t m2
= m_indexed
[i
* 4 + 2];
544 int8_t m3
= m_indexed
[i
* 4 + 3];
552 } while (++i
< segend
);
554 } while (i
< opr_sz_4
);
556 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
559 void HELPER(gvec_udot_idx_b
)(void *vd
, void *vn
, void *vm
,
560 void *va
, uint32_t desc
)
562 intptr_t i
, segend
, opr_sz
= simd_oprsz(desc
), opr_sz_4
= opr_sz
/ 4;
563 intptr_t index
= simd_data(desc
);
564 uint32_t *d
= vd
, *a
= va
;
566 uint8_t *m_indexed
= (uint8_t *)vm
+ H4(index
) * 4;
568 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
569 * Otherwise opr_sz is a multiple of 16.
571 segend
= MIN(4, opr_sz_4
);
574 uint8_t m0
= m_indexed
[i
* 4 + 0];
575 uint8_t m1
= m_indexed
[i
* 4 + 1];
576 uint8_t m2
= m_indexed
[i
* 4 + 2];
577 uint8_t m3
= m_indexed
[i
* 4 + 3];
585 } while (++i
< segend
);
587 } while (i
< opr_sz_4
);
589 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
592 void HELPER(gvec_sdot_idx_h
)(void *vd
, void *vn
, void *vm
,
593 void *va
, uint32_t desc
)
595 intptr_t i
, opr_sz
= simd_oprsz(desc
), opr_sz_8
= opr_sz
/ 8;
596 intptr_t index
= simd_data(desc
);
597 int64_t *d
= vd
, *a
= va
;
599 int16_t *m_indexed
= (int16_t *)vm
+ index
* 4;
601 /* This is supported by SVE only, so opr_sz is always a multiple of 16.
602 * Process the entire segment all at once, writing back the results
603 * only after we've consumed all of the inputs.
605 for (i
= 0; i
< opr_sz_8
; i
+= 2) {
609 d0
+= n
[i
* 4 + 0] * (int64_t)m_indexed
[i
* 4 + 0];
610 d0
+= n
[i
* 4 + 1] * (int64_t)m_indexed
[i
* 4 + 1];
611 d0
+= n
[i
* 4 + 2] * (int64_t)m_indexed
[i
* 4 + 2];
612 d0
+= n
[i
* 4 + 3] * (int64_t)m_indexed
[i
* 4 + 3];
615 d1
+= n
[i
* 4 + 4] * (int64_t)m_indexed
[i
* 4 + 0];
616 d1
+= n
[i
* 4 + 5] * (int64_t)m_indexed
[i
* 4 + 1];
617 d1
+= n
[i
* 4 + 6] * (int64_t)m_indexed
[i
* 4 + 2];
618 d1
+= n
[i
* 4 + 7] * (int64_t)m_indexed
[i
* 4 + 3];
623 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
626 void HELPER(gvec_udot_idx_h
)(void *vd
, void *vn
, void *vm
,
627 void *va
, uint32_t desc
)
629 intptr_t i
, opr_sz
= simd_oprsz(desc
), opr_sz_8
= opr_sz
/ 8;
630 intptr_t index
= simd_data(desc
);
631 uint64_t *d
= vd
, *a
= va
;
633 uint16_t *m_indexed
= (uint16_t *)vm
+ index
* 4;
635 /* This is supported by SVE only, so opr_sz is always a multiple of 16.
636 * Process the entire segment all at once, writing back the results
637 * only after we've consumed all of the inputs.
639 for (i
= 0; i
< opr_sz_8
; i
+= 2) {
643 d0
+= n
[i
* 4 + 0] * (uint64_t)m_indexed
[i
* 4 + 0];
644 d0
+= n
[i
* 4 + 1] * (uint64_t)m_indexed
[i
* 4 + 1];
645 d0
+= n
[i
* 4 + 2] * (uint64_t)m_indexed
[i
* 4 + 2];
646 d0
+= n
[i
* 4 + 3] * (uint64_t)m_indexed
[i
* 4 + 3];
649 d1
+= n
[i
* 4 + 4] * (uint64_t)m_indexed
[i
* 4 + 0];
650 d1
+= n
[i
* 4 + 5] * (uint64_t)m_indexed
[i
* 4 + 1];
651 d1
+= n
[i
* 4 + 6] * (uint64_t)m_indexed
[i
* 4 + 2];
652 d1
+= n
[i
* 4 + 7] * (uint64_t)m_indexed
[i
* 4 + 3];
657 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
660 void HELPER(gvec_fcaddh
)(void *vd
, void *vn
, void *vm
,
661 void *vfpst
, uint32_t desc
)
663 uintptr_t opr_sz
= simd_oprsz(desc
);
667 float_status
*fpst
= vfpst
;
668 uint32_t neg_real
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
669 uint32_t neg_imag
= neg_real
^ 1;
672 /* Shift boolean to the sign bit so we can xor to negate. */
676 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
677 float16 e0
= n
[H2(i
)];
678 float16 e1
= m
[H2(i
+ 1)] ^ neg_imag
;
679 float16 e2
= n
[H2(i
+ 1)];
680 float16 e3
= m
[H2(i
)] ^ neg_real
;
682 d
[H2(i
)] = float16_add(e0
, e1
, fpst
);
683 d
[H2(i
+ 1)] = float16_add(e2
, e3
, fpst
);
685 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
688 void HELPER(gvec_fcadds
)(void *vd
, void *vn
, void *vm
,
689 void *vfpst
, uint32_t desc
)
691 uintptr_t opr_sz
= simd_oprsz(desc
);
695 float_status
*fpst
= vfpst
;
696 uint32_t neg_real
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
697 uint32_t neg_imag
= neg_real
^ 1;
700 /* Shift boolean to the sign bit so we can xor to negate. */
704 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
705 float32 e0
= n
[H4(i
)];
706 float32 e1
= m
[H4(i
+ 1)] ^ neg_imag
;
707 float32 e2
= n
[H4(i
+ 1)];
708 float32 e3
= m
[H4(i
)] ^ neg_real
;
710 d
[H4(i
)] = float32_add(e0
, e1
, fpst
);
711 d
[H4(i
+ 1)] = float32_add(e2
, e3
, fpst
);
713 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
716 void HELPER(gvec_fcaddd
)(void *vd
, void *vn
, void *vm
,
717 void *vfpst
, uint32_t desc
)
719 uintptr_t opr_sz
= simd_oprsz(desc
);
723 float_status
*fpst
= vfpst
;
724 uint64_t neg_real
= extract64(desc
, SIMD_DATA_SHIFT
, 1);
725 uint64_t neg_imag
= neg_real
^ 1;
728 /* Shift boolean to the sign bit so we can xor to negate. */
732 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
734 float64 e1
= m
[i
+ 1] ^ neg_imag
;
735 float64 e2
= n
[i
+ 1];
736 float64 e3
= m
[i
] ^ neg_real
;
738 d
[i
] = float64_add(e0
, e1
, fpst
);
739 d
[i
+ 1] = float64_add(e2
, e3
, fpst
);
741 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
744 void HELPER(gvec_fcmlah
)(void *vd
, void *vn
, void *vm
, void *va
,
745 void *vfpst
, uint32_t desc
)
747 uintptr_t opr_sz
= simd_oprsz(desc
);
748 float16
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
749 float_status
*fpst
= vfpst
;
750 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
751 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
752 uint32_t neg_real
= flip
^ neg_imag
;
755 /* Shift boolean to the sign bit so we can xor to negate. */
759 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
760 float16 e2
= n
[H2(i
+ flip
)];
761 float16 e1
= m
[H2(i
+ flip
)] ^ neg_real
;
763 float16 e3
= m
[H2(i
+ 1 - flip
)] ^ neg_imag
;
765 d
[H2(i
)] = float16_muladd(e2
, e1
, a
[H2(i
)], 0, fpst
);
766 d
[H2(i
+ 1)] = float16_muladd(e4
, e3
, a
[H2(i
+ 1)], 0, fpst
);
768 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
771 void HELPER(gvec_fcmlah_idx
)(void *vd
, void *vn
, void *vm
, void *va
,
772 void *vfpst
, uint32_t desc
)
774 uintptr_t opr_sz
= simd_oprsz(desc
);
775 float16
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
776 float_status
*fpst
= vfpst
;
777 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
778 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
779 intptr_t index
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2);
780 uint32_t neg_real
= flip
^ neg_imag
;
781 intptr_t elements
= opr_sz
/ sizeof(float16
);
782 intptr_t eltspersegment
= 16 / sizeof(float16
);
785 /* Shift boolean to the sign bit so we can xor to negate. */
789 for (i
= 0; i
< elements
; i
+= eltspersegment
) {
790 float16 mr
= m
[H2(i
+ 2 * index
+ 0)];
791 float16 mi
= m
[H2(i
+ 2 * index
+ 1)];
792 float16 e1
= neg_real
^ (flip
? mi
: mr
);
793 float16 e3
= neg_imag
^ (flip
? mr
: mi
);
795 for (j
= i
; j
< i
+ eltspersegment
; j
+= 2) {
796 float16 e2
= n
[H2(j
+ flip
)];
799 d
[H2(j
)] = float16_muladd(e2
, e1
, a
[H2(j
)], 0, fpst
);
800 d
[H2(j
+ 1)] = float16_muladd(e4
, e3
, a
[H2(j
+ 1)], 0, fpst
);
803 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
806 void HELPER(gvec_fcmlas
)(void *vd
, void *vn
, void *vm
, void *va
,
807 void *vfpst
, uint32_t desc
)
809 uintptr_t opr_sz
= simd_oprsz(desc
);
810 float32
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
811 float_status
*fpst
= vfpst
;
812 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
813 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
814 uint32_t neg_real
= flip
^ neg_imag
;
817 /* Shift boolean to the sign bit so we can xor to negate. */
821 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
822 float32 e2
= n
[H4(i
+ flip
)];
823 float32 e1
= m
[H4(i
+ flip
)] ^ neg_real
;
825 float32 e3
= m
[H4(i
+ 1 - flip
)] ^ neg_imag
;
827 d
[H4(i
)] = float32_muladd(e2
, e1
, a
[H4(i
)], 0, fpst
);
828 d
[H4(i
+ 1)] = float32_muladd(e4
, e3
, a
[H4(i
+ 1)], 0, fpst
);
830 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
833 void HELPER(gvec_fcmlas_idx
)(void *vd
, void *vn
, void *vm
, void *va
,
834 void *vfpst
, uint32_t desc
)
836 uintptr_t opr_sz
= simd_oprsz(desc
);
837 float32
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
838 float_status
*fpst
= vfpst
;
839 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
840 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
841 intptr_t index
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2);
842 uint32_t neg_real
= flip
^ neg_imag
;
843 intptr_t elements
= opr_sz
/ sizeof(float32
);
844 intptr_t eltspersegment
= 16 / sizeof(float32
);
847 /* Shift boolean to the sign bit so we can xor to negate. */
851 for (i
= 0; i
< elements
; i
+= eltspersegment
) {
852 float32 mr
= m
[H4(i
+ 2 * index
+ 0)];
853 float32 mi
= m
[H4(i
+ 2 * index
+ 1)];
854 float32 e1
= neg_real
^ (flip
? mi
: mr
);
855 float32 e3
= neg_imag
^ (flip
? mr
: mi
);
857 for (j
= i
; j
< i
+ eltspersegment
; j
+= 2) {
858 float32 e2
= n
[H4(j
+ flip
)];
861 d
[H4(j
)] = float32_muladd(e2
, e1
, a
[H4(j
)], 0, fpst
);
862 d
[H4(j
+ 1)] = float32_muladd(e4
, e3
, a
[H4(j
+ 1)], 0, fpst
);
865 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
868 void HELPER(gvec_fcmlad
)(void *vd
, void *vn
, void *vm
, void *va
,
869 void *vfpst
, uint32_t desc
)
871 uintptr_t opr_sz
= simd_oprsz(desc
);
872 float64
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
873 float_status
*fpst
= vfpst
;
874 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
875 uint64_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
876 uint64_t neg_real
= flip
^ neg_imag
;
879 /* Shift boolean to the sign bit so we can xor to negate. */
883 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
884 float64 e2
= n
[i
+ flip
];
885 float64 e1
= m
[i
+ flip
] ^ neg_real
;
887 float64 e3
= m
[i
+ 1 - flip
] ^ neg_imag
;
889 d
[i
] = float64_muladd(e2
, e1
, a
[i
], 0, fpst
);
890 d
[i
+ 1] = float64_muladd(e4
, e3
, a
[i
+ 1], 0, fpst
);
892 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
896 * Floating point comparisons producing an integer result (all 1s or all 0s).
897 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
898 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
900 static uint16_t float16_ceq(float16 op1
, float16 op2
, float_status
*stat
)
902 return -float16_eq_quiet(op1
, op2
, stat
);
905 static uint32_t float32_ceq(float32 op1
, float32 op2
, float_status
*stat
)
907 return -float32_eq_quiet(op1
, op2
, stat
);
910 static uint16_t float16_cge(float16 op1
, float16 op2
, float_status
*stat
)
912 return -float16_le(op2
, op1
, stat
);
915 static uint32_t float32_cge(float32 op1
, float32 op2
, float_status
*stat
)
917 return -float32_le(op2
, op1
, stat
);
920 static uint16_t float16_cgt(float16 op1
, float16 op2
, float_status
*stat
)
922 return -float16_lt(op2
, op1
, stat
);
925 static uint32_t float32_cgt(float32 op1
, float32 op2
, float_status
*stat
)
927 return -float32_lt(op2
, op1
, stat
);
930 static uint16_t float16_acge(float16 op1
, float16 op2
, float_status
*stat
)
932 return -float16_le(float16_abs(op2
), float16_abs(op1
), stat
);
935 static uint32_t float32_acge(float32 op1
, float32 op2
, float_status
*stat
)
937 return -float32_le(float32_abs(op2
), float32_abs(op1
), stat
);
940 static uint16_t float16_acgt(float16 op1
, float16 op2
, float_status
*stat
)
942 return -float16_lt(float16_abs(op2
), float16_abs(op1
), stat
);
945 static uint32_t float32_acgt(float32 op1
, float32 op2
, float_status
*stat
)
947 return -float32_lt(float32_abs(op2
), float32_abs(op1
), stat
);
950 static int16_t vfp_tosszh(float16 x
, void *fpstp
)
952 float_status
*fpst
= fpstp
;
953 if (float16_is_any_nan(x
)) {
954 float_raise(float_flag_invalid
, fpst
);
957 return float16_to_int16_round_to_zero(x
, fpst
);
960 static uint16_t vfp_touszh(float16 x
, void *fpstp
)
962 float_status
*fpst
= fpstp
;
963 if (float16_is_any_nan(x
)) {
964 float_raise(float_flag_invalid
, fpst
);
967 return float16_to_uint16_round_to_zero(x
, fpst
);
970 #define DO_2OP(NAME, FUNC, TYPE) \
971 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
973 intptr_t i, oprsz = simd_oprsz(desc); \
974 TYPE *d = vd, *n = vn; \
975 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
976 d[i] = FUNC(n[i], stat); \
978 clear_tail(d, oprsz, simd_maxsz(desc)); \
981 DO_2OP(gvec_frecpe_h
, helper_recpe_f16
, float16
)
982 DO_2OP(gvec_frecpe_s
, helper_recpe_f32
, float32
)
983 DO_2OP(gvec_frecpe_d
, helper_recpe_f64
, float64
)
985 DO_2OP(gvec_frsqrte_h
, helper_rsqrte_f16
, float16
)
986 DO_2OP(gvec_frsqrte_s
, helper_rsqrte_f32
, float32
)
987 DO_2OP(gvec_frsqrte_d
, helper_rsqrte_f64
, float64
)
989 DO_2OP(gvec_vrintx_h
, float16_round_to_int
, float16
)
990 DO_2OP(gvec_vrintx_s
, float32_round_to_int
, float32
)
992 DO_2OP(gvec_sitos
, helper_vfp_sitos
, int32_t)
993 DO_2OP(gvec_uitos
, helper_vfp_uitos
, uint32_t)
994 DO_2OP(gvec_tosizs
, helper_vfp_tosizs
, float32
)
995 DO_2OP(gvec_touizs
, helper_vfp_touizs
, float32
)
996 DO_2OP(gvec_sstoh
, int16_to_float16
, int16_t)
997 DO_2OP(gvec_ustoh
, uint16_to_float16
, uint16_t)
998 DO_2OP(gvec_tosszh
, vfp_tosszh
, float16
)
999 DO_2OP(gvec_touszh
, vfp_touszh
, float16
)
1001 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1002 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1004 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1007 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1008 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1010 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1013 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1014 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1015 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1016 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1017 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1019 DO_2OP_CMP0(cgt
, cgt
, FWD
)
1020 DO_2OP_CMP0(cge
, cge
, FWD
)
1021 DO_2OP_CMP0(ceq
, ceq
, FWD
)
1022 DO_2OP_CMP0(clt
, cgt
, REV
)
1023 DO_2OP_CMP0(cle
, cge
, REV
)
1028 /* Floating-point trigonometric starting value.
1029 * See the ARM ARM pseudocode function FPTrigSMul.
1031 static float16
float16_ftsmul(float16 op1
, uint16_t op2
, float_status
*stat
)
1033 float16 result
= float16_mul(op1
, op1
, stat
);
1034 if (!float16_is_any_nan(result
)) {
1035 result
= float16_set_sign(result
, op2
& 1);
1040 static float32
float32_ftsmul(float32 op1
, uint32_t op2
, float_status
*stat
)
1042 float32 result
= float32_mul(op1
, op1
, stat
);
1043 if (!float32_is_any_nan(result
)) {
1044 result
= float32_set_sign(result
, op2
& 1);
1049 static float64
float64_ftsmul(float64 op1
, uint64_t op2
, float_status
*stat
)
1051 float64 result
= float64_mul(op1
, op1
, stat
);
1052 if (!float64_is_any_nan(result
)) {
1053 result
= float64_set_sign(result
, op2
& 1);
1058 static float16
float16_abd(float16 op1
, float16 op2
, float_status
*stat
)
1060 return float16_abs(float16_sub(op1
, op2
, stat
));
1063 static float32
float32_abd(float32 op1
, float32 op2
, float_status
*stat
)
1065 return float32_abs(float32_sub(op1
, op2
, stat
));
1069 * Reciprocal step. These are the AArch32 version which uses a
1070 * non-fused multiply-and-subtract.
1072 static float16
float16_recps_nf(float16 op1
, float16 op2
, float_status
*stat
)
1074 op1
= float16_squash_input_denormal(op1
, stat
);
1075 op2
= float16_squash_input_denormal(op2
, stat
);
1077 if ((float16_is_infinity(op1
) && float16_is_zero(op2
)) ||
1078 (float16_is_infinity(op2
) && float16_is_zero(op1
))) {
1081 return float16_sub(float16_two
, float16_mul(op1
, op2
, stat
), stat
);
1084 static float32
float32_recps_nf(float32 op1
, float32 op2
, float_status
*stat
)
1086 op1
= float32_squash_input_denormal(op1
, stat
);
1087 op2
= float32_squash_input_denormal(op2
, stat
);
1089 if ((float32_is_infinity(op1
) && float32_is_zero(op2
)) ||
1090 (float32_is_infinity(op2
) && float32_is_zero(op1
))) {
1093 return float32_sub(float32_two
, float32_mul(op1
, op2
, stat
), stat
);
1096 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1097 static float16
float16_rsqrts_nf(float16 op1
, float16 op2
, float_status
*stat
)
1099 op1
= float16_squash_input_denormal(op1
, stat
);
1100 op2
= float16_squash_input_denormal(op2
, stat
);
1102 if ((float16_is_infinity(op1
) && float16_is_zero(op2
)) ||
1103 (float16_is_infinity(op2
) && float16_is_zero(op1
))) {
1104 return float16_one_point_five
;
1106 op1
= float16_sub(float16_three
, float16_mul(op1
, op2
, stat
), stat
);
1107 return float16_div(op1
, float16_two
, stat
);
1110 static float32
float32_rsqrts_nf(float32 op1
, float32 op2
, float_status
*stat
)
1112 op1
= float32_squash_input_denormal(op1
, stat
);
1113 op2
= float32_squash_input_denormal(op2
, stat
);
1115 if ((float32_is_infinity(op1
) && float32_is_zero(op2
)) ||
1116 (float32_is_infinity(op2
) && float32_is_zero(op1
))) {
1117 return float32_one_point_five
;
1119 op1
= float32_sub(float32_three
, float32_mul(op1
, op2
, stat
), stat
);
1120 return float32_div(op1
, float32_two
, stat
);
1123 #define DO_3OP(NAME, FUNC, TYPE) \
1124 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1126 intptr_t i, oprsz = simd_oprsz(desc); \
1127 TYPE *d = vd, *n = vn, *m = vm; \
1128 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1129 d[i] = FUNC(n[i], m[i], stat); \
1131 clear_tail(d, oprsz, simd_maxsz(desc)); \
1134 DO_3OP(gvec_fadd_h
, float16_add
, float16
)
1135 DO_3OP(gvec_fadd_s
, float32_add
, float32
)
1136 DO_3OP(gvec_fadd_d
, float64_add
, float64
)
1138 DO_3OP(gvec_fsub_h
, float16_sub
, float16
)
1139 DO_3OP(gvec_fsub_s
, float32_sub
, float32
)
1140 DO_3OP(gvec_fsub_d
, float64_sub
, float64
)
1142 DO_3OP(gvec_fmul_h
, float16_mul
, float16
)
1143 DO_3OP(gvec_fmul_s
, float32_mul
, float32
)
1144 DO_3OP(gvec_fmul_d
, float64_mul
, float64
)
1146 DO_3OP(gvec_ftsmul_h
, float16_ftsmul
, float16
)
1147 DO_3OP(gvec_ftsmul_s
, float32_ftsmul
, float32
)
1148 DO_3OP(gvec_ftsmul_d
, float64_ftsmul
, float64
)
1150 DO_3OP(gvec_fabd_h
, float16_abd
, float16
)
1151 DO_3OP(gvec_fabd_s
, float32_abd
, float32
)
1153 DO_3OP(gvec_fceq_h
, float16_ceq
, float16
)
1154 DO_3OP(gvec_fceq_s
, float32_ceq
, float32
)
1156 DO_3OP(gvec_fcge_h
, float16_cge
, float16
)
1157 DO_3OP(gvec_fcge_s
, float32_cge
, float32
)
1159 DO_3OP(gvec_fcgt_h
, float16_cgt
, float16
)
1160 DO_3OP(gvec_fcgt_s
, float32_cgt
, float32
)
1162 DO_3OP(gvec_facge_h
, float16_acge
, float16
)
1163 DO_3OP(gvec_facge_s
, float32_acge
, float32
)
1165 DO_3OP(gvec_facgt_h
, float16_acgt
, float16
)
1166 DO_3OP(gvec_facgt_s
, float32_acgt
, float32
)
1168 DO_3OP(gvec_fmax_h
, float16_max
, float16
)
1169 DO_3OP(gvec_fmax_s
, float32_max
, float32
)
1171 DO_3OP(gvec_fmin_h
, float16_min
, float16
)
1172 DO_3OP(gvec_fmin_s
, float32_min
, float32
)
1174 DO_3OP(gvec_fmaxnum_h
, float16_maxnum
, float16
)
1175 DO_3OP(gvec_fmaxnum_s
, float32_maxnum
, float32
)
1177 DO_3OP(gvec_fminnum_h
, float16_minnum
, float16
)
1178 DO_3OP(gvec_fminnum_s
, float32_minnum
, float32
)
1180 DO_3OP(gvec_recps_nf_h
, float16_recps_nf
, float16
)
1181 DO_3OP(gvec_recps_nf_s
, float32_recps_nf
, float32
)
1183 DO_3OP(gvec_rsqrts_nf_h
, float16_rsqrts_nf
, float16
)
1184 DO_3OP(gvec_rsqrts_nf_s
, float32_rsqrts_nf
, float32
)
1186 #ifdef TARGET_AARCH64
1188 DO_3OP(gvec_recps_h
, helper_recpsf_f16
, float16
)
1189 DO_3OP(gvec_recps_s
, helper_recpsf_f32
, float32
)
1190 DO_3OP(gvec_recps_d
, helper_recpsf_f64
, float64
)
1192 DO_3OP(gvec_rsqrts_h
, helper_rsqrtsf_f16
, float16
)
1193 DO_3OP(gvec_rsqrts_s
, helper_rsqrtsf_f32
, float32
)
1194 DO_3OP(gvec_rsqrts_d
, helper_rsqrtsf_f64
, float64
)
1199 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1200 static float16
float16_muladd_nf(float16 dest
, float16 op1
, float16 op2
,
1203 return float16_add(dest
, float16_mul(op1
, op2
, stat
), stat
);
1206 static float32
float32_muladd_nf(float32 dest
, float32 op1
, float32 op2
,
1209 return float32_add(dest
, float32_mul(op1
, op2
, stat
), stat
);
1212 static float16
float16_mulsub_nf(float16 dest
, float16 op1
, float16 op2
,
1215 return float16_sub(dest
, float16_mul(op1
, op2
, stat
), stat
);
1218 static float32
float32_mulsub_nf(float32 dest
, float32 op1
, float32 op2
,
1221 return float32_sub(dest
, float32_mul(op1
, op2
, stat
), stat
);
1224 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1225 static float16
float16_muladd_f(float16 dest
, float16 op1
, float16 op2
,
1228 return float16_muladd(op1
, op2
, dest
, 0, stat
);
1231 static float32
float32_muladd_f(float32 dest
, float32 op1
, float32 op2
,
1234 return float32_muladd(op1
, op2
, dest
, 0, stat
);
1237 static float16
float16_mulsub_f(float16 dest
, float16 op1
, float16 op2
,
1240 return float16_muladd(float16_chs(op1
), op2
, dest
, 0, stat
);
1243 static float32
float32_mulsub_f(float32 dest
, float32 op1
, float32 op2
,
1246 return float32_muladd(float32_chs(op1
), op2
, dest
, 0, stat
);
1249 #define DO_MULADD(NAME, FUNC, TYPE) \
1250 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1252 intptr_t i, oprsz = simd_oprsz(desc); \
1253 TYPE *d = vd, *n = vn, *m = vm; \
1254 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1255 d[i] = FUNC(d[i], n[i], m[i], stat); \
1257 clear_tail(d, oprsz, simd_maxsz(desc)); \
1260 DO_MULADD(gvec_fmla_h
, float16_muladd_nf
, float16
)
1261 DO_MULADD(gvec_fmla_s
, float32_muladd_nf
, float32
)
1263 DO_MULADD(gvec_fmls_h
, float16_mulsub_nf
, float16
)
1264 DO_MULADD(gvec_fmls_s
, float32_mulsub_nf
, float32
)
1266 DO_MULADD(gvec_vfma_h
, float16_muladd_f
, float16
)
1267 DO_MULADD(gvec_vfma_s
, float32_muladd_f
, float32
)
1269 DO_MULADD(gvec_vfms_h
, float16_mulsub_f
, float16
)
1270 DO_MULADD(gvec_vfms_s
, float32_mulsub_f
, float32
)
1272 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1273 * For AdvSIMD, there is of course only one such vector segment.
1276 #define DO_MUL_IDX(NAME, TYPE, H) \
1277 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1279 intptr_t i, j, oprsz = simd_oprsz(desc); \
1280 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1281 intptr_t idx = simd_data(desc); \
1282 TYPE *d = vd, *n = vn, *m = vm; \
1283 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1284 TYPE mm = m[H(i + idx)]; \
1285 for (j = 0; j < segment; j++) { \
1286 d[i + j] = n[i + j] * mm; \
1289 clear_tail(d, oprsz, simd_maxsz(desc)); \
1292 DO_MUL_IDX(gvec_mul_idx_h
, uint16_t, H2
)
1293 DO_MUL_IDX(gvec_mul_idx_s
, uint32_t, H4
)
1294 DO_MUL_IDX(gvec_mul_idx_d
, uint64_t, )
1298 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1299 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1301 intptr_t i, j, oprsz = simd_oprsz(desc); \
1302 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1303 intptr_t idx = simd_data(desc); \
1304 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1305 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1306 TYPE mm = m[H(i + idx)]; \
1307 for (j = 0; j < segment; j++) { \
1308 d[i + j] = a[i + j] OP n[i + j] * mm; \
1311 clear_tail(d, oprsz, simd_maxsz(desc)); \
1314 DO_MLA_IDX(gvec_mla_idx_h
, uint16_t, +, H2
)
1315 DO_MLA_IDX(gvec_mla_idx_s
, uint32_t, +, H4
)
1316 DO_MLA_IDX(gvec_mla_idx_d
, uint64_t, +, )
1318 DO_MLA_IDX(gvec_mls_idx_h
, uint16_t, -, H2
)
1319 DO_MLA_IDX(gvec_mls_idx_s
, uint32_t, -, H4
)
1320 DO_MLA_IDX(gvec_mls_idx_d
, uint64_t, -, )
1324 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1327 intptr_t i, j, oprsz = simd_oprsz(desc); \
1328 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1329 intptr_t idx = simd_data(desc); \
1330 TYPE *d = vd, *n = vn, *m = vm; \
1331 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1332 TYPE mm = m[H(i + idx)]; \
1333 for (j = 0; j < segment; j++) { \
1334 d[i + j] = TYPE##_##ADD(d[i + j], \
1335 TYPE##_mul(n[i + j], mm, stat), stat); \
1338 clear_tail(d, oprsz, simd_maxsz(desc)); \
1341 #define float16_nop(N, M, S) (M)
1342 #define float32_nop(N, M, S) (M)
1343 #define float64_nop(N, M, S) (M)
1345 DO_FMUL_IDX(gvec_fmul_idx_h
, nop
, float16
, H2
)
1346 DO_FMUL_IDX(gvec_fmul_idx_s
, nop
, float32
, H4
)
1347 DO_FMUL_IDX(gvec_fmul_idx_d
, nop
, float64
, )
1350 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1351 * the fused ops below they assume accumulate both from and into Vd.
1353 DO_FMUL_IDX(gvec_fmla_nf_idx_h
, add
, float16
, H2
)
1354 DO_FMUL_IDX(gvec_fmla_nf_idx_s
, add
, float32
, H4
)
1355 DO_FMUL_IDX(gvec_fmls_nf_idx_h
, sub
, float16
, H2
)
1356 DO_FMUL_IDX(gvec_fmls_nf_idx_s
, sub
, float32
, H4
)
1363 #define DO_FMLA_IDX(NAME, TYPE, H) \
1364 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1365 void *stat, uint32_t desc) \
1367 intptr_t i, j, oprsz = simd_oprsz(desc); \
1368 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1369 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1370 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1371 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1372 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1373 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1374 TYPE mm = m[H(i + idx)]; \
1375 for (j = 0; j < segment; j++) { \
1376 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1377 mm, a[i + j], 0, stat); \
1380 clear_tail(d, oprsz, simd_maxsz(desc)); \
1383 DO_FMLA_IDX(gvec_fmla_idx_h
, float16
, H2
)
1384 DO_FMLA_IDX(gvec_fmla_idx_s
, float32
, H4
)
1385 DO_FMLA_IDX(gvec_fmla_idx_d
, float64
, )
1389 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1390 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1392 intptr_t i, oprsz = simd_oprsz(desc); \
1393 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1395 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1396 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1400 } else if (dd > MAX) { \
1407 uint32_t *qc = vq; \
1410 clear_tail(d, oprsz, simd_maxsz(desc)); \
1413 DO_SAT(gvec_uqadd_b
, int, uint8_t, uint8_t, +, 0, UINT8_MAX
)
1414 DO_SAT(gvec_uqadd_h
, int, uint16_t, uint16_t, +, 0, UINT16_MAX
)
1415 DO_SAT(gvec_uqadd_s
, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX
)
1417 DO_SAT(gvec_sqadd_b
, int, int8_t, int8_t, +, INT8_MIN
, INT8_MAX
)
1418 DO_SAT(gvec_sqadd_h
, int, int16_t, int16_t, +, INT16_MIN
, INT16_MAX
)
1419 DO_SAT(gvec_sqadd_s
, int64_t, int32_t, int32_t, +, INT32_MIN
, INT32_MAX
)
1421 DO_SAT(gvec_uqsub_b
, int, uint8_t, uint8_t, -, 0, UINT8_MAX
)
1422 DO_SAT(gvec_uqsub_h
, int, uint16_t, uint16_t, -, 0, UINT16_MAX
)
1423 DO_SAT(gvec_uqsub_s
, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX
)
1425 DO_SAT(gvec_sqsub_b
, int, int8_t, int8_t, -, INT8_MIN
, INT8_MAX
)
1426 DO_SAT(gvec_sqsub_h
, int, int16_t, int16_t, -, INT16_MIN
, INT16_MAX
)
1427 DO_SAT(gvec_sqsub_s
, int64_t, int32_t, int32_t, -, INT32_MIN
, INT32_MAX
)
1431 void HELPER(gvec_uqadd_d
)(void *vd
, void *vq
, void *vn
,
1432 void *vm
, uint32_t desc
)
1434 intptr_t i
, oprsz
= simd_oprsz(desc
);
1435 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1438 for (i
= 0; i
< oprsz
/ 8; i
++) {
1439 uint64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
+ mm
;
1450 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1453 void HELPER(gvec_uqsub_d
)(void *vd
, void *vq
, void *vn
,
1454 void *vm
, uint32_t desc
)
1456 intptr_t i
, oprsz
= simd_oprsz(desc
);
1457 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1460 for (i
= 0; i
< oprsz
/ 8; i
++) {
1461 uint64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
- mm
;
1472 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1475 void HELPER(gvec_sqadd_d
)(void *vd
, void *vq
, void *vn
,
1476 void *vm
, uint32_t desc
)
1478 intptr_t i
, oprsz
= simd_oprsz(desc
);
1479 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
1482 for (i
= 0; i
< oprsz
/ 8; i
++) {
1483 int64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
+ mm
;
1484 if (((dd
^ nn
) & ~(nn
^ mm
)) & INT64_MIN
) {
1485 dd
= (nn
>> 63) ^ ~INT64_MIN
;
1494 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1497 void HELPER(gvec_sqsub_d
)(void *vd
, void *vq
, void *vn
,
1498 void *vm
, uint32_t desc
)
1500 intptr_t i
, oprsz
= simd_oprsz(desc
);
1501 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
1504 for (i
= 0; i
< oprsz
/ 8; i
++) {
1505 int64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
- mm
;
1506 if (((dd
^ nn
) & (nn
^ mm
)) & INT64_MIN
) {
1507 dd
= (nn
>> 63) ^ ~INT64_MIN
;
1516 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1520 #define DO_SRA(NAME, TYPE) \
1521 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1523 intptr_t i, oprsz = simd_oprsz(desc); \
1524 int shift = simd_data(desc); \
1525 TYPE *d = vd, *n = vn; \
1526 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1527 d[i] += n[i] >> shift; \
1529 clear_tail(d, oprsz, simd_maxsz(desc)); \
1532 DO_SRA(gvec_ssra_b
, int8_t)
1533 DO_SRA(gvec_ssra_h
, int16_t)
1534 DO_SRA(gvec_ssra_s
, int32_t)
1535 DO_SRA(gvec_ssra_d
, int64_t)
1537 DO_SRA(gvec_usra_b
, uint8_t)
1538 DO_SRA(gvec_usra_h
, uint16_t)
1539 DO_SRA(gvec_usra_s
, uint32_t)
1540 DO_SRA(gvec_usra_d
, uint64_t)
1544 #define DO_RSHR(NAME, TYPE) \
1545 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1547 intptr_t i, oprsz = simd_oprsz(desc); \
1548 int shift = simd_data(desc); \
1549 TYPE *d = vd, *n = vn; \
1550 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1551 TYPE tmp = n[i] >> (shift - 1); \
1552 d[i] = (tmp >> 1) + (tmp & 1); \
1554 clear_tail(d, oprsz, simd_maxsz(desc)); \
1557 DO_RSHR(gvec_srshr_b
, int8_t)
1558 DO_RSHR(gvec_srshr_h
, int16_t)
1559 DO_RSHR(gvec_srshr_s
, int32_t)
1560 DO_RSHR(gvec_srshr_d
, int64_t)
1562 DO_RSHR(gvec_urshr_b
, uint8_t)
1563 DO_RSHR(gvec_urshr_h
, uint16_t)
1564 DO_RSHR(gvec_urshr_s
, uint32_t)
1565 DO_RSHR(gvec_urshr_d
, uint64_t)
1569 #define DO_RSRA(NAME, TYPE) \
1570 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1572 intptr_t i, oprsz = simd_oprsz(desc); \
1573 int shift = simd_data(desc); \
1574 TYPE *d = vd, *n = vn; \
1575 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1576 TYPE tmp = n[i] >> (shift - 1); \
1577 d[i] += (tmp >> 1) + (tmp & 1); \
1579 clear_tail(d, oprsz, simd_maxsz(desc)); \
1582 DO_RSRA(gvec_srsra_b
, int8_t)
1583 DO_RSRA(gvec_srsra_h
, int16_t)
1584 DO_RSRA(gvec_srsra_s
, int32_t)
1585 DO_RSRA(gvec_srsra_d
, int64_t)
1587 DO_RSRA(gvec_ursra_b
, uint8_t)
1588 DO_RSRA(gvec_ursra_h
, uint16_t)
1589 DO_RSRA(gvec_ursra_s
, uint32_t)
1590 DO_RSRA(gvec_ursra_d
, uint64_t)
1594 #define DO_SRI(NAME, TYPE) \
1595 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1597 intptr_t i, oprsz = simd_oprsz(desc); \
1598 int shift = simd_data(desc); \
1599 TYPE *d = vd, *n = vn; \
1600 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1601 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1603 clear_tail(d, oprsz, simd_maxsz(desc)); \
1606 DO_SRI(gvec_sri_b
, uint8_t)
1607 DO_SRI(gvec_sri_h
, uint16_t)
1608 DO_SRI(gvec_sri_s
, uint32_t)
1609 DO_SRI(gvec_sri_d
, uint64_t)
1613 #define DO_SLI(NAME, TYPE) \
1614 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1616 intptr_t i, oprsz = simd_oprsz(desc); \
1617 int shift = simd_data(desc); \
1618 TYPE *d = vd, *n = vn; \
1619 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1620 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1622 clear_tail(d, oprsz, simd_maxsz(desc)); \
1625 DO_SLI(gvec_sli_b
, uint8_t)
1626 DO_SLI(gvec_sli_h
, uint16_t)
1627 DO_SLI(gvec_sli_s
, uint32_t)
1628 DO_SLI(gvec_sli_d
, uint64_t)
1633 * Convert float16 to float32, raising no exceptions and
1634 * preserving exceptional values, including SNaN.
1635 * This is effectively an unpack+repack operation.
1637 static float32
float16_to_float32_by_bits(uint32_t f16
, bool fz16
)
1639 const int f16_bias
= 15;
1640 const int f32_bias
= 127;
1641 uint32_t sign
= extract32(f16
, 15, 1);
1642 uint32_t exp
= extract32(f16
, 10, 5);
1643 uint32_t frac
= extract32(f16
, 0, 10);
1648 } else if (exp
== 0) {
1649 /* Zero or denormal. */
1655 * Denormal; these are all normal float32.
1656 * Shift the fraction so that the msb is at bit 11,
1657 * then remove bit 11 as the implicit bit of the
1658 * normalized float32. Note that we still go through
1659 * the shift for normal numbers below, to put the
1660 * float32 fraction at the right place.
1662 int shift
= clz32(frac
) - 21;
1663 frac
= (frac
<< shift
) & 0x3ff;
1664 exp
= f32_bias
- f16_bias
- shift
+ 1;
1668 /* Normal number; adjust the bias. */
1669 exp
+= f32_bias
- f16_bias
;
1675 return sign
| exp
| frac
;
1678 static uint64_t load4_f16(uint64_t *ptr
, int is_q
, int is_2
)
1681 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1682 * Load the 2nd qword iff is_q & is_2.
1683 * Shift to the 2nd dword iff !is_q & is_2.
1684 * For !is_q & !is_2, the upper bits of the result are garbage.
1686 return ptr
[is_q
& is_2
] >> ((is_2
& ~is_q
) << 5);
1690 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1691 * as there is not yet SVE versions that might use blocking.
1694 static void do_fmlal(float32
*d
, void *vn
, void *vm
, float_status
*fpst
,
1695 uint32_t desc
, bool fz16
)
1697 intptr_t i
, oprsz
= simd_oprsz(desc
);
1698 int is_s
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1699 int is_2
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1700 int is_q
= oprsz
== 16;
1703 /* Pre-load all of the f16 data, avoiding overlap issues. */
1704 n_4
= load4_f16(vn
, is_q
, is_2
);
1705 m_4
= load4_f16(vm
, is_q
, is_2
);
1707 /* Negate all inputs for FMLSL at once. */
1709 n_4
^= 0x8000800080008000ull
;
1712 for (i
= 0; i
< oprsz
/ 4; i
++) {
1713 float32 n_1
= float16_to_float32_by_bits(n_4
>> (i
* 16), fz16
);
1714 float32 m_1
= float16_to_float32_by_bits(m_4
>> (i
* 16), fz16
);
1715 d
[H4(i
)] = float32_muladd(n_1
, m_1
, d
[H4(i
)], 0, fpst
);
1717 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1720 void HELPER(gvec_fmlal_a32
)(void *vd
, void *vn
, void *vm
,
1721 void *venv
, uint32_t desc
)
1723 CPUARMState
*env
= venv
;
1724 do_fmlal(vd
, vn
, vm
, &env
->vfp
.standard_fp_status
, desc
,
1725 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1728 void HELPER(gvec_fmlal_a64
)(void *vd
, void *vn
, void *vm
,
1729 void *venv
, uint32_t desc
)
1731 CPUARMState
*env
= venv
;
1732 do_fmlal(vd
, vn
, vm
, &env
->vfp
.fp_status
, desc
,
1733 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1736 static void do_fmlal_idx(float32
*d
, void *vn
, void *vm
, float_status
*fpst
,
1737 uint32_t desc
, bool fz16
)
1739 intptr_t i
, oprsz
= simd_oprsz(desc
);
1740 int is_s
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1741 int is_2
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1742 int index
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 3);
1743 int is_q
= oprsz
== 16;
1747 /* Pre-load all of the f16 data, avoiding overlap issues. */
1748 n_4
= load4_f16(vn
, is_q
, is_2
);
1750 /* Negate all inputs for FMLSL at once. */
1752 n_4
^= 0x8000800080008000ull
;
1755 m_1
= float16_to_float32_by_bits(((float16
*)vm
)[H2(index
)], fz16
);
1757 for (i
= 0; i
< oprsz
/ 4; i
++) {
1758 float32 n_1
= float16_to_float32_by_bits(n_4
>> (i
* 16), fz16
);
1759 d
[H4(i
)] = float32_muladd(n_1
, m_1
, d
[H4(i
)], 0, fpst
);
1761 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1764 void HELPER(gvec_fmlal_idx_a32
)(void *vd
, void *vn
, void *vm
,
1765 void *venv
, uint32_t desc
)
1767 CPUARMState
*env
= venv
;
1768 do_fmlal_idx(vd
, vn
, vm
, &env
->vfp
.standard_fp_status
, desc
,
1769 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1772 void HELPER(gvec_fmlal_idx_a64
)(void *vd
, void *vn
, void *vm
,
1773 void *venv
, uint32_t desc
)
1775 CPUARMState
*env
= venv
;
1776 do_fmlal_idx(vd
, vn
, vm
, &env
->vfp
.fp_status
, desc
,
1777 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1780 void HELPER(gvec_sshl_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1782 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1783 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
1785 for (i
= 0; i
< opr_sz
; ++i
) {
1794 res
= nn
>> (mm
> -8 ? -mm
: 7);
1798 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1801 void HELPER(gvec_sshl_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1803 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1804 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
1806 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
1807 int8_t mm
= m
[i
]; /* only 8 bits of shift are significant */
1815 res
= nn
>> (mm
> -16 ? -mm
: 15);
1819 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1822 void HELPER(gvec_ushl_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1824 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1825 uint8_t *d
= vd
, *n
= vn
, *m
= vm
;
1827 for (i
= 0; i
< opr_sz
; ++i
) {
1842 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1845 void HELPER(gvec_ushl_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1847 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1848 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
1850 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
1851 int8_t mm
= m
[i
]; /* only 8 bits of shift are significant */
1865 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1869 * 8x8->8 polynomial multiply.
1871 * Polynomial multiplication is like integer multiplication except the
1872 * partial products are XORed, not added.
1874 * TODO: expose this as a generic vector operation, as it is a common
1875 * crypto building block.
1877 void HELPER(gvec_pmul_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1879 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1880 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1882 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1887 for (j
= 0; j
< 8; ++j
) {
1888 uint64_t mask
= (nn
& 0x0101010101010101ull
) * 0xff;
1890 mm
= (mm
<< 1) & 0xfefefefefefefefeull
;
1895 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1899 * 64x64->128 polynomial multiply.
1900 * Because of the lanes are not accessed in strict columns,
1901 * this probably cannot be turned into a generic helper.
1903 void HELPER(gvec_pmull_q
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1905 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1906 intptr_t hi
= simd_data(desc
);
1907 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1909 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
1910 uint64_t nn
= n
[i
+ hi
];
1911 uint64_t mm
= m
[i
+ hi
];
1915 /* Bit 0 can only influence the low 64-bit result. */
1920 for (j
= 1; j
< 64; ++j
) {
1921 uint64_t mask
= -((nn
>> j
) & 1);
1922 rlo
^= (mm
<< j
) & mask
;
1923 rhi
^= (mm
>> (64 - j
)) & mask
;
1928 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1932 * 8x8->16 polynomial multiply.
1934 * The byte inputs are expanded to (or extracted from) half-words.
1935 * Note that neon and sve2 get the inputs from different positions.
1936 * This allows 4 bytes to be processed in parallel with uint64_t.
1939 static uint64_t expand_byte_to_half(uint64_t x
)
1941 return (x
& 0x000000ff)
1942 | ((x
& 0x0000ff00) << 8)
1943 | ((x
& 0x00ff0000) << 16)
1944 | ((x
& 0xff000000) << 24);
1947 static uint64_t pmull_h(uint64_t op1
, uint64_t op2
)
1949 uint64_t result
= 0;
1952 for (i
= 0; i
< 8; ++i
) {
1953 uint64_t mask
= (op1
& 0x0001000100010001ull
) * 0xffff;
1954 result
^= op2
& mask
;
1961 void HELPER(neon_pmull_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1963 int hi
= simd_data(desc
);
1964 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1965 uint64_t nn
= n
[hi
], mm
= m
[hi
];
1967 d
[0] = pmull_h(expand_byte_to_half(nn
), expand_byte_to_half(mm
));
1970 d
[1] = pmull_h(expand_byte_to_half(nn
), expand_byte_to_half(mm
));
1972 clear_tail(d
, 16, simd_maxsz(desc
));
1975 #ifdef TARGET_AARCH64
1976 void HELPER(sve2_pmull_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1978 int shift
= simd_data(desc
) * 8;
1979 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1980 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1982 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1983 uint64_t nn
= (n
[i
] >> shift
) & 0x00ff00ff00ff00ffull
;
1984 uint64_t mm
= (m
[i
] >> shift
) & 0x00ff00ff00ff00ffull
;
1986 d
[i
] = pmull_h(nn
, mm
);
1990 static uint64_t pmull_d(uint64_t op1
, uint64_t op2
)
1992 uint64_t result
= 0;
1995 for (i
= 0; i
< 32; ++i
) {
1996 uint64_t mask
= -((op1
>> i
) & 1);
1997 result
^= (op2
<< i
) & mask
;
2002 void HELPER(sve2_pmull_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2004 intptr_t sel
= H4(simd_data(desc
));
2005 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2006 uint32_t *n
= vn
, *m
= vm
;
2009 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
2010 d
[i
] = pmull_d(n
[2 * i
+ sel
], m
[2 * i
+ sel
]);
2015 #define DO_CMP0(NAME, TYPE, OP) \
2016 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2018 intptr_t i, opr_sz = simd_oprsz(desc); \
2019 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2020 TYPE nn = *(TYPE *)(vn + i); \
2021 *(TYPE *)(vd + i) = -(nn OP 0); \
2023 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2026 DO_CMP0(gvec_ceq0_b
, int8_t, ==)
2027 DO_CMP0(gvec_clt0_b
, int8_t, <)
2028 DO_CMP0(gvec_cle0_b
, int8_t, <=)
2029 DO_CMP0(gvec_cgt0_b
, int8_t, >)
2030 DO_CMP0(gvec_cge0_b
, int8_t, >=)
2032 DO_CMP0(gvec_ceq0_h
, int16_t, ==)
2033 DO_CMP0(gvec_clt0_h
, int16_t, <)
2034 DO_CMP0(gvec_cle0_h
, int16_t, <=)
2035 DO_CMP0(gvec_cgt0_h
, int16_t, >)
2036 DO_CMP0(gvec_cge0_h
, int16_t, >=)
2040 #define DO_ABD(NAME, TYPE) \
2041 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2043 intptr_t i, opr_sz = simd_oprsz(desc); \
2044 TYPE *d = vd, *n = vn, *m = vm; \
2046 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2047 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2049 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2052 DO_ABD(gvec_sabd_b
, int8_t)
2053 DO_ABD(gvec_sabd_h
, int16_t)
2054 DO_ABD(gvec_sabd_s
, int32_t)
2055 DO_ABD(gvec_sabd_d
, int64_t)
2057 DO_ABD(gvec_uabd_b
, uint8_t)
2058 DO_ABD(gvec_uabd_h
, uint16_t)
2059 DO_ABD(gvec_uabd_s
, uint32_t)
2060 DO_ABD(gvec_uabd_d
, uint64_t)
2064 #define DO_ABA(NAME, TYPE) \
2065 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2067 intptr_t i, opr_sz = simd_oprsz(desc); \
2068 TYPE *d = vd, *n = vn, *m = vm; \
2070 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2071 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2073 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2076 DO_ABA(gvec_saba_b
, int8_t)
2077 DO_ABA(gvec_saba_h
, int16_t)
2078 DO_ABA(gvec_saba_s
, int32_t)
2079 DO_ABA(gvec_saba_d
, int64_t)
2081 DO_ABA(gvec_uaba_b
, uint8_t)
2082 DO_ABA(gvec_uaba_h
, uint16_t)
2083 DO_ABA(gvec_uaba_s
, uint32_t)
2084 DO_ABA(gvec_uaba_d
, uint64_t)
2088 #define DO_NEON_PAIRWISE(NAME, OP) \
2089 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2090 void *stat, uint32_t oprsz) \
2092 float_status *fpst = stat; \
2098 /* Read all inputs before writing outputs in case vm == vd */ \
2099 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2100 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2106 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2107 void *stat, uint32_t oprsz) \
2109 float_status *fpst = stat; \
2113 float16 r0, r1, r2, r3; \
2115 /* Read all inputs before writing outputs in case vm == vd */ \
2116 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2117 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2118 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2119 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2127 DO_NEON_PAIRWISE(neon_padd
, add
)
2128 DO_NEON_PAIRWISE(neon_pmax
, max
)
2129 DO_NEON_PAIRWISE(neon_pmin
, min
)
2131 #undef DO_NEON_PAIRWISE
2133 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2134 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2136 intptr_t i, oprsz = simd_oprsz(desc); \
2137 int shift = simd_data(desc); \
2138 TYPE *d = vd, *n = vn; \
2139 float_status *fpst = stat; \
2140 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2141 d[i] = FUNC(n[i], shift, fpst); \
2143 clear_tail(d, oprsz, simd_maxsz(desc)); \
2146 DO_VCVT_FIXED(gvec_vcvt_sf
, helper_vfp_sltos
, uint32_t)
2147 DO_VCVT_FIXED(gvec_vcvt_uf
, helper_vfp_ultos
, uint32_t)
2148 DO_VCVT_FIXED(gvec_vcvt_fs
, helper_vfp_tosls_round_to_zero
, uint32_t)
2149 DO_VCVT_FIXED(gvec_vcvt_fu
, helper_vfp_touls_round_to_zero
, uint32_t)
2150 DO_VCVT_FIXED(gvec_vcvt_sh
, helper_vfp_shtoh
, uint16_t)
2151 DO_VCVT_FIXED(gvec_vcvt_uh
, helper_vfp_uhtoh
, uint16_t)
2152 DO_VCVT_FIXED(gvec_vcvt_hs
, helper_vfp_toshh_round_to_zero
, uint16_t)
2153 DO_VCVT_FIXED(gvec_vcvt_hu
, helper_vfp_touhh_round_to_zero
, uint16_t)
2155 #undef DO_VCVT_FIXED
2157 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2158 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2160 float_status *fpst = stat; \
2161 intptr_t i, oprsz = simd_oprsz(desc); \
2162 uint32_t rmode = simd_data(desc); \
2163 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2164 TYPE *d = vd, *n = vn; \
2165 set_float_rounding_mode(rmode, fpst); \
2166 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2167 d[i] = FUNC(n[i], 0, fpst); \
2169 set_float_rounding_mode(prev_rmode, fpst); \
2170 clear_tail(d, oprsz, simd_maxsz(desc)); \
2173 DO_VCVT_RMODE(gvec_vcvt_rm_ss
, helper_vfp_tosls
, uint32_t)
2174 DO_VCVT_RMODE(gvec_vcvt_rm_us
, helper_vfp_touls
, uint32_t)
2175 DO_VCVT_RMODE(gvec_vcvt_rm_sh
, helper_vfp_toshh
, uint16_t)
2176 DO_VCVT_RMODE(gvec_vcvt_rm_uh
, helper_vfp_touhh
, uint16_t)
2178 #undef DO_VCVT_RMODE
2180 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2181 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2183 float_status *fpst = stat; \
2184 intptr_t i, oprsz = simd_oprsz(desc); \
2185 uint32_t rmode = simd_data(desc); \
2186 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2187 TYPE *d = vd, *n = vn; \
2188 set_float_rounding_mode(rmode, fpst); \
2189 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2190 d[i] = FUNC(n[i], fpst); \
2192 set_float_rounding_mode(prev_rmode, fpst); \
2193 clear_tail(d, oprsz, simd_maxsz(desc)); \
2196 DO_VRINT_RMODE(gvec_vrint_rm_h
, helper_rinth
, uint16_t)
2197 DO_VRINT_RMODE(gvec_vrint_rm_s
, helper_rints
, uint32_t)
2199 #undef DO_VRINT_RMODE
2201 #ifdef TARGET_AARCH64
2202 void HELPER(simd_tblx
)(void *vd
, void *vm
, void *venv
, uint32_t desc
)
2204 const uint8_t *indices
= vm
;
2205 CPUARMState
*env
= venv
;
2206 size_t oprsz
= simd_oprsz(desc
);
2207 uint32_t rn
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
2208 bool is_tbx
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 1);
2209 uint32_t table_len
= desc
>> (SIMD_DATA_SHIFT
+ 6);
2216 * We must construct the final result in a temp, lest the output
2217 * overlaps the input table. For TBL, begin with zero; for TBX,
2218 * begin with the original register contents. Note that we always
2219 * copy 16 bytes here to avoid an extra branch; clearing the high
2220 * bits of the register for oprsz == 8 is handled below.
2223 memcpy(&result
, vd
, 16);
2225 memset(&result
, 0, 16);
2228 for (size_t i
= 0; i
< oprsz
; ++i
) {
2229 uint32_t index
= indices
[H1(i
)];
2231 if (index
< table_len
) {
2233 * Convert index (a byte offset into the virtual table
2234 * which is a series of 128-bit vectors concatenated)
2235 * into the correct register element, bearing in mind
2236 * that the table can wrap around from V31 to V0.
2238 const uint8_t *table
= (const uint8_t *)
2239 aa64_vfp_qreg(env
, (rn
+ (index
>> 4)) % 32);
2240 result
.b
[H1(i
)] = table
[H1(index
% 16)];
2244 memcpy(vd
, &result
, 16);
2245 clear_tail(vd
, oprsz
, simd_maxsz(desc
));
2250 * NxN -> N highpart multiply
2252 * TODO: expose this as a generic vector operation.
2255 void HELPER(gvec_smulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2257 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2258 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
2260 for (i
= 0; i
< opr_sz
; ++i
) {
2261 d
[i
] = ((int32_t)n
[i
] * m
[i
]) >> 8;
2263 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2266 void HELPER(gvec_smulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2268 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2269 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
2271 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
2272 d
[i
] = ((int32_t)n
[i
] * m
[i
]) >> 16;
2274 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2277 void HELPER(gvec_smulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2279 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2280 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
2282 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
2283 d
[i
] = ((int64_t)n
[i
] * m
[i
]) >> 32;
2285 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2288 void HELPER(gvec_smulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2290 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2291 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2294 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
2295 muls64(&discard
, &d
[i
], n
[i
], m
[i
]);
2297 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2300 void HELPER(gvec_umulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2302 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2303 uint8_t *d
= vd
, *n
= vn
, *m
= vm
;
2305 for (i
= 0; i
< opr_sz
; ++i
) {
2306 d
[i
] = ((uint32_t)n
[i
] * m
[i
]) >> 8;
2308 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2311 void HELPER(gvec_umulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2313 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2314 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
2316 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
2317 d
[i
] = ((uint32_t)n
[i
] * m
[i
]) >> 16;
2319 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2322 void HELPER(gvec_umulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2324 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2325 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2327 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
2328 d
[i
] = ((uint64_t)n
[i
] * m
[i
]) >> 32;
2330 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2333 void HELPER(gvec_umulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2335 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2336 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2339 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
2340 mulu64(&discard
, &d
[i
], n
[i
], m
[i
]);
2342 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2345 void HELPER(gvec_xar_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2347 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2348 int shr
= simd_data(desc
);
2349 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2351 for (i
= 0; i
< opr_sz
; ++i
) {
2352 d
[i
] = ror64(n
[i
] ^ m
[i
], shr
);
2354 clear_tail(d
, opr_sz
* 8, simd_maxsz(desc
));