2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
28 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
29 int8_t do_sqrdmlah_b(int8_t src1
, int8_t src2
, int8_t src3
,
34 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
35 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
37 int32_t ret
= (int32_t)src1
* src2
;
41 ret
+= ((int32_t)src3
<< 7) + (round
<< 6);
44 if (ret
!= (int8_t)ret
) {
45 ret
= (ret
< 0 ? INT8_MIN
: INT8_MAX
);
50 void HELPER(sve2_sqrdmlah_b
)(void *vd
, void *vn
, void *vm
,
51 void *va
, uint32_t desc
)
53 intptr_t i
, opr_sz
= simd_oprsz(desc
);
54 int8_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
56 for (i
= 0; i
< opr_sz
; ++i
) {
57 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], a
[i
], false, true);
61 void HELPER(sve2_sqrdmlsh_b
)(void *vd
, void *vn
, void *vm
,
62 void *va
, uint32_t desc
)
64 intptr_t i
, opr_sz
= simd_oprsz(desc
);
65 int8_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
67 for (i
= 0; i
< opr_sz
; ++i
) {
68 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], a
[i
], true, true);
72 void HELPER(sve2_sqdmulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
74 intptr_t i
, opr_sz
= simd_oprsz(desc
);
75 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
77 for (i
= 0; i
< opr_sz
; ++i
) {
78 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], 0, false, false);
82 void HELPER(sve2_sqrdmulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
84 intptr_t i
, opr_sz
= simd_oprsz(desc
);
85 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
87 for (i
= 0; i
< opr_sz
; ++i
) {
88 d
[i
] = do_sqrdmlah_b(n
[i
], m
[i
], 0, false, true);
92 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
93 int16_t do_sqrdmlah_h(int16_t src1
, int16_t src2
, int16_t src3
,
94 bool neg
, bool round
, uint32_t *sat
)
96 /* Simplify similarly to do_sqrdmlah_b above. */
97 int32_t ret
= (int32_t)src1
* src2
;
101 ret
+= ((int32_t)src3
<< 15) + (round
<< 14);
104 if (ret
!= (int16_t)ret
) {
106 ret
= (ret
< 0 ? INT16_MIN
: INT16_MAX
);
111 uint32_t HELPER(neon_qrdmlah_s16
)(CPUARMState
*env
, uint32_t src1
,
112 uint32_t src2
, uint32_t src3
)
114 uint32_t *sat
= &env
->vfp
.qc
[0];
115 uint16_t e1
= do_sqrdmlah_h(src1
, src2
, src3
, false, true, sat
);
116 uint16_t e2
= do_sqrdmlah_h(src1
>> 16, src2
>> 16, src3
>> 16,
118 return deposit32(e1
, 16, 16, e2
);
121 void HELPER(gvec_qrdmlah_s16
)(void *vd
, void *vn
, void *vm
,
122 void *vq
, uint32_t desc
)
124 uintptr_t opr_sz
= simd_oprsz(desc
);
130 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
131 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], d
[i
], false, true, vq
);
133 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
136 uint32_t HELPER(neon_qrdmlsh_s16
)(CPUARMState
*env
, uint32_t src1
,
137 uint32_t src2
, uint32_t src3
)
139 uint32_t *sat
= &env
->vfp
.qc
[0];
140 uint16_t e1
= do_sqrdmlah_h(src1
, src2
, src3
, true, true, sat
);
141 uint16_t e2
= do_sqrdmlah_h(src1
>> 16, src2
>> 16, src3
>> 16,
143 return deposit32(e1
, 16, 16, e2
);
146 void HELPER(gvec_qrdmlsh_s16
)(void *vd
, void *vn
, void *vm
,
147 void *vq
, uint32_t desc
)
149 uintptr_t opr_sz
= simd_oprsz(desc
);
155 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
156 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], d
[i
], true, true, vq
);
158 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
161 void HELPER(neon_sqdmulh_h
)(void *vd
, void *vn
, void *vm
,
162 void *vq
, uint32_t desc
)
164 intptr_t i
, opr_sz
= simd_oprsz(desc
);
165 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
167 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
168 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, false, vq
);
170 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
173 void HELPER(neon_sqrdmulh_h
)(void *vd
, void *vn
, void *vm
,
174 void *vq
, uint32_t desc
)
176 intptr_t i
, opr_sz
= simd_oprsz(desc
);
177 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
179 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
180 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, true, vq
);
182 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
185 void HELPER(sve2_sqrdmlah_h
)(void *vd
, void *vn
, void *vm
,
186 void *va
, uint32_t desc
)
188 intptr_t i
, opr_sz
= simd_oprsz(desc
);
189 int16_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
192 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
193 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], a
[i
], false, true, &discard
);
197 void HELPER(sve2_sqrdmlsh_h
)(void *vd
, void *vn
, void *vm
,
198 void *va
, uint32_t desc
)
200 intptr_t i
, opr_sz
= simd_oprsz(desc
);
201 int16_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
204 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
205 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], a
[i
], true, true, &discard
);
209 void HELPER(sve2_sqdmulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
211 intptr_t i
, opr_sz
= simd_oprsz(desc
);
212 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
215 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
216 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, false, &discard
);
220 void HELPER(sve2_sqrdmulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
222 intptr_t i
, opr_sz
= simd_oprsz(desc
);
223 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
226 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
227 d
[i
] = do_sqrdmlah_h(n
[i
], m
[i
], 0, false, true, &discard
);
231 void HELPER(sve2_sqdmulh_idx_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
233 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
234 int idx
= simd_data(desc
);
235 int16_t *d
= vd
, *n
= vn
, *m
= (int16_t *)vm
+ H2(idx
);
238 for (i
= 0; i
< opr_sz
/ 2; i
+= 16 / 2) {
240 for (j
= 0; j
< 16 / 2; ++j
) {
241 d
[i
+ j
] = do_sqrdmlah_h(n
[i
+ j
], mm
, 0, false, false, &discard
);
246 void HELPER(sve2_sqrdmulh_idx_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
248 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
249 int idx
= simd_data(desc
);
250 int16_t *d
= vd
, *n
= vn
, *m
= (int16_t *)vm
+ H2(idx
);
253 for (i
= 0; i
< opr_sz
/ 2; i
+= 16 / 2) {
255 for (j
= 0; j
< 16 / 2; ++j
) {
256 d
[i
+ j
] = do_sqrdmlah_h(n
[i
+ j
], mm
, 0, false, true, &discard
);
261 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
262 int32_t do_sqrdmlah_s(int32_t src1
, int32_t src2
, int32_t src3
,
263 bool neg
, bool round
, uint32_t *sat
)
265 /* Simplify similarly to do_sqrdmlah_b above. */
266 int64_t ret
= (int64_t)src1
* src2
;
270 ret
+= ((int64_t)src3
<< 31) + (round
<< 30);
273 if (ret
!= (int32_t)ret
) {
275 ret
= (ret
< 0 ? INT32_MIN
: INT32_MAX
);
280 uint32_t HELPER(neon_qrdmlah_s32
)(CPUARMState
*env
, int32_t src1
,
281 int32_t src2
, int32_t src3
)
283 uint32_t *sat
= &env
->vfp
.qc
[0];
284 return do_sqrdmlah_s(src1
, src2
, src3
, false, true, sat
);
287 void HELPER(gvec_qrdmlah_s32
)(void *vd
, void *vn
, void *vm
,
288 void *vq
, uint32_t desc
)
290 uintptr_t opr_sz
= simd_oprsz(desc
);
296 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
297 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], d
[i
], false, true, vq
);
299 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
302 uint32_t HELPER(neon_qrdmlsh_s32
)(CPUARMState
*env
, int32_t src1
,
303 int32_t src2
, int32_t src3
)
305 uint32_t *sat
= &env
->vfp
.qc
[0];
306 return do_sqrdmlah_s(src1
, src2
, src3
, true, true, sat
);
309 void HELPER(gvec_qrdmlsh_s32
)(void *vd
, void *vn
, void *vm
,
310 void *vq
, uint32_t desc
)
312 uintptr_t opr_sz
= simd_oprsz(desc
);
318 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
319 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], d
[i
], true, true, vq
);
321 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
324 void HELPER(neon_sqdmulh_s
)(void *vd
, void *vn
, void *vm
,
325 void *vq
, uint32_t desc
)
327 intptr_t i
, opr_sz
= simd_oprsz(desc
);
328 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
330 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
331 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, false, vq
);
333 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
336 void HELPER(neon_sqrdmulh_s
)(void *vd
, void *vn
, void *vm
,
337 void *vq
, uint32_t desc
)
339 intptr_t i
, opr_sz
= simd_oprsz(desc
);
340 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
342 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
343 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, true, vq
);
345 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
348 void HELPER(sve2_sqrdmlah_s
)(void *vd
, void *vn
, void *vm
,
349 void *va
, uint32_t desc
)
351 intptr_t i
, opr_sz
= simd_oprsz(desc
);
352 int32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
355 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
356 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], a
[i
], false, true, &discard
);
360 void HELPER(sve2_sqrdmlsh_s
)(void *vd
, void *vn
, void *vm
,
361 void *va
, uint32_t desc
)
363 intptr_t i
, opr_sz
= simd_oprsz(desc
);
364 int32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
367 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
368 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], a
[i
], true, true, &discard
);
372 void HELPER(sve2_sqdmulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
374 intptr_t i
, opr_sz
= simd_oprsz(desc
);
375 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
378 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
379 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, false, &discard
);
383 void HELPER(sve2_sqrdmulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
385 intptr_t i
, opr_sz
= simd_oprsz(desc
);
386 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
389 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
390 d
[i
] = do_sqrdmlah_s(n
[i
], m
[i
], 0, false, true, &discard
);
394 void HELPER(sve2_sqdmulh_idx_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
396 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
397 int idx
= simd_data(desc
);
398 int32_t *d
= vd
, *n
= vn
, *m
= (int32_t *)vm
+ H4(idx
);
401 for (i
= 0; i
< opr_sz
/ 4; i
+= 16 / 4) {
403 for (j
= 0; j
< 16 / 4; ++j
) {
404 d
[i
+ j
] = do_sqrdmlah_s(n
[i
+ j
], mm
, 0, false, false, &discard
);
409 void HELPER(sve2_sqrdmulh_idx_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
411 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
412 int idx
= simd_data(desc
);
413 int32_t *d
= vd
, *n
= vn
, *m
= (int32_t *)vm
+ H4(idx
);
416 for (i
= 0; i
< opr_sz
/ 4; i
+= 16 / 4) {
418 for (j
= 0; j
< 16 / 4; ++j
) {
419 d
[i
+ j
] = do_sqrdmlah_s(n
[i
+ j
], mm
, 0, false, true, &discard
);
424 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
425 static int64_t do_sat128_d(Int128 r
)
427 int64_t ls
= int128_getlo(r
);
428 int64_t hs
= int128_gethi(r
);
430 if (unlikely(hs
!= (ls
>> 63))) {
431 return hs
< 0 ? INT64_MIN
: INT64_MAX
;
436 int64_t do_sqrdmlah_d(int64_t n
, int64_t m
, int64_t a
, bool neg
, bool round
)
441 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
442 muls64(&l
, &h
, m
, n
);
443 r
= int128_make128(l
, h
);
448 t
= int128_exts64(a
);
449 t
= int128_lshift(t
, 63);
450 r
= int128_add(r
, t
);
453 t
= int128_exts64(1ll << 62);
454 r
= int128_add(r
, t
);
456 r
= int128_rshift(r
, 63);
458 return do_sat128_d(r
);
461 void HELPER(sve2_sqrdmlah_d
)(void *vd
, void *vn
, void *vm
,
462 void *va
, uint32_t desc
)
464 intptr_t i
, opr_sz
= simd_oprsz(desc
);
465 int64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
467 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
468 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], a
[i
], false, true);
472 void HELPER(sve2_sqrdmlsh_d
)(void *vd
, void *vn
, void *vm
,
473 void *va
, uint32_t desc
)
475 intptr_t i
, opr_sz
= simd_oprsz(desc
);
476 int64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
478 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
479 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], a
[i
], true, true);
483 void HELPER(sve2_sqdmulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
485 intptr_t i
, opr_sz
= simd_oprsz(desc
);
486 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
488 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
489 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], 0, false, false);
493 void HELPER(sve2_sqrdmulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
495 intptr_t i
, opr_sz
= simd_oprsz(desc
);
496 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
498 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
499 d
[i
] = do_sqrdmlah_d(n
[i
], m
[i
], 0, false, true);
503 void HELPER(sve2_sqdmulh_idx_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
505 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
506 int idx
= simd_data(desc
);
507 int64_t *d
= vd
, *n
= vn
, *m
= (int64_t *)vm
+ idx
;
509 for (i
= 0; i
< opr_sz
/ 8; i
+= 16 / 8) {
511 for (j
= 0; j
< 16 / 8; ++j
) {
512 d
[i
+ j
] = do_sqrdmlah_d(n
[i
+ j
], mm
, 0, false, false);
517 void HELPER(sve2_sqrdmulh_idx_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
519 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
520 int idx
= simd_data(desc
);
521 int64_t *d
= vd
, *n
= vn
, *m
= (int64_t *)vm
+ idx
;
523 for (i
= 0; i
< opr_sz
/ 8; i
+= 16 / 8) {
525 for (j
= 0; j
< 16 / 8; ++j
) {
526 d
[i
+ j
] = do_sqrdmlah_d(n
[i
+ j
], mm
, 0, false, true);
531 /* Integer 8 and 16-bit dot-product.
533 * Note that for the loops herein, host endianness does not matter
534 * with respect to the ordering of data within the quad-width lanes.
535 * All elements are treated equally, no matter where they are.
538 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
539 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
541 intptr_t i, opr_sz = simd_oprsz(desc); \
542 TYPED *d = vd, *a = va; \
545 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
547 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
548 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
549 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
550 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
552 clear_tail(d, opr_sz, simd_maxsz(desc)); \
555 DO_DOT(gvec_sdot_b
, int32_t, int8_t, int8_t)
556 DO_DOT(gvec_udot_b
, uint32_t, uint8_t, uint8_t)
557 DO_DOT(gvec_usdot_b
, uint32_t, uint8_t, int8_t)
558 DO_DOT(gvec_sdot_h
, int64_t, int16_t, int16_t)
559 DO_DOT(gvec_udot_h
, uint64_t, uint16_t, uint16_t)
561 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
562 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
564 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
565 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
566 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
567 intptr_t index = simd_data(desc); \
568 TYPED *d = vd, *a = va; \
570 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
572 TYPED m0 = m_indexed[i * 4 + 0]; \
573 TYPED m1 = m_indexed[i * 4 + 1]; \
574 TYPED m2 = m_indexed[i * 4 + 2]; \
575 TYPED m3 = m_indexed[i * 4 + 3]; \
578 n[i * 4 + 0] * m0 + \
579 n[i * 4 + 1] * m1 + \
580 n[i * 4 + 2] * m2 + \
581 n[i * 4 + 3] * m3); \
582 } while (++i < segend); \
584 } while (i < opr_sz_n); \
585 clear_tail(d, opr_sz, simd_maxsz(desc)); \
588 DO_DOT_IDX(gvec_sdot_idx_b
, int32_t, int8_t, int8_t, H4
)
589 DO_DOT_IDX(gvec_udot_idx_b
, uint32_t, uint8_t, uint8_t, H4
)
590 DO_DOT_IDX(gvec_sudot_idx_b
, int32_t, int8_t, uint8_t, H4
)
591 DO_DOT_IDX(gvec_usdot_idx_b
, int32_t, uint8_t, int8_t, H4
)
592 DO_DOT_IDX(gvec_sdot_idx_h
, int64_t, int16_t, int16_t, )
593 DO_DOT_IDX(gvec_udot_idx_h
, uint64_t, uint16_t, uint16_t, )
595 void HELPER(gvec_fcaddh
)(void *vd
, void *vn
, void *vm
,
596 void *vfpst
, uint32_t desc
)
598 uintptr_t opr_sz
= simd_oprsz(desc
);
602 float_status
*fpst
= vfpst
;
603 uint32_t neg_real
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
604 uint32_t neg_imag
= neg_real
^ 1;
607 /* Shift boolean to the sign bit so we can xor to negate. */
611 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
612 float16 e0
= n
[H2(i
)];
613 float16 e1
= m
[H2(i
+ 1)] ^ neg_imag
;
614 float16 e2
= n
[H2(i
+ 1)];
615 float16 e3
= m
[H2(i
)] ^ neg_real
;
617 d
[H2(i
)] = float16_add(e0
, e1
, fpst
);
618 d
[H2(i
+ 1)] = float16_add(e2
, e3
, fpst
);
620 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
623 void HELPER(gvec_fcadds
)(void *vd
, void *vn
, void *vm
,
624 void *vfpst
, uint32_t desc
)
626 uintptr_t opr_sz
= simd_oprsz(desc
);
630 float_status
*fpst
= vfpst
;
631 uint32_t neg_real
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
632 uint32_t neg_imag
= neg_real
^ 1;
635 /* Shift boolean to the sign bit so we can xor to negate. */
639 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
640 float32 e0
= n
[H4(i
)];
641 float32 e1
= m
[H4(i
+ 1)] ^ neg_imag
;
642 float32 e2
= n
[H4(i
+ 1)];
643 float32 e3
= m
[H4(i
)] ^ neg_real
;
645 d
[H4(i
)] = float32_add(e0
, e1
, fpst
);
646 d
[H4(i
+ 1)] = float32_add(e2
, e3
, fpst
);
648 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
651 void HELPER(gvec_fcaddd
)(void *vd
, void *vn
, void *vm
,
652 void *vfpst
, uint32_t desc
)
654 uintptr_t opr_sz
= simd_oprsz(desc
);
658 float_status
*fpst
= vfpst
;
659 uint64_t neg_real
= extract64(desc
, SIMD_DATA_SHIFT
, 1);
660 uint64_t neg_imag
= neg_real
^ 1;
663 /* Shift boolean to the sign bit so we can xor to negate. */
667 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
669 float64 e1
= m
[i
+ 1] ^ neg_imag
;
670 float64 e2
= n
[i
+ 1];
671 float64 e3
= m
[i
] ^ neg_real
;
673 d
[i
] = float64_add(e0
, e1
, fpst
);
674 d
[i
+ 1] = float64_add(e2
, e3
, fpst
);
676 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
679 void HELPER(gvec_fcmlah
)(void *vd
, void *vn
, void *vm
, void *va
,
680 void *vfpst
, uint32_t desc
)
682 uintptr_t opr_sz
= simd_oprsz(desc
);
683 float16
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
684 float_status
*fpst
= vfpst
;
685 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
686 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
687 uint32_t neg_real
= flip
^ neg_imag
;
690 /* Shift boolean to the sign bit so we can xor to negate. */
694 for (i
= 0; i
< opr_sz
/ 2; i
+= 2) {
695 float16 e2
= n
[H2(i
+ flip
)];
696 float16 e1
= m
[H2(i
+ flip
)] ^ neg_real
;
698 float16 e3
= m
[H2(i
+ 1 - flip
)] ^ neg_imag
;
700 d
[H2(i
)] = float16_muladd(e2
, e1
, a
[H2(i
)], 0, fpst
);
701 d
[H2(i
+ 1)] = float16_muladd(e4
, e3
, a
[H2(i
+ 1)], 0, fpst
);
703 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
706 void HELPER(gvec_fcmlah_idx
)(void *vd
, void *vn
, void *vm
, void *va
,
707 void *vfpst
, uint32_t desc
)
709 uintptr_t opr_sz
= simd_oprsz(desc
);
710 float16
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
711 float_status
*fpst
= vfpst
;
712 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
713 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
714 intptr_t index
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2);
715 uint32_t neg_real
= flip
^ neg_imag
;
716 intptr_t elements
= opr_sz
/ sizeof(float16
);
717 intptr_t eltspersegment
= 16 / sizeof(float16
);
720 /* Shift boolean to the sign bit so we can xor to negate. */
724 for (i
= 0; i
< elements
; i
+= eltspersegment
) {
725 float16 mr
= m
[H2(i
+ 2 * index
+ 0)];
726 float16 mi
= m
[H2(i
+ 2 * index
+ 1)];
727 float16 e1
= neg_real
^ (flip
? mi
: mr
);
728 float16 e3
= neg_imag
^ (flip
? mr
: mi
);
730 for (j
= i
; j
< i
+ eltspersegment
; j
+= 2) {
731 float16 e2
= n
[H2(j
+ flip
)];
734 d
[H2(j
)] = float16_muladd(e2
, e1
, a
[H2(j
)], 0, fpst
);
735 d
[H2(j
+ 1)] = float16_muladd(e4
, e3
, a
[H2(j
+ 1)], 0, fpst
);
738 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
741 void HELPER(gvec_fcmlas
)(void *vd
, void *vn
, void *vm
, void *va
,
742 void *vfpst
, uint32_t desc
)
744 uintptr_t opr_sz
= simd_oprsz(desc
);
745 float32
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
746 float_status
*fpst
= vfpst
;
747 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
748 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
749 uint32_t neg_real
= flip
^ neg_imag
;
752 /* Shift boolean to the sign bit so we can xor to negate. */
756 for (i
= 0; i
< opr_sz
/ 4; i
+= 2) {
757 float32 e2
= n
[H4(i
+ flip
)];
758 float32 e1
= m
[H4(i
+ flip
)] ^ neg_real
;
760 float32 e3
= m
[H4(i
+ 1 - flip
)] ^ neg_imag
;
762 d
[H4(i
)] = float32_muladd(e2
, e1
, a
[H4(i
)], 0, fpst
);
763 d
[H4(i
+ 1)] = float32_muladd(e4
, e3
, a
[H4(i
+ 1)], 0, fpst
);
765 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
768 void HELPER(gvec_fcmlas_idx
)(void *vd
, void *vn
, void *vm
, void *va
,
769 void *vfpst
, uint32_t desc
)
771 uintptr_t opr_sz
= simd_oprsz(desc
);
772 float32
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
773 float_status
*fpst
= vfpst
;
774 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
775 uint32_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
776 intptr_t index
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2);
777 uint32_t neg_real
= flip
^ neg_imag
;
778 intptr_t elements
= opr_sz
/ sizeof(float32
);
779 intptr_t eltspersegment
= 16 / sizeof(float32
);
782 /* Shift boolean to the sign bit so we can xor to negate. */
786 for (i
= 0; i
< elements
; i
+= eltspersegment
) {
787 float32 mr
= m
[H4(i
+ 2 * index
+ 0)];
788 float32 mi
= m
[H4(i
+ 2 * index
+ 1)];
789 float32 e1
= neg_real
^ (flip
? mi
: mr
);
790 float32 e3
= neg_imag
^ (flip
? mr
: mi
);
792 for (j
= i
; j
< i
+ eltspersegment
; j
+= 2) {
793 float32 e2
= n
[H4(j
+ flip
)];
796 d
[H4(j
)] = float32_muladd(e2
, e1
, a
[H4(j
)], 0, fpst
);
797 d
[H4(j
+ 1)] = float32_muladd(e4
, e3
, a
[H4(j
+ 1)], 0, fpst
);
800 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
803 void HELPER(gvec_fcmlad
)(void *vd
, void *vn
, void *vm
, void *va
,
804 void *vfpst
, uint32_t desc
)
806 uintptr_t opr_sz
= simd_oprsz(desc
);
807 float64
*d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
808 float_status
*fpst
= vfpst
;
809 intptr_t flip
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
810 uint64_t neg_imag
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
811 uint64_t neg_real
= flip
^ neg_imag
;
814 /* Shift boolean to the sign bit so we can xor to negate. */
818 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
819 float64 e2
= n
[i
+ flip
];
820 float64 e1
= m
[i
+ flip
] ^ neg_real
;
822 float64 e3
= m
[i
+ 1 - flip
] ^ neg_imag
;
824 d
[i
] = float64_muladd(e2
, e1
, a
[i
], 0, fpst
);
825 d
[i
+ 1] = float64_muladd(e4
, e3
, a
[i
+ 1], 0, fpst
);
827 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
831 * Floating point comparisons producing an integer result (all 1s or all 0s).
832 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
833 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
835 static uint16_t float16_ceq(float16 op1
, float16 op2
, float_status
*stat
)
837 return -float16_eq_quiet(op1
, op2
, stat
);
840 static uint32_t float32_ceq(float32 op1
, float32 op2
, float_status
*stat
)
842 return -float32_eq_quiet(op1
, op2
, stat
);
845 static uint16_t float16_cge(float16 op1
, float16 op2
, float_status
*stat
)
847 return -float16_le(op2
, op1
, stat
);
850 static uint32_t float32_cge(float32 op1
, float32 op2
, float_status
*stat
)
852 return -float32_le(op2
, op1
, stat
);
855 static uint16_t float16_cgt(float16 op1
, float16 op2
, float_status
*stat
)
857 return -float16_lt(op2
, op1
, stat
);
860 static uint32_t float32_cgt(float32 op1
, float32 op2
, float_status
*stat
)
862 return -float32_lt(op2
, op1
, stat
);
865 static uint16_t float16_acge(float16 op1
, float16 op2
, float_status
*stat
)
867 return -float16_le(float16_abs(op2
), float16_abs(op1
), stat
);
870 static uint32_t float32_acge(float32 op1
, float32 op2
, float_status
*stat
)
872 return -float32_le(float32_abs(op2
), float32_abs(op1
), stat
);
875 static uint16_t float16_acgt(float16 op1
, float16 op2
, float_status
*stat
)
877 return -float16_lt(float16_abs(op2
), float16_abs(op1
), stat
);
880 static uint32_t float32_acgt(float32 op1
, float32 op2
, float_status
*stat
)
882 return -float32_lt(float32_abs(op2
), float32_abs(op1
), stat
);
885 static int16_t vfp_tosszh(float16 x
, void *fpstp
)
887 float_status
*fpst
= fpstp
;
888 if (float16_is_any_nan(x
)) {
889 float_raise(float_flag_invalid
, fpst
);
892 return float16_to_int16_round_to_zero(x
, fpst
);
895 static uint16_t vfp_touszh(float16 x
, void *fpstp
)
897 float_status
*fpst
= fpstp
;
898 if (float16_is_any_nan(x
)) {
899 float_raise(float_flag_invalid
, fpst
);
902 return float16_to_uint16_round_to_zero(x
, fpst
);
905 #define DO_2OP(NAME, FUNC, TYPE) \
906 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
908 intptr_t i, oprsz = simd_oprsz(desc); \
909 TYPE *d = vd, *n = vn; \
910 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
911 d[i] = FUNC(n[i], stat); \
913 clear_tail(d, oprsz, simd_maxsz(desc)); \
916 DO_2OP(gvec_frecpe_h
, helper_recpe_f16
, float16
)
917 DO_2OP(gvec_frecpe_s
, helper_recpe_f32
, float32
)
918 DO_2OP(gvec_frecpe_d
, helper_recpe_f64
, float64
)
920 DO_2OP(gvec_frsqrte_h
, helper_rsqrte_f16
, float16
)
921 DO_2OP(gvec_frsqrte_s
, helper_rsqrte_f32
, float32
)
922 DO_2OP(gvec_frsqrte_d
, helper_rsqrte_f64
, float64
)
924 DO_2OP(gvec_vrintx_h
, float16_round_to_int
, float16
)
925 DO_2OP(gvec_vrintx_s
, float32_round_to_int
, float32
)
927 DO_2OP(gvec_sitos
, helper_vfp_sitos
, int32_t)
928 DO_2OP(gvec_uitos
, helper_vfp_uitos
, uint32_t)
929 DO_2OP(gvec_tosizs
, helper_vfp_tosizs
, float32
)
930 DO_2OP(gvec_touizs
, helper_vfp_touizs
, float32
)
931 DO_2OP(gvec_sstoh
, int16_to_float16
, int16_t)
932 DO_2OP(gvec_ustoh
, uint16_to_float16
, uint16_t)
933 DO_2OP(gvec_tosszh
, vfp_tosszh
, float16
)
934 DO_2OP(gvec_touszh
, vfp_touszh
, float16
)
936 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
937 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
939 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
942 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
943 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
945 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
948 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
949 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
950 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
951 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
952 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
954 DO_2OP_CMP0(cgt
, cgt
, FWD
)
955 DO_2OP_CMP0(cge
, cge
, FWD
)
956 DO_2OP_CMP0(ceq
, ceq
, FWD
)
957 DO_2OP_CMP0(clt
, cgt
, REV
)
958 DO_2OP_CMP0(cle
, cge
, REV
)
963 /* Floating-point trigonometric starting value.
964 * See the ARM ARM pseudocode function FPTrigSMul.
966 static float16
float16_ftsmul(float16 op1
, uint16_t op2
, float_status
*stat
)
968 float16 result
= float16_mul(op1
, op1
, stat
);
969 if (!float16_is_any_nan(result
)) {
970 result
= float16_set_sign(result
, op2
& 1);
975 static float32
float32_ftsmul(float32 op1
, uint32_t op2
, float_status
*stat
)
977 float32 result
= float32_mul(op1
, op1
, stat
);
978 if (!float32_is_any_nan(result
)) {
979 result
= float32_set_sign(result
, op2
& 1);
984 static float64
float64_ftsmul(float64 op1
, uint64_t op2
, float_status
*stat
)
986 float64 result
= float64_mul(op1
, op1
, stat
);
987 if (!float64_is_any_nan(result
)) {
988 result
= float64_set_sign(result
, op2
& 1);
993 static float16
float16_abd(float16 op1
, float16 op2
, float_status
*stat
)
995 return float16_abs(float16_sub(op1
, op2
, stat
));
998 static float32
float32_abd(float32 op1
, float32 op2
, float_status
*stat
)
1000 return float32_abs(float32_sub(op1
, op2
, stat
));
1004 * Reciprocal step. These are the AArch32 version which uses a
1005 * non-fused multiply-and-subtract.
1007 static float16
float16_recps_nf(float16 op1
, float16 op2
, float_status
*stat
)
1009 op1
= float16_squash_input_denormal(op1
, stat
);
1010 op2
= float16_squash_input_denormal(op2
, stat
);
1012 if ((float16_is_infinity(op1
) && float16_is_zero(op2
)) ||
1013 (float16_is_infinity(op2
) && float16_is_zero(op1
))) {
1016 return float16_sub(float16_two
, float16_mul(op1
, op2
, stat
), stat
);
1019 static float32
float32_recps_nf(float32 op1
, float32 op2
, float_status
*stat
)
1021 op1
= float32_squash_input_denormal(op1
, stat
);
1022 op2
= float32_squash_input_denormal(op2
, stat
);
1024 if ((float32_is_infinity(op1
) && float32_is_zero(op2
)) ||
1025 (float32_is_infinity(op2
) && float32_is_zero(op1
))) {
1028 return float32_sub(float32_two
, float32_mul(op1
, op2
, stat
), stat
);
1031 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1032 static float16
float16_rsqrts_nf(float16 op1
, float16 op2
, float_status
*stat
)
1034 op1
= float16_squash_input_denormal(op1
, stat
);
1035 op2
= float16_squash_input_denormal(op2
, stat
);
1037 if ((float16_is_infinity(op1
) && float16_is_zero(op2
)) ||
1038 (float16_is_infinity(op2
) && float16_is_zero(op1
))) {
1039 return float16_one_point_five
;
1041 op1
= float16_sub(float16_three
, float16_mul(op1
, op2
, stat
), stat
);
1042 return float16_div(op1
, float16_two
, stat
);
1045 static float32
float32_rsqrts_nf(float32 op1
, float32 op2
, float_status
*stat
)
1047 op1
= float32_squash_input_denormal(op1
, stat
);
1048 op2
= float32_squash_input_denormal(op2
, stat
);
1050 if ((float32_is_infinity(op1
) && float32_is_zero(op2
)) ||
1051 (float32_is_infinity(op2
) && float32_is_zero(op1
))) {
1052 return float32_one_point_five
;
1054 op1
= float32_sub(float32_three
, float32_mul(op1
, op2
, stat
), stat
);
1055 return float32_div(op1
, float32_two
, stat
);
1058 #define DO_3OP(NAME, FUNC, TYPE) \
1059 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1061 intptr_t i, oprsz = simd_oprsz(desc); \
1062 TYPE *d = vd, *n = vn, *m = vm; \
1063 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1064 d[i] = FUNC(n[i], m[i], stat); \
1066 clear_tail(d, oprsz, simd_maxsz(desc)); \
1069 DO_3OP(gvec_fadd_h
, float16_add
, float16
)
1070 DO_3OP(gvec_fadd_s
, float32_add
, float32
)
1071 DO_3OP(gvec_fadd_d
, float64_add
, float64
)
1073 DO_3OP(gvec_fsub_h
, float16_sub
, float16
)
1074 DO_3OP(gvec_fsub_s
, float32_sub
, float32
)
1075 DO_3OP(gvec_fsub_d
, float64_sub
, float64
)
1077 DO_3OP(gvec_fmul_h
, float16_mul
, float16
)
1078 DO_3OP(gvec_fmul_s
, float32_mul
, float32
)
1079 DO_3OP(gvec_fmul_d
, float64_mul
, float64
)
1081 DO_3OP(gvec_ftsmul_h
, float16_ftsmul
, float16
)
1082 DO_3OP(gvec_ftsmul_s
, float32_ftsmul
, float32
)
1083 DO_3OP(gvec_ftsmul_d
, float64_ftsmul
, float64
)
1085 DO_3OP(gvec_fabd_h
, float16_abd
, float16
)
1086 DO_3OP(gvec_fabd_s
, float32_abd
, float32
)
1088 DO_3OP(gvec_fceq_h
, float16_ceq
, float16
)
1089 DO_3OP(gvec_fceq_s
, float32_ceq
, float32
)
1091 DO_3OP(gvec_fcge_h
, float16_cge
, float16
)
1092 DO_3OP(gvec_fcge_s
, float32_cge
, float32
)
1094 DO_3OP(gvec_fcgt_h
, float16_cgt
, float16
)
1095 DO_3OP(gvec_fcgt_s
, float32_cgt
, float32
)
1097 DO_3OP(gvec_facge_h
, float16_acge
, float16
)
1098 DO_3OP(gvec_facge_s
, float32_acge
, float32
)
1100 DO_3OP(gvec_facgt_h
, float16_acgt
, float16
)
1101 DO_3OP(gvec_facgt_s
, float32_acgt
, float32
)
1103 DO_3OP(gvec_fmax_h
, float16_max
, float16
)
1104 DO_3OP(gvec_fmax_s
, float32_max
, float32
)
1106 DO_3OP(gvec_fmin_h
, float16_min
, float16
)
1107 DO_3OP(gvec_fmin_s
, float32_min
, float32
)
1109 DO_3OP(gvec_fmaxnum_h
, float16_maxnum
, float16
)
1110 DO_3OP(gvec_fmaxnum_s
, float32_maxnum
, float32
)
1112 DO_3OP(gvec_fminnum_h
, float16_minnum
, float16
)
1113 DO_3OP(gvec_fminnum_s
, float32_minnum
, float32
)
1115 DO_3OP(gvec_recps_nf_h
, float16_recps_nf
, float16
)
1116 DO_3OP(gvec_recps_nf_s
, float32_recps_nf
, float32
)
1118 DO_3OP(gvec_rsqrts_nf_h
, float16_rsqrts_nf
, float16
)
1119 DO_3OP(gvec_rsqrts_nf_s
, float32_rsqrts_nf
, float32
)
1121 #ifdef TARGET_AARCH64
1123 DO_3OP(gvec_recps_h
, helper_recpsf_f16
, float16
)
1124 DO_3OP(gvec_recps_s
, helper_recpsf_f32
, float32
)
1125 DO_3OP(gvec_recps_d
, helper_recpsf_f64
, float64
)
1127 DO_3OP(gvec_rsqrts_h
, helper_rsqrtsf_f16
, float16
)
1128 DO_3OP(gvec_rsqrts_s
, helper_rsqrtsf_f32
, float32
)
1129 DO_3OP(gvec_rsqrts_d
, helper_rsqrtsf_f64
, float64
)
1134 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1135 static float16
float16_muladd_nf(float16 dest
, float16 op1
, float16 op2
,
1138 return float16_add(dest
, float16_mul(op1
, op2
, stat
), stat
);
1141 static float32
float32_muladd_nf(float32 dest
, float32 op1
, float32 op2
,
1144 return float32_add(dest
, float32_mul(op1
, op2
, stat
), stat
);
1147 static float16
float16_mulsub_nf(float16 dest
, float16 op1
, float16 op2
,
1150 return float16_sub(dest
, float16_mul(op1
, op2
, stat
), stat
);
1153 static float32
float32_mulsub_nf(float32 dest
, float32 op1
, float32 op2
,
1156 return float32_sub(dest
, float32_mul(op1
, op2
, stat
), stat
);
1159 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1160 static float16
float16_muladd_f(float16 dest
, float16 op1
, float16 op2
,
1163 return float16_muladd(op1
, op2
, dest
, 0, stat
);
1166 static float32
float32_muladd_f(float32 dest
, float32 op1
, float32 op2
,
1169 return float32_muladd(op1
, op2
, dest
, 0, stat
);
1172 static float16
float16_mulsub_f(float16 dest
, float16 op1
, float16 op2
,
1175 return float16_muladd(float16_chs(op1
), op2
, dest
, 0, stat
);
1178 static float32
float32_mulsub_f(float32 dest
, float32 op1
, float32 op2
,
1181 return float32_muladd(float32_chs(op1
), op2
, dest
, 0, stat
);
1184 #define DO_MULADD(NAME, FUNC, TYPE) \
1185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1187 intptr_t i, oprsz = simd_oprsz(desc); \
1188 TYPE *d = vd, *n = vn, *m = vm; \
1189 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1190 d[i] = FUNC(d[i], n[i], m[i], stat); \
1192 clear_tail(d, oprsz, simd_maxsz(desc)); \
1195 DO_MULADD(gvec_fmla_h
, float16_muladd_nf
, float16
)
1196 DO_MULADD(gvec_fmla_s
, float32_muladd_nf
, float32
)
1198 DO_MULADD(gvec_fmls_h
, float16_mulsub_nf
, float16
)
1199 DO_MULADD(gvec_fmls_s
, float32_mulsub_nf
, float32
)
1201 DO_MULADD(gvec_vfma_h
, float16_muladd_f
, float16
)
1202 DO_MULADD(gvec_vfma_s
, float32_muladd_f
, float32
)
1204 DO_MULADD(gvec_vfms_h
, float16_mulsub_f
, float16
)
1205 DO_MULADD(gvec_vfms_s
, float32_mulsub_f
, float32
)
1207 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1208 * For AdvSIMD, there is of course only one such vector segment.
1211 #define DO_MUL_IDX(NAME, TYPE, H) \
1212 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1214 intptr_t i, j, oprsz = simd_oprsz(desc); \
1215 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1216 intptr_t idx = simd_data(desc); \
1217 TYPE *d = vd, *n = vn, *m = vm; \
1218 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1219 TYPE mm = m[H(i + idx)]; \
1220 for (j = 0; j < segment; j++) { \
1221 d[i + j] = n[i + j] * mm; \
1224 clear_tail(d, oprsz, simd_maxsz(desc)); \
1227 DO_MUL_IDX(gvec_mul_idx_h
, uint16_t, H2
)
1228 DO_MUL_IDX(gvec_mul_idx_s
, uint32_t, H4
)
1229 DO_MUL_IDX(gvec_mul_idx_d
, uint64_t, )
1233 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1234 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1236 intptr_t i, j, oprsz = simd_oprsz(desc); \
1237 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1238 intptr_t idx = simd_data(desc); \
1239 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1240 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1241 TYPE mm = m[H(i + idx)]; \
1242 for (j = 0; j < segment; j++) { \
1243 d[i + j] = a[i + j] OP n[i + j] * mm; \
1246 clear_tail(d, oprsz, simd_maxsz(desc)); \
1249 DO_MLA_IDX(gvec_mla_idx_h
, uint16_t, +, H2
)
1250 DO_MLA_IDX(gvec_mla_idx_s
, uint32_t, +, H4
)
1251 DO_MLA_IDX(gvec_mla_idx_d
, uint64_t, +, )
1253 DO_MLA_IDX(gvec_mls_idx_h
, uint16_t, -, H2
)
1254 DO_MLA_IDX(gvec_mls_idx_s
, uint32_t, -, H4
)
1255 DO_MLA_IDX(gvec_mls_idx_d
, uint64_t, -, )
1259 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1260 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1262 intptr_t i, j, oprsz = simd_oprsz(desc); \
1263 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1264 intptr_t idx = simd_data(desc); \
1265 TYPE *d = vd, *n = vn, *m = vm; \
1266 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1267 TYPE mm = m[H(i + idx)]; \
1268 for (j = 0; j < segment; j++) { \
1269 d[i + j] = TYPE##_##ADD(d[i + j], \
1270 TYPE##_mul(n[i + j], mm, stat), stat); \
1273 clear_tail(d, oprsz, simd_maxsz(desc)); \
1276 #define float16_nop(N, M, S) (M)
1277 #define float32_nop(N, M, S) (M)
1278 #define float64_nop(N, M, S) (M)
1280 DO_FMUL_IDX(gvec_fmul_idx_h
, nop
, float16
, H2
)
1281 DO_FMUL_IDX(gvec_fmul_idx_s
, nop
, float32
, H4
)
1282 DO_FMUL_IDX(gvec_fmul_idx_d
, nop
, float64
, )
1285 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1286 * the fused ops below they assume accumulate both from and into Vd.
1288 DO_FMUL_IDX(gvec_fmla_nf_idx_h
, add
, float16
, H2
)
1289 DO_FMUL_IDX(gvec_fmla_nf_idx_s
, add
, float32
, H4
)
1290 DO_FMUL_IDX(gvec_fmls_nf_idx_h
, sub
, float16
, H2
)
1291 DO_FMUL_IDX(gvec_fmls_nf_idx_s
, sub
, float32
, H4
)
1298 #define DO_FMLA_IDX(NAME, TYPE, H) \
1299 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1300 void *stat, uint32_t desc) \
1302 intptr_t i, j, oprsz = simd_oprsz(desc); \
1303 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1304 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1305 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1306 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1307 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1308 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1309 TYPE mm = m[H(i + idx)]; \
1310 for (j = 0; j < segment; j++) { \
1311 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1312 mm, a[i + j], 0, stat); \
1315 clear_tail(d, oprsz, simd_maxsz(desc)); \
1318 DO_FMLA_IDX(gvec_fmla_idx_h
, float16
, H2
)
1319 DO_FMLA_IDX(gvec_fmla_idx_s
, float32
, H4
)
1320 DO_FMLA_IDX(gvec_fmla_idx_d
, float64
, )
1324 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1325 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1327 intptr_t i, oprsz = simd_oprsz(desc); \
1328 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1330 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1331 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1335 } else if (dd > MAX) { \
1342 uint32_t *qc = vq; \
1345 clear_tail(d, oprsz, simd_maxsz(desc)); \
1348 DO_SAT(gvec_uqadd_b
, int, uint8_t, uint8_t, +, 0, UINT8_MAX
)
1349 DO_SAT(gvec_uqadd_h
, int, uint16_t, uint16_t, +, 0, UINT16_MAX
)
1350 DO_SAT(gvec_uqadd_s
, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX
)
1352 DO_SAT(gvec_sqadd_b
, int, int8_t, int8_t, +, INT8_MIN
, INT8_MAX
)
1353 DO_SAT(gvec_sqadd_h
, int, int16_t, int16_t, +, INT16_MIN
, INT16_MAX
)
1354 DO_SAT(gvec_sqadd_s
, int64_t, int32_t, int32_t, +, INT32_MIN
, INT32_MAX
)
1356 DO_SAT(gvec_uqsub_b
, int, uint8_t, uint8_t, -, 0, UINT8_MAX
)
1357 DO_SAT(gvec_uqsub_h
, int, uint16_t, uint16_t, -, 0, UINT16_MAX
)
1358 DO_SAT(gvec_uqsub_s
, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX
)
1360 DO_SAT(gvec_sqsub_b
, int, int8_t, int8_t, -, INT8_MIN
, INT8_MAX
)
1361 DO_SAT(gvec_sqsub_h
, int, int16_t, int16_t, -, INT16_MIN
, INT16_MAX
)
1362 DO_SAT(gvec_sqsub_s
, int64_t, int32_t, int32_t, -, INT32_MIN
, INT32_MAX
)
1366 void HELPER(gvec_uqadd_d
)(void *vd
, void *vq
, void *vn
,
1367 void *vm
, uint32_t desc
)
1369 intptr_t i
, oprsz
= simd_oprsz(desc
);
1370 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1373 for (i
= 0; i
< oprsz
/ 8; i
++) {
1374 uint64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
+ mm
;
1385 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1388 void HELPER(gvec_uqsub_d
)(void *vd
, void *vq
, void *vn
,
1389 void *vm
, uint32_t desc
)
1391 intptr_t i
, oprsz
= simd_oprsz(desc
);
1392 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1395 for (i
= 0; i
< oprsz
/ 8; i
++) {
1396 uint64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
- mm
;
1407 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1410 void HELPER(gvec_sqadd_d
)(void *vd
, void *vq
, void *vn
,
1411 void *vm
, uint32_t desc
)
1413 intptr_t i
, oprsz
= simd_oprsz(desc
);
1414 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
1417 for (i
= 0; i
< oprsz
/ 8; i
++) {
1418 int64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
+ mm
;
1419 if (((dd
^ nn
) & ~(nn
^ mm
)) & INT64_MIN
) {
1420 dd
= (nn
>> 63) ^ ~INT64_MIN
;
1429 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1432 void HELPER(gvec_sqsub_d
)(void *vd
, void *vq
, void *vn
,
1433 void *vm
, uint32_t desc
)
1435 intptr_t i
, oprsz
= simd_oprsz(desc
);
1436 int64_t *d
= vd
, *n
= vn
, *m
= vm
;
1439 for (i
= 0; i
< oprsz
/ 8; i
++) {
1440 int64_t nn
= n
[i
], mm
= m
[i
], dd
= nn
- mm
;
1441 if (((dd
^ nn
) & (nn
^ mm
)) & INT64_MIN
) {
1442 dd
= (nn
>> 63) ^ ~INT64_MIN
;
1451 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1455 #define DO_SRA(NAME, TYPE) \
1456 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1458 intptr_t i, oprsz = simd_oprsz(desc); \
1459 int shift = simd_data(desc); \
1460 TYPE *d = vd, *n = vn; \
1461 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1462 d[i] += n[i] >> shift; \
1464 clear_tail(d, oprsz, simd_maxsz(desc)); \
1467 DO_SRA(gvec_ssra_b
, int8_t)
1468 DO_SRA(gvec_ssra_h
, int16_t)
1469 DO_SRA(gvec_ssra_s
, int32_t)
1470 DO_SRA(gvec_ssra_d
, int64_t)
1472 DO_SRA(gvec_usra_b
, uint8_t)
1473 DO_SRA(gvec_usra_h
, uint16_t)
1474 DO_SRA(gvec_usra_s
, uint32_t)
1475 DO_SRA(gvec_usra_d
, uint64_t)
1479 #define DO_RSHR(NAME, TYPE) \
1480 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1482 intptr_t i, oprsz = simd_oprsz(desc); \
1483 int shift = simd_data(desc); \
1484 TYPE *d = vd, *n = vn; \
1485 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1486 TYPE tmp = n[i] >> (shift - 1); \
1487 d[i] = (tmp >> 1) + (tmp & 1); \
1489 clear_tail(d, oprsz, simd_maxsz(desc)); \
1492 DO_RSHR(gvec_srshr_b
, int8_t)
1493 DO_RSHR(gvec_srshr_h
, int16_t)
1494 DO_RSHR(gvec_srshr_s
, int32_t)
1495 DO_RSHR(gvec_srshr_d
, int64_t)
1497 DO_RSHR(gvec_urshr_b
, uint8_t)
1498 DO_RSHR(gvec_urshr_h
, uint16_t)
1499 DO_RSHR(gvec_urshr_s
, uint32_t)
1500 DO_RSHR(gvec_urshr_d
, uint64_t)
1504 #define DO_RSRA(NAME, TYPE) \
1505 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1507 intptr_t i, oprsz = simd_oprsz(desc); \
1508 int shift = simd_data(desc); \
1509 TYPE *d = vd, *n = vn; \
1510 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1511 TYPE tmp = n[i] >> (shift - 1); \
1512 d[i] += (tmp >> 1) + (tmp & 1); \
1514 clear_tail(d, oprsz, simd_maxsz(desc)); \
1517 DO_RSRA(gvec_srsra_b
, int8_t)
1518 DO_RSRA(gvec_srsra_h
, int16_t)
1519 DO_RSRA(gvec_srsra_s
, int32_t)
1520 DO_RSRA(gvec_srsra_d
, int64_t)
1522 DO_RSRA(gvec_ursra_b
, uint8_t)
1523 DO_RSRA(gvec_ursra_h
, uint16_t)
1524 DO_RSRA(gvec_ursra_s
, uint32_t)
1525 DO_RSRA(gvec_ursra_d
, uint64_t)
1529 #define DO_SRI(NAME, TYPE) \
1530 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1532 intptr_t i, oprsz = simd_oprsz(desc); \
1533 int shift = simd_data(desc); \
1534 TYPE *d = vd, *n = vn; \
1535 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1536 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1538 clear_tail(d, oprsz, simd_maxsz(desc)); \
1541 DO_SRI(gvec_sri_b
, uint8_t)
1542 DO_SRI(gvec_sri_h
, uint16_t)
1543 DO_SRI(gvec_sri_s
, uint32_t)
1544 DO_SRI(gvec_sri_d
, uint64_t)
1548 #define DO_SLI(NAME, TYPE) \
1549 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1551 intptr_t i, oprsz = simd_oprsz(desc); \
1552 int shift = simd_data(desc); \
1553 TYPE *d = vd, *n = vn; \
1554 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1555 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1557 clear_tail(d, oprsz, simd_maxsz(desc)); \
1560 DO_SLI(gvec_sli_b
, uint8_t)
1561 DO_SLI(gvec_sli_h
, uint16_t)
1562 DO_SLI(gvec_sli_s
, uint32_t)
1563 DO_SLI(gvec_sli_d
, uint64_t)
1568 * Convert float16 to float32, raising no exceptions and
1569 * preserving exceptional values, including SNaN.
1570 * This is effectively an unpack+repack operation.
1572 static float32
float16_to_float32_by_bits(uint32_t f16
, bool fz16
)
1574 const int f16_bias
= 15;
1575 const int f32_bias
= 127;
1576 uint32_t sign
= extract32(f16
, 15, 1);
1577 uint32_t exp
= extract32(f16
, 10, 5);
1578 uint32_t frac
= extract32(f16
, 0, 10);
1583 } else if (exp
== 0) {
1584 /* Zero or denormal. */
1590 * Denormal; these are all normal float32.
1591 * Shift the fraction so that the msb is at bit 11,
1592 * then remove bit 11 as the implicit bit of the
1593 * normalized float32. Note that we still go through
1594 * the shift for normal numbers below, to put the
1595 * float32 fraction at the right place.
1597 int shift
= clz32(frac
) - 21;
1598 frac
= (frac
<< shift
) & 0x3ff;
1599 exp
= f32_bias
- f16_bias
- shift
+ 1;
1603 /* Normal number; adjust the bias. */
1604 exp
+= f32_bias
- f16_bias
;
1610 return sign
| exp
| frac
;
1613 static uint64_t load4_f16(uint64_t *ptr
, int is_q
, int is_2
)
1616 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1617 * Load the 2nd qword iff is_q & is_2.
1618 * Shift to the 2nd dword iff !is_q & is_2.
1619 * For !is_q & !is_2, the upper bits of the result are garbage.
1621 return ptr
[is_q
& is_2
] >> ((is_2
& ~is_q
) << 5);
1625 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1626 * as there is not yet SVE versions that might use blocking.
1629 static void do_fmlal(float32
*d
, void *vn
, void *vm
, float_status
*fpst
,
1630 uint32_t desc
, bool fz16
)
1632 intptr_t i
, oprsz
= simd_oprsz(desc
);
1633 int is_s
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1634 int is_2
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1635 int is_q
= oprsz
== 16;
1638 /* Pre-load all of the f16 data, avoiding overlap issues. */
1639 n_4
= load4_f16(vn
, is_q
, is_2
);
1640 m_4
= load4_f16(vm
, is_q
, is_2
);
1642 /* Negate all inputs for FMLSL at once. */
1644 n_4
^= 0x8000800080008000ull
;
1647 for (i
= 0; i
< oprsz
/ 4; i
++) {
1648 float32 n_1
= float16_to_float32_by_bits(n_4
>> (i
* 16), fz16
);
1649 float32 m_1
= float16_to_float32_by_bits(m_4
>> (i
* 16), fz16
);
1650 d
[H4(i
)] = float32_muladd(n_1
, m_1
, d
[H4(i
)], 0, fpst
);
1652 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1655 void HELPER(gvec_fmlal_a32
)(void *vd
, void *vn
, void *vm
,
1656 void *venv
, uint32_t desc
)
1658 CPUARMState
*env
= venv
;
1659 do_fmlal(vd
, vn
, vm
, &env
->vfp
.standard_fp_status
, desc
,
1660 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1663 void HELPER(gvec_fmlal_a64
)(void *vd
, void *vn
, void *vm
,
1664 void *venv
, uint32_t desc
)
1666 CPUARMState
*env
= venv
;
1667 do_fmlal(vd
, vn
, vm
, &env
->vfp
.fp_status
, desc
,
1668 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1671 void HELPER(sve2_fmlal_zzzw_s
)(void *vd
, void *vn
, void *vm
, void *va
,
1672 void *venv
, uint32_t desc
)
1674 intptr_t i
, oprsz
= simd_oprsz(desc
);
1675 uint16_t negn
= extract32(desc
, SIMD_DATA_SHIFT
, 1) << 15;
1676 intptr_t sel
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1) * sizeof(float16
);
1677 CPUARMState
*env
= venv
;
1678 float_status
*status
= &env
->vfp
.fp_status
;
1679 bool fz16
= get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
);
1681 for (i
= 0; i
< oprsz
; i
+= sizeof(float32
)) {
1682 float16 nn_16
= *(float16
*)(vn
+ H1_2(i
+ sel
)) ^ negn
;
1683 float16 mm_16
= *(float16
*)(vm
+ H1_2(i
+ sel
));
1684 float32 nn
= float16_to_float32_by_bits(nn_16
, fz16
);
1685 float32 mm
= float16_to_float32_by_bits(mm_16
, fz16
);
1686 float32 aa
= *(float32
*)(va
+ H1_4(i
));
1688 *(float32
*)(vd
+ H1_4(i
)) = float32_muladd(nn
, mm
, aa
, 0, status
);
1692 static void do_fmlal_idx(float32
*d
, void *vn
, void *vm
, float_status
*fpst
,
1693 uint32_t desc
, bool fz16
)
1695 intptr_t i
, oprsz
= simd_oprsz(desc
);
1696 int is_s
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1697 int is_2
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1698 int index
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 3);
1699 int is_q
= oprsz
== 16;
1703 /* Pre-load all of the f16 data, avoiding overlap issues. */
1704 n_4
= load4_f16(vn
, is_q
, is_2
);
1706 /* Negate all inputs for FMLSL at once. */
1708 n_4
^= 0x8000800080008000ull
;
1711 m_1
= float16_to_float32_by_bits(((float16
*)vm
)[H2(index
)], fz16
);
1713 for (i
= 0; i
< oprsz
/ 4; i
++) {
1714 float32 n_1
= float16_to_float32_by_bits(n_4
>> (i
* 16), fz16
);
1715 d
[H4(i
)] = float32_muladd(n_1
, m_1
, d
[H4(i
)], 0, fpst
);
1717 clear_tail(d
, oprsz
, simd_maxsz(desc
));
1720 void HELPER(gvec_fmlal_idx_a32
)(void *vd
, void *vn
, void *vm
,
1721 void *venv
, uint32_t desc
)
1723 CPUARMState
*env
= venv
;
1724 do_fmlal_idx(vd
, vn
, vm
, &env
->vfp
.standard_fp_status
, desc
,
1725 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1728 void HELPER(gvec_fmlal_idx_a64
)(void *vd
, void *vn
, void *vm
,
1729 void *venv
, uint32_t desc
)
1731 CPUARMState
*env
= venv
;
1732 do_fmlal_idx(vd
, vn
, vm
, &env
->vfp
.fp_status
, desc
,
1733 get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
));
1736 void HELPER(sve2_fmlal_zzxw_s
)(void *vd
, void *vn
, void *vm
, void *va
,
1737 void *venv
, uint32_t desc
)
1739 intptr_t i
, j
, oprsz
= simd_oprsz(desc
);
1740 uint16_t negn
= extract32(desc
, SIMD_DATA_SHIFT
, 1) << 15;
1741 intptr_t sel
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1) * sizeof(float16
);
1742 intptr_t idx
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 3) * sizeof(float16
);
1743 CPUARMState
*env
= venv
;
1744 float_status
*status
= &env
->vfp
.fp_status
;
1745 bool fz16
= get_flush_inputs_to_zero(&env
->vfp
.fp_status_f16
);
1747 for (i
= 0; i
< oprsz
; i
+= 16) {
1748 float16 mm_16
= *(float16
*)(vm
+ i
+ idx
);
1749 float32 mm
= float16_to_float32_by_bits(mm_16
, fz16
);
1751 for (j
= 0; j
< 16; j
+= sizeof(float32
)) {
1752 float16 nn_16
= *(float16
*)(vn
+ H1_2(i
+ j
+ sel
)) ^ negn
;
1753 float32 nn
= float16_to_float32_by_bits(nn_16
, fz16
);
1754 float32 aa
= *(float32
*)(va
+ H1_4(i
+ j
));
1756 *(float32
*)(vd
+ H1_4(i
+ j
)) =
1757 float32_muladd(nn
, mm
, aa
, 0, status
);
1762 void HELPER(gvec_sshl_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1764 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1765 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
1767 for (i
= 0; i
< opr_sz
; ++i
) {
1776 res
= nn
>> (mm
> -8 ? -mm
: 7);
1780 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1783 void HELPER(gvec_sshl_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1785 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1786 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
1788 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
1789 int8_t mm
= m
[i
]; /* only 8 bits of shift are significant */
1797 res
= nn
>> (mm
> -16 ? -mm
: 15);
1801 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1804 void HELPER(gvec_ushl_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1806 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1807 uint8_t *d
= vd
, *n
= vn
, *m
= vm
;
1809 for (i
= 0; i
< opr_sz
; ++i
) {
1824 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1827 void HELPER(gvec_ushl_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1829 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1830 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
1832 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
1833 int8_t mm
= m
[i
]; /* only 8 bits of shift are significant */
1847 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1851 * 8x8->8 polynomial multiply.
1853 * Polynomial multiplication is like integer multiplication except the
1854 * partial products are XORed, not added.
1856 * TODO: expose this as a generic vector operation, as it is a common
1857 * crypto building block.
1859 void HELPER(gvec_pmul_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1861 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1862 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1864 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1869 for (j
= 0; j
< 8; ++j
) {
1870 uint64_t mask
= (nn
& 0x0101010101010101ull
) * 0xff;
1872 mm
= (mm
<< 1) & 0xfefefefefefefefeull
;
1877 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1881 * 64x64->128 polynomial multiply.
1882 * Because of the lanes are not accessed in strict columns,
1883 * this probably cannot be turned into a generic helper.
1885 void HELPER(gvec_pmull_q
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1887 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1888 intptr_t hi
= simd_data(desc
);
1889 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1891 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
1892 uint64_t nn
= n
[i
+ hi
];
1893 uint64_t mm
= m
[i
+ hi
];
1897 /* Bit 0 can only influence the low 64-bit result. */
1902 for (j
= 1; j
< 64; ++j
) {
1903 uint64_t mask
= -((nn
>> j
) & 1);
1904 rlo
^= (mm
<< j
) & mask
;
1905 rhi
^= (mm
>> (64 - j
)) & mask
;
1910 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
1914 * 8x8->16 polynomial multiply.
1916 * The byte inputs are expanded to (or extracted from) half-words.
1917 * Note that neon and sve2 get the inputs from different positions.
1918 * This allows 4 bytes to be processed in parallel with uint64_t.
1921 static uint64_t expand_byte_to_half(uint64_t x
)
1923 return (x
& 0x000000ff)
1924 | ((x
& 0x0000ff00) << 8)
1925 | ((x
& 0x00ff0000) << 16)
1926 | ((x
& 0xff000000) << 24);
1929 static uint64_t pmull_h(uint64_t op1
, uint64_t op2
)
1931 uint64_t result
= 0;
1934 for (i
= 0; i
< 8; ++i
) {
1935 uint64_t mask
= (op1
& 0x0001000100010001ull
) * 0xffff;
1936 result
^= op2
& mask
;
1943 void HELPER(neon_pmull_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1945 int hi
= simd_data(desc
);
1946 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1947 uint64_t nn
= n
[hi
], mm
= m
[hi
];
1949 d
[0] = pmull_h(expand_byte_to_half(nn
), expand_byte_to_half(mm
));
1952 d
[1] = pmull_h(expand_byte_to_half(nn
), expand_byte_to_half(mm
));
1954 clear_tail(d
, 16, simd_maxsz(desc
));
1957 #ifdef TARGET_AARCH64
1958 void HELPER(sve2_pmull_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1960 int shift
= simd_data(desc
) * 8;
1961 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1962 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1964 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1965 uint64_t nn
= (n
[i
] >> shift
) & 0x00ff00ff00ff00ffull
;
1966 uint64_t mm
= (m
[i
] >> shift
) & 0x00ff00ff00ff00ffull
;
1968 d
[i
] = pmull_h(nn
, mm
);
1972 static uint64_t pmull_d(uint64_t op1
, uint64_t op2
)
1974 uint64_t result
= 0;
1977 for (i
= 0; i
< 32; ++i
) {
1978 uint64_t mask
= -((op1
>> i
) & 1);
1979 result
^= (op2
<< i
) & mask
;
1984 void HELPER(sve2_pmull_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1986 intptr_t sel
= H4(simd_data(desc
));
1987 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1988 uint32_t *n
= vn
, *m
= vm
;
1991 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1992 d
[i
] = pmull_d(n
[2 * i
+ sel
], m
[2 * i
+ sel
]);
1997 #define DO_CMP0(NAME, TYPE, OP) \
1998 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2000 intptr_t i, opr_sz = simd_oprsz(desc); \
2001 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2002 TYPE nn = *(TYPE *)(vn + i); \
2003 *(TYPE *)(vd + i) = -(nn OP 0); \
2005 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2008 DO_CMP0(gvec_ceq0_b
, int8_t, ==)
2009 DO_CMP0(gvec_clt0_b
, int8_t, <)
2010 DO_CMP0(gvec_cle0_b
, int8_t, <=)
2011 DO_CMP0(gvec_cgt0_b
, int8_t, >)
2012 DO_CMP0(gvec_cge0_b
, int8_t, >=)
2014 DO_CMP0(gvec_ceq0_h
, int16_t, ==)
2015 DO_CMP0(gvec_clt0_h
, int16_t, <)
2016 DO_CMP0(gvec_cle0_h
, int16_t, <=)
2017 DO_CMP0(gvec_cgt0_h
, int16_t, >)
2018 DO_CMP0(gvec_cge0_h
, int16_t, >=)
2022 #define DO_ABD(NAME, TYPE) \
2023 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2025 intptr_t i, opr_sz = simd_oprsz(desc); \
2026 TYPE *d = vd, *n = vn, *m = vm; \
2028 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2029 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2031 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2034 DO_ABD(gvec_sabd_b
, int8_t)
2035 DO_ABD(gvec_sabd_h
, int16_t)
2036 DO_ABD(gvec_sabd_s
, int32_t)
2037 DO_ABD(gvec_sabd_d
, int64_t)
2039 DO_ABD(gvec_uabd_b
, uint8_t)
2040 DO_ABD(gvec_uabd_h
, uint16_t)
2041 DO_ABD(gvec_uabd_s
, uint32_t)
2042 DO_ABD(gvec_uabd_d
, uint64_t)
2046 #define DO_ABA(NAME, TYPE) \
2047 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2049 intptr_t i, opr_sz = simd_oprsz(desc); \
2050 TYPE *d = vd, *n = vn, *m = vm; \
2052 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2053 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2055 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2058 DO_ABA(gvec_saba_b
, int8_t)
2059 DO_ABA(gvec_saba_h
, int16_t)
2060 DO_ABA(gvec_saba_s
, int32_t)
2061 DO_ABA(gvec_saba_d
, int64_t)
2063 DO_ABA(gvec_uaba_b
, uint8_t)
2064 DO_ABA(gvec_uaba_h
, uint16_t)
2065 DO_ABA(gvec_uaba_s
, uint32_t)
2066 DO_ABA(gvec_uaba_d
, uint64_t)
2070 #define DO_NEON_PAIRWISE(NAME, OP) \
2071 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2072 void *stat, uint32_t oprsz) \
2074 float_status *fpst = stat; \
2080 /* Read all inputs before writing outputs in case vm == vd */ \
2081 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2082 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2088 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2089 void *stat, uint32_t oprsz) \
2091 float_status *fpst = stat; \
2095 float16 r0, r1, r2, r3; \
2097 /* Read all inputs before writing outputs in case vm == vd */ \
2098 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2099 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2100 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2101 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2109 DO_NEON_PAIRWISE(neon_padd
, add
)
2110 DO_NEON_PAIRWISE(neon_pmax
, max
)
2111 DO_NEON_PAIRWISE(neon_pmin
, min
)
2113 #undef DO_NEON_PAIRWISE
2115 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2116 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2118 intptr_t i, oprsz = simd_oprsz(desc); \
2119 int shift = simd_data(desc); \
2120 TYPE *d = vd, *n = vn; \
2121 float_status *fpst = stat; \
2122 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2123 d[i] = FUNC(n[i], shift, fpst); \
2125 clear_tail(d, oprsz, simd_maxsz(desc)); \
2128 DO_VCVT_FIXED(gvec_vcvt_sf
, helper_vfp_sltos
, uint32_t)
2129 DO_VCVT_FIXED(gvec_vcvt_uf
, helper_vfp_ultos
, uint32_t)
2130 DO_VCVT_FIXED(gvec_vcvt_fs
, helper_vfp_tosls_round_to_zero
, uint32_t)
2131 DO_VCVT_FIXED(gvec_vcvt_fu
, helper_vfp_touls_round_to_zero
, uint32_t)
2132 DO_VCVT_FIXED(gvec_vcvt_sh
, helper_vfp_shtoh
, uint16_t)
2133 DO_VCVT_FIXED(gvec_vcvt_uh
, helper_vfp_uhtoh
, uint16_t)
2134 DO_VCVT_FIXED(gvec_vcvt_hs
, helper_vfp_toshh_round_to_zero
, uint16_t)
2135 DO_VCVT_FIXED(gvec_vcvt_hu
, helper_vfp_touhh_round_to_zero
, uint16_t)
2137 #undef DO_VCVT_FIXED
2139 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2140 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2142 float_status *fpst = stat; \
2143 intptr_t i, oprsz = simd_oprsz(desc); \
2144 uint32_t rmode = simd_data(desc); \
2145 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2146 TYPE *d = vd, *n = vn; \
2147 set_float_rounding_mode(rmode, fpst); \
2148 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2149 d[i] = FUNC(n[i], 0, fpst); \
2151 set_float_rounding_mode(prev_rmode, fpst); \
2152 clear_tail(d, oprsz, simd_maxsz(desc)); \
2155 DO_VCVT_RMODE(gvec_vcvt_rm_ss
, helper_vfp_tosls
, uint32_t)
2156 DO_VCVT_RMODE(gvec_vcvt_rm_us
, helper_vfp_touls
, uint32_t)
2157 DO_VCVT_RMODE(gvec_vcvt_rm_sh
, helper_vfp_toshh
, uint16_t)
2158 DO_VCVT_RMODE(gvec_vcvt_rm_uh
, helper_vfp_touhh
, uint16_t)
2160 #undef DO_VCVT_RMODE
2162 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2163 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2165 float_status *fpst = stat; \
2166 intptr_t i, oprsz = simd_oprsz(desc); \
2167 uint32_t rmode = simd_data(desc); \
2168 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2169 TYPE *d = vd, *n = vn; \
2170 set_float_rounding_mode(rmode, fpst); \
2171 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2172 d[i] = FUNC(n[i], fpst); \
2174 set_float_rounding_mode(prev_rmode, fpst); \
2175 clear_tail(d, oprsz, simd_maxsz(desc)); \
2178 DO_VRINT_RMODE(gvec_vrint_rm_h
, helper_rinth
, uint16_t)
2179 DO_VRINT_RMODE(gvec_vrint_rm_s
, helper_rints
, uint32_t)
2181 #undef DO_VRINT_RMODE
2183 #ifdef TARGET_AARCH64
2184 void HELPER(simd_tblx
)(void *vd
, void *vm
, void *venv
, uint32_t desc
)
2186 const uint8_t *indices
= vm
;
2187 CPUARMState
*env
= venv
;
2188 size_t oprsz
= simd_oprsz(desc
);
2189 uint32_t rn
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
2190 bool is_tbx
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 1);
2191 uint32_t table_len
= desc
>> (SIMD_DATA_SHIFT
+ 6);
2198 * We must construct the final result in a temp, lest the output
2199 * overlaps the input table. For TBL, begin with zero; for TBX,
2200 * begin with the original register contents. Note that we always
2201 * copy 16 bytes here to avoid an extra branch; clearing the high
2202 * bits of the register for oprsz == 8 is handled below.
2205 memcpy(&result
, vd
, 16);
2207 memset(&result
, 0, 16);
2210 for (size_t i
= 0; i
< oprsz
; ++i
) {
2211 uint32_t index
= indices
[H1(i
)];
2213 if (index
< table_len
) {
2215 * Convert index (a byte offset into the virtual table
2216 * which is a series of 128-bit vectors concatenated)
2217 * into the correct register element, bearing in mind
2218 * that the table can wrap around from V31 to V0.
2220 const uint8_t *table
= (const uint8_t *)
2221 aa64_vfp_qreg(env
, (rn
+ (index
>> 4)) % 32);
2222 result
.b
[H1(i
)] = table
[H1(index
% 16)];
2226 memcpy(vd
, &result
, 16);
2227 clear_tail(vd
, oprsz
, simd_maxsz(desc
));
2232 * NxN -> N highpart multiply
2234 * TODO: expose this as a generic vector operation.
2237 void HELPER(gvec_smulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2239 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2240 int8_t *d
= vd
, *n
= vn
, *m
= vm
;
2242 for (i
= 0; i
< opr_sz
; ++i
) {
2243 d
[i
] = ((int32_t)n
[i
] * m
[i
]) >> 8;
2245 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2248 void HELPER(gvec_smulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2250 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2251 int16_t *d
= vd
, *n
= vn
, *m
= vm
;
2253 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
2254 d
[i
] = ((int32_t)n
[i
] * m
[i
]) >> 16;
2256 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2259 void HELPER(gvec_smulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2261 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2262 int32_t *d
= vd
, *n
= vn
, *m
= vm
;
2264 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
2265 d
[i
] = ((int64_t)n
[i
] * m
[i
]) >> 32;
2267 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2270 void HELPER(gvec_smulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2272 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2273 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2276 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
2277 muls64(&discard
, &d
[i
], n
[i
], m
[i
]);
2279 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2282 void HELPER(gvec_umulh_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2284 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2285 uint8_t *d
= vd
, *n
= vn
, *m
= vm
;
2287 for (i
= 0; i
< opr_sz
; ++i
) {
2288 d
[i
] = ((uint32_t)n
[i
] * m
[i
]) >> 8;
2290 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2293 void HELPER(gvec_umulh_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2295 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2296 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
2298 for (i
= 0; i
< opr_sz
/ 2; ++i
) {
2299 d
[i
] = ((uint32_t)n
[i
] * m
[i
]) >> 16;
2301 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2304 void HELPER(gvec_umulh_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2306 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2307 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2309 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
2310 d
[i
] = ((uint64_t)n
[i
] * m
[i
]) >> 32;
2312 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2315 void HELPER(gvec_umulh_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2317 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2318 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2321 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
2322 mulu64(&discard
, &d
[i
], n
[i
], m
[i
]);
2324 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2327 void HELPER(gvec_xar_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2329 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2330 int shr
= simd_data(desc
);
2331 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2333 for (i
= 0; i
< opr_sz
; ++i
) {
2334 d
[i
] = ror64(n
[i
] ^ m
[i
], shr
);
2336 clear_tail(d
, opr_sz
* 8, simd_maxsz(desc
));
2340 * Integer matrix-multiply accumulate
2343 static uint32_t do_smmla_b(uint32_t sum
, void *vn
, void *vm
)
2345 int8_t *n
= vn
, *m
= vm
;
2347 for (intptr_t k
= 0; k
< 8; ++k
) {
2348 sum
+= n
[H1(k
)] * m
[H1(k
)];
2353 static uint32_t do_ummla_b(uint32_t sum
, void *vn
, void *vm
)
2355 uint8_t *n
= vn
, *m
= vm
;
2357 for (intptr_t k
= 0; k
< 8; ++k
) {
2358 sum
+= n
[H1(k
)] * m
[H1(k
)];
2363 static uint32_t do_usmmla_b(uint32_t sum
, void *vn
, void *vm
)
2368 for (intptr_t k
= 0; k
< 8; ++k
) {
2369 sum
+= n
[H1(k
)] * m
[H1(k
)];
2374 static void do_mmla_b(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
,
2375 uint32_t (*inner_loop
)(uint32_t, void *, void *))
2377 intptr_t seg
, opr_sz
= simd_oprsz(desc
);
2379 for (seg
= 0; seg
< opr_sz
; seg
+= 16) {
2380 uint32_t *d
= vd
+ seg
;
2381 uint32_t *a
= va
+ seg
;
2382 uint32_t sum0
, sum1
, sum2
, sum3
;
2385 * Process the entire segment at once, writing back the
2386 * results only after we've consumed all of the inputs.
2388 * Key to indices by column:
2391 sum0
= a
[H4(0 + 0)];
2392 sum0
= inner_loop(sum0
, vn
+ seg
+ 0, vm
+ seg
+ 0);
2393 sum1
= a
[H4(0 + 1)];
2394 sum1
= inner_loop(sum1
, vn
+ seg
+ 0, vm
+ seg
+ 8);
2395 sum2
= a
[H4(2 + 0)];
2396 sum2
= inner_loop(sum2
, vn
+ seg
+ 8, vm
+ seg
+ 0);
2397 sum3
= a
[H4(2 + 1)];
2398 sum3
= inner_loop(sum3
, vn
+ seg
+ 8, vm
+ seg
+ 8);
2405 clear_tail(vd
, opr_sz
, simd_maxsz(desc
));
2408 #define DO_MMLA_B(NAME, INNER) \
2409 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2410 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2412 DO_MMLA_B(gvec_smmla_b
, do_smmla_b
)
2413 DO_MMLA_B(gvec_ummla_b
, do_ummla_b
)
2414 DO_MMLA_B(gvec_usmmla_b
, do_usmmla_b
)
2417 * BFloat16 Dot Product
2420 static float32
bfdotadd(float32 sum
, uint32_t e1
, uint32_t e2
)
2422 /* FPCR is ignored for BFDOT and BFMMLA. */
2423 float_status bf_status
= {
2424 .tininess_before_rounding
= float_tininess_before_rounding
,
2425 .float_rounding_mode
= float_round_to_odd_inf
,
2426 .flush_to_zero
= true,
2427 .flush_inputs_to_zero
= true,
2428 .default_nan_mode
= true,
2433 * Extract each BFloat16 from the element pair, and shift
2434 * them such that they become float32.
2436 t1
= float32_mul(e1
<< 16, e2
<< 16, &bf_status
);
2437 t2
= float32_mul(e1
& 0xffff0000u
, e2
& 0xffff0000u
, &bf_status
);
2438 t1
= float32_add(t1
, t2
, &bf_status
);
2439 t1
= float32_add(sum
, t1
, &bf_status
);
2444 void HELPER(gvec_bfdot
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
2446 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2447 float32
*d
= vd
, *a
= va
;
2448 uint32_t *n
= vn
, *m
= vm
;
2450 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
2451 d
[i
] = bfdotadd(a
[i
], n
[i
], m
[i
]);
2453 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2456 void HELPER(gvec_bfdot_idx
)(void *vd
, void *vn
, void *vm
,
2457 void *va
, uint32_t desc
)
2459 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2460 intptr_t index
= simd_data(desc
);
2461 intptr_t elements
= opr_sz
/ 4;
2462 intptr_t eltspersegment
= MIN(16 / 4, elements
);
2463 float32
*d
= vd
, *a
= va
;
2464 uint32_t *n
= vn
, *m
= vm
;
2466 for (i
= 0; i
< elements
; i
+= eltspersegment
) {
2467 uint32_t m_idx
= m
[i
+ H4(index
)];
2469 for (j
= i
; j
< i
+ eltspersegment
; j
++) {
2470 d
[j
] = bfdotadd(a
[j
], n
[j
], m_idx
);
2473 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2476 void HELPER(gvec_bfmmla
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
2478 intptr_t s
, opr_sz
= simd_oprsz(desc
);
2479 float32
*d
= vd
, *a
= va
;
2480 uint32_t *n
= vn
, *m
= vm
;
2482 for (s
= 0; s
< opr_sz
/ 4; s
+= 4) {
2483 float32 sum00
, sum01
, sum10
, sum11
;
2486 * Process the entire segment at once, writing back the
2487 * results only after we've consumed all of the inputs.
2489 * Key to indicies by column:
2492 sum00
= a
[s
+ H4(0 + 0)];
2493 sum00
= bfdotadd(sum00
, n
[s
+ H4(0 + 0)], m
[s
+ H4(0 + 0)]);
2494 sum00
= bfdotadd(sum00
, n
[s
+ H4(0 + 1)], m
[s
+ H4(0 + 1)]);
2496 sum01
= a
[s
+ H4(0 + 1)];
2497 sum01
= bfdotadd(sum01
, n
[s
+ H4(0 + 0)], m
[s
+ H4(2 + 0)]);
2498 sum01
= bfdotadd(sum01
, n
[s
+ H4(0 + 1)], m
[s
+ H4(2 + 1)]);
2500 sum10
= a
[s
+ H4(2 + 0)];
2501 sum10
= bfdotadd(sum10
, n
[s
+ H4(2 + 0)], m
[s
+ H4(0 + 0)]);
2502 sum10
= bfdotadd(sum10
, n
[s
+ H4(2 + 1)], m
[s
+ H4(0 + 1)]);
2504 sum11
= a
[s
+ H4(2 + 1)];
2505 sum11
= bfdotadd(sum11
, n
[s
+ H4(2 + 0)], m
[s
+ H4(2 + 0)]);
2506 sum11
= bfdotadd(sum11
, n
[s
+ H4(2 + 1)], m
[s
+ H4(2 + 1)]);
2508 d
[s
+ H4(0 + 0)] = sum00
;
2509 d
[s
+ H4(0 + 1)] = sum01
;
2510 d
[s
+ H4(2 + 0)] = sum10
;
2511 d
[s
+ H4(2 + 1)] = sum11
;
2513 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2516 void HELPER(gvec_bfmlal
)(void *vd
, void *vn
, void *vm
, void *va
,
2517 void *stat
, uint32_t desc
)
2519 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2520 intptr_t sel
= simd_data(desc
);
2521 float32
*d
= vd
, *a
= va
;
2522 bfloat16
*n
= vn
, *m
= vm
;
2524 for (i
= 0; i
< opr_sz
/ 4; ++i
) {
2525 float32 nn
= n
[H2(i
* 2 + sel
)] << 16;
2526 float32 mm
= m
[H2(i
* 2 + sel
)] << 16;
2527 d
[H4(i
)] = float32_muladd(nn
, mm
, a
[H4(i
)], 0, stat
);
2529 clear_tail(d
, opr_sz
, simd_maxsz(desc
));
2532 void HELPER(gvec_bfmlal_idx
)(void *vd
, void *vn
, void *vm
,
2533 void *va
, void *stat
, uint32_t desc
)
2535 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2536 intptr_t sel
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
2537 intptr_t index
= extract32(desc
, SIMD_DATA_SHIFT
+ 1, 3);
2538 intptr_t elements
= opr_sz
/ 4;
2539 intptr_t eltspersegment
= MIN(16 / 4, elements
);
2540 float32
*d
= vd
, *a
= va
;
2541 bfloat16
*n
= vn
, *m
= vm
;
2543 for (i
= 0; i
< elements
; i
+= eltspersegment
) {
2544 float32 m_idx
= m
[H2(2 * i
+ index
)] << 16;
2546 for (j
= i
; j
< i
+ eltspersegment
; j
++) {
2547 float32 n_j
= n
[H2(2 * j
+ sel
)] << 16;
2548 d
[H4(j
)] = float32_muladd(n_j
, m_idx
, a
[H4(j
)], 0, stat
);
2551 clear_tail(d
, opr_sz
, simd_maxsz(desc
));