target/arm: Implement SVE2 signed saturating doubling multiply high
[qemu/ar7.git] / target / arm / vec_helper.c
blob25061c15e10b508f5fb1a9af7081b61f6e9abea3
1 /*
2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
28 /* Note that vector data is stored in host-endian 64-bit chunks,
29 so addressing units smaller than that needs a host-endian fixup. */
30 #ifdef HOST_WORDS_BIGENDIAN
31 #define H1(x) ((x) ^ 7)
32 #define H2(x) ((x) ^ 3)
33 #define H4(x) ((x) ^ 1)
34 #else
35 #define H1(x) (x)
36 #define H2(x) (x)
37 #define H4(x) (x)
38 #endif
40 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
41 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
42 bool neg, bool round)
45 * Simplify:
46 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
47 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
49 int32_t ret = (int32_t)src1 * src2;
50 if (neg) {
51 ret = -ret;
53 ret += ((int32_t)src3 << 7) + (round << 6);
54 ret >>= 7;
56 if (ret != (int8_t)ret) {
57 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
59 return ret;
62 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
63 void *va, uint32_t desc)
65 intptr_t i, opr_sz = simd_oprsz(desc);
66 int8_t *d = vd, *n = vn, *m = vm, *a = va;
68 for (i = 0; i < opr_sz; ++i) {
69 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
73 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
74 void *va, uint32_t desc)
76 intptr_t i, opr_sz = simd_oprsz(desc);
77 int8_t *d = vd, *n = vn, *m = vm, *a = va;
79 for (i = 0; i < opr_sz; ++i) {
80 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
84 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
86 intptr_t i, opr_sz = simd_oprsz(desc);
87 int8_t *d = vd, *n = vn, *m = vm;
89 for (i = 0; i < opr_sz; ++i) {
90 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
94 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
96 intptr_t i, opr_sz = simd_oprsz(desc);
97 int8_t *d = vd, *n = vn, *m = vm;
99 for (i = 0; i < opr_sz; ++i) {
100 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
104 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
105 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
106 bool neg, bool round, uint32_t *sat)
108 /* Simplify similarly to do_sqrdmlah_b above. */
109 int32_t ret = (int32_t)src1 * src2;
110 if (neg) {
111 ret = -ret;
113 ret += ((int32_t)src3 << 15) + (round << 14);
114 ret >>= 15;
116 if (ret != (int16_t)ret) {
117 *sat = 1;
118 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
120 return ret;
123 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
124 uint32_t src2, uint32_t src3)
126 uint32_t *sat = &env->vfp.qc[0];
127 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
128 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
129 false, true, sat);
130 return deposit32(e1, 16, 16, e2);
133 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
134 void *vq, uint32_t desc)
136 uintptr_t opr_sz = simd_oprsz(desc);
137 int16_t *d = vd;
138 int16_t *n = vn;
139 int16_t *m = vm;
140 uintptr_t i;
142 for (i = 0; i < opr_sz / 2; ++i) {
143 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
145 clear_tail(d, opr_sz, simd_maxsz(desc));
148 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
149 uint32_t src2, uint32_t src3)
151 uint32_t *sat = &env->vfp.qc[0];
152 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
153 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
154 true, true, sat);
155 return deposit32(e1, 16, 16, e2);
158 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
159 void *vq, uint32_t desc)
161 uintptr_t opr_sz = simd_oprsz(desc);
162 int16_t *d = vd;
163 int16_t *n = vn;
164 int16_t *m = vm;
165 uintptr_t i;
167 for (i = 0; i < opr_sz / 2; ++i) {
168 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
170 clear_tail(d, opr_sz, simd_maxsz(desc));
173 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
174 void *vq, uint32_t desc)
176 intptr_t i, opr_sz = simd_oprsz(desc);
177 int16_t *d = vd, *n = vn, *m = vm;
179 for (i = 0; i < opr_sz / 2; ++i) {
180 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
182 clear_tail(d, opr_sz, simd_maxsz(desc));
185 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
186 void *vq, uint32_t desc)
188 intptr_t i, opr_sz = simd_oprsz(desc);
189 int16_t *d = vd, *n = vn, *m = vm;
191 for (i = 0; i < opr_sz / 2; ++i) {
192 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
194 clear_tail(d, opr_sz, simd_maxsz(desc));
197 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
198 void *va, uint32_t desc)
200 intptr_t i, opr_sz = simd_oprsz(desc);
201 int16_t *d = vd, *n = vn, *m = vm, *a = va;
202 uint32_t discard;
204 for (i = 0; i < opr_sz / 2; ++i) {
205 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
209 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
210 void *va, uint32_t desc)
212 intptr_t i, opr_sz = simd_oprsz(desc);
213 int16_t *d = vd, *n = vn, *m = vm, *a = va;
214 uint32_t discard;
216 for (i = 0; i < opr_sz / 2; ++i) {
217 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
221 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
223 intptr_t i, opr_sz = simd_oprsz(desc);
224 int16_t *d = vd, *n = vn, *m = vm;
225 uint32_t discard;
227 for (i = 0; i < opr_sz / 2; ++i) {
228 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
232 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
234 intptr_t i, opr_sz = simd_oprsz(desc);
235 int16_t *d = vd, *n = vn, *m = vm;
236 uint32_t discard;
238 for (i = 0; i < opr_sz / 2; ++i) {
239 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
243 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
244 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
245 bool neg, bool round, uint32_t *sat)
247 /* Simplify similarly to do_sqrdmlah_b above. */
248 int64_t ret = (int64_t)src1 * src2;
249 if (neg) {
250 ret = -ret;
252 ret += ((int64_t)src3 << 31) + (round << 30);
253 ret >>= 31;
255 if (ret != (int32_t)ret) {
256 *sat = 1;
257 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
259 return ret;
262 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
263 int32_t src2, int32_t src3)
265 uint32_t *sat = &env->vfp.qc[0];
266 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
269 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
270 void *vq, uint32_t desc)
272 uintptr_t opr_sz = simd_oprsz(desc);
273 int32_t *d = vd;
274 int32_t *n = vn;
275 int32_t *m = vm;
276 uintptr_t i;
278 for (i = 0; i < opr_sz / 4; ++i) {
279 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
281 clear_tail(d, opr_sz, simd_maxsz(desc));
284 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
285 int32_t src2, int32_t src3)
287 uint32_t *sat = &env->vfp.qc[0];
288 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
291 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
292 void *vq, uint32_t desc)
294 uintptr_t opr_sz = simd_oprsz(desc);
295 int32_t *d = vd;
296 int32_t *n = vn;
297 int32_t *m = vm;
298 uintptr_t i;
300 for (i = 0; i < opr_sz / 4; ++i) {
301 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
303 clear_tail(d, opr_sz, simd_maxsz(desc));
306 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
307 void *vq, uint32_t desc)
309 intptr_t i, opr_sz = simd_oprsz(desc);
310 int32_t *d = vd, *n = vn, *m = vm;
312 for (i = 0; i < opr_sz / 4; ++i) {
313 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
315 clear_tail(d, opr_sz, simd_maxsz(desc));
318 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
319 void *vq, uint32_t desc)
321 intptr_t i, opr_sz = simd_oprsz(desc);
322 int32_t *d = vd, *n = vn, *m = vm;
324 for (i = 0; i < opr_sz / 4; ++i) {
325 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
327 clear_tail(d, opr_sz, simd_maxsz(desc));
330 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
331 void *va, uint32_t desc)
333 intptr_t i, opr_sz = simd_oprsz(desc);
334 int32_t *d = vd, *n = vn, *m = vm, *a = va;
335 uint32_t discard;
337 for (i = 0; i < opr_sz / 4; ++i) {
338 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
342 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
343 void *va, uint32_t desc)
345 intptr_t i, opr_sz = simd_oprsz(desc);
346 int32_t *d = vd, *n = vn, *m = vm, *a = va;
347 uint32_t discard;
349 for (i = 0; i < opr_sz / 4; ++i) {
350 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
354 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
356 intptr_t i, opr_sz = simd_oprsz(desc);
357 int32_t *d = vd, *n = vn, *m = vm;
358 uint32_t discard;
360 for (i = 0; i < opr_sz / 4; ++i) {
361 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
365 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
367 intptr_t i, opr_sz = simd_oprsz(desc);
368 int32_t *d = vd, *n = vn, *m = vm;
369 uint32_t discard;
371 for (i = 0; i < opr_sz / 4; ++i) {
372 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
376 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
377 static int64_t do_sat128_d(Int128 r)
379 int64_t ls = int128_getlo(r);
380 int64_t hs = int128_gethi(r);
382 if (unlikely(hs != (ls >> 63))) {
383 return hs < 0 ? INT64_MIN : INT64_MAX;
385 return ls;
388 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
390 uint64_t l, h;
391 Int128 r, t;
393 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
394 muls64(&l, &h, m, n);
395 r = int128_make128(l, h);
396 if (neg) {
397 r = int128_neg(r);
399 if (a) {
400 t = int128_exts64(a);
401 t = int128_lshift(t, 63);
402 r = int128_add(r, t);
404 if (round) {
405 t = int128_exts64(1ll << 62);
406 r = int128_add(r, t);
408 r = int128_rshift(r, 63);
410 return do_sat128_d(r);
413 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
414 void *va, uint32_t desc)
416 intptr_t i, opr_sz = simd_oprsz(desc);
417 int64_t *d = vd, *n = vn, *m = vm, *a = va;
419 for (i = 0; i < opr_sz / 8; ++i) {
420 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
424 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
425 void *va, uint32_t desc)
427 intptr_t i, opr_sz = simd_oprsz(desc);
428 int64_t *d = vd, *n = vn, *m = vm, *a = va;
430 for (i = 0; i < opr_sz / 8; ++i) {
431 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
435 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
437 intptr_t i, opr_sz = simd_oprsz(desc);
438 int64_t *d = vd, *n = vn, *m = vm;
440 for (i = 0; i < opr_sz / 8; ++i) {
441 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
445 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
447 intptr_t i, opr_sz = simd_oprsz(desc);
448 int64_t *d = vd, *n = vn, *m = vm;
450 for (i = 0; i < opr_sz / 8; ++i) {
451 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
455 /* Integer 8 and 16-bit dot-product.
457 * Note that for the loops herein, host endianness does not matter
458 * with respect to the ordering of data within the 64-bit lanes.
459 * All elements are treated equally, no matter where they are.
462 void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
464 intptr_t i, opr_sz = simd_oprsz(desc);
465 int32_t *d = vd, *a = va;
466 int8_t *n = vn, *m = vm;
468 for (i = 0; i < opr_sz / 4; ++i) {
469 d[i] = (a[i] +
470 n[i * 4 + 0] * m[i * 4 + 0] +
471 n[i * 4 + 1] * m[i * 4 + 1] +
472 n[i * 4 + 2] * m[i * 4 + 2] +
473 n[i * 4 + 3] * m[i * 4 + 3]);
475 clear_tail(d, opr_sz, simd_maxsz(desc));
478 void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
480 intptr_t i, opr_sz = simd_oprsz(desc);
481 uint32_t *d = vd, *a = va;
482 uint8_t *n = vn, *m = vm;
484 for (i = 0; i < opr_sz / 4; ++i) {
485 d[i] = (a[i] +
486 n[i * 4 + 0] * m[i * 4 + 0] +
487 n[i * 4 + 1] * m[i * 4 + 1] +
488 n[i * 4 + 2] * m[i * 4 + 2] +
489 n[i * 4 + 3] * m[i * 4 + 3]);
491 clear_tail(d, opr_sz, simd_maxsz(desc));
494 void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
496 intptr_t i, opr_sz = simd_oprsz(desc);
497 int64_t *d = vd, *a = va;
498 int16_t *n = vn, *m = vm;
500 for (i = 0; i < opr_sz / 8; ++i) {
501 d[i] = (a[i] +
502 (int64_t)n[i * 4 + 0] * m[i * 4 + 0] +
503 (int64_t)n[i * 4 + 1] * m[i * 4 + 1] +
504 (int64_t)n[i * 4 + 2] * m[i * 4 + 2] +
505 (int64_t)n[i * 4 + 3] * m[i * 4 + 3]);
507 clear_tail(d, opr_sz, simd_maxsz(desc));
510 void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
512 intptr_t i, opr_sz = simd_oprsz(desc);
513 uint64_t *d = vd, *a = va;
514 uint16_t *n = vn, *m = vm;
516 for (i = 0; i < opr_sz / 8; ++i) {
517 d[i] = (a[i] +
518 (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] +
519 (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] +
520 (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] +
521 (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]);
523 clear_tail(d, opr_sz, simd_maxsz(desc));
526 void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm,
527 void *va, uint32_t desc)
529 intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
530 intptr_t index = simd_data(desc);
531 int32_t *d = vd, *a = va;
532 int8_t *n = vn;
533 int8_t *m_indexed = (int8_t *)vm + H4(index) * 4;
535 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
536 * Otherwise opr_sz is a multiple of 16.
538 segend = MIN(4, opr_sz_4);
539 i = 0;
540 do {
541 int8_t m0 = m_indexed[i * 4 + 0];
542 int8_t m1 = m_indexed[i * 4 + 1];
543 int8_t m2 = m_indexed[i * 4 + 2];
544 int8_t m3 = m_indexed[i * 4 + 3];
546 do {
547 d[i] = (a[i] +
548 n[i * 4 + 0] * m0 +
549 n[i * 4 + 1] * m1 +
550 n[i * 4 + 2] * m2 +
551 n[i * 4 + 3] * m3);
552 } while (++i < segend);
553 segend = i + 4;
554 } while (i < opr_sz_4);
556 clear_tail(d, opr_sz, simd_maxsz(desc));
559 void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm,
560 void *va, uint32_t desc)
562 intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
563 intptr_t index = simd_data(desc);
564 uint32_t *d = vd, *a = va;
565 uint8_t *n = vn;
566 uint8_t *m_indexed = (uint8_t *)vm + H4(index) * 4;
568 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
569 * Otherwise opr_sz is a multiple of 16.
571 segend = MIN(4, opr_sz_4);
572 i = 0;
573 do {
574 uint8_t m0 = m_indexed[i * 4 + 0];
575 uint8_t m1 = m_indexed[i * 4 + 1];
576 uint8_t m2 = m_indexed[i * 4 + 2];
577 uint8_t m3 = m_indexed[i * 4 + 3];
579 do {
580 d[i] = (a[i] +
581 n[i * 4 + 0] * m0 +
582 n[i * 4 + 1] * m1 +
583 n[i * 4 + 2] * m2 +
584 n[i * 4 + 3] * m3);
585 } while (++i < segend);
586 segend = i + 4;
587 } while (i < opr_sz_4);
589 clear_tail(d, opr_sz, simd_maxsz(desc));
592 void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm,
593 void *va, uint32_t desc)
595 intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
596 intptr_t index = simd_data(desc);
597 int64_t *d = vd, *a = va;
598 int16_t *n = vn;
599 int16_t *m_indexed = (int16_t *)vm + index * 4;
601 /* This is supported by SVE only, so opr_sz is always a multiple of 16.
602 * Process the entire segment all at once, writing back the results
603 * only after we've consumed all of the inputs.
605 for (i = 0; i < opr_sz_8; i += 2) {
606 int64_t d0, d1;
608 d0 = a[i + 0];
609 d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
610 d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1];
611 d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2];
612 d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3];
614 d1 = a[i + 1];
615 d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
616 d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1];
617 d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2];
618 d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3];
620 d[i + 0] = d0;
621 d[i + 1] = d1;
623 clear_tail(d, opr_sz, simd_maxsz(desc));
626 void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm,
627 void *va, uint32_t desc)
629 intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
630 intptr_t index = simd_data(desc);
631 uint64_t *d = vd, *a = va;
632 uint16_t *n = vn;
633 uint16_t *m_indexed = (uint16_t *)vm + index * 4;
635 /* This is supported by SVE only, so opr_sz is always a multiple of 16.
636 * Process the entire segment all at once, writing back the results
637 * only after we've consumed all of the inputs.
639 for (i = 0; i < opr_sz_8; i += 2) {
640 uint64_t d0, d1;
642 d0 = a[i + 0];
643 d0 += n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0];
644 d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1];
645 d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2];
646 d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3];
648 d1 = a[i + 1];
649 d1 += n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0];
650 d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1];
651 d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2];
652 d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3];
654 d[i + 0] = d0;
655 d[i + 1] = d1;
657 clear_tail(d, opr_sz, simd_maxsz(desc));
660 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
661 void *vfpst, uint32_t desc)
663 uintptr_t opr_sz = simd_oprsz(desc);
664 float16 *d = vd;
665 float16 *n = vn;
666 float16 *m = vm;
667 float_status *fpst = vfpst;
668 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
669 uint32_t neg_imag = neg_real ^ 1;
670 uintptr_t i;
672 /* Shift boolean to the sign bit so we can xor to negate. */
673 neg_real <<= 15;
674 neg_imag <<= 15;
676 for (i = 0; i < opr_sz / 2; i += 2) {
677 float16 e0 = n[H2(i)];
678 float16 e1 = m[H2(i + 1)] ^ neg_imag;
679 float16 e2 = n[H2(i + 1)];
680 float16 e3 = m[H2(i)] ^ neg_real;
682 d[H2(i)] = float16_add(e0, e1, fpst);
683 d[H2(i + 1)] = float16_add(e2, e3, fpst);
685 clear_tail(d, opr_sz, simd_maxsz(desc));
688 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
689 void *vfpst, uint32_t desc)
691 uintptr_t opr_sz = simd_oprsz(desc);
692 float32 *d = vd;
693 float32 *n = vn;
694 float32 *m = vm;
695 float_status *fpst = vfpst;
696 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
697 uint32_t neg_imag = neg_real ^ 1;
698 uintptr_t i;
700 /* Shift boolean to the sign bit so we can xor to negate. */
701 neg_real <<= 31;
702 neg_imag <<= 31;
704 for (i = 0; i < opr_sz / 4; i += 2) {
705 float32 e0 = n[H4(i)];
706 float32 e1 = m[H4(i + 1)] ^ neg_imag;
707 float32 e2 = n[H4(i + 1)];
708 float32 e3 = m[H4(i)] ^ neg_real;
710 d[H4(i)] = float32_add(e0, e1, fpst);
711 d[H4(i + 1)] = float32_add(e2, e3, fpst);
713 clear_tail(d, opr_sz, simd_maxsz(desc));
716 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
717 void *vfpst, uint32_t desc)
719 uintptr_t opr_sz = simd_oprsz(desc);
720 float64 *d = vd;
721 float64 *n = vn;
722 float64 *m = vm;
723 float_status *fpst = vfpst;
724 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
725 uint64_t neg_imag = neg_real ^ 1;
726 uintptr_t i;
728 /* Shift boolean to the sign bit so we can xor to negate. */
729 neg_real <<= 63;
730 neg_imag <<= 63;
732 for (i = 0; i < opr_sz / 8; i += 2) {
733 float64 e0 = n[i];
734 float64 e1 = m[i + 1] ^ neg_imag;
735 float64 e2 = n[i + 1];
736 float64 e3 = m[i] ^ neg_real;
738 d[i] = float64_add(e0, e1, fpst);
739 d[i + 1] = float64_add(e2, e3, fpst);
741 clear_tail(d, opr_sz, simd_maxsz(desc));
744 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
745 void *vfpst, uint32_t desc)
747 uintptr_t opr_sz = simd_oprsz(desc);
748 float16 *d = vd, *n = vn, *m = vm, *a = va;
749 float_status *fpst = vfpst;
750 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
751 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
752 uint32_t neg_real = flip ^ neg_imag;
753 uintptr_t i;
755 /* Shift boolean to the sign bit so we can xor to negate. */
756 neg_real <<= 15;
757 neg_imag <<= 15;
759 for (i = 0; i < opr_sz / 2; i += 2) {
760 float16 e2 = n[H2(i + flip)];
761 float16 e1 = m[H2(i + flip)] ^ neg_real;
762 float16 e4 = e2;
763 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
765 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
766 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
768 clear_tail(d, opr_sz, simd_maxsz(desc));
771 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
772 void *vfpst, uint32_t desc)
774 uintptr_t opr_sz = simd_oprsz(desc);
775 float16 *d = vd, *n = vn, *m = vm, *a = va;
776 float_status *fpst = vfpst;
777 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
778 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
779 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
780 uint32_t neg_real = flip ^ neg_imag;
781 intptr_t elements = opr_sz / sizeof(float16);
782 intptr_t eltspersegment = 16 / sizeof(float16);
783 intptr_t i, j;
785 /* Shift boolean to the sign bit so we can xor to negate. */
786 neg_real <<= 15;
787 neg_imag <<= 15;
789 for (i = 0; i < elements; i += eltspersegment) {
790 float16 mr = m[H2(i + 2 * index + 0)];
791 float16 mi = m[H2(i + 2 * index + 1)];
792 float16 e1 = neg_real ^ (flip ? mi : mr);
793 float16 e3 = neg_imag ^ (flip ? mr : mi);
795 for (j = i; j < i + eltspersegment; j += 2) {
796 float16 e2 = n[H2(j + flip)];
797 float16 e4 = e2;
799 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
800 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
803 clear_tail(d, opr_sz, simd_maxsz(desc));
806 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
807 void *vfpst, uint32_t desc)
809 uintptr_t opr_sz = simd_oprsz(desc);
810 float32 *d = vd, *n = vn, *m = vm, *a = va;
811 float_status *fpst = vfpst;
812 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
813 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
814 uint32_t neg_real = flip ^ neg_imag;
815 uintptr_t i;
817 /* Shift boolean to the sign bit so we can xor to negate. */
818 neg_real <<= 31;
819 neg_imag <<= 31;
821 for (i = 0; i < opr_sz / 4; i += 2) {
822 float32 e2 = n[H4(i + flip)];
823 float32 e1 = m[H4(i + flip)] ^ neg_real;
824 float32 e4 = e2;
825 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
827 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
828 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
830 clear_tail(d, opr_sz, simd_maxsz(desc));
833 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
834 void *vfpst, uint32_t desc)
836 uintptr_t opr_sz = simd_oprsz(desc);
837 float32 *d = vd, *n = vn, *m = vm, *a = va;
838 float_status *fpst = vfpst;
839 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
840 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
841 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
842 uint32_t neg_real = flip ^ neg_imag;
843 intptr_t elements = opr_sz / sizeof(float32);
844 intptr_t eltspersegment = 16 / sizeof(float32);
845 intptr_t i, j;
847 /* Shift boolean to the sign bit so we can xor to negate. */
848 neg_real <<= 31;
849 neg_imag <<= 31;
851 for (i = 0; i < elements; i += eltspersegment) {
852 float32 mr = m[H4(i + 2 * index + 0)];
853 float32 mi = m[H4(i + 2 * index + 1)];
854 float32 e1 = neg_real ^ (flip ? mi : mr);
855 float32 e3 = neg_imag ^ (flip ? mr : mi);
857 for (j = i; j < i + eltspersegment; j += 2) {
858 float32 e2 = n[H4(j + flip)];
859 float32 e4 = e2;
861 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
862 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
865 clear_tail(d, opr_sz, simd_maxsz(desc));
868 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
869 void *vfpst, uint32_t desc)
871 uintptr_t opr_sz = simd_oprsz(desc);
872 float64 *d = vd, *n = vn, *m = vm, *a = va;
873 float_status *fpst = vfpst;
874 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
875 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
876 uint64_t neg_real = flip ^ neg_imag;
877 uintptr_t i;
879 /* Shift boolean to the sign bit so we can xor to negate. */
880 neg_real <<= 63;
881 neg_imag <<= 63;
883 for (i = 0; i < opr_sz / 8; i += 2) {
884 float64 e2 = n[i + flip];
885 float64 e1 = m[i + flip] ^ neg_real;
886 float64 e4 = e2;
887 float64 e3 = m[i + 1 - flip] ^ neg_imag;
889 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
890 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
892 clear_tail(d, opr_sz, simd_maxsz(desc));
896 * Floating point comparisons producing an integer result (all 1s or all 0s).
897 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
898 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
900 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
902 return -float16_eq_quiet(op1, op2, stat);
905 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
907 return -float32_eq_quiet(op1, op2, stat);
910 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
912 return -float16_le(op2, op1, stat);
915 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
917 return -float32_le(op2, op1, stat);
920 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
922 return -float16_lt(op2, op1, stat);
925 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
927 return -float32_lt(op2, op1, stat);
930 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
932 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
935 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
937 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
940 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
942 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
945 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
947 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
950 static int16_t vfp_tosszh(float16 x, void *fpstp)
952 float_status *fpst = fpstp;
953 if (float16_is_any_nan(x)) {
954 float_raise(float_flag_invalid, fpst);
955 return 0;
957 return float16_to_int16_round_to_zero(x, fpst);
960 static uint16_t vfp_touszh(float16 x, void *fpstp)
962 float_status *fpst = fpstp;
963 if (float16_is_any_nan(x)) {
964 float_raise(float_flag_invalid, fpst);
965 return 0;
967 return float16_to_uint16_round_to_zero(x, fpst);
970 #define DO_2OP(NAME, FUNC, TYPE) \
971 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
973 intptr_t i, oprsz = simd_oprsz(desc); \
974 TYPE *d = vd, *n = vn; \
975 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
976 d[i] = FUNC(n[i], stat); \
978 clear_tail(d, oprsz, simd_maxsz(desc)); \
981 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
982 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
983 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
985 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
986 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
987 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
989 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
990 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
992 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
993 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
994 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
995 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
996 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
997 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
998 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
999 DO_2OP(gvec_touszh, vfp_touszh, float16)
1001 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1002 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1004 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1007 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1008 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1010 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1013 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1014 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1015 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1016 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1017 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1019 DO_2OP_CMP0(cgt, cgt, FWD)
1020 DO_2OP_CMP0(cge, cge, FWD)
1021 DO_2OP_CMP0(ceq, ceq, FWD)
1022 DO_2OP_CMP0(clt, cgt, REV)
1023 DO_2OP_CMP0(cle, cge, REV)
1025 #undef DO_2OP
1026 #undef DO_2OP_CMP0
1028 /* Floating-point trigonometric starting value.
1029 * See the ARM ARM pseudocode function FPTrigSMul.
1031 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1033 float16 result = float16_mul(op1, op1, stat);
1034 if (!float16_is_any_nan(result)) {
1035 result = float16_set_sign(result, op2 & 1);
1037 return result;
1040 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1042 float32 result = float32_mul(op1, op1, stat);
1043 if (!float32_is_any_nan(result)) {
1044 result = float32_set_sign(result, op2 & 1);
1046 return result;
1049 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1051 float64 result = float64_mul(op1, op1, stat);
1052 if (!float64_is_any_nan(result)) {
1053 result = float64_set_sign(result, op2 & 1);
1055 return result;
1058 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1060 return float16_abs(float16_sub(op1, op2, stat));
1063 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1065 return float32_abs(float32_sub(op1, op2, stat));
1069 * Reciprocal step. These are the AArch32 version which uses a
1070 * non-fused multiply-and-subtract.
1072 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1074 op1 = float16_squash_input_denormal(op1, stat);
1075 op2 = float16_squash_input_denormal(op2, stat);
1077 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1078 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1079 return float16_two;
1081 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1084 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1086 op1 = float32_squash_input_denormal(op1, stat);
1087 op2 = float32_squash_input_denormal(op2, stat);
1089 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1090 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1091 return float32_two;
1093 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1096 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1097 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1099 op1 = float16_squash_input_denormal(op1, stat);
1100 op2 = float16_squash_input_denormal(op2, stat);
1102 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1103 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1104 return float16_one_point_five;
1106 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1107 return float16_div(op1, float16_two, stat);
1110 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1112 op1 = float32_squash_input_denormal(op1, stat);
1113 op2 = float32_squash_input_denormal(op2, stat);
1115 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1116 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1117 return float32_one_point_five;
1119 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1120 return float32_div(op1, float32_two, stat);
1123 #define DO_3OP(NAME, FUNC, TYPE) \
1124 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1126 intptr_t i, oprsz = simd_oprsz(desc); \
1127 TYPE *d = vd, *n = vn, *m = vm; \
1128 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1129 d[i] = FUNC(n[i], m[i], stat); \
1131 clear_tail(d, oprsz, simd_maxsz(desc)); \
1134 DO_3OP(gvec_fadd_h, float16_add, float16)
1135 DO_3OP(gvec_fadd_s, float32_add, float32)
1136 DO_3OP(gvec_fadd_d, float64_add, float64)
1138 DO_3OP(gvec_fsub_h, float16_sub, float16)
1139 DO_3OP(gvec_fsub_s, float32_sub, float32)
1140 DO_3OP(gvec_fsub_d, float64_sub, float64)
1142 DO_3OP(gvec_fmul_h, float16_mul, float16)
1143 DO_3OP(gvec_fmul_s, float32_mul, float32)
1144 DO_3OP(gvec_fmul_d, float64_mul, float64)
1146 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1147 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1148 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1150 DO_3OP(gvec_fabd_h, float16_abd, float16)
1151 DO_3OP(gvec_fabd_s, float32_abd, float32)
1153 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1154 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1156 DO_3OP(gvec_fcge_h, float16_cge, float16)
1157 DO_3OP(gvec_fcge_s, float32_cge, float32)
1159 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1160 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1162 DO_3OP(gvec_facge_h, float16_acge, float16)
1163 DO_3OP(gvec_facge_s, float32_acge, float32)
1165 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1166 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1168 DO_3OP(gvec_fmax_h, float16_max, float16)
1169 DO_3OP(gvec_fmax_s, float32_max, float32)
1171 DO_3OP(gvec_fmin_h, float16_min, float16)
1172 DO_3OP(gvec_fmin_s, float32_min, float32)
1174 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1175 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1177 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1178 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1180 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1181 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1183 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1184 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1186 #ifdef TARGET_AARCH64
1188 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1189 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1190 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1192 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1193 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1194 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1196 #endif
1197 #undef DO_3OP
1199 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1200 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1201 float_status *stat)
1203 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1206 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1207 float_status *stat)
1209 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1212 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1213 float_status *stat)
1215 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1218 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1219 float_status *stat)
1221 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1224 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1225 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1226 float_status *stat)
1228 return float16_muladd(op1, op2, dest, 0, stat);
1231 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1232 float_status *stat)
1234 return float32_muladd(op1, op2, dest, 0, stat);
1237 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1238 float_status *stat)
1240 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1243 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1244 float_status *stat)
1246 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1249 #define DO_MULADD(NAME, FUNC, TYPE) \
1250 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1252 intptr_t i, oprsz = simd_oprsz(desc); \
1253 TYPE *d = vd, *n = vn, *m = vm; \
1254 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1255 d[i] = FUNC(d[i], n[i], m[i], stat); \
1257 clear_tail(d, oprsz, simd_maxsz(desc)); \
1260 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1261 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1263 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1264 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1266 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1267 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1269 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1270 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1272 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1273 * For AdvSIMD, there is of course only one such vector segment.
1276 #define DO_MUL_IDX(NAME, TYPE, H) \
1277 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1279 intptr_t i, j, oprsz = simd_oprsz(desc); \
1280 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1281 intptr_t idx = simd_data(desc); \
1282 TYPE *d = vd, *n = vn, *m = vm; \
1283 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1284 TYPE mm = m[H(i + idx)]; \
1285 for (j = 0; j < segment; j++) { \
1286 d[i + j] = n[i + j] * mm; \
1289 clear_tail(d, oprsz, simd_maxsz(desc)); \
1292 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1293 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1294 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, )
1296 #undef DO_MUL_IDX
1298 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1299 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1301 intptr_t i, j, oprsz = simd_oprsz(desc); \
1302 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1303 intptr_t idx = simd_data(desc); \
1304 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1305 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1306 TYPE mm = m[H(i + idx)]; \
1307 for (j = 0; j < segment; j++) { \
1308 d[i + j] = a[i + j] OP n[i + j] * mm; \
1311 clear_tail(d, oprsz, simd_maxsz(desc)); \
1314 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1315 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1316 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, )
1318 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1319 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1320 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, )
1322 #undef DO_MLA_IDX
1324 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1327 intptr_t i, j, oprsz = simd_oprsz(desc); \
1328 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1329 intptr_t idx = simd_data(desc); \
1330 TYPE *d = vd, *n = vn, *m = vm; \
1331 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1332 TYPE mm = m[H(i + idx)]; \
1333 for (j = 0; j < segment; j++) { \
1334 d[i + j] = TYPE##_##ADD(d[i + j], \
1335 TYPE##_mul(n[i + j], mm, stat), stat); \
1338 clear_tail(d, oprsz, simd_maxsz(desc)); \
1341 #define float16_nop(N, M, S) (M)
1342 #define float32_nop(N, M, S) (M)
1343 #define float64_nop(N, M, S) (M)
1345 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1346 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1347 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, )
1350 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1351 * the fused ops below they assume accumulate both from and into Vd.
1353 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1354 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1355 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1356 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1358 #undef float16_nop
1359 #undef float32_nop
1360 #undef float64_nop
1361 #undef DO_FMUL_IDX
1363 #define DO_FMLA_IDX(NAME, TYPE, H) \
1364 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1365 void *stat, uint32_t desc) \
1367 intptr_t i, j, oprsz = simd_oprsz(desc); \
1368 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1369 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1370 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1371 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1372 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1373 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1374 TYPE mm = m[H(i + idx)]; \
1375 for (j = 0; j < segment; j++) { \
1376 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1377 mm, a[i + j], 0, stat); \
1380 clear_tail(d, oprsz, simd_maxsz(desc)); \
1383 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1384 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1385 DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
1387 #undef DO_FMLA_IDX
1389 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1390 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1392 intptr_t i, oprsz = simd_oprsz(desc); \
1393 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1394 bool q = false; \
1395 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1396 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1397 if (dd < MIN) { \
1398 dd = MIN; \
1399 q = true; \
1400 } else if (dd > MAX) { \
1401 dd = MAX; \
1402 q = true; \
1404 d[i] = dd; \
1406 if (q) { \
1407 uint32_t *qc = vq; \
1408 qc[0] = 1; \
1410 clear_tail(d, oprsz, simd_maxsz(desc)); \
1413 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1414 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1415 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1417 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1418 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1419 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1421 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1422 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1423 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1425 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1426 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1427 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1429 #undef DO_SAT
1431 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1432 void *vm, uint32_t desc)
1434 intptr_t i, oprsz = simd_oprsz(desc);
1435 uint64_t *d = vd, *n = vn, *m = vm;
1436 bool q = false;
1438 for (i = 0; i < oprsz / 8; i++) {
1439 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1440 if (dd < nn) {
1441 dd = UINT64_MAX;
1442 q = true;
1444 d[i] = dd;
1446 if (q) {
1447 uint32_t *qc = vq;
1448 qc[0] = 1;
1450 clear_tail(d, oprsz, simd_maxsz(desc));
1453 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1454 void *vm, uint32_t desc)
1456 intptr_t i, oprsz = simd_oprsz(desc);
1457 uint64_t *d = vd, *n = vn, *m = vm;
1458 bool q = false;
1460 for (i = 0; i < oprsz / 8; i++) {
1461 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1462 if (nn < mm) {
1463 dd = 0;
1464 q = true;
1466 d[i] = dd;
1468 if (q) {
1469 uint32_t *qc = vq;
1470 qc[0] = 1;
1472 clear_tail(d, oprsz, simd_maxsz(desc));
1475 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1476 void *vm, uint32_t desc)
1478 intptr_t i, oprsz = simd_oprsz(desc);
1479 int64_t *d = vd, *n = vn, *m = vm;
1480 bool q = false;
1482 for (i = 0; i < oprsz / 8; i++) {
1483 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1484 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1485 dd = (nn >> 63) ^ ~INT64_MIN;
1486 q = true;
1488 d[i] = dd;
1490 if (q) {
1491 uint32_t *qc = vq;
1492 qc[0] = 1;
1494 clear_tail(d, oprsz, simd_maxsz(desc));
1497 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1498 void *vm, uint32_t desc)
1500 intptr_t i, oprsz = simd_oprsz(desc);
1501 int64_t *d = vd, *n = vn, *m = vm;
1502 bool q = false;
1504 for (i = 0; i < oprsz / 8; i++) {
1505 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1506 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1507 dd = (nn >> 63) ^ ~INT64_MIN;
1508 q = true;
1510 d[i] = dd;
1512 if (q) {
1513 uint32_t *qc = vq;
1514 qc[0] = 1;
1516 clear_tail(d, oprsz, simd_maxsz(desc));
1520 #define DO_SRA(NAME, TYPE) \
1521 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1523 intptr_t i, oprsz = simd_oprsz(desc); \
1524 int shift = simd_data(desc); \
1525 TYPE *d = vd, *n = vn; \
1526 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1527 d[i] += n[i] >> shift; \
1529 clear_tail(d, oprsz, simd_maxsz(desc)); \
1532 DO_SRA(gvec_ssra_b, int8_t)
1533 DO_SRA(gvec_ssra_h, int16_t)
1534 DO_SRA(gvec_ssra_s, int32_t)
1535 DO_SRA(gvec_ssra_d, int64_t)
1537 DO_SRA(gvec_usra_b, uint8_t)
1538 DO_SRA(gvec_usra_h, uint16_t)
1539 DO_SRA(gvec_usra_s, uint32_t)
1540 DO_SRA(gvec_usra_d, uint64_t)
1542 #undef DO_SRA
1544 #define DO_RSHR(NAME, TYPE) \
1545 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1547 intptr_t i, oprsz = simd_oprsz(desc); \
1548 int shift = simd_data(desc); \
1549 TYPE *d = vd, *n = vn; \
1550 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1551 TYPE tmp = n[i] >> (shift - 1); \
1552 d[i] = (tmp >> 1) + (tmp & 1); \
1554 clear_tail(d, oprsz, simd_maxsz(desc)); \
1557 DO_RSHR(gvec_srshr_b, int8_t)
1558 DO_RSHR(gvec_srshr_h, int16_t)
1559 DO_RSHR(gvec_srshr_s, int32_t)
1560 DO_RSHR(gvec_srshr_d, int64_t)
1562 DO_RSHR(gvec_urshr_b, uint8_t)
1563 DO_RSHR(gvec_urshr_h, uint16_t)
1564 DO_RSHR(gvec_urshr_s, uint32_t)
1565 DO_RSHR(gvec_urshr_d, uint64_t)
1567 #undef DO_RSHR
1569 #define DO_RSRA(NAME, TYPE) \
1570 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1572 intptr_t i, oprsz = simd_oprsz(desc); \
1573 int shift = simd_data(desc); \
1574 TYPE *d = vd, *n = vn; \
1575 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1576 TYPE tmp = n[i] >> (shift - 1); \
1577 d[i] += (tmp >> 1) + (tmp & 1); \
1579 clear_tail(d, oprsz, simd_maxsz(desc)); \
1582 DO_RSRA(gvec_srsra_b, int8_t)
1583 DO_RSRA(gvec_srsra_h, int16_t)
1584 DO_RSRA(gvec_srsra_s, int32_t)
1585 DO_RSRA(gvec_srsra_d, int64_t)
1587 DO_RSRA(gvec_ursra_b, uint8_t)
1588 DO_RSRA(gvec_ursra_h, uint16_t)
1589 DO_RSRA(gvec_ursra_s, uint32_t)
1590 DO_RSRA(gvec_ursra_d, uint64_t)
1592 #undef DO_RSRA
1594 #define DO_SRI(NAME, TYPE) \
1595 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1597 intptr_t i, oprsz = simd_oprsz(desc); \
1598 int shift = simd_data(desc); \
1599 TYPE *d = vd, *n = vn; \
1600 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1601 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1603 clear_tail(d, oprsz, simd_maxsz(desc)); \
1606 DO_SRI(gvec_sri_b, uint8_t)
1607 DO_SRI(gvec_sri_h, uint16_t)
1608 DO_SRI(gvec_sri_s, uint32_t)
1609 DO_SRI(gvec_sri_d, uint64_t)
1611 #undef DO_SRI
1613 #define DO_SLI(NAME, TYPE) \
1614 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1616 intptr_t i, oprsz = simd_oprsz(desc); \
1617 int shift = simd_data(desc); \
1618 TYPE *d = vd, *n = vn; \
1619 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1620 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1622 clear_tail(d, oprsz, simd_maxsz(desc)); \
1625 DO_SLI(gvec_sli_b, uint8_t)
1626 DO_SLI(gvec_sli_h, uint16_t)
1627 DO_SLI(gvec_sli_s, uint32_t)
1628 DO_SLI(gvec_sli_d, uint64_t)
1630 #undef DO_SLI
1633 * Convert float16 to float32, raising no exceptions and
1634 * preserving exceptional values, including SNaN.
1635 * This is effectively an unpack+repack operation.
1637 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1639 const int f16_bias = 15;
1640 const int f32_bias = 127;
1641 uint32_t sign = extract32(f16, 15, 1);
1642 uint32_t exp = extract32(f16, 10, 5);
1643 uint32_t frac = extract32(f16, 0, 10);
1645 if (exp == 0x1f) {
1646 /* Inf or NaN */
1647 exp = 0xff;
1648 } else if (exp == 0) {
1649 /* Zero or denormal. */
1650 if (frac != 0) {
1651 if (fz16) {
1652 frac = 0;
1653 } else {
1655 * Denormal; these are all normal float32.
1656 * Shift the fraction so that the msb is at bit 11,
1657 * then remove bit 11 as the implicit bit of the
1658 * normalized float32. Note that we still go through
1659 * the shift for normal numbers below, to put the
1660 * float32 fraction at the right place.
1662 int shift = clz32(frac) - 21;
1663 frac = (frac << shift) & 0x3ff;
1664 exp = f32_bias - f16_bias - shift + 1;
1667 } else {
1668 /* Normal number; adjust the bias. */
1669 exp += f32_bias - f16_bias;
1671 sign <<= 31;
1672 exp <<= 23;
1673 frac <<= 23 - 10;
1675 return sign | exp | frac;
1678 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1681 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1682 * Load the 2nd qword iff is_q & is_2.
1683 * Shift to the 2nd dword iff !is_q & is_2.
1684 * For !is_q & !is_2, the upper bits of the result are garbage.
1686 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1690 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1691 * as there is not yet SVE versions that might use blocking.
1694 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1695 uint32_t desc, bool fz16)
1697 intptr_t i, oprsz = simd_oprsz(desc);
1698 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1699 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1700 int is_q = oprsz == 16;
1701 uint64_t n_4, m_4;
1703 /* Pre-load all of the f16 data, avoiding overlap issues. */
1704 n_4 = load4_f16(vn, is_q, is_2);
1705 m_4 = load4_f16(vm, is_q, is_2);
1707 /* Negate all inputs for FMLSL at once. */
1708 if (is_s) {
1709 n_4 ^= 0x8000800080008000ull;
1712 for (i = 0; i < oprsz / 4; i++) {
1713 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1714 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1715 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1717 clear_tail(d, oprsz, simd_maxsz(desc));
1720 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1721 void *venv, uint32_t desc)
1723 CPUARMState *env = venv;
1724 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1725 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1728 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1729 void *venv, uint32_t desc)
1731 CPUARMState *env = venv;
1732 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1733 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1736 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1737 uint32_t desc, bool fz16)
1739 intptr_t i, oprsz = simd_oprsz(desc);
1740 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1741 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1742 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1743 int is_q = oprsz == 16;
1744 uint64_t n_4;
1745 float32 m_1;
1747 /* Pre-load all of the f16 data, avoiding overlap issues. */
1748 n_4 = load4_f16(vn, is_q, is_2);
1750 /* Negate all inputs for FMLSL at once. */
1751 if (is_s) {
1752 n_4 ^= 0x8000800080008000ull;
1755 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1757 for (i = 0; i < oprsz / 4; i++) {
1758 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1759 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1761 clear_tail(d, oprsz, simd_maxsz(desc));
1764 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1765 void *venv, uint32_t desc)
1767 CPUARMState *env = venv;
1768 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1769 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1772 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1773 void *venv, uint32_t desc)
1775 CPUARMState *env = venv;
1776 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1777 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1780 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1782 intptr_t i, opr_sz = simd_oprsz(desc);
1783 int8_t *d = vd, *n = vn, *m = vm;
1785 for (i = 0; i < opr_sz; ++i) {
1786 int8_t mm = m[i];
1787 int8_t nn = n[i];
1788 int8_t res = 0;
1789 if (mm >= 0) {
1790 if (mm < 8) {
1791 res = nn << mm;
1793 } else {
1794 res = nn >> (mm > -8 ? -mm : 7);
1796 d[i] = res;
1798 clear_tail(d, opr_sz, simd_maxsz(desc));
1801 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1803 intptr_t i, opr_sz = simd_oprsz(desc);
1804 int16_t *d = vd, *n = vn, *m = vm;
1806 for (i = 0; i < opr_sz / 2; ++i) {
1807 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1808 int16_t nn = n[i];
1809 int16_t res = 0;
1810 if (mm >= 0) {
1811 if (mm < 16) {
1812 res = nn << mm;
1814 } else {
1815 res = nn >> (mm > -16 ? -mm : 15);
1817 d[i] = res;
1819 clear_tail(d, opr_sz, simd_maxsz(desc));
1822 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1824 intptr_t i, opr_sz = simd_oprsz(desc);
1825 uint8_t *d = vd, *n = vn, *m = vm;
1827 for (i = 0; i < opr_sz; ++i) {
1828 int8_t mm = m[i];
1829 uint8_t nn = n[i];
1830 uint8_t res = 0;
1831 if (mm >= 0) {
1832 if (mm < 8) {
1833 res = nn << mm;
1835 } else {
1836 if (mm > -8) {
1837 res = nn >> -mm;
1840 d[i] = res;
1842 clear_tail(d, opr_sz, simd_maxsz(desc));
1845 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1847 intptr_t i, opr_sz = simd_oprsz(desc);
1848 uint16_t *d = vd, *n = vn, *m = vm;
1850 for (i = 0; i < opr_sz / 2; ++i) {
1851 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1852 uint16_t nn = n[i];
1853 uint16_t res = 0;
1854 if (mm >= 0) {
1855 if (mm < 16) {
1856 res = nn << mm;
1858 } else {
1859 if (mm > -16) {
1860 res = nn >> -mm;
1863 d[i] = res;
1865 clear_tail(d, opr_sz, simd_maxsz(desc));
1869 * 8x8->8 polynomial multiply.
1871 * Polynomial multiplication is like integer multiplication except the
1872 * partial products are XORed, not added.
1874 * TODO: expose this as a generic vector operation, as it is a common
1875 * crypto building block.
1877 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1879 intptr_t i, j, opr_sz = simd_oprsz(desc);
1880 uint64_t *d = vd, *n = vn, *m = vm;
1882 for (i = 0; i < opr_sz / 8; ++i) {
1883 uint64_t nn = n[i];
1884 uint64_t mm = m[i];
1885 uint64_t rr = 0;
1887 for (j = 0; j < 8; ++j) {
1888 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1889 rr ^= mm & mask;
1890 mm = (mm << 1) & 0xfefefefefefefefeull;
1891 nn >>= 1;
1893 d[i] = rr;
1895 clear_tail(d, opr_sz, simd_maxsz(desc));
1899 * 64x64->128 polynomial multiply.
1900 * Because of the lanes are not accessed in strict columns,
1901 * this probably cannot be turned into a generic helper.
1903 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1905 intptr_t i, j, opr_sz = simd_oprsz(desc);
1906 intptr_t hi = simd_data(desc);
1907 uint64_t *d = vd, *n = vn, *m = vm;
1909 for (i = 0; i < opr_sz / 8; i += 2) {
1910 uint64_t nn = n[i + hi];
1911 uint64_t mm = m[i + hi];
1912 uint64_t rhi = 0;
1913 uint64_t rlo = 0;
1915 /* Bit 0 can only influence the low 64-bit result. */
1916 if (nn & 1) {
1917 rlo = mm;
1920 for (j = 1; j < 64; ++j) {
1921 uint64_t mask = -((nn >> j) & 1);
1922 rlo ^= (mm << j) & mask;
1923 rhi ^= (mm >> (64 - j)) & mask;
1925 d[i] = rlo;
1926 d[i + 1] = rhi;
1928 clear_tail(d, opr_sz, simd_maxsz(desc));
1932 * 8x8->16 polynomial multiply.
1934 * The byte inputs are expanded to (or extracted from) half-words.
1935 * Note that neon and sve2 get the inputs from different positions.
1936 * This allows 4 bytes to be processed in parallel with uint64_t.
1939 static uint64_t expand_byte_to_half(uint64_t x)
1941 return (x & 0x000000ff)
1942 | ((x & 0x0000ff00) << 8)
1943 | ((x & 0x00ff0000) << 16)
1944 | ((x & 0xff000000) << 24);
1947 static uint64_t pmull_h(uint64_t op1, uint64_t op2)
1949 uint64_t result = 0;
1950 int i;
1952 for (i = 0; i < 8; ++i) {
1953 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
1954 result ^= op2 & mask;
1955 op1 >>= 1;
1956 op2 <<= 1;
1958 return result;
1961 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
1963 int hi = simd_data(desc);
1964 uint64_t *d = vd, *n = vn, *m = vm;
1965 uint64_t nn = n[hi], mm = m[hi];
1967 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
1968 nn >>= 32;
1969 mm >>= 32;
1970 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
1972 clear_tail(d, 16, simd_maxsz(desc));
1975 #ifdef TARGET_AARCH64
1976 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
1978 int shift = simd_data(desc) * 8;
1979 intptr_t i, opr_sz = simd_oprsz(desc);
1980 uint64_t *d = vd, *n = vn, *m = vm;
1982 for (i = 0; i < opr_sz / 8; ++i) {
1983 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
1984 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
1986 d[i] = pmull_h(nn, mm);
1990 static uint64_t pmull_d(uint64_t op1, uint64_t op2)
1992 uint64_t result = 0;
1993 int i;
1995 for (i = 0; i < 32; ++i) {
1996 uint64_t mask = -((op1 >> i) & 1);
1997 result ^= (op2 << i) & mask;
1999 return result;
2002 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2004 intptr_t sel = H4(simd_data(desc));
2005 intptr_t i, opr_sz = simd_oprsz(desc);
2006 uint32_t *n = vn, *m = vm;
2007 uint64_t *d = vd;
2009 for (i = 0; i < opr_sz / 8; ++i) {
2010 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2013 #endif
2015 #define DO_CMP0(NAME, TYPE, OP) \
2016 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2018 intptr_t i, opr_sz = simd_oprsz(desc); \
2019 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2020 TYPE nn = *(TYPE *)(vn + i); \
2021 *(TYPE *)(vd + i) = -(nn OP 0); \
2023 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2026 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2027 DO_CMP0(gvec_clt0_b, int8_t, <)
2028 DO_CMP0(gvec_cle0_b, int8_t, <=)
2029 DO_CMP0(gvec_cgt0_b, int8_t, >)
2030 DO_CMP0(gvec_cge0_b, int8_t, >=)
2032 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2033 DO_CMP0(gvec_clt0_h, int16_t, <)
2034 DO_CMP0(gvec_cle0_h, int16_t, <=)
2035 DO_CMP0(gvec_cgt0_h, int16_t, >)
2036 DO_CMP0(gvec_cge0_h, int16_t, >=)
2038 #undef DO_CMP0
2040 #define DO_ABD(NAME, TYPE) \
2041 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2043 intptr_t i, opr_sz = simd_oprsz(desc); \
2044 TYPE *d = vd, *n = vn, *m = vm; \
2046 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2047 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2049 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2052 DO_ABD(gvec_sabd_b, int8_t)
2053 DO_ABD(gvec_sabd_h, int16_t)
2054 DO_ABD(gvec_sabd_s, int32_t)
2055 DO_ABD(gvec_sabd_d, int64_t)
2057 DO_ABD(gvec_uabd_b, uint8_t)
2058 DO_ABD(gvec_uabd_h, uint16_t)
2059 DO_ABD(gvec_uabd_s, uint32_t)
2060 DO_ABD(gvec_uabd_d, uint64_t)
2062 #undef DO_ABD
2064 #define DO_ABA(NAME, TYPE) \
2065 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2067 intptr_t i, opr_sz = simd_oprsz(desc); \
2068 TYPE *d = vd, *n = vn, *m = vm; \
2070 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2071 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2073 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2076 DO_ABA(gvec_saba_b, int8_t)
2077 DO_ABA(gvec_saba_h, int16_t)
2078 DO_ABA(gvec_saba_s, int32_t)
2079 DO_ABA(gvec_saba_d, int64_t)
2081 DO_ABA(gvec_uaba_b, uint8_t)
2082 DO_ABA(gvec_uaba_h, uint16_t)
2083 DO_ABA(gvec_uaba_s, uint32_t)
2084 DO_ABA(gvec_uaba_d, uint64_t)
2086 #undef DO_ABA
2088 #define DO_NEON_PAIRWISE(NAME, OP) \
2089 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2090 void *stat, uint32_t oprsz) \
2092 float_status *fpst = stat; \
2093 float32 *d = vd; \
2094 float32 *n = vn; \
2095 float32 *m = vm; \
2096 float32 r0, r1; \
2098 /* Read all inputs before writing outputs in case vm == vd */ \
2099 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2100 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2102 d[H4(0)] = r0; \
2103 d[H4(1)] = r1; \
2106 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2107 void *stat, uint32_t oprsz) \
2109 float_status *fpst = stat; \
2110 float16 *d = vd; \
2111 float16 *n = vn; \
2112 float16 *m = vm; \
2113 float16 r0, r1, r2, r3; \
2115 /* Read all inputs before writing outputs in case vm == vd */ \
2116 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2117 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2118 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2119 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2121 d[H2(0)] = r0; \
2122 d[H2(1)] = r1; \
2123 d[H2(2)] = r2; \
2124 d[H2(3)] = r3; \
2127 DO_NEON_PAIRWISE(neon_padd, add)
2128 DO_NEON_PAIRWISE(neon_pmax, max)
2129 DO_NEON_PAIRWISE(neon_pmin, min)
2131 #undef DO_NEON_PAIRWISE
2133 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2134 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2136 intptr_t i, oprsz = simd_oprsz(desc); \
2137 int shift = simd_data(desc); \
2138 TYPE *d = vd, *n = vn; \
2139 float_status *fpst = stat; \
2140 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2141 d[i] = FUNC(n[i], shift, fpst); \
2143 clear_tail(d, oprsz, simd_maxsz(desc)); \
2146 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2147 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2148 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2149 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2150 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2151 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2152 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2153 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2155 #undef DO_VCVT_FIXED
2157 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2158 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2160 float_status *fpst = stat; \
2161 intptr_t i, oprsz = simd_oprsz(desc); \
2162 uint32_t rmode = simd_data(desc); \
2163 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2164 TYPE *d = vd, *n = vn; \
2165 set_float_rounding_mode(rmode, fpst); \
2166 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2167 d[i] = FUNC(n[i], 0, fpst); \
2169 set_float_rounding_mode(prev_rmode, fpst); \
2170 clear_tail(d, oprsz, simd_maxsz(desc)); \
2173 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2174 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2175 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2176 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2178 #undef DO_VCVT_RMODE
2180 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2181 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2183 float_status *fpst = stat; \
2184 intptr_t i, oprsz = simd_oprsz(desc); \
2185 uint32_t rmode = simd_data(desc); \
2186 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2187 TYPE *d = vd, *n = vn; \
2188 set_float_rounding_mode(rmode, fpst); \
2189 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2190 d[i] = FUNC(n[i], fpst); \
2192 set_float_rounding_mode(prev_rmode, fpst); \
2193 clear_tail(d, oprsz, simd_maxsz(desc)); \
2196 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2197 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2199 #undef DO_VRINT_RMODE
2201 #ifdef TARGET_AARCH64
2202 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2204 const uint8_t *indices = vm;
2205 CPUARMState *env = venv;
2206 size_t oprsz = simd_oprsz(desc);
2207 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2208 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2209 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2210 union {
2211 uint8_t b[16];
2212 uint64_t d[2];
2213 } result;
2216 * We must construct the final result in a temp, lest the output
2217 * overlaps the input table. For TBL, begin with zero; for TBX,
2218 * begin with the original register contents. Note that we always
2219 * copy 16 bytes here to avoid an extra branch; clearing the high
2220 * bits of the register for oprsz == 8 is handled below.
2222 if (is_tbx) {
2223 memcpy(&result, vd, 16);
2224 } else {
2225 memset(&result, 0, 16);
2228 for (size_t i = 0; i < oprsz; ++i) {
2229 uint32_t index = indices[H1(i)];
2231 if (index < table_len) {
2233 * Convert index (a byte offset into the virtual table
2234 * which is a series of 128-bit vectors concatenated)
2235 * into the correct register element, bearing in mind
2236 * that the table can wrap around from V31 to V0.
2238 const uint8_t *table = (const uint8_t *)
2239 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2240 result.b[H1(i)] = table[H1(index % 16)];
2244 memcpy(vd, &result, 16);
2245 clear_tail(vd, oprsz, simd_maxsz(desc));
2247 #endif
2250 * NxN -> N highpart multiply
2252 * TODO: expose this as a generic vector operation.
2255 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2257 intptr_t i, opr_sz = simd_oprsz(desc);
2258 int8_t *d = vd, *n = vn, *m = vm;
2260 for (i = 0; i < opr_sz; ++i) {
2261 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2263 clear_tail(d, opr_sz, simd_maxsz(desc));
2266 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2268 intptr_t i, opr_sz = simd_oprsz(desc);
2269 int16_t *d = vd, *n = vn, *m = vm;
2271 for (i = 0; i < opr_sz / 2; ++i) {
2272 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2274 clear_tail(d, opr_sz, simd_maxsz(desc));
2277 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2279 intptr_t i, opr_sz = simd_oprsz(desc);
2280 int32_t *d = vd, *n = vn, *m = vm;
2282 for (i = 0; i < opr_sz / 4; ++i) {
2283 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2285 clear_tail(d, opr_sz, simd_maxsz(desc));
2288 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2290 intptr_t i, opr_sz = simd_oprsz(desc);
2291 uint64_t *d = vd, *n = vn, *m = vm;
2292 uint64_t discard;
2294 for (i = 0; i < opr_sz / 8; ++i) {
2295 muls64(&discard, &d[i], n[i], m[i]);
2297 clear_tail(d, opr_sz, simd_maxsz(desc));
2300 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2302 intptr_t i, opr_sz = simd_oprsz(desc);
2303 uint8_t *d = vd, *n = vn, *m = vm;
2305 for (i = 0; i < opr_sz; ++i) {
2306 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2308 clear_tail(d, opr_sz, simd_maxsz(desc));
2311 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2313 intptr_t i, opr_sz = simd_oprsz(desc);
2314 uint16_t *d = vd, *n = vn, *m = vm;
2316 for (i = 0; i < opr_sz / 2; ++i) {
2317 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2319 clear_tail(d, opr_sz, simd_maxsz(desc));
2322 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2324 intptr_t i, opr_sz = simd_oprsz(desc);
2325 uint32_t *d = vd, *n = vn, *m = vm;
2327 for (i = 0; i < opr_sz / 4; ++i) {
2328 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2330 clear_tail(d, opr_sz, simd_maxsz(desc));
2333 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2335 intptr_t i, opr_sz = simd_oprsz(desc);
2336 uint64_t *d = vd, *n = vn, *m = vm;
2337 uint64_t discard;
2339 for (i = 0; i < opr_sz / 8; ++i) {
2340 mulu64(&discard, &d[i], n[i], m[i]);
2342 clear_tail(d, opr_sz, simd_maxsz(desc));
2345 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2347 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2348 int shr = simd_data(desc);
2349 uint64_t *d = vd, *n = vn, *m = vm;
2351 for (i = 0; i < opr_sz; ++i) {
2352 d[i] = ror64(n[i] ^ m[i], shr);
2354 clear_tail(d, opr_sz * 8, simd_maxsz(desc));