pc-bios/vof: Adopt meson style Make output
[qemu.git] / target / arm / vec_helper.c
blobf59d3b26eacf08f80c1e601f221a54a3764fc9ce
1 /*
2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
29 * Data for expanding active predicate bits to bytes, for byte elements.
31 * for (i = 0; i < 256; ++i) {
32 * unsigned long m = 0;
33 * for (j = 0; j < 8; j++) {
34 * if ((i >> j) & 1) {
35 * m |= 0xfful << (j << 3);
36 * }
37 * }
38 * printf("0x%016lx,\n", m);
39 * }
41 const uint64_t expand_pred_b_data[256] = {
42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
127 0xffffffffffffffff,
131 * Similarly for half-word elements.
132 * for (i = 0; i < 256; ++i) {
133 * unsigned long m = 0;
134 * if (i & 0xaa) {
135 * continue;
137 * for (j = 0; j < 8; j += 2) {
138 * if ((i >> j) & 1) {
139 * m |= 0xfffful << (j << 3);
142 * printf("[0x%x] = 0x%016lx,\n", i, m);
145 const uint64_t expand_pred_h_data[0x55 + 1] = {
146 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
147 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
148 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
149 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
150 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
151 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
152 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
153 [0x55] = 0xffffffffffffffff,
156 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
157 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
158 bool neg, bool round)
161 * Simplify:
162 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
163 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165 int32_t ret = (int32_t)src1 * src2;
166 if (neg) {
167 ret = -ret;
169 ret += ((int32_t)src3 << 7) + (round << 6);
170 ret >>= 7;
172 if (ret != (int8_t)ret) {
173 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175 return ret;
178 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
179 void *va, uint32_t desc)
181 intptr_t i, opr_sz = simd_oprsz(desc);
182 int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 for (i = 0; i < opr_sz; ++i) {
185 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
189 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
190 void *va, uint32_t desc)
192 intptr_t i, opr_sz = simd_oprsz(desc);
193 int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 for (i = 0; i < opr_sz; ++i) {
196 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
200 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 intptr_t i, opr_sz = simd_oprsz(desc);
203 int8_t *d = vd, *n = vn, *m = vm;
205 for (i = 0; i < opr_sz; ++i) {
206 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
210 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 intptr_t i, opr_sz = simd_oprsz(desc);
213 int8_t *d = vd, *n = vn, *m = vm;
215 for (i = 0; i < opr_sz; ++i) {
216 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
220 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
221 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
222 bool neg, bool round, uint32_t *sat)
224 /* Simplify similarly to do_sqrdmlah_b above. */
225 int32_t ret = (int32_t)src1 * src2;
226 if (neg) {
227 ret = -ret;
229 ret += ((int32_t)src3 << 15) + (round << 14);
230 ret >>= 15;
232 if (ret != (int16_t)ret) {
233 *sat = 1;
234 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236 return ret;
239 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
240 uint32_t src2, uint32_t src3)
242 uint32_t *sat = &env->vfp.qc[0];
243 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
244 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
245 false, true, sat);
246 return deposit32(e1, 16, 16, e2);
249 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
250 void *vq, uint32_t desc)
252 uintptr_t opr_sz = simd_oprsz(desc);
253 int16_t *d = vd;
254 int16_t *n = vn;
255 int16_t *m = vm;
256 uintptr_t i;
258 for (i = 0; i < opr_sz / 2; ++i) {
259 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261 clear_tail(d, opr_sz, simd_maxsz(desc));
264 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
265 uint32_t src2, uint32_t src3)
267 uint32_t *sat = &env->vfp.qc[0];
268 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
269 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
270 true, true, sat);
271 return deposit32(e1, 16, 16, e2);
274 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
275 void *vq, uint32_t desc)
277 uintptr_t opr_sz = simd_oprsz(desc);
278 int16_t *d = vd;
279 int16_t *n = vn;
280 int16_t *m = vm;
281 uintptr_t i;
283 for (i = 0; i < opr_sz / 2; ++i) {
284 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286 clear_tail(d, opr_sz, simd_maxsz(desc));
289 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
290 void *vq, uint32_t desc)
292 intptr_t i, opr_sz = simd_oprsz(desc);
293 int16_t *d = vd, *n = vn, *m = vm;
295 for (i = 0; i < opr_sz / 2; ++i) {
296 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298 clear_tail(d, opr_sz, simd_maxsz(desc));
301 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
302 void *vq, uint32_t desc)
304 intptr_t i, opr_sz = simd_oprsz(desc);
305 int16_t *d = vd, *n = vn, *m = vm;
307 for (i = 0; i < opr_sz / 2; ++i) {
308 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310 clear_tail(d, opr_sz, simd_maxsz(desc));
313 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
314 void *va, uint32_t desc)
316 intptr_t i, opr_sz = simd_oprsz(desc);
317 int16_t *d = vd, *n = vn, *m = vm, *a = va;
318 uint32_t discard;
320 for (i = 0; i < opr_sz / 2; ++i) {
321 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
325 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
326 void *va, uint32_t desc)
328 intptr_t i, opr_sz = simd_oprsz(desc);
329 int16_t *d = vd, *n = vn, *m = vm, *a = va;
330 uint32_t discard;
332 for (i = 0; i < opr_sz / 2; ++i) {
333 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
337 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 intptr_t i, opr_sz = simd_oprsz(desc);
340 int16_t *d = vd, *n = vn, *m = vm;
341 uint32_t discard;
343 for (i = 0; i < opr_sz / 2; ++i) {
344 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
348 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 intptr_t i, opr_sz = simd_oprsz(desc);
351 int16_t *d = vd, *n = vn, *m = vm;
352 uint32_t discard;
354 for (i = 0; i < opr_sz / 2; ++i) {
355 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
359 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 intptr_t i, j, opr_sz = simd_oprsz(desc);
362 int idx = simd_data(desc);
363 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
364 uint32_t discard;
366 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
367 int16_t mm = m[i];
368 for (j = 0; j < 16 / 2; ++j) {
369 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
374 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 intptr_t i, j, opr_sz = simd_oprsz(desc);
377 int idx = simd_data(desc);
378 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
379 uint32_t discard;
381 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
382 int16_t mm = m[i];
383 for (j = 0; j < 16 / 2; ++j) {
384 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
389 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
390 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
391 bool neg, bool round, uint32_t *sat)
393 /* Simplify similarly to do_sqrdmlah_b above. */
394 int64_t ret = (int64_t)src1 * src2;
395 if (neg) {
396 ret = -ret;
398 ret += ((int64_t)src3 << 31) + (round << 30);
399 ret >>= 31;
401 if (ret != (int32_t)ret) {
402 *sat = 1;
403 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405 return ret;
408 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
409 int32_t src2, int32_t src3)
411 uint32_t *sat = &env->vfp.qc[0];
412 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
415 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
416 void *vq, uint32_t desc)
418 uintptr_t opr_sz = simd_oprsz(desc);
419 int32_t *d = vd;
420 int32_t *n = vn;
421 int32_t *m = vm;
422 uintptr_t i;
424 for (i = 0; i < opr_sz / 4; ++i) {
425 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427 clear_tail(d, opr_sz, simd_maxsz(desc));
430 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
431 int32_t src2, int32_t src3)
433 uint32_t *sat = &env->vfp.qc[0];
434 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
437 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
438 void *vq, uint32_t desc)
440 uintptr_t opr_sz = simd_oprsz(desc);
441 int32_t *d = vd;
442 int32_t *n = vn;
443 int32_t *m = vm;
444 uintptr_t i;
446 for (i = 0; i < opr_sz / 4; ++i) {
447 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449 clear_tail(d, opr_sz, simd_maxsz(desc));
452 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
453 void *vq, uint32_t desc)
455 intptr_t i, opr_sz = simd_oprsz(desc);
456 int32_t *d = vd, *n = vn, *m = vm;
458 for (i = 0; i < opr_sz / 4; ++i) {
459 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461 clear_tail(d, opr_sz, simd_maxsz(desc));
464 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
465 void *vq, uint32_t desc)
467 intptr_t i, opr_sz = simd_oprsz(desc);
468 int32_t *d = vd, *n = vn, *m = vm;
470 for (i = 0; i < opr_sz / 4; ++i) {
471 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473 clear_tail(d, opr_sz, simd_maxsz(desc));
476 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
477 void *va, uint32_t desc)
479 intptr_t i, opr_sz = simd_oprsz(desc);
480 int32_t *d = vd, *n = vn, *m = vm, *a = va;
481 uint32_t discard;
483 for (i = 0; i < opr_sz / 4; ++i) {
484 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
488 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
489 void *va, uint32_t desc)
491 intptr_t i, opr_sz = simd_oprsz(desc);
492 int32_t *d = vd, *n = vn, *m = vm, *a = va;
493 uint32_t discard;
495 for (i = 0; i < opr_sz / 4; ++i) {
496 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
500 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 intptr_t i, opr_sz = simd_oprsz(desc);
503 int32_t *d = vd, *n = vn, *m = vm;
504 uint32_t discard;
506 for (i = 0; i < opr_sz / 4; ++i) {
507 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
511 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 intptr_t i, opr_sz = simd_oprsz(desc);
514 int32_t *d = vd, *n = vn, *m = vm;
515 uint32_t discard;
517 for (i = 0; i < opr_sz / 4; ++i) {
518 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
522 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 intptr_t i, j, opr_sz = simd_oprsz(desc);
525 int idx = simd_data(desc);
526 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
527 uint32_t discard;
529 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
530 int32_t mm = m[i];
531 for (j = 0; j < 16 / 4; ++j) {
532 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
537 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 intptr_t i, j, opr_sz = simd_oprsz(desc);
540 int idx = simd_data(desc);
541 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
542 uint32_t discard;
544 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
545 int32_t mm = m[i];
546 for (j = 0; j < 16 / 4; ++j) {
547 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
552 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
553 static int64_t do_sat128_d(Int128 r)
555 int64_t ls = int128_getlo(r);
556 int64_t hs = int128_gethi(r);
558 if (unlikely(hs != (ls >> 63))) {
559 return hs < 0 ? INT64_MIN : INT64_MAX;
561 return ls;
564 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 uint64_t l, h;
567 Int128 r, t;
569 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
570 muls64(&l, &h, m, n);
571 r = int128_make128(l, h);
572 if (neg) {
573 r = int128_neg(r);
575 if (a) {
576 t = int128_exts64(a);
577 t = int128_lshift(t, 63);
578 r = int128_add(r, t);
580 if (round) {
581 t = int128_exts64(1ll << 62);
582 r = int128_add(r, t);
584 r = int128_rshift(r, 63);
586 return do_sat128_d(r);
589 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
590 void *va, uint32_t desc)
592 intptr_t i, opr_sz = simd_oprsz(desc);
593 int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 for (i = 0; i < opr_sz / 8; ++i) {
596 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
600 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
601 void *va, uint32_t desc)
603 intptr_t i, opr_sz = simd_oprsz(desc);
604 int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 for (i = 0; i < opr_sz / 8; ++i) {
607 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
611 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 intptr_t i, opr_sz = simd_oprsz(desc);
614 int64_t *d = vd, *n = vn, *m = vm;
616 for (i = 0; i < opr_sz / 8; ++i) {
617 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
621 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 intptr_t i, opr_sz = simd_oprsz(desc);
624 int64_t *d = vd, *n = vn, *m = vm;
626 for (i = 0; i < opr_sz / 8; ++i) {
627 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
631 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 intptr_t i, j, opr_sz = simd_oprsz(desc);
634 int idx = simd_data(desc);
635 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
638 int64_t mm = m[i];
639 for (j = 0; j < 16 / 8; ++j) {
640 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
645 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 intptr_t i, j, opr_sz = simd_oprsz(desc);
648 int idx = simd_data(desc);
649 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
652 int64_t mm = m[i];
653 for (j = 0; j < 16 / 8; ++j) {
654 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
659 /* Integer 8 and 16-bit dot-product.
661 * Note that for the loops herein, host endianness does not matter
662 * with respect to the ordering of data within the quad-width lanes.
663 * All elements are treated equally, no matter where they are.
666 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
667 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
669 intptr_t i, opr_sz = simd_oprsz(desc); \
670 TYPED *d = vd, *a = va; \
671 TYPEN *n = vn; \
672 TYPEM *m = vm; \
673 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
674 d[i] = (a[i] + \
675 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
676 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
677 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
678 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
680 clear_tail(d, opr_sz, simd_maxsz(desc)); \
683 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
684 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
685 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
686 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
687 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
692 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
693 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
694 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
695 intptr_t index = simd_data(desc); \
696 TYPED *d = vd, *a = va; \
697 TYPEN *n = vn; \
698 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
699 do { \
700 TYPED m0 = m_indexed[i * 4 + 0]; \
701 TYPED m1 = m_indexed[i * 4 + 1]; \
702 TYPED m2 = m_indexed[i * 4 + 2]; \
703 TYPED m3 = m_indexed[i * 4 + 3]; \
704 do { \
705 d[i] = (a[i] + \
706 n[i * 4 + 0] * m0 + \
707 n[i * 4 + 1] * m1 + \
708 n[i * 4 + 2] * m2 + \
709 n[i * 4 + 3] * m3); \
710 } while (++i < segend); \
711 segend = i + 4; \
712 } while (i < opr_sz_n); \
713 clear_tail(d, opr_sz, simd_maxsz(desc)); \
716 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
717 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
718 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
720 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
721 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
724 void *vfpst, uint32_t desc)
726 uintptr_t opr_sz = simd_oprsz(desc);
727 float16 *d = vd;
728 float16 *n = vn;
729 float16 *m = vm;
730 float_status *fpst = vfpst;
731 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
732 uint32_t neg_imag = neg_real ^ 1;
733 uintptr_t i;
735 /* Shift boolean to the sign bit so we can xor to negate. */
736 neg_real <<= 15;
737 neg_imag <<= 15;
739 for (i = 0; i < opr_sz / 2; i += 2) {
740 float16 e0 = n[H2(i)];
741 float16 e1 = m[H2(i + 1)] ^ neg_imag;
742 float16 e2 = n[H2(i + 1)];
743 float16 e3 = m[H2(i)] ^ neg_real;
745 d[H2(i)] = float16_add(e0, e1, fpst);
746 d[H2(i + 1)] = float16_add(e2, e3, fpst);
748 clear_tail(d, opr_sz, simd_maxsz(desc));
751 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
752 void *vfpst, uint32_t desc)
754 uintptr_t opr_sz = simd_oprsz(desc);
755 float32 *d = vd;
756 float32 *n = vn;
757 float32 *m = vm;
758 float_status *fpst = vfpst;
759 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
760 uint32_t neg_imag = neg_real ^ 1;
761 uintptr_t i;
763 /* Shift boolean to the sign bit so we can xor to negate. */
764 neg_real <<= 31;
765 neg_imag <<= 31;
767 for (i = 0; i < opr_sz / 4; i += 2) {
768 float32 e0 = n[H4(i)];
769 float32 e1 = m[H4(i + 1)] ^ neg_imag;
770 float32 e2 = n[H4(i + 1)];
771 float32 e3 = m[H4(i)] ^ neg_real;
773 d[H4(i)] = float32_add(e0, e1, fpst);
774 d[H4(i + 1)] = float32_add(e2, e3, fpst);
776 clear_tail(d, opr_sz, simd_maxsz(desc));
779 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
780 void *vfpst, uint32_t desc)
782 uintptr_t opr_sz = simd_oprsz(desc);
783 float64 *d = vd;
784 float64 *n = vn;
785 float64 *m = vm;
786 float_status *fpst = vfpst;
787 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
788 uint64_t neg_imag = neg_real ^ 1;
789 uintptr_t i;
791 /* Shift boolean to the sign bit so we can xor to negate. */
792 neg_real <<= 63;
793 neg_imag <<= 63;
795 for (i = 0; i < opr_sz / 8; i += 2) {
796 float64 e0 = n[i];
797 float64 e1 = m[i + 1] ^ neg_imag;
798 float64 e2 = n[i + 1];
799 float64 e3 = m[i] ^ neg_real;
801 d[i] = float64_add(e0, e1, fpst);
802 d[i + 1] = float64_add(e2, e3, fpst);
804 clear_tail(d, opr_sz, simd_maxsz(desc));
807 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
808 void *vfpst, uint32_t desc)
810 uintptr_t opr_sz = simd_oprsz(desc);
811 float16 *d = vd, *n = vn, *m = vm, *a = va;
812 float_status *fpst = vfpst;
813 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
814 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
815 uint32_t neg_real = flip ^ neg_imag;
816 uintptr_t i;
818 /* Shift boolean to the sign bit so we can xor to negate. */
819 neg_real <<= 15;
820 neg_imag <<= 15;
822 for (i = 0; i < opr_sz / 2; i += 2) {
823 float16 e2 = n[H2(i + flip)];
824 float16 e1 = m[H2(i + flip)] ^ neg_real;
825 float16 e4 = e2;
826 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
829 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831 clear_tail(d, opr_sz, simd_maxsz(desc));
834 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
835 void *vfpst, uint32_t desc)
837 uintptr_t opr_sz = simd_oprsz(desc);
838 float16 *d = vd, *n = vn, *m = vm, *a = va;
839 float_status *fpst = vfpst;
840 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
841 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
842 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
843 uint32_t neg_real = flip ^ neg_imag;
844 intptr_t elements = opr_sz / sizeof(float16);
845 intptr_t eltspersegment = 16 / sizeof(float16);
846 intptr_t i, j;
848 /* Shift boolean to the sign bit so we can xor to negate. */
849 neg_real <<= 15;
850 neg_imag <<= 15;
852 for (i = 0; i < elements; i += eltspersegment) {
853 float16 mr = m[H2(i + 2 * index + 0)];
854 float16 mi = m[H2(i + 2 * index + 1)];
855 float16 e1 = neg_real ^ (flip ? mi : mr);
856 float16 e3 = neg_imag ^ (flip ? mr : mi);
858 for (j = i; j < i + eltspersegment; j += 2) {
859 float16 e2 = n[H2(j + flip)];
860 float16 e4 = e2;
862 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
863 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
866 clear_tail(d, opr_sz, simd_maxsz(desc));
869 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
870 void *vfpst, uint32_t desc)
872 uintptr_t opr_sz = simd_oprsz(desc);
873 float32 *d = vd, *n = vn, *m = vm, *a = va;
874 float_status *fpst = vfpst;
875 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
876 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
877 uint32_t neg_real = flip ^ neg_imag;
878 uintptr_t i;
880 /* Shift boolean to the sign bit so we can xor to negate. */
881 neg_real <<= 31;
882 neg_imag <<= 31;
884 for (i = 0; i < opr_sz / 4; i += 2) {
885 float32 e2 = n[H4(i + flip)];
886 float32 e1 = m[H4(i + flip)] ^ neg_real;
887 float32 e4 = e2;
888 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
891 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893 clear_tail(d, opr_sz, simd_maxsz(desc));
896 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
897 void *vfpst, uint32_t desc)
899 uintptr_t opr_sz = simd_oprsz(desc);
900 float32 *d = vd, *n = vn, *m = vm, *a = va;
901 float_status *fpst = vfpst;
902 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
903 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
904 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
905 uint32_t neg_real = flip ^ neg_imag;
906 intptr_t elements = opr_sz / sizeof(float32);
907 intptr_t eltspersegment = 16 / sizeof(float32);
908 intptr_t i, j;
910 /* Shift boolean to the sign bit so we can xor to negate. */
911 neg_real <<= 31;
912 neg_imag <<= 31;
914 for (i = 0; i < elements; i += eltspersegment) {
915 float32 mr = m[H4(i + 2 * index + 0)];
916 float32 mi = m[H4(i + 2 * index + 1)];
917 float32 e1 = neg_real ^ (flip ? mi : mr);
918 float32 e3 = neg_imag ^ (flip ? mr : mi);
920 for (j = i; j < i + eltspersegment; j += 2) {
921 float32 e2 = n[H4(j + flip)];
922 float32 e4 = e2;
924 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
925 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
928 clear_tail(d, opr_sz, simd_maxsz(desc));
931 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
932 void *vfpst, uint32_t desc)
934 uintptr_t opr_sz = simd_oprsz(desc);
935 float64 *d = vd, *n = vn, *m = vm, *a = va;
936 float_status *fpst = vfpst;
937 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
938 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
939 uint64_t neg_real = flip ^ neg_imag;
940 uintptr_t i;
942 /* Shift boolean to the sign bit so we can xor to negate. */
943 neg_real <<= 63;
944 neg_imag <<= 63;
946 for (i = 0; i < opr_sz / 8; i += 2) {
947 float64 e2 = n[i + flip];
948 float64 e1 = m[i + flip] ^ neg_real;
949 float64 e4 = e2;
950 float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
953 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955 clear_tail(d, opr_sz, simd_maxsz(desc));
959 * Floating point comparisons producing an integer result (all 1s or all 0s).
960 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
961 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 return -float16_eq_quiet(op1, op2, stat);
968 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 return -float32_eq_quiet(op1, op2, stat);
973 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
975 return -float16_le(op2, op1, stat);
978 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
980 return -float32_le(op2, op1, stat);
983 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
985 return -float16_lt(op2, op1, stat);
988 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
990 return -float32_lt(op2, op1, stat);
993 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
995 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
998 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1000 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1003 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1005 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1008 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1010 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1013 static int16_t vfp_tosszh(float16 x, void *fpstp)
1015 float_status *fpst = fpstp;
1016 if (float16_is_any_nan(x)) {
1017 float_raise(float_flag_invalid, fpst);
1018 return 0;
1020 return float16_to_int16_round_to_zero(x, fpst);
1023 static uint16_t vfp_touszh(float16 x, void *fpstp)
1025 float_status *fpst = fpstp;
1026 if (float16_is_any_nan(x)) {
1027 float_raise(float_flag_invalid, fpst);
1028 return 0;
1030 return float16_to_uint16_round_to_zero(x, fpst);
1033 #define DO_2OP(NAME, FUNC, TYPE) \
1034 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1036 intptr_t i, oprsz = simd_oprsz(desc); \
1037 TYPE *d = vd, *n = vn; \
1038 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1039 d[i] = FUNC(n[i], stat); \
1041 clear_tail(d, oprsz, simd_maxsz(desc)); \
1044 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1045 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1046 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1048 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1049 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1050 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1052 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1053 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1055 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1056 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1057 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1058 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1059 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1060 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1061 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1062 DO_2OP(gvec_touszh, vfp_touszh, float16)
1064 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1065 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1067 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1070 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1071 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1073 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1076 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1077 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1078 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1079 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1080 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1082 DO_2OP_CMP0(cgt, cgt, FWD)
1083 DO_2OP_CMP0(cge, cge, FWD)
1084 DO_2OP_CMP0(ceq, ceq, FWD)
1085 DO_2OP_CMP0(clt, cgt, REV)
1086 DO_2OP_CMP0(cle, cge, REV)
1088 #undef DO_2OP
1089 #undef DO_2OP_CMP0
1091 /* Floating-point trigonometric starting value.
1092 * See the ARM ARM pseudocode function FPTrigSMul.
1094 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1096 float16 result = float16_mul(op1, op1, stat);
1097 if (!float16_is_any_nan(result)) {
1098 result = float16_set_sign(result, op2 & 1);
1100 return result;
1103 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1105 float32 result = float32_mul(op1, op1, stat);
1106 if (!float32_is_any_nan(result)) {
1107 result = float32_set_sign(result, op2 & 1);
1109 return result;
1112 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1114 float64 result = float64_mul(op1, op1, stat);
1115 if (!float64_is_any_nan(result)) {
1116 result = float64_set_sign(result, op2 & 1);
1118 return result;
1121 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1123 return float16_abs(float16_sub(op1, op2, stat));
1126 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1128 return float32_abs(float32_sub(op1, op2, stat));
1132 * Reciprocal step. These are the AArch32 version which uses a
1133 * non-fused multiply-and-subtract.
1135 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1137 op1 = float16_squash_input_denormal(op1, stat);
1138 op2 = float16_squash_input_denormal(op2, stat);
1140 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1141 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1142 return float16_two;
1144 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1147 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1149 op1 = float32_squash_input_denormal(op1, stat);
1150 op2 = float32_squash_input_denormal(op2, stat);
1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154 return float32_two;
1156 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1159 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1160 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1162 op1 = float16_squash_input_denormal(op1, stat);
1163 op2 = float16_squash_input_denormal(op2, stat);
1165 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1166 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1167 return float16_one_point_five;
1169 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1170 return float16_div(op1, float16_two, stat);
1173 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1175 op1 = float32_squash_input_denormal(op1, stat);
1176 op2 = float32_squash_input_denormal(op2, stat);
1178 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1179 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1180 return float32_one_point_five;
1182 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1183 return float32_div(op1, float32_two, stat);
1186 #define DO_3OP(NAME, FUNC, TYPE) \
1187 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1189 intptr_t i, oprsz = simd_oprsz(desc); \
1190 TYPE *d = vd, *n = vn, *m = vm; \
1191 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1192 d[i] = FUNC(n[i], m[i], stat); \
1194 clear_tail(d, oprsz, simd_maxsz(desc)); \
1197 DO_3OP(gvec_fadd_h, float16_add, float16)
1198 DO_3OP(gvec_fadd_s, float32_add, float32)
1199 DO_3OP(gvec_fadd_d, float64_add, float64)
1201 DO_3OP(gvec_fsub_h, float16_sub, float16)
1202 DO_3OP(gvec_fsub_s, float32_sub, float32)
1203 DO_3OP(gvec_fsub_d, float64_sub, float64)
1205 DO_3OP(gvec_fmul_h, float16_mul, float16)
1206 DO_3OP(gvec_fmul_s, float32_mul, float32)
1207 DO_3OP(gvec_fmul_d, float64_mul, float64)
1209 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1210 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1211 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1213 DO_3OP(gvec_fabd_h, float16_abd, float16)
1214 DO_3OP(gvec_fabd_s, float32_abd, float32)
1216 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1217 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1219 DO_3OP(gvec_fcge_h, float16_cge, float16)
1220 DO_3OP(gvec_fcge_s, float32_cge, float32)
1222 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1223 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1225 DO_3OP(gvec_facge_h, float16_acge, float16)
1226 DO_3OP(gvec_facge_s, float32_acge, float32)
1228 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1229 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1231 DO_3OP(gvec_fmax_h, float16_max, float16)
1232 DO_3OP(gvec_fmax_s, float32_max, float32)
1234 DO_3OP(gvec_fmin_h, float16_min, float16)
1235 DO_3OP(gvec_fmin_s, float32_min, float32)
1237 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1238 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1240 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1241 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1243 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1244 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1246 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1247 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1249 #ifdef TARGET_AARCH64
1251 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1252 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1253 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1255 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1256 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1257 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1259 #endif
1260 #undef DO_3OP
1262 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1263 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1264 float_status *stat)
1266 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1269 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1270 float_status *stat)
1272 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1275 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1276 float_status *stat)
1278 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1281 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1282 float_status *stat)
1284 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1287 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1288 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1289 float_status *stat)
1291 return float16_muladd(op1, op2, dest, 0, stat);
1294 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1295 float_status *stat)
1297 return float32_muladd(op1, op2, dest, 0, stat);
1300 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1301 float_status *stat)
1303 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1306 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1307 float_status *stat)
1309 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1312 #define DO_MULADD(NAME, FUNC, TYPE) \
1313 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1315 intptr_t i, oprsz = simd_oprsz(desc); \
1316 TYPE *d = vd, *n = vn, *m = vm; \
1317 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1318 d[i] = FUNC(d[i], n[i], m[i], stat); \
1320 clear_tail(d, oprsz, simd_maxsz(desc)); \
1323 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1324 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1326 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1327 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1329 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1330 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1332 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1333 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1335 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1336 * For AdvSIMD, there is of course only one such vector segment.
1339 #define DO_MUL_IDX(NAME, TYPE, H) \
1340 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1342 intptr_t i, j, oprsz = simd_oprsz(desc); \
1343 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1344 intptr_t idx = simd_data(desc); \
1345 TYPE *d = vd, *n = vn, *m = vm; \
1346 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1347 TYPE mm = m[H(i + idx)]; \
1348 for (j = 0; j < segment; j++) { \
1349 d[i + j] = n[i + j] * mm; \
1352 clear_tail(d, oprsz, simd_maxsz(desc)); \
1355 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1356 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1357 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1359 #undef DO_MUL_IDX
1361 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1362 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1364 intptr_t i, j, oprsz = simd_oprsz(desc); \
1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1366 intptr_t idx = simd_data(desc); \
1367 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1369 TYPE mm = m[H(i + idx)]; \
1370 for (j = 0; j < segment; j++) { \
1371 d[i + j] = a[i + j] OP n[i + j] * mm; \
1374 clear_tail(d, oprsz, simd_maxsz(desc)); \
1377 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1378 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1379 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1381 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1382 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1383 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1385 #undef DO_MLA_IDX
1387 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1388 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1390 intptr_t i, j, oprsz = simd_oprsz(desc); \
1391 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1392 intptr_t idx = simd_data(desc); \
1393 TYPE *d = vd, *n = vn, *m = vm; \
1394 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1395 TYPE mm = m[H(i + idx)]; \
1396 for (j = 0; j < segment; j++) { \
1397 d[i + j] = TYPE##_##ADD(d[i + j], \
1398 TYPE##_mul(n[i + j], mm, stat), stat); \
1401 clear_tail(d, oprsz, simd_maxsz(desc)); \
1404 #define float16_nop(N, M, S) (M)
1405 #define float32_nop(N, M, S) (M)
1406 #define float64_nop(N, M, S) (M)
1408 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1409 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1410 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1413 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1414 * the fused ops below they assume accumulate both from and into Vd.
1416 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1417 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1418 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1419 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1421 #undef float16_nop
1422 #undef float32_nop
1423 #undef float64_nop
1424 #undef DO_FMUL_IDX
1426 #define DO_FMLA_IDX(NAME, TYPE, H) \
1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1428 void *stat, uint32_t desc) \
1430 intptr_t i, j, oprsz = simd_oprsz(desc); \
1431 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1432 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1433 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1434 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1435 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1436 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1437 TYPE mm = m[H(i + idx)]; \
1438 for (j = 0; j < segment; j++) { \
1439 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1440 mm, a[i + j], 0, stat); \
1443 clear_tail(d, oprsz, simd_maxsz(desc)); \
1446 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1447 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1448 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1450 #undef DO_FMLA_IDX
1452 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1453 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1455 intptr_t i, oprsz = simd_oprsz(desc); \
1456 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1457 bool q = false; \
1458 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1459 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1460 if (dd < MIN) { \
1461 dd = MIN; \
1462 q = true; \
1463 } else if (dd > MAX) { \
1464 dd = MAX; \
1465 q = true; \
1467 d[i] = dd; \
1469 if (q) { \
1470 uint32_t *qc = vq; \
1471 qc[0] = 1; \
1473 clear_tail(d, oprsz, simd_maxsz(desc)); \
1476 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1477 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1478 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1480 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1481 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1482 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1484 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1485 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1486 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1488 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1489 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1490 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1492 #undef DO_SAT
1494 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1495 void *vm, uint32_t desc)
1497 intptr_t i, oprsz = simd_oprsz(desc);
1498 uint64_t *d = vd, *n = vn, *m = vm;
1499 bool q = false;
1501 for (i = 0; i < oprsz / 8; i++) {
1502 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1503 if (dd < nn) {
1504 dd = UINT64_MAX;
1505 q = true;
1507 d[i] = dd;
1509 if (q) {
1510 uint32_t *qc = vq;
1511 qc[0] = 1;
1513 clear_tail(d, oprsz, simd_maxsz(desc));
1516 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1517 void *vm, uint32_t desc)
1519 intptr_t i, oprsz = simd_oprsz(desc);
1520 uint64_t *d = vd, *n = vn, *m = vm;
1521 bool q = false;
1523 for (i = 0; i < oprsz / 8; i++) {
1524 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1525 if (nn < mm) {
1526 dd = 0;
1527 q = true;
1529 d[i] = dd;
1531 if (q) {
1532 uint32_t *qc = vq;
1533 qc[0] = 1;
1535 clear_tail(d, oprsz, simd_maxsz(desc));
1538 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1539 void *vm, uint32_t desc)
1541 intptr_t i, oprsz = simd_oprsz(desc);
1542 int64_t *d = vd, *n = vn, *m = vm;
1543 bool q = false;
1545 for (i = 0; i < oprsz / 8; i++) {
1546 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1547 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1548 dd = (nn >> 63) ^ ~INT64_MIN;
1549 q = true;
1551 d[i] = dd;
1553 if (q) {
1554 uint32_t *qc = vq;
1555 qc[0] = 1;
1557 clear_tail(d, oprsz, simd_maxsz(desc));
1560 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1561 void *vm, uint32_t desc)
1563 intptr_t i, oprsz = simd_oprsz(desc);
1564 int64_t *d = vd, *n = vn, *m = vm;
1565 bool q = false;
1567 for (i = 0; i < oprsz / 8; i++) {
1568 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1569 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1570 dd = (nn >> 63) ^ ~INT64_MIN;
1571 q = true;
1573 d[i] = dd;
1575 if (q) {
1576 uint32_t *qc = vq;
1577 qc[0] = 1;
1579 clear_tail(d, oprsz, simd_maxsz(desc));
1583 #define DO_SRA(NAME, TYPE) \
1584 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1586 intptr_t i, oprsz = simd_oprsz(desc); \
1587 int shift = simd_data(desc); \
1588 TYPE *d = vd, *n = vn; \
1589 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1590 d[i] += n[i] >> shift; \
1592 clear_tail(d, oprsz, simd_maxsz(desc)); \
1595 DO_SRA(gvec_ssra_b, int8_t)
1596 DO_SRA(gvec_ssra_h, int16_t)
1597 DO_SRA(gvec_ssra_s, int32_t)
1598 DO_SRA(gvec_ssra_d, int64_t)
1600 DO_SRA(gvec_usra_b, uint8_t)
1601 DO_SRA(gvec_usra_h, uint16_t)
1602 DO_SRA(gvec_usra_s, uint32_t)
1603 DO_SRA(gvec_usra_d, uint64_t)
1605 #undef DO_SRA
1607 #define DO_RSHR(NAME, TYPE) \
1608 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1610 intptr_t i, oprsz = simd_oprsz(desc); \
1611 int shift = simd_data(desc); \
1612 TYPE *d = vd, *n = vn; \
1613 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1614 TYPE tmp = n[i] >> (shift - 1); \
1615 d[i] = (tmp >> 1) + (tmp & 1); \
1617 clear_tail(d, oprsz, simd_maxsz(desc)); \
1620 DO_RSHR(gvec_srshr_b, int8_t)
1621 DO_RSHR(gvec_srshr_h, int16_t)
1622 DO_RSHR(gvec_srshr_s, int32_t)
1623 DO_RSHR(gvec_srshr_d, int64_t)
1625 DO_RSHR(gvec_urshr_b, uint8_t)
1626 DO_RSHR(gvec_urshr_h, uint16_t)
1627 DO_RSHR(gvec_urshr_s, uint32_t)
1628 DO_RSHR(gvec_urshr_d, uint64_t)
1630 #undef DO_RSHR
1632 #define DO_RSRA(NAME, TYPE) \
1633 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1635 intptr_t i, oprsz = simd_oprsz(desc); \
1636 int shift = simd_data(desc); \
1637 TYPE *d = vd, *n = vn; \
1638 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1639 TYPE tmp = n[i] >> (shift - 1); \
1640 d[i] += (tmp >> 1) + (tmp & 1); \
1642 clear_tail(d, oprsz, simd_maxsz(desc)); \
1645 DO_RSRA(gvec_srsra_b, int8_t)
1646 DO_RSRA(gvec_srsra_h, int16_t)
1647 DO_RSRA(gvec_srsra_s, int32_t)
1648 DO_RSRA(gvec_srsra_d, int64_t)
1650 DO_RSRA(gvec_ursra_b, uint8_t)
1651 DO_RSRA(gvec_ursra_h, uint16_t)
1652 DO_RSRA(gvec_ursra_s, uint32_t)
1653 DO_RSRA(gvec_ursra_d, uint64_t)
1655 #undef DO_RSRA
1657 #define DO_SRI(NAME, TYPE) \
1658 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1660 intptr_t i, oprsz = simd_oprsz(desc); \
1661 int shift = simd_data(desc); \
1662 TYPE *d = vd, *n = vn; \
1663 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1664 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1666 clear_tail(d, oprsz, simd_maxsz(desc)); \
1669 DO_SRI(gvec_sri_b, uint8_t)
1670 DO_SRI(gvec_sri_h, uint16_t)
1671 DO_SRI(gvec_sri_s, uint32_t)
1672 DO_SRI(gvec_sri_d, uint64_t)
1674 #undef DO_SRI
1676 #define DO_SLI(NAME, TYPE) \
1677 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1679 intptr_t i, oprsz = simd_oprsz(desc); \
1680 int shift = simd_data(desc); \
1681 TYPE *d = vd, *n = vn; \
1682 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1683 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1685 clear_tail(d, oprsz, simd_maxsz(desc)); \
1688 DO_SLI(gvec_sli_b, uint8_t)
1689 DO_SLI(gvec_sli_h, uint16_t)
1690 DO_SLI(gvec_sli_s, uint32_t)
1691 DO_SLI(gvec_sli_d, uint64_t)
1693 #undef DO_SLI
1696 * Convert float16 to float32, raising no exceptions and
1697 * preserving exceptional values, including SNaN.
1698 * This is effectively an unpack+repack operation.
1700 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1702 const int f16_bias = 15;
1703 const int f32_bias = 127;
1704 uint32_t sign = extract32(f16, 15, 1);
1705 uint32_t exp = extract32(f16, 10, 5);
1706 uint32_t frac = extract32(f16, 0, 10);
1708 if (exp == 0x1f) {
1709 /* Inf or NaN */
1710 exp = 0xff;
1711 } else if (exp == 0) {
1712 /* Zero or denormal. */
1713 if (frac != 0) {
1714 if (fz16) {
1715 frac = 0;
1716 } else {
1718 * Denormal; these are all normal float32.
1719 * Shift the fraction so that the msb is at bit 11,
1720 * then remove bit 11 as the implicit bit of the
1721 * normalized float32. Note that we still go through
1722 * the shift for normal numbers below, to put the
1723 * float32 fraction at the right place.
1725 int shift = clz32(frac) - 21;
1726 frac = (frac << shift) & 0x3ff;
1727 exp = f32_bias - f16_bias - shift + 1;
1730 } else {
1731 /* Normal number; adjust the bias. */
1732 exp += f32_bias - f16_bias;
1734 sign <<= 31;
1735 exp <<= 23;
1736 frac <<= 23 - 10;
1738 return sign | exp | frac;
1741 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1744 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1745 * Load the 2nd qword iff is_q & is_2.
1746 * Shift to the 2nd dword iff !is_q & is_2.
1747 * For !is_q & !is_2, the upper bits of the result are garbage.
1749 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1753 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1754 * as there is not yet SVE versions that might use blocking.
1757 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1758 uint32_t desc, bool fz16)
1760 intptr_t i, oprsz = simd_oprsz(desc);
1761 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1762 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1763 int is_q = oprsz == 16;
1764 uint64_t n_4, m_4;
1766 /* Pre-load all of the f16 data, avoiding overlap issues. */
1767 n_4 = load4_f16(vn, is_q, is_2);
1768 m_4 = load4_f16(vm, is_q, is_2);
1770 /* Negate all inputs for FMLSL at once. */
1771 if (is_s) {
1772 n_4 ^= 0x8000800080008000ull;
1775 for (i = 0; i < oprsz / 4; i++) {
1776 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1777 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1778 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1780 clear_tail(d, oprsz, simd_maxsz(desc));
1783 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1784 void *venv, uint32_t desc)
1786 CPUARMState *env = venv;
1787 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1788 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1791 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1792 void *venv, uint32_t desc)
1794 CPUARMState *env = venv;
1795 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1796 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1799 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1800 void *venv, uint32_t desc)
1802 intptr_t i, oprsz = simd_oprsz(desc);
1803 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1804 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1805 CPUARMState *env = venv;
1806 float_status *status = &env->vfp.fp_status;
1807 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1809 for (i = 0; i < oprsz; i += sizeof(float32)) {
1810 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1811 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1812 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1813 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1814 float32 aa = *(float32 *)(va + H1_4(i));
1816 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1820 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1821 uint32_t desc, bool fz16)
1823 intptr_t i, oprsz = simd_oprsz(desc);
1824 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1825 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1826 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1827 int is_q = oprsz == 16;
1828 uint64_t n_4;
1829 float32 m_1;
1831 /* Pre-load all of the f16 data, avoiding overlap issues. */
1832 n_4 = load4_f16(vn, is_q, is_2);
1834 /* Negate all inputs for FMLSL at once. */
1835 if (is_s) {
1836 n_4 ^= 0x8000800080008000ull;
1839 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1841 for (i = 0; i < oprsz / 4; i++) {
1842 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1843 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1845 clear_tail(d, oprsz, simd_maxsz(desc));
1848 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1849 void *venv, uint32_t desc)
1851 CPUARMState *env = venv;
1852 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1853 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1856 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1857 void *venv, uint32_t desc)
1859 CPUARMState *env = venv;
1860 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1861 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1864 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1865 void *venv, uint32_t desc)
1867 intptr_t i, j, oprsz = simd_oprsz(desc);
1868 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1869 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1870 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1871 CPUARMState *env = venv;
1872 float_status *status = &env->vfp.fp_status;
1873 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1875 for (i = 0; i < oprsz; i += 16) {
1876 float16 mm_16 = *(float16 *)(vm + i + idx);
1877 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1879 for (j = 0; j < 16; j += sizeof(float32)) {
1880 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1881 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1882 float32 aa = *(float32 *)(va + H1_4(i + j));
1884 *(float32 *)(vd + H1_4(i + j)) =
1885 float32_muladd(nn, mm, aa, 0, status);
1890 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1892 intptr_t i, opr_sz = simd_oprsz(desc);
1893 int8_t *d = vd, *n = vn, *m = vm;
1895 for (i = 0; i < opr_sz; ++i) {
1896 int8_t mm = m[i];
1897 int8_t nn = n[i];
1898 int8_t res = 0;
1899 if (mm >= 0) {
1900 if (mm < 8) {
1901 res = nn << mm;
1903 } else {
1904 res = nn >> (mm > -8 ? -mm : 7);
1906 d[i] = res;
1908 clear_tail(d, opr_sz, simd_maxsz(desc));
1911 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1913 intptr_t i, opr_sz = simd_oprsz(desc);
1914 int16_t *d = vd, *n = vn, *m = vm;
1916 for (i = 0; i < opr_sz / 2; ++i) {
1917 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1918 int16_t nn = n[i];
1919 int16_t res = 0;
1920 if (mm >= 0) {
1921 if (mm < 16) {
1922 res = nn << mm;
1924 } else {
1925 res = nn >> (mm > -16 ? -mm : 15);
1927 d[i] = res;
1929 clear_tail(d, opr_sz, simd_maxsz(desc));
1932 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1934 intptr_t i, opr_sz = simd_oprsz(desc);
1935 uint8_t *d = vd, *n = vn, *m = vm;
1937 for (i = 0; i < opr_sz; ++i) {
1938 int8_t mm = m[i];
1939 uint8_t nn = n[i];
1940 uint8_t res = 0;
1941 if (mm >= 0) {
1942 if (mm < 8) {
1943 res = nn << mm;
1945 } else {
1946 if (mm > -8) {
1947 res = nn >> -mm;
1950 d[i] = res;
1952 clear_tail(d, opr_sz, simd_maxsz(desc));
1955 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1957 intptr_t i, opr_sz = simd_oprsz(desc);
1958 uint16_t *d = vd, *n = vn, *m = vm;
1960 for (i = 0; i < opr_sz / 2; ++i) {
1961 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1962 uint16_t nn = n[i];
1963 uint16_t res = 0;
1964 if (mm >= 0) {
1965 if (mm < 16) {
1966 res = nn << mm;
1968 } else {
1969 if (mm > -16) {
1970 res = nn >> -mm;
1973 d[i] = res;
1975 clear_tail(d, opr_sz, simd_maxsz(desc));
1979 * 8x8->8 polynomial multiply.
1981 * Polynomial multiplication is like integer multiplication except the
1982 * partial products are XORed, not added.
1984 * TODO: expose this as a generic vector operation, as it is a common
1985 * crypto building block.
1987 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1989 intptr_t i, j, opr_sz = simd_oprsz(desc);
1990 uint64_t *d = vd, *n = vn, *m = vm;
1992 for (i = 0; i < opr_sz / 8; ++i) {
1993 uint64_t nn = n[i];
1994 uint64_t mm = m[i];
1995 uint64_t rr = 0;
1997 for (j = 0; j < 8; ++j) {
1998 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1999 rr ^= mm & mask;
2000 mm = (mm << 1) & 0xfefefefefefefefeull;
2001 nn >>= 1;
2003 d[i] = rr;
2005 clear_tail(d, opr_sz, simd_maxsz(desc));
2009 * 64x64->128 polynomial multiply.
2010 * Because of the lanes are not accessed in strict columns,
2011 * this probably cannot be turned into a generic helper.
2013 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2015 intptr_t i, j, opr_sz = simd_oprsz(desc);
2016 intptr_t hi = simd_data(desc);
2017 uint64_t *d = vd, *n = vn, *m = vm;
2019 for (i = 0; i < opr_sz / 8; i += 2) {
2020 uint64_t nn = n[i + hi];
2021 uint64_t mm = m[i + hi];
2022 uint64_t rhi = 0;
2023 uint64_t rlo = 0;
2025 /* Bit 0 can only influence the low 64-bit result. */
2026 if (nn & 1) {
2027 rlo = mm;
2030 for (j = 1; j < 64; ++j) {
2031 uint64_t mask = -((nn >> j) & 1);
2032 rlo ^= (mm << j) & mask;
2033 rhi ^= (mm >> (64 - j)) & mask;
2035 d[i] = rlo;
2036 d[i + 1] = rhi;
2038 clear_tail(d, opr_sz, simd_maxsz(desc));
2042 * 8x8->16 polynomial multiply.
2044 * The byte inputs are expanded to (or extracted from) half-words.
2045 * Note that neon and sve2 get the inputs from different positions.
2046 * This allows 4 bytes to be processed in parallel with uint64_t.
2049 static uint64_t expand_byte_to_half(uint64_t x)
2051 return (x & 0x000000ff)
2052 | ((x & 0x0000ff00) << 8)
2053 | ((x & 0x00ff0000) << 16)
2054 | ((x & 0xff000000) << 24);
2057 uint64_t pmull_w(uint64_t op1, uint64_t op2)
2059 uint64_t result = 0;
2060 int i;
2061 for (i = 0; i < 16; ++i) {
2062 uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2063 result ^= op2 & mask;
2064 op1 >>= 1;
2065 op2 <<= 1;
2067 return result;
2070 uint64_t pmull_h(uint64_t op1, uint64_t op2)
2072 uint64_t result = 0;
2073 int i;
2074 for (i = 0; i < 8; ++i) {
2075 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2076 result ^= op2 & mask;
2077 op1 >>= 1;
2078 op2 <<= 1;
2080 return result;
2083 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2085 int hi = simd_data(desc);
2086 uint64_t *d = vd, *n = vn, *m = vm;
2087 uint64_t nn = n[hi], mm = m[hi];
2089 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2090 nn >>= 32;
2091 mm >>= 32;
2092 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2094 clear_tail(d, 16, simd_maxsz(desc));
2097 #ifdef TARGET_AARCH64
2098 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2100 int shift = simd_data(desc) * 8;
2101 intptr_t i, opr_sz = simd_oprsz(desc);
2102 uint64_t *d = vd, *n = vn, *m = vm;
2104 for (i = 0; i < opr_sz / 8; ++i) {
2105 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2106 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2108 d[i] = pmull_h(nn, mm);
2112 static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2114 uint64_t result = 0;
2115 int i;
2117 for (i = 0; i < 32; ++i) {
2118 uint64_t mask = -((op1 >> i) & 1);
2119 result ^= (op2 << i) & mask;
2121 return result;
2124 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2126 intptr_t sel = H4(simd_data(desc));
2127 intptr_t i, opr_sz = simd_oprsz(desc);
2128 uint32_t *n = vn, *m = vm;
2129 uint64_t *d = vd;
2131 for (i = 0; i < opr_sz / 8; ++i) {
2132 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2135 #endif
2137 #define DO_CMP0(NAME, TYPE, OP) \
2138 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2140 intptr_t i, opr_sz = simd_oprsz(desc); \
2141 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2142 TYPE nn = *(TYPE *)(vn + i); \
2143 *(TYPE *)(vd + i) = -(nn OP 0); \
2145 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2148 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2149 DO_CMP0(gvec_clt0_b, int8_t, <)
2150 DO_CMP0(gvec_cle0_b, int8_t, <=)
2151 DO_CMP0(gvec_cgt0_b, int8_t, >)
2152 DO_CMP0(gvec_cge0_b, int8_t, >=)
2154 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2155 DO_CMP0(gvec_clt0_h, int16_t, <)
2156 DO_CMP0(gvec_cle0_h, int16_t, <=)
2157 DO_CMP0(gvec_cgt0_h, int16_t, >)
2158 DO_CMP0(gvec_cge0_h, int16_t, >=)
2160 #undef DO_CMP0
2162 #define DO_ABD(NAME, TYPE) \
2163 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2165 intptr_t i, opr_sz = simd_oprsz(desc); \
2166 TYPE *d = vd, *n = vn, *m = vm; \
2168 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2169 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2171 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2174 DO_ABD(gvec_sabd_b, int8_t)
2175 DO_ABD(gvec_sabd_h, int16_t)
2176 DO_ABD(gvec_sabd_s, int32_t)
2177 DO_ABD(gvec_sabd_d, int64_t)
2179 DO_ABD(gvec_uabd_b, uint8_t)
2180 DO_ABD(gvec_uabd_h, uint16_t)
2181 DO_ABD(gvec_uabd_s, uint32_t)
2182 DO_ABD(gvec_uabd_d, uint64_t)
2184 #undef DO_ABD
2186 #define DO_ABA(NAME, TYPE) \
2187 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2189 intptr_t i, opr_sz = simd_oprsz(desc); \
2190 TYPE *d = vd, *n = vn, *m = vm; \
2192 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2193 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2195 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2198 DO_ABA(gvec_saba_b, int8_t)
2199 DO_ABA(gvec_saba_h, int16_t)
2200 DO_ABA(gvec_saba_s, int32_t)
2201 DO_ABA(gvec_saba_d, int64_t)
2203 DO_ABA(gvec_uaba_b, uint8_t)
2204 DO_ABA(gvec_uaba_h, uint16_t)
2205 DO_ABA(gvec_uaba_s, uint32_t)
2206 DO_ABA(gvec_uaba_d, uint64_t)
2208 #undef DO_ABA
2210 #define DO_NEON_PAIRWISE(NAME, OP) \
2211 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2212 void *stat, uint32_t oprsz) \
2214 float_status *fpst = stat; \
2215 float32 *d = vd; \
2216 float32 *n = vn; \
2217 float32 *m = vm; \
2218 float32 r0, r1; \
2220 /* Read all inputs before writing outputs in case vm == vd */ \
2221 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2222 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2224 d[H4(0)] = r0; \
2225 d[H4(1)] = r1; \
2228 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2229 void *stat, uint32_t oprsz) \
2231 float_status *fpst = stat; \
2232 float16 *d = vd; \
2233 float16 *n = vn; \
2234 float16 *m = vm; \
2235 float16 r0, r1, r2, r3; \
2237 /* Read all inputs before writing outputs in case vm == vd */ \
2238 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2239 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2240 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2241 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2243 d[H2(0)] = r0; \
2244 d[H2(1)] = r1; \
2245 d[H2(2)] = r2; \
2246 d[H2(3)] = r3; \
2249 DO_NEON_PAIRWISE(neon_padd, add)
2250 DO_NEON_PAIRWISE(neon_pmax, max)
2251 DO_NEON_PAIRWISE(neon_pmin, min)
2253 #undef DO_NEON_PAIRWISE
2255 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2256 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2258 intptr_t i, oprsz = simd_oprsz(desc); \
2259 int shift = simd_data(desc); \
2260 TYPE *d = vd, *n = vn; \
2261 float_status *fpst = stat; \
2262 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2263 d[i] = FUNC(n[i], shift, fpst); \
2265 clear_tail(d, oprsz, simd_maxsz(desc)); \
2268 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2269 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2270 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2271 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2272 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2273 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2274 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2275 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2277 #undef DO_VCVT_FIXED
2279 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2280 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2282 float_status *fpst = stat; \
2283 intptr_t i, oprsz = simd_oprsz(desc); \
2284 uint32_t rmode = simd_data(desc); \
2285 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2286 TYPE *d = vd, *n = vn; \
2287 set_float_rounding_mode(rmode, fpst); \
2288 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2289 d[i] = FUNC(n[i], 0, fpst); \
2291 set_float_rounding_mode(prev_rmode, fpst); \
2292 clear_tail(d, oprsz, simd_maxsz(desc)); \
2295 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2296 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2297 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2298 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2300 #undef DO_VCVT_RMODE
2302 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2303 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2305 float_status *fpst = stat; \
2306 intptr_t i, oprsz = simd_oprsz(desc); \
2307 uint32_t rmode = simd_data(desc); \
2308 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2309 TYPE *d = vd, *n = vn; \
2310 set_float_rounding_mode(rmode, fpst); \
2311 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2312 d[i] = FUNC(n[i], fpst); \
2314 set_float_rounding_mode(prev_rmode, fpst); \
2315 clear_tail(d, oprsz, simd_maxsz(desc)); \
2318 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2319 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2321 #undef DO_VRINT_RMODE
2323 #ifdef TARGET_AARCH64
2324 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2326 const uint8_t *indices = vm;
2327 CPUARMState *env = venv;
2328 size_t oprsz = simd_oprsz(desc);
2329 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2330 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2331 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2332 union {
2333 uint8_t b[16];
2334 uint64_t d[2];
2335 } result;
2338 * We must construct the final result in a temp, lest the output
2339 * overlaps the input table. For TBL, begin with zero; for TBX,
2340 * begin with the original register contents. Note that we always
2341 * copy 16 bytes here to avoid an extra branch; clearing the high
2342 * bits of the register for oprsz == 8 is handled below.
2344 if (is_tbx) {
2345 memcpy(&result, vd, 16);
2346 } else {
2347 memset(&result, 0, 16);
2350 for (size_t i = 0; i < oprsz; ++i) {
2351 uint32_t index = indices[H1(i)];
2353 if (index < table_len) {
2355 * Convert index (a byte offset into the virtual table
2356 * which is a series of 128-bit vectors concatenated)
2357 * into the correct register element, bearing in mind
2358 * that the table can wrap around from V31 to V0.
2360 const uint8_t *table = (const uint8_t *)
2361 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2362 result.b[H1(i)] = table[H1(index % 16)];
2366 memcpy(vd, &result, 16);
2367 clear_tail(vd, oprsz, simd_maxsz(desc));
2369 #endif
2372 * NxN -> N highpart multiply
2374 * TODO: expose this as a generic vector operation.
2377 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2379 intptr_t i, opr_sz = simd_oprsz(desc);
2380 int8_t *d = vd, *n = vn, *m = vm;
2382 for (i = 0; i < opr_sz; ++i) {
2383 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2385 clear_tail(d, opr_sz, simd_maxsz(desc));
2388 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2390 intptr_t i, opr_sz = simd_oprsz(desc);
2391 int16_t *d = vd, *n = vn, *m = vm;
2393 for (i = 0; i < opr_sz / 2; ++i) {
2394 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2396 clear_tail(d, opr_sz, simd_maxsz(desc));
2399 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2401 intptr_t i, opr_sz = simd_oprsz(desc);
2402 int32_t *d = vd, *n = vn, *m = vm;
2404 for (i = 0; i < opr_sz / 4; ++i) {
2405 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2407 clear_tail(d, opr_sz, simd_maxsz(desc));
2410 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2412 intptr_t i, opr_sz = simd_oprsz(desc);
2413 uint64_t *d = vd, *n = vn, *m = vm;
2414 uint64_t discard;
2416 for (i = 0; i < opr_sz / 8; ++i) {
2417 muls64(&discard, &d[i], n[i], m[i]);
2419 clear_tail(d, opr_sz, simd_maxsz(desc));
2422 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2424 intptr_t i, opr_sz = simd_oprsz(desc);
2425 uint8_t *d = vd, *n = vn, *m = vm;
2427 for (i = 0; i < opr_sz; ++i) {
2428 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2430 clear_tail(d, opr_sz, simd_maxsz(desc));
2433 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2435 intptr_t i, opr_sz = simd_oprsz(desc);
2436 uint16_t *d = vd, *n = vn, *m = vm;
2438 for (i = 0; i < opr_sz / 2; ++i) {
2439 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2441 clear_tail(d, opr_sz, simd_maxsz(desc));
2444 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2446 intptr_t i, opr_sz = simd_oprsz(desc);
2447 uint32_t *d = vd, *n = vn, *m = vm;
2449 for (i = 0; i < opr_sz / 4; ++i) {
2450 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2452 clear_tail(d, opr_sz, simd_maxsz(desc));
2455 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2457 intptr_t i, opr_sz = simd_oprsz(desc);
2458 uint64_t *d = vd, *n = vn, *m = vm;
2459 uint64_t discard;
2461 for (i = 0; i < opr_sz / 8; ++i) {
2462 mulu64(&discard, &d[i], n[i], m[i]);
2464 clear_tail(d, opr_sz, simd_maxsz(desc));
2467 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470 int shr = simd_data(desc);
2471 uint64_t *d = vd, *n = vn, *m = vm;
2473 for (i = 0; i < opr_sz; ++i) {
2474 d[i] = ror64(n[i] ^ m[i], shr);
2476 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2480 * Integer matrix-multiply accumulate
2483 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2485 int8_t *n = vn, *m = vm;
2487 for (intptr_t k = 0; k < 8; ++k) {
2488 sum += n[H1(k)] * m[H1(k)];
2490 return sum;
2493 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2495 uint8_t *n = vn, *m = vm;
2497 for (intptr_t k = 0; k < 8; ++k) {
2498 sum += n[H1(k)] * m[H1(k)];
2500 return sum;
2503 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2505 uint8_t *n = vn;
2506 int8_t *m = vm;
2508 for (intptr_t k = 0; k < 8; ++k) {
2509 sum += n[H1(k)] * m[H1(k)];
2511 return sum;
2514 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2515 uint32_t (*inner_loop)(uint32_t, void *, void *))
2517 intptr_t seg, opr_sz = simd_oprsz(desc);
2519 for (seg = 0; seg < opr_sz; seg += 16) {
2520 uint32_t *d = vd + seg;
2521 uint32_t *a = va + seg;
2522 uint32_t sum0, sum1, sum2, sum3;
2525 * Process the entire segment at once, writing back the
2526 * results only after we've consumed all of the inputs.
2528 * Key to indices by column:
2529 * i j i j
2531 sum0 = a[H4(0 + 0)];
2532 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2533 sum1 = a[H4(0 + 1)];
2534 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2535 sum2 = a[H4(2 + 0)];
2536 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2537 sum3 = a[H4(2 + 1)];
2538 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2540 d[H4(0)] = sum0;
2541 d[H4(1)] = sum1;
2542 d[H4(2)] = sum2;
2543 d[H4(3)] = sum3;
2545 clear_tail(vd, opr_sz, simd_maxsz(desc));
2548 #define DO_MMLA_B(NAME, INNER) \
2549 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2550 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2552 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2553 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2554 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2557 * BFloat16 Dot Product
2560 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2562 /* FPCR is ignored for BFDOT and BFMMLA. */
2563 float_status bf_status = {
2564 .tininess_before_rounding = float_tininess_before_rounding,
2565 .float_rounding_mode = float_round_to_odd_inf,
2566 .flush_to_zero = true,
2567 .flush_inputs_to_zero = true,
2568 .default_nan_mode = true,
2570 float32 t1, t2;
2573 * Extract each BFloat16 from the element pair, and shift
2574 * them such that they become float32.
2576 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2577 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2578 t1 = float32_add(t1, t2, &bf_status);
2579 t1 = float32_add(sum, t1, &bf_status);
2581 return t1;
2584 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2586 intptr_t i, opr_sz = simd_oprsz(desc);
2587 float32 *d = vd, *a = va;
2588 uint32_t *n = vn, *m = vm;
2590 for (i = 0; i < opr_sz / 4; ++i) {
2591 d[i] = bfdotadd(a[i], n[i], m[i]);
2593 clear_tail(d, opr_sz, simd_maxsz(desc));
2596 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2597 void *va, uint32_t desc)
2599 intptr_t i, j, opr_sz = simd_oprsz(desc);
2600 intptr_t index = simd_data(desc);
2601 intptr_t elements = opr_sz / 4;
2602 intptr_t eltspersegment = MIN(16 / 4, elements);
2603 float32 *d = vd, *a = va;
2604 uint32_t *n = vn, *m = vm;
2606 for (i = 0; i < elements; i += eltspersegment) {
2607 uint32_t m_idx = m[i + H4(index)];
2609 for (j = i; j < i + eltspersegment; j++) {
2610 d[j] = bfdotadd(a[j], n[j], m_idx);
2613 clear_tail(d, opr_sz, simd_maxsz(desc));
2616 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2618 intptr_t s, opr_sz = simd_oprsz(desc);
2619 float32 *d = vd, *a = va;
2620 uint32_t *n = vn, *m = vm;
2622 for (s = 0; s < opr_sz / 4; s += 4) {
2623 float32 sum00, sum01, sum10, sum11;
2626 * Process the entire segment at once, writing back the
2627 * results only after we've consumed all of the inputs.
2629 * Key to indicies by column:
2630 * i j i k j k
2632 sum00 = a[s + H4(0 + 0)];
2633 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2634 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2636 sum01 = a[s + H4(0 + 1)];
2637 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2638 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2640 sum10 = a[s + H4(2 + 0)];
2641 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2642 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2644 sum11 = a[s + H4(2 + 1)];
2645 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2646 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2648 d[s + H4(0 + 0)] = sum00;
2649 d[s + H4(0 + 1)] = sum01;
2650 d[s + H4(2 + 0)] = sum10;
2651 d[s + H4(2 + 1)] = sum11;
2653 clear_tail(d, opr_sz, simd_maxsz(desc));
2656 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2657 void *stat, uint32_t desc)
2659 intptr_t i, opr_sz = simd_oprsz(desc);
2660 intptr_t sel = simd_data(desc);
2661 float32 *d = vd, *a = va;
2662 bfloat16 *n = vn, *m = vm;
2664 for (i = 0; i < opr_sz / 4; ++i) {
2665 float32 nn = n[H2(i * 2 + sel)] << 16;
2666 float32 mm = m[H2(i * 2 + sel)] << 16;
2667 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2669 clear_tail(d, opr_sz, simd_maxsz(desc));
2672 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2673 void *va, void *stat, uint32_t desc)
2675 intptr_t i, j, opr_sz = simd_oprsz(desc);
2676 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2677 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2678 intptr_t elements = opr_sz / 4;
2679 intptr_t eltspersegment = MIN(16 / 4, elements);
2680 float32 *d = vd, *a = va;
2681 bfloat16 *n = vn, *m = vm;
2683 for (i = 0; i < elements; i += eltspersegment) {
2684 float32 m_idx = m[H2(2 * i + index)] << 16;
2686 for (j = i; j < i + eltspersegment; j++) {
2687 float32 n_j = n[H2(2 * j + sel)] << 16;
2688 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2691 clear_tail(d, opr_sz, simd_maxsz(desc));
2694 #define DO_CLAMP(NAME, TYPE) \
2695 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
2697 intptr_t i, opr_sz = simd_oprsz(desc); \
2698 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2699 TYPE aa = *(TYPE *)(a + i); \
2700 TYPE nn = *(TYPE *)(n + i); \
2701 TYPE mm = *(TYPE *)(m + i); \
2702 TYPE dd = MIN(MAX(aa, nn), mm); \
2703 *(TYPE *)(d + i) = dd; \
2705 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2708 DO_CLAMP(gvec_sclamp_b, int8_t)
2709 DO_CLAMP(gvec_sclamp_h, int16_t)
2710 DO_CLAMP(gvec_sclamp_s, int32_t)
2711 DO_CLAMP(gvec_sclamp_d, int64_t)
2713 DO_CLAMP(gvec_uclamp_b, uint8_t)
2714 DO_CLAMP(gvec_uclamp_h, uint16_t)
2715 DO_CLAMP(gvec_uclamp_s, uint32_t)
2716 DO_CLAMP(gvec_uclamp_d, uint64_t)