tests/docker: auto-generate ubuntu1804.docker with lcitool
[qemu/armbru.git] / target / arm / vec_helper.c
blob17fb1583622e5bf8c646f91518c23d8860cb131c
1 /*
2 * ARM AdvSIMD / SVE Vector Operations
4 * Copyright (c) 2018 Linaro
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "vec_internal.h"
29 * Data for expanding active predicate bits to bytes, for byte elements.
31 * for (i = 0; i < 256; ++i) {
32 * unsigned long m = 0;
33 * for (j = 0; j < 8; j++) {
34 * if ((i >> j) & 1) {
35 * m |= 0xfful << (j << 3);
36 * }
37 * }
38 * printf("0x%016lx,\n", m);
39 * }
41 const uint64_t expand_pred_b_data[256] = {
42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
127 0xffffffffffffffff,
130 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
131 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
132 bool neg, bool round)
135 * Simplify:
136 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
137 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
139 int32_t ret = (int32_t)src1 * src2;
140 if (neg) {
141 ret = -ret;
143 ret += ((int32_t)src3 << 7) + (round << 6);
144 ret >>= 7;
146 if (ret != (int8_t)ret) {
147 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
149 return ret;
152 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
153 void *va, uint32_t desc)
155 intptr_t i, opr_sz = simd_oprsz(desc);
156 int8_t *d = vd, *n = vn, *m = vm, *a = va;
158 for (i = 0; i < opr_sz; ++i) {
159 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
163 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
164 void *va, uint32_t desc)
166 intptr_t i, opr_sz = simd_oprsz(desc);
167 int8_t *d = vd, *n = vn, *m = vm, *a = va;
169 for (i = 0; i < opr_sz; ++i) {
170 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
174 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
176 intptr_t i, opr_sz = simd_oprsz(desc);
177 int8_t *d = vd, *n = vn, *m = vm;
179 for (i = 0; i < opr_sz; ++i) {
180 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
184 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
186 intptr_t i, opr_sz = simd_oprsz(desc);
187 int8_t *d = vd, *n = vn, *m = vm;
189 for (i = 0; i < opr_sz; ++i) {
190 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
194 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
195 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
196 bool neg, bool round, uint32_t *sat)
198 /* Simplify similarly to do_sqrdmlah_b above. */
199 int32_t ret = (int32_t)src1 * src2;
200 if (neg) {
201 ret = -ret;
203 ret += ((int32_t)src3 << 15) + (round << 14);
204 ret >>= 15;
206 if (ret != (int16_t)ret) {
207 *sat = 1;
208 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
210 return ret;
213 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
214 uint32_t src2, uint32_t src3)
216 uint32_t *sat = &env->vfp.qc[0];
217 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
218 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
219 false, true, sat);
220 return deposit32(e1, 16, 16, e2);
223 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
224 void *vq, uint32_t desc)
226 uintptr_t opr_sz = simd_oprsz(desc);
227 int16_t *d = vd;
228 int16_t *n = vn;
229 int16_t *m = vm;
230 uintptr_t i;
232 for (i = 0; i < opr_sz / 2; ++i) {
233 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
235 clear_tail(d, opr_sz, simd_maxsz(desc));
238 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
239 uint32_t src2, uint32_t src3)
241 uint32_t *sat = &env->vfp.qc[0];
242 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
243 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
244 true, true, sat);
245 return deposit32(e1, 16, 16, e2);
248 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
249 void *vq, uint32_t desc)
251 uintptr_t opr_sz = simd_oprsz(desc);
252 int16_t *d = vd;
253 int16_t *n = vn;
254 int16_t *m = vm;
255 uintptr_t i;
257 for (i = 0; i < opr_sz / 2; ++i) {
258 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
260 clear_tail(d, opr_sz, simd_maxsz(desc));
263 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
264 void *vq, uint32_t desc)
266 intptr_t i, opr_sz = simd_oprsz(desc);
267 int16_t *d = vd, *n = vn, *m = vm;
269 for (i = 0; i < opr_sz / 2; ++i) {
270 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
272 clear_tail(d, opr_sz, simd_maxsz(desc));
275 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
276 void *vq, uint32_t desc)
278 intptr_t i, opr_sz = simd_oprsz(desc);
279 int16_t *d = vd, *n = vn, *m = vm;
281 for (i = 0; i < opr_sz / 2; ++i) {
282 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
284 clear_tail(d, opr_sz, simd_maxsz(desc));
287 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
288 void *va, uint32_t desc)
290 intptr_t i, opr_sz = simd_oprsz(desc);
291 int16_t *d = vd, *n = vn, *m = vm, *a = va;
292 uint32_t discard;
294 for (i = 0; i < opr_sz / 2; ++i) {
295 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
299 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
300 void *va, uint32_t desc)
302 intptr_t i, opr_sz = simd_oprsz(desc);
303 int16_t *d = vd, *n = vn, *m = vm, *a = va;
304 uint32_t discard;
306 for (i = 0; i < opr_sz / 2; ++i) {
307 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
311 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
313 intptr_t i, opr_sz = simd_oprsz(desc);
314 int16_t *d = vd, *n = vn, *m = vm;
315 uint32_t discard;
317 for (i = 0; i < opr_sz / 2; ++i) {
318 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
322 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
324 intptr_t i, opr_sz = simd_oprsz(desc);
325 int16_t *d = vd, *n = vn, *m = vm;
326 uint32_t discard;
328 for (i = 0; i < opr_sz / 2; ++i) {
329 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
333 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
335 intptr_t i, j, opr_sz = simd_oprsz(desc);
336 int idx = simd_data(desc);
337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338 uint32_t discard;
340 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
341 int16_t mm = m[i];
342 for (j = 0; j < 16 / 2; ++j) {
343 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
348 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 intptr_t i, j, opr_sz = simd_oprsz(desc);
351 int idx = simd_data(desc);
352 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
353 uint32_t discard;
355 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
356 int16_t mm = m[i];
357 for (j = 0; j < 16 / 2; ++j) {
358 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
363 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
364 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
365 bool neg, bool round, uint32_t *sat)
367 /* Simplify similarly to do_sqrdmlah_b above. */
368 int64_t ret = (int64_t)src1 * src2;
369 if (neg) {
370 ret = -ret;
372 ret += ((int64_t)src3 << 31) + (round << 30);
373 ret >>= 31;
375 if (ret != (int32_t)ret) {
376 *sat = 1;
377 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
379 return ret;
382 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
383 int32_t src2, int32_t src3)
385 uint32_t *sat = &env->vfp.qc[0];
386 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
389 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
390 void *vq, uint32_t desc)
392 uintptr_t opr_sz = simd_oprsz(desc);
393 int32_t *d = vd;
394 int32_t *n = vn;
395 int32_t *m = vm;
396 uintptr_t i;
398 for (i = 0; i < opr_sz / 4; ++i) {
399 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
401 clear_tail(d, opr_sz, simd_maxsz(desc));
404 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
405 int32_t src2, int32_t src3)
407 uint32_t *sat = &env->vfp.qc[0];
408 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
411 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
412 void *vq, uint32_t desc)
414 uintptr_t opr_sz = simd_oprsz(desc);
415 int32_t *d = vd;
416 int32_t *n = vn;
417 int32_t *m = vm;
418 uintptr_t i;
420 for (i = 0; i < opr_sz / 4; ++i) {
421 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
423 clear_tail(d, opr_sz, simd_maxsz(desc));
426 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
427 void *vq, uint32_t desc)
429 intptr_t i, opr_sz = simd_oprsz(desc);
430 int32_t *d = vd, *n = vn, *m = vm;
432 for (i = 0; i < opr_sz / 4; ++i) {
433 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
435 clear_tail(d, opr_sz, simd_maxsz(desc));
438 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
439 void *vq, uint32_t desc)
441 intptr_t i, opr_sz = simd_oprsz(desc);
442 int32_t *d = vd, *n = vn, *m = vm;
444 for (i = 0; i < opr_sz / 4; ++i) {
445 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
447 clear_tail(d, opr_sz, simd_maxsz(desc));
450 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
451 void *va, uint32_t desc)
453 intptr_t i, opr_sz = simd_oprsz(desc);
454 int32_t *d = vd, *n = vn, *m = vm, *a = va;
455 uint32_t discard;
457 for (i = 0; i < opr_sz / 4; ++i) {
458 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
462 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
463 void *va, uint32_t desc)
465 intptr_t i, opr_sz = simd_oprsz(desc);
466 int32_t *d = vd, *n = vn, *m = vm, *a = va;
467 uint32_t discard;
469 for (i = 0; i < opr_sz / 4; ++i) {
470 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
474 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
476 intptr_t i, opr_sz = simd_oprsz(desc);
477 int32_t *d = vd, *n = vn, *m = vm;
478 uint32_t discard;
480 for (i = 0; i < opr_sz / 4; ++i) {
481 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
485 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
487 intptr_t i, opr_sz = simd_oprsz(desc);
488 int32_t *d = vd, *n = vn, *m = vm;
489 uint32_t discard;
491 for (i = 0; i < opr_sz / 4; ++i) {
492 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
496 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
498 intptr_t i, j, opr_sz = simd_oprsz(desc);
499 int idx = simd_data(desc);
500 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
501 uint32_t discard;
503 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
504 int32_t mm = m[i];
505 for (j = 0; j < 16 / 4; ++j) {
506 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
511 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 intptr_t i, j, opr_sz = simd_oprsz(desc);
514 int idx = simd_data(desc);
515 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
516 uint32_t discard;
518 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
519 int32_t mm = m[i];
520 for (j = 0; j < 16 / 4; ++j) {
521 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
526 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
527 static int64_t do_sat128_d(Int128 r)
529 int64_t ls = int128_getlo(r);
530 int64_t hs = int128_gethi(r);
532 if (unlikely(hs != (ls >> 63))) {
533 return hs < 0 ? INT64_MIN : INT64_MAX;
535 return ls;
538 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
540 uint64_t l, h;
541 Int128 r, t;
543 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
544 muls64(&l, &h, m, n);
545 r = int128_make128(l, h);
546 if (neg) {
547 r = int128_neg(r);
549 if (a) {
550 t = int128_exts64(a);
551 t = int128_lshift(t, 63);
552 r = int128_add(r, t);
554 if (round) {
555 t = int128_exts64(1ll << 62);
556 r = int128_add(r, t);
558 r = int128_rshift(r, 63);
560 return do_sat128_d(r);
563 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
564 void *va, uint32_t desc)
566 intptr_t i, opr_sz = simd_oprsz(desc);
567 int64_t *d = vd, *n = vn, *m = vm, *a = va;
569 for (i = 0; i < opr_sz / 8; ++i) {
570 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
574 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
575 void *va, uint32_t desc)
577 intptr_t i, opr_sz = simd_oprsz(desc);
578 int64_t *d = vd, *n = vn, *m = vm, *a = va;
580 for (i = 0; i < opr_sz / 8; ++i) {
581 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
585 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
587 intptr_t i, opr_sz = simd_oprsz(desc);
588 int64_t *d = vd, *n = vn, *m = vm;
590 for (i = 0; i < opr_sz / 8; ++i) {
591 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
595 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
597 intptr_t i, opr_sz = simd_oprsz(desc);
598 int64_t *d = vd, *n = vn, *m = vm;
600 for (i = 0; i < opr_sz / 8; ++i) {
601 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
605 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
607 intptr_t i, j, opr_sz = simd_oprsz(desc);
608 int idx = simd_data(desc);
609 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
611 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
612 int64_t mm = m[i];
613 for (j = 0; j < 16 / 8; ++j) {
614 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
619 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
621 intptr_t i, j, opr_sz = simd_oprsz(desc);
622 int idx = simd_data(desc);
623 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
625 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
626 int64_t mm = m[i];
627 for (j = 0; j < 16 / 8; ++j) {
628 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
633 /* Integer 8 and 16-bit dot-product.
635 * Note that for the loops herein, host endianness does not matter
636 * with respect to the ordering of data within the quad-width lanes.
637 * All elements are treated equally, no matter where they are.
640 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
641 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
643 intptr_t i, opr_sz = simd_oprsz(desc); \
644 TYPED *d = vd, *a = va; \
645 TYPEN *n = vn; \
646 TYPEM *m = vm; \
647 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
648 d[i] = (a[i] + \
649 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
650 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
651 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
652 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
654 clear_tail(d, opr_sz, simd_maxsz(desc)); \
657 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
658 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
659 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
660 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
661 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
663 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
666 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
667 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
668 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
669 intptr_t index = simd_data(desc); \
670 TYPED *d = vd, *a = va; \
671 TYPEN *n = vn; \
672 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
673 do { \
674 TYPED m0 = m_indexed[i * 4 + 0]; \
675 TYPED m1 = m_indexed[i * 4 + 1]; \
676 TYPED m2 = m_indexed[i * 4 + 2]; \
677 TYPED m3 = m_indexed[i * 4 + 3]; \
678 do { \
679 d[i] = (a[i] + \
680 n[i * 4 + 0] * m0 + \
681 n[i * 4 + 1] * m1 + \
682 n[i * 4 + 2] * m2 + \
683 n[i * 4 + 3] * m3); \
684 } while (++i < segend); \
685 segend = i + 4; \
686 } while (i < opr_sz_n); \
687 clear_tail(d, opr_sz, simd_maxsz(desc)); \
690 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
691 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
692 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
693 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
694 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
695 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
697 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
698 void *vfpst, uint32_t desc)
700 uintptr_t opr_sz = simd_oprsz(desc);
701 float16 *d = vd;
702 float16 *n = vn;
703 float16 *m = vm;
704 float_status *fpst = vfpst;
705 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
706 uint32_t neg_imag = neg_real ^ 1;
707 uintptr_t i;
709 /* Shift boolean to the sign bit so we can xor to negate. */
710 neg_real <<= 15;
711 neg_imag <<= 15;
713 for (i = 0; i < opr_sz / 2; i += 2) {
714 float16 e0 = n[H2(i)];
715 float16 e1 = m[H2(i + 1)] ^ neg_imag;
716 float16 e2 = n[H2(i + 1)];
717 float16 e3 = m[H2(i)] ^ neg_real;
719 d[H2(i)] = float16_add(e0, e1, fpst);
720 d[H2(i + 1)] = float16_add(e2, e3, fpst);
722 clear_tail(d, opr_sz, simd_maxsz(desc));
725 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
726 void *vfpst, uint32_t desc)
728 uintptr_t opr_sz = simd_oprsz(desc);
729 float32 *d = vd;
730 float32 *n = vn;
731 float32 *m = vm;
732 float_status *fpst = vfpst;
733 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
734 uint32_t neg_imag = neg_real ^ 1;
735 uintptr_t i;
737 /* Shift boolean to the sign bit so we can xor to negate. */
738 neg_real <<= 31;
739 neg_imag <<= 31;
741 for (i = 0; i < opr_sz / 4; i += 2) {
742 float32 e0 = n[H4(i)];
743 float32 e1 = m[H4(i + 1)] ^ neg_imag;
744 float32 e2 = n[H4(i + 1)];
745 float32 e3 = m[H4(i)] ^ neg_real;
747 d[H4(i)] = float32_add(e0, e1, fpst);
748 d[H4(i + 1)] = float32_add(e2, e3, fpst);
750 clear_tail(d, opr_sz, simd_maxsz(desc));
753 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
754 void *vfpst, uint32_t desc)
756 uintptr_t opr_sz = simd_oprsz(desc);
757 float64 *d = vd;
758 float64 *n = vn;
759 float64 *m = vm;
760 float_status *fpst = vfpst;
761 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
762 uint64_t neg_imag = neg_real ^ 1;
763 uintptr_t i;
765 /* Shift boolean to the sign bit so we can xor to negate. */
766 neg_real <<= 63;
767 neg_imag <<= 63;
769 for (i = 0; i < opr_sz / 8; i += 2) {
770 float64 e0 = n[i];
771 float64 e1 = m[i + 1] ^ neg_imag;
772 float64 e2 = n[i + 1];
773 float64 e3 = m[i] ^ neg_real;
775 d[i] = float64_add(e0, e1, fpst);
776 d[i + 1] = float64_add(e2, e3, fpst);
778 clear_tail(d, opr_sz, simd_maxsz(desc));
781 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
782 void *vfpst, uint32_t desc)
784 uintptr_t opr_sz = simd_oprsz(desc);
785 float16 *d = vd, *n = vn, *m = vm, *a = va;
786 float_status *fpst = vfpst;
787 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
788 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
789 uint32_t neg_real = flip ^ neg_imag;
790 uintptr_t i;
792 /* Shift boolean to the sign bit so we can xor to negate. */
793 neg_real <<= 15;
794 neg_imag <<= 15;
796 for (i = 0; i < opr_sz / 2; i += 2) {
797 float16 e2 = n[H2(i + flip)];
798 float16 e1 = m[H2(i + flip)] ^ neg_real;
799 float16 e4 = e2;
800 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
802 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
803 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
805 clear_tail(d, opr_sz, simd_maxsz(desc));
808 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
809 void *vfpst, uint32_t desc)
811 uintptr_t opr_sz = simd_oprsz(desc);
812 float16 *d = vd, *n = vn, *m = vm, *a = va;
813 float_status *fpst = vfpst;
814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
817 uint32_t neg_real = flip ^ neg_imag;
818 intptr_t elements = opr_sz / sizeof(float16);
819 intptr_t eltspersegment = 16 / sizeof(float16);
820 intptr_t i, j;
822 /* Shift boolean to the sign bit so we can xor to negate. */
823 neg_real <<= 15;
824 neg_imag <<= 15;
826 for (i = 0; i < elements; i += eltspersegment) {
827 float16 mr = m[H2(i + 2 * index + 0)];
828 float16 mi = m[H2(i + 2 * index + 1)];
829 float16 e1 = neg_real ^ (flip ? mi : mr);
830 float16 e3 = neg_imag ^ (flip ? mr : mi);
832 for (j = i; j < i + eltspersegment; j += 2) {
833 float16 e2 = n[H2(j + flip)];
834 float16 e4 = e2;
836 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
837 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
840 clear_tail(d, opr_sz, simd_maxsz(desc));
843 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
844 void *vfpst, uint32_t desc)
846 uintptr_t opr_sz = simd_oprsz(desc);
847 float32 *d = vd, *n = vn, *m = vm, *a = va;
848 float_status *fpst = vfpst;
849 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
850 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
851 uint32_t neg_real = flip ^ neg_imag;
852 uintptr_t i;
854 /* Shift boolean to the sign bit so we can xor to negate. */
855 neg_real <<= 31;
856 neg_imag <<= 31;
858 for (i = 0; i < opr_sz / 4; i += 2) {
859 float32 e2 = n[H4(i + flip)];
860 float32 e1 = m[H4(i + flip)] ^ neg_real;
861 float32 e4 = e2;
862 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
864 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
865 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
867 clear_tail(d, opr_sz, simd_maxsz(desc));
870 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
871 void *vfpst, uint32_t desc)
873 uintptr_t opr_sz = simd_oprsz(desc);
874 float32 *d = vd, *n = vn, *m = vm, *a = va;
875 float_status *fpst = vfpst;
876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
879 uint32_t neg_real = flip ^ neg_imag;
880 intptr_t elements = opr_sz / sizeof(float32);
881 intptr_t eltspersegment = 16 / sizeof(float32);
882 intptr_t i, j;
884 /* Shift boolean to the sign bit so we can xor to negate. */
885 neg_real <<= 31;
886 neg_imag <<= 31;
888 for (i = 0; i < elements; i += eltspersegment) {
889 float32 mr = m[H4(i + 2 * index + 0)];
890 float32 mi = m[H4(i + 2 * index + 1)];
891 float32 e1 = neg_real ^ (flip ? mi : mr);
892 float32 e3 = neg_imag ^ (flip ? mr : mi);
894 for (j = i; j < i + eltspersegment; j += 2) {
895 float32 e2 = n[H4(j + flip)];
896 float32 e4 = e2;
898 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
899 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
902 clear_tail(d, opr_sz, simd_maxsz(desc));
905 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
906 void *vfpst, uint32_t desc)
908 uintptr_t opr_sz = simd_oprsz(desc);
909 float64 *d = vd, *n = vn, *m = vm, *a = va;
910 float_status *fpst = vfpst;
911 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
912 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
913 uint64_t neg_real = flip ^ neg_imag;
914 uintptr_t i;
916 /* Shift boolean to the sign bit so we can xor to negate. */
917 neg_real <<= 63;
918 neg_imag <<= 63;
920 for (i = 0; i < opr_sz / 8; i += 2) {
921 float64 e2 = n[i + flip];
922 float64 e1 = m[i + flip] ^ neg_real;
923 float64 e4 = e2;
924 float64 e3 = m[i + 1 - flip] ^ neg_imag;
926 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
927 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
929 clear_tail(d, opr_sz, simd_maxsz(desc));
933 * Floating point comparisons producing an integer result (all 1s or all 0s).
934 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
935 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
937 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
939 return -float16_eq_quiet(op1, op2, stat);
942 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
944 return -float32_eq_quiet(op1, op2, stat);
947 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
949 return -float16_le(op2, op1, stat);
952 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
954 return -float32_le(op2, op1, stat);
957 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
959 return -float16_lt(op2, op1, stat);
962 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
964 return -float32_lt(op2, op1, stat);
967 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
969 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
972 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
974 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
977 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
979 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
982 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
984 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
987 static int16_t vfp_tosszh(float16 x, void *fpstp)
989 float_status *fpst = fpstp;
990 if (float16_is_any_nan(x)) {
991 float_raise(float_flag_invalid, fpst);
992 return 0;
994 return float16_to_int16_round_to_zero(x, fpst);
997 static uint16_t vfp_touszh(float16 x, void *fpstp)
999 float_status *fpst = fpstp;
1000 if (float16_is_any_nan(x)) {
1001 float_raise(float_flag_invalid, fpst);
1002 return 0;
1004 return float16_to_uint16_round_to_zero(x, fpst);
1007 #define DO_2OP(NAME, FUNC, TYPE) \
1008 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1010 intptr_t i, oprsz = simd_oprsz(desc); \
1011 TYPE *d = vd, *n = vn; \
1012 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1013 d[i] = FUNC(n[i], stat); \
1015 clear_tail(d, oprsz, simd_maxsz(desc)); \
1018 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1019 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1020 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1022 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1023 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1024 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1026 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1027 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1029 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1030 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1031 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1032 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1033 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1034 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1035 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1036 DO_2OP(gvec_touszh, vfp_touszh, float16)
1038 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1039 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1041 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1044 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1045 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1047 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1050 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1051 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1052 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1053 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1054 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1056 DO_2OP_CMP0(cgt, cgt, FWD)
1057 DO_2OP_CMP0(cge, cge, FWD)
1058 DO_2OP_CMP0(ceq, ceq, FWD)
1059 DO_2OP_CMP0(clt, cgt, REV)
1060 DO_2OP_CMP0(cle, cge, REV)
1062 #undef DO_2OP
1063 #undef DO_2OP_CMP0
1065 /* Floating-point trigonometric starting value.
1066 * See the ARM ARM pseudocode function FPTrigSMul.
1068 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1070 float16 result = float16_mul(op1, op1, stat);
1071 if (!float16_is_any_nan(result)) {
1072 result = float16_set_sign(result, op2 & 1);
1074 return result;
1077 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1079 float32 result = float32_mul(op1, op1, stat);
1080 if (!float32_is_any_nan(result)) {
1081 result = float32_set_sign(result, op2 & 1);
1083 return result;
1086 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1088 float64 result = float64_mul(op1, op1, stat);
1089 if (!float64_is_any_nan(result)) {
1090 result = float64_set_sign(result, op2 & 1);
1092 return result;
1095 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1097 return float16_abs(float16_sub(op1, op2, stat));
1100 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1102 return float32_abs(float32_sub(op1, op2, stat));
1106 * Reciprocal step. These are the AArch32 version which uses a
1107 * non-fused multiply-and-subtract.
1109 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1111 op1 = float16_squash_input_denormal(op1, stat);
1112 op2 = float16_squash_input_denormal(op2, stat);
1114 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1115 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1116 return float16_two;
1118 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1121 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1123 op1 = float32_squash_input_denormal(op1, stat);
1124 op2 = float32_squash_input_denormal(op2, stat);
1126 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1127 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1128 return float32_two;
1130 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1133 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1134 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1136 op1 = float16_squash_input_denormal(op1, stat);
1137 op2 = float16_squash_input_denormal(op2, stat);
1139 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1140 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1141 return float16_one_point_five;
1143 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1144 return float16_div(op1, float16_two, stat);
1147 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1149 op1 = float32_squash_input_denormal(op1, stat);
1150 op2 = float32_squash_input_denormal(op2, stat);
1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154 return float32_one_point_five;
1156 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1157 return float32_div(op1, float32_two, stat);
1160 #define DO_3OP(NAME, FUNC, TYPE) \
1161 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1163 intptr_t i, oprsz = simd_oprsz(desc); \
1164 TYPE *d = vd, *n = vn, *m = vm; \
1165 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1166 d[i] = FUNC(n[i], m[i], stat); \
1168 clear_tail(d, oprsz, simd_maxsz(desc)); \
1171 DO_3OP(gvec_fadd_h, float16_add, float16)
1172 DO_3OP(gvec_fadd_s, float32_add, float32)
1173 DO_3OP(gvec_fadd_d, float64_add, float64)
1175 DO_3OP(gvec_fsub_h, float16_sub, float16)
1176 DO_3OP(gvec_fsub_s, float32_sub, float32)
1177 DO_3OP(gvec_fsub_d, float64_sub, float64)
1179 DO_3OP(gvec_fmul_h, float16_mul, float16)
1180 DO_3OP(gvec_fmul_s, float32_mul, float32)
1181 DO_3OP(gvec_fmul_d, float64_mul, float64)
1183 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1184 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1185 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1187 DO_3OP(gvec_fabd_h, float16_abd, float16)
1188 DO_3OP(gvec_fabd_s, float32_abd, float32)
1190 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1191 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1193 DO_3OP(gvec_fcge_h, float16_cge, float16)
1194 DO_3OP(gvec_fcge_s, float32_cge, float32)
1196 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1197 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1199 DO_3OP(gvec_facge_h, float16_acge, float16)
1200 DO_3OP(gvec_facge_s, float32_acge, float32)
1202 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1203 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1205 DO_3OP(gvec_fmax_h, float16_max, float16)
1206 DO_3OP(gvec_fmax_s, float32_max, float32)
1208 DO_3OP(gvec_fmin_h, float16_min, float16)
1209 DO_3OP(gvec_fmin_s, float32_min, float32)
1211 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1212 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1214 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1215 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1217 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1218 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1220 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1221 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1223 #ifdef TARGET_AARCH64
1225 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1226 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1227 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1229 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1230 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1231 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1233 #endif
1234 #undef DO_3OP
1236 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1237 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1238 float_status *stat)
1240 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1243 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1244 float_status *stat)
1246 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1249 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1250 float_status *stat)
1252 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1255 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1256 float_status *stat)
1258 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1261 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1262 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1263 float_status *stat)
1265 return float16_muladd(op1, op2, dest, 0, stat);
1268 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1269 float_status *stat)
1271 return float32_muladd(op1, op2, dest, 0, stat);
1274 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1275 float_status *stat)
1277 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1280 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1281 float_status *stat)
1283 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1286 #define DO_MULADD(NAME, FUNC, TYPE) \
1287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1289 intptr_t i, oprsz = simd_oprsz(desc); \
1290 TYPE *d = vd, *n = vn, *m = vm; \
1291 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1292 d[i] = FUNC(d[i], n[i], m[i], stat); \
1294 clear_tail(d, oprsz, simd_maxsz(desc)); \
1297 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1298 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1300 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1301 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1303 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1304 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1306 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1307 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1309 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1310 * For AdvSIMD, there is of course only one such vector segment.
1313 #define DO_MUL_IDX(NAME, TYPE, H) \
1314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1316 intptr_t i, j, oprsz = simd_oprsz(desc); \
1317 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1318 intptr_t idx = simd_data(desc); \
1319 TYPE *d = vd, *n = vn, *m = vm; \
1320 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1321 TYPE mm = m[H(i + idx)]; \
1322 for (j = 0; j < segment; j++) { \
1323 d[i + j] = n[i + j] * mm; \
1326 clear_tail(d, oprsz, simd_maxsz(desc)); \
1329 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1330 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1331 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1333 #undef DO_MUL_IDX
1335 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1336 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1338 intptr_t i, j, oprsz = simd_oprsz(desc); \
1339 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1340 intptr_t idx = simd_data(desc); \
1341 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1342 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1343 TYPE mm = m[H(i + idx)]; \
1344 for (j = 0; j < segment; j++) { \
1345 d[i + j] = a[i + j] OP n[i + j] * mm; \
1348 clear_tail(d, oprsz, simd_maxsz(desc)); \
1351 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1352 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1353 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1355 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1356 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1357 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1359 #undef DO_MLA_IDX
1361 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1362 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1364 intptr_t i, j, oprsz = simd_oprsz(desc); \
1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1366 intptr_t idx = simd_data(desc); \
1367 TYPE *d = vd, *n = vn, *m = vm; \
1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1369 TYPE mm = m[H(i + idx)]; \
1370 for (j = 0; j < segment; j++) { \
1371 d[i + j] = TYPE##_##ADD(d[i + j], \
1372 TYPE##_mul(n[i + j], mm, stat), stat); \
1375 clear_tail(d, oprsz, simd_maxsz(desc)); \
1378 #define float16_nop(N, M, S) (M)
1379 #define float32_nop(N, M, S) (M)
1380 #define float64_nop(N, M, S) (M)
1382 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1383 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1384 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1387 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1388 * the fused ops below they assume accumulate both from and into Vd.
1390 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1391 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1392 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1393 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1395 #undef float16_nop
1396 #undef float32_nop
1397 #undef float64_nop
1398 #undef DO_FMUL_IDX
1400 #define DO_FMLA_IDX(NAME, TYPE, H) \
1401 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1402 void *stat, uint32_t desc) \
1404 intptr_t i, j, oprsz = simd_oprsz(desc); \
1405 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1406 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1407 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1408 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1409 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1410 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1411 TYPE mm = m[H(i + idx)]; \
1412 for (j = 0; j < segment; j++) { \
1413 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1414 mm, a[i + j], 0, stat); \
1417 clear_tail(d, oprsz, simd_maxsz(desc)); \
1420 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1421 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1422 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1424 #undef DO_FMLA_IDX
1426 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1427 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1429 intptr_t i, oprsz = simd_oprsz(desc); \
1430 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1431 bool q = false; \
1432 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1433 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1434 if (dd < MIN) { \
1435 dd = MIN; \
1436 q = true; \
1437 } else if (dd > MAX) { \
1438 dd = MAX; \
1439 q = true; \
1441 d[i] = dd; \
1443 if (q) { \
1444 uint32_t *qc = vq; \
1445 qc[0] = 1; \
1447 clear_tail(d, oprsz, simd_maxsz(desc)); \
1450 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1451 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1452 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1454 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1455 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1456 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1458 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1459 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1460 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1462 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1463 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1464 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1466 #undef DO_SAT
1468 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1469 void *vm, uint32_t desc)
1471 intptr_t i, oprsz = simd_oprsz(desc);
1472 uint64_t *d = vd, *n = vn, *m = vm;
1473 bool q = false;
1475 for (i = 0; i < oprsz / 8; i++) {
1476 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1477 if (dd < nn) {
1478 dd = UINT64_MAX;
1479 q = true;
1481 d[i] = dd;
1483 if (q) {
1484 uint32_t *qc = vq;
1485 qc[0] = 1;
1487 clear_tail(d, oprsz, simd_maxsz(desc));
1490 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1491 void *vm, uint32_t desc)
1493 intptr_t i, oprsz = simd_oprsz(desc);
1494 uint64_t *d = vd, *n = vn, *m = vm;
1495 bool q = false;
1497 for (i = 0; i < oprsz / 8; i++) {
1498 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1499 if (nn < mm) {
1500 dd = 0;
1501 q = true;
1503 d[i] = dd;
1505 if (q) {
1506 uint32_t *qc = vq;
1507 qc[0] = 1;
1509 clear_tail(d, oprsz, simd_maxsz(desc));
1512 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1513 void *vm, uint32_t desc)
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516 int64_t *d = vd, *n = vn, *m = vm;
1517 bool q = false;
1519 for (i = 0; i < oprsz / 8; i++) {
1520 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1521 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1522 dd = (nn >> 63) ^ ~INT64_MIN;
1523 q = true;
1525 d[i] = dd;
1527 if (q) {
1528 uint32_t *qc = vq;
1529 qc[0] = 1;
1531 clear_tail(d, oprsz, simd_maxsz(desc));
1534 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1535 void *vm, uint32_t desc)
1537 intptr_t i, oprsz = simd_oprsz(desc);
1538 int64_t *d = vd, *n = vn, *m = vm;
1539 bool q = false;
1541 for (i = 0; i < oprsz / 8; i++) {
1542 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1543 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1544 dd = (nn >> 63) ^ ~INT64_MIN;
1545 q = true;
1547 d[i] = dd;
1549 if (q) {
1550 uint32_t *qc = vq;
1551 qc[0] = 1;
1553 clear_tail(d, oprsz, simd_maxsz(desc));
1557 #define DO_SRA(NAME, TYPE) \
1558 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1560 intptr_t i, oprsz = simd_oprsz(desc); \
1561 int shift = simd_data(desc); \
1562 TYPE *d = vd, *n = vn; \
1563 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1564 d[i] += n[i] >> shift; \
1566 clear_tail(d, oprsz, simd_maxsz(desc)); \
1569 DO_SRA(gvec_ssra_b, int8_t)
1570 DO_SRA(gvec_ssra_h, int16_t)
1571 DO_SRA(gvec_ssra_s, int32_t)
1572 DO_SRA(gvec_ssra_d, int64_t)
1574 DO_SRA(gvec_usra_b, uint8_t)
1575 DO_SRA(gvec_usra_h, uint16_t)
1576 DO_SRA(gvec_usra_s, uint32_t)
1577 DO_SRA(gvec_usra_d, uint64_t)
1579 #undef DO_SRA
1581 #define DO_RSHR(NAME, TYPE) \
1582 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1584 intptr_t i, oprsz = simd_oprsz(desc); \
1585 int shift = simd_data(desc); \
1586 TYPE *d = vd, *n = vn; \
1587 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1588 TYPE tmp = n[i] >> (shift - 1); \
1589 d[i] = (tmp >> 1) + (tmp & 1); \
1591 clear_tail(d, oprsz, simd_maxsz(desc)); \
1594 DO_RSHR(gvec_srshr_b, int8_t)
1595 DO_RSHR(gvec_srshr_h, int16_t)
1596 DO_RSHR(gvec_srshr_s, int32_t)
1597 DO_RSHR(gvec_srshr_d, int64_t)
1599 DO_RSHR(gvec_urshr_b, uint8_t)
1600 DO_RSHR(gvec_urshr_h, uint16_t)
1601 DO_RSHR(gvec_urshr_s, uint32_t)
1602 DO_RSHR(gvec_urshr_d, uint64_t)
1604 #undef DO_RSHR
1606 #define DO_RSRA(NAME, TYPE) \
1607 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1609 intptr_t i, oprsz = simd_oprsz(desc); \
1610 int shift = simd_data(desc); \
1611 TYPE *d = vd, *n = vn; \
1612 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1613 TYPE tmp = n[i] >> (shift - 1); \
1614 d[i] += (tmp >> 1) + (tmp & 1); \
1616 clear_tail(d, oprsz, simd_maxsz(desc)); \
1619 DO_RSRA(gvec_srsra_b, int8_t)
1620 DO_RSRA(gvec_srsra_h, int16_t)
1621 DO_RSRA(gvec_srsra_s, int32_t)
1622 DO_RSRA(gvec_srsra_d, int64_t)
1624 DO_RSRA(gvec_ursra_b, uint8_t)
1625 DO_RSRA(gvec_ursra_h, uint16_t)
1626 DO_RSRA(gvec_ursra_s, uint32_t)
1627 DO_RSRA(gvec_ursra_d, uint64_t)
1629 #undef DO_RSRA
1631 #define DO_SRI(NAME, TYPE) \
1632 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1634 intptr_t i, oprsz = simd_oprsz(desc); \
1635 int shift = simd_data(desc); \
1636 TYPE *d = vd, *n = vn; \
1637 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1638 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1640 clear_tail(d, oprsz, simd_maxsz(desc)); \
1643 DO_SRI(gvec_sri_b, uint8_t)
1644 DO_SRI(gvec_sri_h, uint16_t)
1645 DO_SRI(gvec_sri_s, uint32_t)
1646 DO_SRI(gvec_sri_d, uint64_t)
1648 #undef DO_SRI
1650 #define DO_SLI(NAME, TYPE) \
1651 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1653 intptr_t i, oprsz = simd_oprsz(desc); \
1654 int shift = simd_data(desc); \
1655 TYPE *d = vd, *n = vn; \
1656 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1657 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1659 clear_tail(d, oprsz, simd_maxsz(desc)); \
1662 DO_SLI(gvec_sli_b, uint8_t)
1663 DO_SLI(gvec_sli_h, uint16_t)
1664 DO_SLI(gvec_sli_s, uint32_t)
1665 DO_SLI(gvec_sli_d, uint64_t)
1667 #undef DO_SLI
1670 * Convert float16 to float32, raising no exceptions and
1671 * preserving exceptional values, including SNaN.
1672 * This is effectively an unpack+repack operation.
1674 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1676 const int f16_bias = 15;
1677 const int f32_bias = 127;
1678 uint32_t sign = extract32(f16, 15, 1);
1679 uint32_t exp = extract32(f16, 10, 5);
1680 uint32_t frac = extract32(f16, 0, 10);
1682 if (exp == 0x1f) {
1683 /* Inf or NaN */
1684 exp = 0xff;
1685 } else if (exp == 0) {
1686 /* Zero or denormal. */
1687 if (frac != 0) {
1688 if (fz16) {
1689 frac = 0;
1690 } else {
1692 * Denormal; these are all normal float32.
1693 * Shift the fraction so that the msb is at bit 11,
1694 * then remove bit 11 as the implicit bit of the
1695 * normalized float32. Note that we still go through
1696 * the shift for normal numbers below, to put the
1697 * float32 fraction at the right place.
1699 int shift = clz32(frac) - 21;
1700 frac = (frac << shift) & 0x3ff;
1701 exp = f32_bias - f16_bias - shift + 1;
1704 } else {
1705 /* Normal number; adjust the bias. */
1706 exp += f32_bias - f16_bias;
1708 sign <<= 31;
1709 exp <<= 23;
1710 frac <<= 23 - 10;
1712 return sign | exp | frac;
1715 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1718 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1719 * Load the 2nd qword iff is_q & is_2.
1720 * Shift to the 2nd dword iff !is_q & is_2.
1721 * For !is_q & !is_2, the upper bits of the result are garbage.
1723 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1727 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1728 * as there is not yet SVE versions that might use blocking.
1731 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1732 uint32_t desc, bool fz16)
1734 intptr_t i, oprsz = simd_oprsz(desc);
1735 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1736 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1737 int is_q = oprsz == 16;
1738 uint64_t n_4, m_4;
1740 /* Pre-load all of the f16 data, avoiding overlap issues. */
1741 n_4 = load4_f16(vn, is_q, is_2);
1742 m_4 = load4_f16(vm, is_q, is_2);
1744 /* Negate all inputs for FMLSL at once. */
1745 if (is_s) {
1746 n_4 ^= 0x8000800080008000ull;
1749 for (i = 0; i < oprsz / 4; i++) {
1750 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1751 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1752 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1754 clear_tail(d, oprsz, simd_maxsz(desc));
1757 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1758 void *venv, uint32_t desc)
1760 CPUARMState *env = venv;
1761 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1762 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1765 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1766 void *venv, uint32_t desc)
1768 CPUARMState *env = venv;
1769 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1770 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1773 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1774 void *venv, uint32_t desc)
1776 intptr_t i, oprsz = simd_oprsz(desc);
1777 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1778 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1779 CPUARMState *env = venv;
1780 float_status *status = &env->vfp.fp_status;
1781 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1783 for (i = 0; i < oprsz; i += sizeof(float32)) {
1784 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1785 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1786 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1787 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1788 float32 aa = *(float32 *)(va + H1_4(i));
1790 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1794 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1795 uint32_t desc, bool fz16)
1797 intptr_t i, oprsz = simd_oprsz(desc);
1798 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1799 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1800 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1801 int is_q = oprsz == 16;
1802 uint64_t n_4;
1803 float32 m_1;
1805 /* Pre-load all of the f16 data, avoiding overlap issues. */
1806 n_4 = load4_f16(vn, is_q, is_2);
1808 /* Negate all inputs for FMLSL at once. */
1809 if (is_s) {
1810 n_4 ^= 0x8000800080008000ull;
1813 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1815 for (i = 0; i < oprsz / 4; i++) {
1816 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1817 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1819 clear_tail(d, oprsz, simd_maxsz(desc));
1822 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1823 void *venv, uint32_t desc)
1825 CPUARMState *env = venv;
1826 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1827 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1830 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1831 void *venv, uint32_t desc)
1833 CPUARMState *env = venv;
1834 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1835 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1838 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1839 void *venv, uint32_t desc)
1841 intptr_t i, j, oprsz = simd_oprsz(desc);
1842 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1843 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1844 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1845 CPUARMState *env = venv;
1846 float_status *status = &env->vfp.fp_status;
1847 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1849 for (i = 0; i < oprsz; i += 16) {
1850 float16 mm_16 = *(float16 *)(vm + i + idx);
1851 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1853 for (j = 0; j < 16; j += sizeof(float32)) {
1854 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1855 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1856 float32 aa = *(float32 *)(va + H1_4(i + j));
1858 *(float32 *)(vd + H1_4(i + j)) =
1859 float32_muladd(nn, mm, aa, 0, status);
1864 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1866 intptr_t i, opr_sz = simd_oprsz(desc);
1867 int8_t *d = vd, *n = vn, *m = vm;
1869 for (i = 0; i < opr_sz; ++i) {
1870 int8_t mm = m[i];
1871 int8_t nn = n[i];
1872 int8_t res = 0;
1873 if (mm >= 0) {
1874 if (mm < 8) {
1875 res = nn << mm;
1877 } else {
1878 res = nn >> (mm > -8 ? -mm : 7);
1880 d[i] = res;
1882 clear_tail(d, opr_sz, simd_maxsz(desc));
1885 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1887 intptr_t i, opr_sz = simd_oprsz(desc);
1888 int16_t *d = vd, *n = vn, *m = vm;
1890 for (i = 0; i < opr_sz / 2; ++i) {
1891 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1892 int16_t nn = n[i];
1893 int16_t res = 0;
1894 if (mm >= 0) {
1895 if (mm < 16) {
1896 res = nn << mm;
1898 } else {
1899 res = nn >> (mm > -16 ? -mm : 15);
1901 d[i] = res;
1903 clear_tail(d, opr_sz, simd_maxsz(desc));
1906 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1908 intptr_t i, opr_sz = simd_oprsz(desc);
1909 uint8_t *d = vd, *n = vn, *m = vm;
1911 for (i = 0; i < opr_sz; ++i) {
1912 int8_t mm = m[i];
1913 uint8_t nn = n[i];
1914 uint8_t res = 0;
1915 if (mm >= 0) {
1916 if (mm < 8) {
1917 res = nn << mm;
1919 } else {
1920 if (mm > -8) {
1921 res = nn >> -mm;
1924 d[i] = res;
1926 clear_tail(d, opr_sz, simd_maxsz(desc));
1929 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1931 intptr_t i, opr_sz = simd_oprsz(desc);
1932 uint16_t *d = vd, *n = vn, *m = vm;
1934 for (i = 0; i < opr_sz / 2; ++i) {
1935 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1936 uint16_t nn = n[i];
1937 uint16_t res = 0;
1938 if (mm >= 0) {
1939 if (mm < 16) {
1940 res = nn << mm;
1942 } else {
1943 if (mm > -16) {
1944 res = nn >> -mm;
1947 d[i] = res;
1949 clear_tail(d, opr_sz, simd_maxsz(desc));
1953 * 8x8->8 polynomial multiply.
1955 * Polynomial multiplication is like integer multiplication except the
1956 * partial products are XORed, not added.
1958 * TODO: expose this as a generic vector operation, as it is a common
1959 * crypto building block.
1961 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1963 intptr_t i, j, opr_sz = simd_oprsz(desc);
1964 uint64_t *d = vd, *n = vn, *m = vm;
1966 for (i = 0; i < opr_sz / 8; ++i) {
1967 uint64_t nn = n[i];
1968 uint64_t mm = m[i];
1969 uint64_t rr = 0;
1971 for (j = 0; j < 8; ++j) {
1972 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1973 rr ^= mm & mask;
1974 mm = (mm << 1) & 0xfefefefefefefefeull;
1975 nn >>= 1;
1977 d[i] = rr;
1979 clear_tail(d, opr_sz, simd_maxsz(desc));
1983 * 64x64->128 polynomial multiply.
1984 * Because of the lanes are not accessed in strict columns,
1985 * this probably cannot be turned into a generic helper.
1987 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1989 intptr_t i, j, opr_sz = simd_oprsz(desc);
1990 intptr_t hi = simd_data(desc);
1991 uint64_t *d = vd, *n = vn, *m = vm;
1993 for (i = 0; i < opr_sz / 8; i += 2) {
1994 uint64_t nn = n[i + hi];
1995 uint64_t mm = m[i + hi];
1996 uint64_t rhi = 0;
1997 uint64_t rlo = 0;
1999 /* Bit 0 can only influence the low 64-bit result. */
2000 if (nn & 1) {
2001 rlo = mm;
2004 for (j = 1; j < 64; ++j) {
2005 uint64_t mask = -((nn >> j) & 1);
2006 rlo ^= (mm << j) & mask;
2007 rhi ^= (mm >> (64 - j)) & mask;
2009 d[i] = rlo;
2010 d[i + 1] = rhi;
2012 clear_tail(d, opr_sz, simd_maxsz(desc));
2016 * 8x8->16 polynomial multiply.
2018 * The byte inputs are expanded to (or extracted from) half-words.
2019 * Note that neon and sve2 get the inputs from different positions.
2020 * This allows 4 bytes to be processed in parallel with uint64_t.
2023 static uint64_t expand_byte_to_half(uint64_t x)
2025 return (x & 0x000000ff)
2026 | ((x & 0x0000ff00) << 8)
2027 | ((x & 0x00ff0000) << 16)
2028 | ((x & 0xff000000) << 24);
2031 uint64_t pmull_w(uint64_t op1, uint64_t op2)
2033 uint64_t result = 0;
2034 int i;
2035 for (i = 0; i < 16; ++i) {
2036 uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2037 result ^= op2 & mask;
2038 op1 >>= 1;
2039 op2 <<= 1;
2041 return result;
2044 uint64_t pmull_h(uint64_t op1, uint64_t op2)
2046 uint64_t result = 0;
2047 int i;
2048 for (i = 0; i < 8; ++i) {
2049 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2050 result ^= op2 & mask;
2051 op1 >>= 1;
2052 op2 <<= 1;
2054 return result;
2057 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2059 int hi = simd_data(desc);
2060 uint64_t *d = vd, *n = vn, *m = vm;
2061 uint64_t nn = n[hi], mm = m[hi];
2063 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2064 nn >>= 32;
2065 mm >>= 32;
2066 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2068 clear_tail(d, 16, simd_maxsz(desc));
2071 #ifdef TARGET_AARCH64
2072 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2074 int shift = simd_data(desc) * 8;
2075 intptr_t i, opr_sz = simd_oprsz(desc);
2076 uint64_t *d = vd, *n = vn, *m = vm;
2078 for (i = 0; i < opr_sz / 8; ++i) {
2079 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2080 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2082 d[i] = pmull_h(nn, mm);
2086 static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2088 uint64_t result = 0;
2089 int i;
2091 for (i = 0; i < 32; ++i) {
2092 uint64_t mask = -((op1 >> i) & 1);
2093 result ^= (op2 << i) & mask;
2095 return result;
2098 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2100 intptr_t sel = H4(simd_data(desc));
2101 intptr_t i, opr_sz = simd_oprsz(desc);
2102 uint32_t *n = vn, *m = vm;
2103 uint64_t *d = vd;
2105 for (i = 0; i < opr_sz / 8; ++i) {
2106 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2109 #endif
2111 #define DO_CMP0(NAME, TYPE, OP) \
2112 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2114 intptr_t i, opr_sz = simd_oprsz(desc); \
2115 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2116 TYPE nn = *(TYPE *)(vn + i); \
2117 *(TYPE *)(vd + i) = -(nn OP 0); \
2119 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2122 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2123 DO_CMP0(gvec_clt0_b, int8_t, <)
2124 DO_CMP0(gvec_cle0_b, int8_t, <=)
2125 DO_CMP0(gvec_cgt0_b, int8_t, >)
2126 DO_CMP0(gvec_cge0_b, int8_t, >=)
2128 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2129 DO_CMP0(gvec_clt0_h, int16_t, <)
2130 DO_CMP0(gvec_cle0_h, int16_t, <=)
2131 DO_CMP0(gvec_cgt0_h, int16_t, >)
2132 DO_CMP0(gvec_cge0_h, int16_t, >=)
2134 #undef DO_CMP0
2136 #define DO_ABD(NAME, TYPE) \
2137 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2139 intptr_t i, opr_sz = simd_oprsz(desc); \
2140 TYPE *d = vd, *n = vn, *m = vm; \
2142 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2143 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2145 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2148 DO_ABD(gvec_sabd_b, int8_t)
2149 DO_ABD(gvec_sabd_h, int16_t)
2150 DO_ABD(gvec_sabd_s, int32_t)
2151 DO_ABD(gvec_sabd_d, int64_t)
2153 DO_ABD(gvec_uabd_b, uint8_t)
2154 DO_ABD(gvec_uabd_h, uint16_t)
2155 DO_ABD(gvec_uabd_s, uint32_t)
2156 DO_ABD(gvec_uabd_d, uint64_t)
2158 #undef DO_ABD
2160 #define DO_ABA(NAME, TYPE) \
2161 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2163 intptr_t i, opr_sz = simd_oprsz(desc); \
2164 TYPE *d = vd, *n = vn, *m = vm; \
2166 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2167 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2169 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2172 DO_ABA(gvec_saba_b, int8_t)
2173 DO_ABA(gvec_saba_h, int16_t)
2174 DO_ABA(gvec_saba_s, int32_t)
2175 DO_ABA(gvec_saba_d, int64_t)
2177 DO_ABA(gvec_uaba_b, uint8_t)
2178 DO_ABA(gvec_uaba_h, uint16_t)
2179 DO_ABA(gvec_uaba_s, uint32_t)
2180 DO_ABA(gvec_uaba_d, uint64_t)
2182 #undef DO_ABA
2184 #define DO_NEON_PAIRWISE(NAME, OP) \
2185 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2186 void *stat, uint32_t oprsz) \
2188 float_status *fpst = stat; \
2189 float32 *d = vd; \
2190 float32 *n = vn; \
2191 float32 *m = vm; \
2192 float32 r0, r1; \
2194 /* Read all inputs before writing outputs in case vm == vd */ \
2195 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2196 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2198 d[H4(0)] = r0; \
2199 d[H4(1)] = r1; \
2202 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2203 void *stat, uint32_t oprsz) \
2205 float_status *fpst = stat; \
2206 float16 *d = vd; \
2207 float16 *n = vn; \
2208 float16 *m = vm; \
2209 float16 r0, r1, r2, r3; \
2211 /* Read all inputs before writing outputs in case vm == vd */ \
2212 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2213 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2214 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2215 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2217 d[H2(0)] = r0; \
2218 d[H2(1)] = r1; \
2219 d[H2(2)] = r2; \
2220 d[H2(3)] = r3; \
2223 DO_NEON_PAIRWISE(neon_padd, add)
2224 DO_NEON_PAIRWISE(neon_pmax, max)
2225 DO_NEON_PAIRWISE(neon_pmin, min)
2227 #undef DO_NEON_PAIRWISE
2229 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2230 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2232 intptr_t i, oprsz = simd_oprsz(desc); \
2233 int shift = simd_data(desc); \
2234 TYPE *d = vd, *n = vn; \
2235 float_status *fpst = stat; \
2236 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2237 d[i] = FUNC(n[i], shift, fpst); \
2239 clear_tail(d, oprsz, simd_maxsz(desc)); \
2242 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2243 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2244 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2245 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2246 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2247 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2248 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2249 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2251 #undef DO_VCVT_FIXED
2253 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2254 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2256 float_status *fpst = stat; \
2257 intptr_t i, oprsz = simd_oprsz(desc); \
2258 uint32_t rmode = simd_data(desc); \
2259 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2260 TYPE *d = vd, *n = vn; \
2261 set_float_rounding_mode(rmode, fpst); \
2262 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2263 d[i] = FUNC(n[i], 0, fpst); \
2265 set_float_rounding_mode(prev_rmode, fpst); \
2266 clear_tail(d, oprsz, simd_maxsz(desc)); \
2269 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2270 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2271 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2272 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2274 #undef DO_VCVT_RMODE
2276 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2277 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2279 float_status *fpst = stat; \
2280 intptr_t i, oprsz = simd_oprsz(desc); \
2281 uint32_t rmode = simd_data(desc); \
2282 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2283 TYPE *d = vd, *n = vn; \
2284 set_float_rounding_mode(rmode, fpst); \
2285 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2286 d[i] = FUNC(n[i], fpst); \
2288 set_float_rounding_mode(prev_rmode, fpst); \
2289 clear_tail(d, oprsz, simd_maxsz(desc)); \
2292 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2293 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2295 #undef DO_VRINT_RMODE
2297 #ifdef TARGET_AARCH64
2298 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2300 const uint8_t *indices = vm;
2301 CPUARMState *env = venv;
2302 size_t oprsz = simd_oprsz(desc);
2303 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2304 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2305 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2306 union {
2307 uint8_t b[16];
2308 uint64_t d[2];
2309 } result;
2312 * We must construct the final result in a temp, lest the output
2313 * overlaps the input table. For TBL, begin with zero; for TBX,
2314 * begin with the original register contents. Note that we always
2315 * copy 16 bytes here to avoid an extra branch; clearing the high
2316 * bits of the register for oprsz == 8 is handled below.
2318 if (is_tbx) {
2319 memcpy(&result, vd, 16);
2320 } else {
2321 memset(&result, 0, 16);
2324 for (size_t i = 0; i < oprsz; ++i) {
2325 uint32_t index = indices[H1(i)];
2327 if (index < table_len) {
2329 * Convert index (a byte offset into the virtual table
2330 * which is a series of 128-bit vectors concatenated)
2331 * into the correct register element, bearing in mind
2332 * that the table can wrap around from V31 to V0.
2334 const uint8_t *table = (const uint8_t *)
2335 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2336 result.b[H1(i)] = table[H1(index % 16)];
2340 memcpy(vd, &result, 16);
2341 clear_tail(vd, oprsz, simd_maxsz(desc));
2343 #endif
2346 * NxN -> N highpart multiply
2348 * TODO: expose this as a generic vector operation.
2351 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2353 intptr_t i, opr_sz = simd_oprsz(desc);
2354 int8_t *d = vd, *n = vn, *m = vm;
2356 for (i = 0; i < opr_sz; ++i) {
2357 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2359 clear_tail(d, opr_sz, simd_maxsz(desc));
2362 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2364 intptr_t i, opr_sz = simd_oprsz(desc);
2365 int16_t *d = vd, *n = vn, *m = vm;
2367 for (i = 0; i < opr_sz / 2; ++i) {
2368 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2370 clear_tail(d, opr_sz, simd_maxsz(desc));
2373 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2375 intptr_t i, opr_sz = simd_oprsz(desc);
2376 int32_t *d = vd, *n = vn, *m = vm;
2378 for (i = 0; i < opr_sz / 4; ++i) {
2379 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2381 clear_tail(d, opr_sz, simd_maxsz(desc));
2384 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2386 intptr_t i, opr_sz = simd_oprsz(desc);
2387 uint64_t *d = vd, *n = vn, *m = vm;
2388 uint64_t discard;
2390 for (i = 0; i < opr_sz / 8; ++i) {
2391 muls64(&discard, &d[i], n[i], m[i]);
2393 clear_tail(d, opr_sz, simd_maxsz(desc));
2396 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2398 intptr_t i, opr_sz = simd_oprsz(desc);
2399 uint8_t *d = vd, *n = vn, *m = vm;
2401 for (i = 0; i < opr_sz; ++i) {
2402 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2404 clear_tail(d, opr_sz, simd_maxsz(desc));
2407 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2409 intptr_t i, opr_sz = simd_oprsz(desc);
2410 uint16_t *d = vd, *n = vn, *m = vm;
2412 for (i = 0; i < opr_sz / 2; ++i) {
2413 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2415 clear_tail(d, opr_sz, simd_maxsz(desc));
2418 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2420 intptr_t i, opr_sz = simd_oprsz(desc);
2421 uint32_t *d = vd, *n = vn, *m = vm;
2423 for (i = 0; i < opr_sz / 4; ++i) {
2424 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2426 clear_tail(d, opr_sz, simd_maxsz(desc));
2429 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2431 intptr_t i, opr_sz = simd_oprsz(desc);
2432 uint64_t *d = vd, *n = vn, *m = vm;
2433 uint64_t discard;
2435 for (i = 0; i < opr_sz / 8; ++i) {
2436 mulu64(&discard, &d[i], n[i], m[i]);
2438 clear_tail(d, opr_sz, simd_maxsz(desc));
2441 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2443 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2444 int shr = simd_data(desc);
2445 uint64_t *d = vd, *n = vn, *m = vm;
2447 for (i = 0; i < opr_sz; ++i) {
2448 d[i] = ror64(n[i] ^ m[i], shr);
2450 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2454 * Integer matrix-multiply accumulate
2457 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2459 int8_t *n = vn, *m = vm;
2461 for (intptr_t k = 0; k < 8; ++k) {
2462 sum += n[H1(k)] * m[H1(k)];
2464 return sum;
2467 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2469 uint8_t *n = vn, *m = vm;
2471 for (intptr_t k = 0; k < 8; ++k) {
2472 sum += n[H1(k)] * m[H1(k)];
2474 return sum;
2477 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2479 uint8_t *n = vn;
2480 int8_t *m = vm;
2482 for (intptr_t k = 0; k < 8; ++k) {
2483 sum += n[H1(k)] * m[H1(k)];
2485 return sum;
2488 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2489 uint32_t (*inner_loop)(uint32_t, void *, void *))
2491 intptr_t seg, opr_sz = simd_oprsz(desc);
2493 for (seg = 0; seg < opr_sz; seg += 16) {
2494 uint32_t *d = vd + seg;
2495 uint32_t *a = va + seg;
2496 uint32_t sum0, sum1, sum2, sum3;
2499 * Process the entire segment at once, writing back the
2500 * results only after we've consumed all of the inputs.
2502 * Key to indices by column:
2503 * i j i j
2505 sum0 = a[H4(0 + 0)];
2506 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2507 sum1 = a[H4(0 + 1)];
2508 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2509 sum2 = a[H4(2 + 0)];
2510 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2511 sum3 = a[H4(2 + 1)];
2512 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2514 d[H4(0)] = sum0;
2515 d[H4(1)] = sum1;
2516 d[H4(2)] = sum2;
2517 d[H4(3)] = sum3;
2519 clear_tail(vd, opr_sz, simd_maxsz(desc));
2522 #define DO_MMLA_B(NAME, INNER) \
2523 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2524 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2526 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2527 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2528 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2531 * BFloat16 Dot Product
2534 static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2536 /* FPCR is ignored for BFDOT and BFMMLA. */
2537 float_status bf_status = {
2538 .tininess_before_rounding = float_tininess_before_rounding,
2539 .float_rounding_mode = float_round_to_odd_inf,
2540 .flush_to_zero = true,
2541 .flush_inputs_to_zero = true,
2542 .default_nan_mode = true,
2544 float32 t1, t2;
2547 * Extract each BFloat16 from the element pair, and shift
2548 * them such that they become float32.
2550 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2551 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2552 t1 = float32_add(t1, t2, &bf_status);
2553 t1 = float32_add(sum, t1, &bf_status);
2555 return t1;
2558 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2560 intptr_t i, opr_sz = simd_oprsz(desc);
2561 float32 *d = vd, *a = va;
2562 uint32_t *n = vn, *m = vm;
2564 for (i = 0; i < opr_sz / 4; ++i) {
2565 d[i] = bfdotadd(a[i], n[i], m[i]);
2567 clear_tail(d, opr_sz, simd_maxsz(desc));
2570 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2571 void *va, uint32_t desc)
2573 intptr_t i, j, opr_sz = simd_oprsz(desc);
2574 intptr_t index = simd_data(desc);
2575 intptr_t elements = opr_sz / 4;
2576 intptr_t eltspersegment = MIN(16 / 4, elements);
2577 float32 *d = vd, *a = va;
2578 uint32_t *n = vn, *m = vm;
2580 for (i = 0; i < elements; i += eltspersegment) {
2581 uint32_t m_idx = m[i + H4(index)];
2583 for (j = i; j < i + eltspersegment; j++) {
2584 d[j] = bfdotadd(a[j], n[j], m_idx);
2587 clear_tail(d, opr_sz, simd_maxsz(desc));
2590 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2592 intptr_t s, opr_sz = simd_oprsz(desc);
2593 float32 *d = vd, *a = va;
2594 uint32_t *n = vn, *m = vm;
2596 for (s = 0; s < opr_sz / 4; s += 4) {
2597 float32 sum00, sum01, sum10, sum11;
2600 * Process the entire segment at once, writing back the
2601 * results only after we've consumed all of the inputs.
2603 * Key to indicies by column:
2604 * i j i k j k
2606 sum00 = a[s + H4(0 + 0)];
2607 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2608 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2610 sum01 = a[s + H4(0 + 1)];
2611 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2612 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2614 sum10 = a[s + H4(2 + 0)];
2615 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2616 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2618 sum11 = a[s + H4(2 + 1)];
2619 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2620 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2622 d[s + H4(0 + 0)] = sum00;
2623 d[s + H4(0 + 1)] = sum01;
2624 d[s + H4(2 + 0)] = sum10;
2625 d[s + H4(2 + 1)] = sum11;
2627 clear_tail(d, opr_sz, simd_maxsz(desc));
2630 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2631 void *stat, uint32_t desc)
2633 intptr_t i, opr_sz = simd_oprsz(desc);
2634 intptr_t sel = simd_data(desc);
2635 float32 *d = vd, *a = va;
2636 bfloat16 *n = vn, *m = vm;
2638 for (i = 0; i < opr_sz / 4; ++i) {
2639 float32 nn = n[H2(i * 2 + sel)] << 16;
2640 float32 mm = m[H2(i * 2 + sel)] << 16;
2641 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2643 clear_tail(d, opr_sz, simd_maxsz(desc));
2646 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2647 void *va, void *stat, uint32_t desc)
2649 intptr_t i, j, opr_sz = simd_oprsz(desc);
2650 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2651 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2652 intptr_t elements = opr_sz / 4;
2653 intptr_t eltspersegment = MIN(16 / 4, elements);
2654 float32 *d = vd, *a = va;
2655 bfloat16 *n = vn, *m = vm;
2657 for (i = 0; i < elements; i += eltspersegment) {
2658 float32 m_idx = m[H2(2 * i + index)] << 16;
2660 for (j = i; j < i + eltspersegment; j++) {
2661 float32 n_j = n[H2(2 * j + sel)] << 16;
2662 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2665 clear_tail(d, opr_sz, simd_maxsz(desc));