target/arm: Move expand_pred_b() data to vec_helper.c
[qemu/ar7.git] / target / arm / sve_helper.c
blob321098e26515cad71137a13f972f902a858e2728
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
36 * within CPUARMState.
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
43 * moving forward.
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 if (likely(g)) {
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
55 /* Accumulate Z from each D & G. */
56 flags |= ((d & g) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 return flags;
64 /* This is an iterative function, called for each Pd and Pg word
65 * moving backward.
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 if (likely(g)) {
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
72 if (!(flags & 4)) {
73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags |= (d & pow2floor(g)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags |= ((d & g) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 return flags;
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
103 return flags;
107 * Expand active predicate bits to bytes, for byte elements.
108 * (The data table itself is in vec_helper.c as MVE also needs it.)
110 static inline uint64_t expand_pred_b(uint8_t byte)
112 return expand_pred_b_data[byte];
115 /* Similarly for half-word elements.
116 * for (i = 0; i < 256; ++i) {
117 * unsigned long m = 0;
118 * if (i & 0xaa) {
119 * continue;
121 * for (j = 0; j < 8; j += 2) {
122 * if ((i >> j) & 1) {
123 * m |= 0xfffful << (j << 3);
126 * printf("[0x%x] = 0x%016lx,\n", i, m);
129 static inline uint64_t expand_pred_h(uint8_t byte)
131 static const uint64_t word[] = {
132 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
133 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
134 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
135 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
136 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
137 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
138 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
139 [0x55] = 0xffffffffffffffff,
141 return word[byte & 0x55];
144 /* Similarly for single word elements. */
145 static inline uint64_t expand_pred_s(uint8_t byte)
147 static const uint64_t word[] = {
148 [0x01] = 0x00000000ffffffffull,
149 [0x10] = 0xffffffff00000000ull,
150 [0x11] = 0xffffffffffffffffull,
152 return word[byte & 0x11];
155 /* Swap 16-bit words within a 32-bit word. */
156 static inline uint32_t hswap32(uint32_t h)
158 return rol32(h, 16);
161 /* Swap 16-bit words within a 64-bit word. */
162 static inline uint64_t hswap64(uint64_t h)
164 uint64_t m = 0x0000ffff0000ffffull;
165 h = rol64(h, 32);
166 return ((h & m) << 16) | ((h >> 16) & m);
169 /* Swap 32-bit words within a 64-bit word. */
170 static inline uint64_t wswap64(uint64_t h)
172 return rol64(h, 32);
175 #define LOGICAL_PPPP(NAME, FUNC) \
176 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
178 uintptr_t opr_sz = simd_oprsz(desc); \
179 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
180 uintptr_t i; \
181 for (i = 0; i < opr_sz / 8; ++i) { \
182 d[i] = FUNC(n[i], m[i], g[i]); \
186 #define DO_AND(N, M, G) (((N) & (M)) & (G))
187 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
188 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
189 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
190 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
191 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
192 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
193 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
195 LOGICAL_PPPP(sve_and_pppp, DO_AND)
196 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
197 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
198 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
199 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
200 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
201 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
202 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
204 #undef DO_AND
205 #undef DO_BIC
206 #undef DO_EOR
207 #undef DO_ORR
208 #undef DO_ORN
209 #undef DO_NOR
210 #undef DO_NAND
211 #undef DO_SEL
212 #undef LOGICAL_PPPP
214 /* Fully general three-operand expander, controlled by a predicate.
215 * This is complicated by the host-endian storage of the register file.
217 /* ??? I don't expect the compiler could ever vectorize this itself.
218 * With some tables we can convert bit masks to byte masks, and with
219 * extra care wrt byte/word ordering we could use gcc generic vectors
220 * and do 16 bytes at a time.
222 #define DO_ZPZZ(NAME, TYPE, H, OP) \
223 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
225 intptr_t i, opr_sz = simd_oprsz(desc); \
226 for (i = 0; i < opr_sz; ) { \
227 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
228 do { \
229 if (pg & 1) { \
230 TYPE nn = *(TYPE *)(vn + H(i)); \
231 TYPE mm = *(TYPE *)(vm + H(i)); \
232 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
234 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
235 } while (i & 15); \
239 /* Similarly, specialized for 64-bit operands. */
240 #define DO_ZPZZ_D(NAME, TYPE, OP) \
241 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
243 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
244 TYPE *d = vd, *n = vn, *m = vm; \
245 uint8_t *pg = vg; \
246 for (i = 0; i < opr_sz; i += 1) { \
247 if (pg[H1(i)] & 1) { \
248 TYPE nn = n[i], mm = m[i]; \
249 d[i] = OP(nn, mm); \
254 #define DO_AND(N, M) (N & M)
255 #define DO_EOR(N, M) (N ^ M)
256 #define DO_ORR(N, M) (N | M)
257 #define DO_BIC(N, M) (N & ~M)
258 #define DO_ADD(N, M) (N + M)
259 #define DO_SUB(N, M) (N - M)
260 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
261 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
262 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
263 #define DO_MUL(N, M) (N * M)
267 * We must avoid the C undefined behaviour cases: division by
268 * zero and signed division of INT_MIN by -1. Both of these
269 * have architecturally defined required results for Arm.
270 * We special case all signed divisions by -1 to avoid having
271 * to deduce the minimum integer for the type involved.
273 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
274 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
276 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
277 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
278 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
279 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
281 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
282 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
283 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
284 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
286 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
287 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
288 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
289 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
291 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
292 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
293 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
294 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
296 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
297 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
298 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
299 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
301 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
302 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
303 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
304 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
306 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
307 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
308 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
309 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
311 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
312 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
313 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
314 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
316 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
317 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
318 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
319 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
321 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
322 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
323 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
324 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
326 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
327 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
328 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
329 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
331 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
332 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
333 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
334 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
336 /* Because the computation type is at least twice as large as required,
337 these work for both signed and unsigned source types. */
338 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
340 return (n * m) >> 8;
343 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
345 return (n * m) >> 16;
348 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
350 return (n * m) >> 32;
353 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
355 uint64_t lo, hi;
356 muls64(&lo, &hi, n, m);
357 return hi;
360 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
362 uint64_t lo, hi;
363 mulu64(&lo, &hi, n, m);
364 return hi;
367 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
368 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
369 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
370 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
372 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
373 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
374 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
375 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
377 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
378 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
379 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
380 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
382 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
383 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
385 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
386 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
388 /* Note that all bits of the shift are significant
389 and not modulo the element size. */
390 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
391 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
392 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
394 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
395 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
396 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
398 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
399 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
400 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
402 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
403 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
404 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
406 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
407 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
408 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
410 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
412 int8_t n1 = n, n2 = n >> 8;
413 return m + n1 + n2;
416 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
418 int16_t n1 = n, n2 = n >> 16;
419 return m + n1 + n2;
422 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
424 int32_t n1 = n, n2 = n >> 32;
425 return m + n1 + n2;
428 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
429 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
430 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
432 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
434 uint8_t n1 = n, n2 = n >> 8;
435 return m + n1 + n2;
438 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
440 uint16_t n1 = n, n2 = n >> 16;
441 return m + n1 + n2;
444 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
446 uint32_t n1 = n, n2 = n >> 32;
447 return m + n1 + n2;
450 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
451 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
452 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
454 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
455 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
456 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
457 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
459 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
460 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
461 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
462 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
464 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
465 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
466 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
467 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
469 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
470 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
471 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
472 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
475 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
476 * We pass in a pointer to a dummy saturation field to trigger
477 * the saturating arithmetic but discard the information about
478 * whether it has occurred.
480 #define do_sqshl_b(n, m) \
481 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
482 #define do_sqshl_h(n, m) \
483 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
484 #define do_sqshl_s(n, m) \
485 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
486 #define do_sqshl_d(n, m) \
487 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
489 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
490 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
491 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
492 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
494 #define do_uqshl_b(n, m) \
495 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
496 #define do_uqshl_h(n, m) \
497 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
498 #define do_uqshl_s(n, m) \
499 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
500 #define do_uqshl_d(n, m) \
501 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
503 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
504 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
505 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
506 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
508 #define do_sqrshl_b(n, m) \
509 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
510 #define do_sqrshl_h(n, m) \
511 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
512 #define do_sqrshl_s(n, m) \
513 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
514 #define do_sqrshl_d(n, m) \
515 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
517 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
518 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
519 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
520 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
522 #undef do_sqrshl_d
524 #define do_uqrshl_b(n, m) \
525 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
526 #define do_uqrshl_h(n, m) \
527 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
528 #define do_uqrshl_s(n, m) \
529 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
530 #define do_uqrshl_d(n, m) \
531 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
533 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
534 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
535 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
536 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
538 #undef do_uqrshl_d
540 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
541 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
543 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
544 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
545 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
546 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
548 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
549 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
550 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
551 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
553 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
554 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
556 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
557 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
558 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
559 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
561 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
562 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
563 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
564 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
566 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
567 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
569 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
570 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
571 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
572 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
574 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
575 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
576 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
577 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
579 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
581 return val >= max ? max : val <= min ? min : val;
584 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
585 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
586 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
588 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
590 int64_t r = n + m;
591 if (((r ^ n) & ~(n ^ m)) < 0) {
592 /* Signed overflow. */
593 return r < 0 ? INT64_MAX : INT64_MIN;
595 return r;
598 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
599 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
600 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
601 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
603 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
604 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
605 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
607 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
609 uint64_t r = n + m;
610 return r < n ? UINT64_MAX : r;
613 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
614 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
615 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
616 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
618 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
619 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
620 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
622 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
624 int64_t r = n - m;
625 if (((r ^ n) & (n ^ m)) < 0) {
626 /* Signed overflow. */
627 return r < 0 ? INT64_MAX : INT64_MIN;
629 return r;
632 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
633 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
634 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
635 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
637 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
638 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
639 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
641 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
643 return n > m ? n - m : 0;
646 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
647 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
648 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
649 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
651 #define DO_SUQADD_B(n, m) \
652 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
653 #define DO_SUQADD_H(n, m) \
654 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
655 #define DO_SUQADD_S(n, m) \
656 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
658 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
660 uint64_t r = n + m;
662 if (n < 0) {
663 /* Note that m - abs(n) cannot underflow. */
664 if (r > INT64_MAX) {
665 /* Result is either very large positive or negative. */
666 if (m > -n) {
667 /* m > abs(n), so r is a very large positive. */
668 return INT64_MAX;
670 /* Result is negative. */
672 } else {
673 /* Both inputs are positive: check for overflow. */
674 if (r < m || r > INT64_MAX) {
675 return INT64_MAX;
678 return r;
681 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
682 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
683 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
684 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
686 #define DO_USQADD_B(n, m) \
687 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
688 #define DO_USQADD_H(n, m) \
689 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
690 #define DO_USQADD_S(n, m) \
691 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
693 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
695 uint64_t r = n + m;
697 if (m < 0) {
698 return n < -m ? 0 : r;
700 return r < n ? UINT64_MAX : r;
703 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
704 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
705 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
706 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
708 #undef DO_ZPZZ
709 #undef DO_ZPZZ_D
712 * Three operand expander, operating on element pairs.
713 * If the slot I is even, the elements from from VN {I, I+1}.
714 * If the slot I is odd, the elements from from VM {I-1, I}.
715 * Load all of the input elements in each pair before overwriting output.
717 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
718 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
720 intptr_t i, opr_sz = simd_oprsz(desc); \
721 for (i = 0; i < opr_sz; ) { \
722 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
723 do { \
724 TYPE n0 = *(TYPE *)(vn + H(i)); \
725 TYPE m0 = *(TYPE *)(vm + H(i)); \
726 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
727 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
728 if (pg & 1) { \
729 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
731 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
732 if (pg & 1) { \
733 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
735 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
736 } while (i & 15); \
740 /* Similarly, specialized for 64-bit operands. */
741 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
742 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
744 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
745 TYPE *d = vd, *n = vn, *m = vm; \
746 uint8_t *pg = vg; \
747 for (i = 0; i < opr_sz; i += 2) { \
748 TYPE n0 = n[i], n1 = n[i + 1]; \
749 TYPE m0 = m[i], m1 = m[i + 1]; \
750 if (pg[H1(i)] & 1) { \
751 d[i] = OP(n0, n1); \
753 if (pg[H1(i + 1)] & 1) { \
754 d[i + 1] = OP(m0, m1); \
759 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
760 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
761 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
762 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
764 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
765 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
766 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
767 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
769 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
770 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
771 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
772 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
774 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
775 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
776 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
777 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
779 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
780 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
781 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
782 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
784 #undef DO_ZPZZ_PAIR
785 #undef DO_ZPZZ_PAIR_D
787 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
788 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
789 void *status, uint32_t desc) \
791 intptr_t i, opr_sz = simd_oprsz(desc); \
792 for (i = 0; i < opr_sz; ) { \
793 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
794 do { \
795 TYPE n0 = *(TYPE *)(vn + H(i)); \
796 TYPE m0 = *(TYPE *)(vm + H(i)); \
797 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
798 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
799 if (pg & 1) { \
800 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
802 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
803 if (pg & 1) { \
804 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
806 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
807 } while (i & 15); \
811 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
812 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
813 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
815 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
816 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
817 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
819 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
820 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
821 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
823 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
824 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
825 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
827 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
828 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
829 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
831 #undef DO_ZPZZ_PAIR_FP
833 /* Three-operand expander, controlled by a predicate, in which the
834 * third operand is "wide". That is, for D = N op M, the same 64-bit
835 * value of M is used with all of the narrower values of N.
837 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
838 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
840 intptr_t i, opr_sz = simd_oprsz(desc); \
841 for (i = 0; i < opr_sz; ) { \
842 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
843 TYPEW mm = *(TYPEW *)(vm + i); \
844 do { \
845 if (pg & 1) { \
846 TYPE nn = *(TYPE *)(vn + H(i)); \
847 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
849 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
850 } while (i & 7); \
854 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
855 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
856 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
858 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
859 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
860 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
862 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
863 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
864 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
866 #undef DO_ZPZW
868 /* Fully general two-operand expander, controlled by a predicate.
870 #define DO_ZPZ(NAME, TYPE, H, OP) \
871 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
873 intptr_t i, opr_sz = simd_oprsz(desc); \
874 for (i = 0; i < opr_sz; ) { \
875 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
876 do { \
877 if (pg & 1) { \
878 TYPE nn = *(TYPE *)(vn + H(i)); \
879 *(TYPE *)(vd + H(i)) = OP(nn); \
881 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
882 } while (i & 15); \
886 /* Similarly, specialized for 64-bit operands. */
887 #define DO_ZPZ_D(NAME, TYPE, OP) \
888 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
890 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
891 TYPE *d = vd, *n = vn; \
892 uint8_t *pg = vg; \
893 for (i = 0; i < opr_sz; i += 1) { \
894 if (pg[H1(i)] & 1) { \
895 TYPE nn = n[i]; \
896 d[i] = OP(nn); \
901 #define DO_CLS_B(N) (clrsb32(N) - 24)
902 #define DO_CLS_H(N) (clrsb32(N) - 16)
904 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
905 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
906 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
907 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
909 #define DO_CLZ_B(N) (clz32(N) - 24)
910 #define DO_CLZ_H(N) (clz32(N) - 16)
912 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
913 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
914 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
915 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
917 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
918 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
919 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
920 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
922 #define DO_CNOT(N) (N == 0)
924 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
925 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
926 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
927 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
929 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
931 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
932 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
933 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
935 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
937 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
938 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
939 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
941 #define DO_NOT(N) (~N)
943 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
944 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
945 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
946 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
948 #define DO_SXTB(N) ((int8_t)N)
949 #define DO_SXTH(N) ((int16_t)N)
950 #define DO_SXTS(N) ((int32_t)N)
951 #define DO_UXTB(N) ((uint8_t)N)
952 #define DO_UXTH(N) ((uint16_t)N)
953 #define DO_UXTS(N) ((uint32_t)N)
955 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
956 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
957 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
958 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
959 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
960 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
962 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
963 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
964 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
965 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
966 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
967 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
969 #define DO_ABS(N) (N < 0 ? -N : N)
971 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
972 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
973 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
974 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
976 #define DO_NEG(N) (-N)
978 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
979 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
980 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
981 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
983 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
984 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
985 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
987 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
988 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
990 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
992 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
993 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
994 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
995 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
997 #define DO_SQABS(X) \
998 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
999 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1001 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1002 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1003 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1004 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1006 #define DO_SQNEG(X) \
1007 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1008 x_ == min_ ? -min_ - 1 : -x_; })
1010 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1011 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1012 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1013 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1015 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1016 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1018 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1020 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1021 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1023 intptr_t i, opr_sz = simd_oprsz(desc); \
1024 for (i = 0; i < opr_sz; ) { \
1025 TYPEW mm = *(TYPEW *)(vm + i); \
1026 do { \
1027 TYPE nn = *(TYPE *)(vn + H(i)); \
1028 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1029 i += sizeof(TYPE); \
1030 } while (i & 7); \
1034 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1035 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1036 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1038 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1039 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1040 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1042 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1043 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1044 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1046 #undef DO_ZZW
1048 #undef DO_CLS_B
1049 #undef DO_CLS_H
1050 #undef DO_CLZ_B
1051 #undef DO_CLZ_H
1052 #undef DO_CNOT
1053 #undef DO_FABS
1054 #undef DO_FNEG
1055 #undef DO_ABS
1056 #undef DO_NEG
1057 #undef DO_ZPZ
1058 #undef DO_ZPZ_D
1061 * Three-operand expander, unpredicated, in which the two inputs are
1062 * selected from the top or bottom half of the wide column.
1064 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1065 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1067 intptr_t i, opr_sz = simd_oprsz(desc); \
1068 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1069 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1070 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1071 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1072 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1073 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1077 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1078 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1079 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1081 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1082 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1083 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1085 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1086 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1087 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1089 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1090 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1091 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1093 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1094 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1095 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1097 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1098 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1099 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1101 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1102 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1103 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1105 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1106 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1107 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1109 /* Note that the multiply cannot overflow, but the doubling can. */
1110 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1112 int16_t val = n * m;
1113 return DO_SQADD_H(val, val);
1116 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1118 int32_t val = n * m;
1119 return DO_SQADD_S(val, val);
1122 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1124 int64_t val = n * m;
1125 return do_sqadd_d(val, val);
1128 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1129 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1130 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1132 #undef DO_ZZZ_TB
1134 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1135 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1137 intptr_t i, opr_sz = simd_oprsz(desc); \
1138 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1139 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1140 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1141 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1142 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1146 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1147 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1148 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1150 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1151 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1152 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1154 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1155 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1156 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1158 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1159 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1160 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1162 #undef DO_ZZZ_WTB
1164 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1165 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1167 intptr_t i, opr_sz = simd_oprsz(desc); \
1168 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1169 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1170 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1171 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1172 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1173 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1177 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1178 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1179 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1180 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1182 #undef DO_ZZZ_NTB
1184 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1187 intptr_t i, opr_sz = simd_oprsz(desc); \
1188 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1189 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1190 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1191 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1192 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1193 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1197 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1198 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1199 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1201 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1202 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1203 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1205 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1206 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1207 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1209 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1210 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1211 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1213 #define DO_NMUL(N, M) -(N * M)
1215 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1216 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1217 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1219 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1220 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1221 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1223 #undef DO_ZZZW_ACC
1225 #define DO_XTNB(NAME, TYPE, OP) \
1226 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1228 intptr_t i, opr_sz = simd_oprsz(desc); \
1229 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1230 TYPE nn = *(TYPE *)(vn + i); \
1231 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1232 *(TYPE *)(vd + i) = nn; \
1236 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1237 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1239 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1240 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1241 TYPE nn = *(TYPE *)(vn + i); \
1242 *(TYPEN *)(vd + i + odd) = OP(nn); \
1246 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1247 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1248 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1250 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1251 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1252 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1254 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1255 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1256 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1258 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1259 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1260 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1262 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1263 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1264 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1266 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1267 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1268 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1270 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1271 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1272 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1274 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1275 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1276 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1278 #undef DO_XTNB
1279 #undef DO_XTNT
1281 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1283 intptr_t i, opr_sz = simd_oprsz(desc);
1284 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1285 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1286 uint32_t *a = va, *n = vn;
1287 uint64_t *d = vd, *m = vm;
1289 for (i = 0; i < opr_sz / 8; ++i) {
1290 uint32_t e1 = a[2 * i + H4(0)];
1291 uint32_t e2 = n[2 * i + sel] ^ inv;
1292 uint64_t c = extract64(m[i], 32, 1);
1293 /* Compute and store the entire 33-bit result at once. */
1294 d[i] = c + e1 + e2;
1298 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1300 intptr_t i, opr_sz = simd_oprsz(desc);
1301 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1302 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1303 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1305 for (i = 0; i < opr_sz / 8; i += 2) {
1306 Int128 e1 = int128_make64(a[i]);
1307 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1308 Int128 c = int128_make64(m[i + 1] & 1);
1309 Int128 r = int128_add(int128_add(e1, e2), c);
1310 d[i + 0] = int128_getlo(r);
1311 d[i + 1] = int128_gethi(r);
1315 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1318 intptr_t i, opr_sz = simd_oprsz(desc); \
1319 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1320 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1321 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1322 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1323 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1324 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1325 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1329 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1330 do_sqdmull_h, DO_SQADD_H)
1331 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1332 do_sqdmull_s, DO_SQADD_S)
1333 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1334 do_sqdmull_d, do_sqadd_d)
1336 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1337 do_sqdmull_h, DO_SQSUB_H)
1338 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1339 do_sqdmull_s, DO_SQSUB_S)
1340 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1341 do_sqdmull_d, do_sqsub_d)
1343 #undef DO_SQDMLAL
1345 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1346 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1348 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1349 int rot = simd_data(desc); \
1350 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1351 bool sub_r = rot == 1 || rot == 2; \
1352 bool sub_i = rot >= 2; \
1353 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1354 for (i = 0; i < opr_sz; i += 2) { \
1355 TYPE elt1_a = n[H(i + sel_a)]; \
1356 TYPE elt2_a = m[H(i + sel_a)]; \
1357 TYPE elt2_b = m[H(i + sel_b)]; \
1358 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1359 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1363 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1365 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1366 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1367 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1368 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1370 #define DO_SQRDMLAH_B(N, M, A, S) \
1371 do_sqrdmlah_b(N, M, A, S, true)
1372 #define DO_SQRDMLAH_H(N, M, A, S) \
1373 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1374 #define DO_SQRDMLAH_S(N, M, A, S) \
1375 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1376 #define DO_SQRDMLAH_D(N, M, A, S) \
1377 do_sqrdmlah_d(N, M, A, S, true)
1379 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1380 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1381 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1382 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1384 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1385 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1387 intptr_t i, j, oprsz = simd_oprsz(desc); \
1388 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1389 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1390 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1391 bool sub_r = rot == 1 || rot == 2; \
1392 bool sub_i = rot >= 2; \
1393 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1394 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1395 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1396 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1397 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1398 TYPE elt1_a = n[H(i + j + sel_a)]; \
1399 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1400 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1405 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1406 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1408 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1409 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1411 #undef DO_CMLA
1412 #undef DO_CMLA_FUNC
1413 #undef DO_CMLA_IDX_FUNC
1414 #undef DO_SQRDMLAH_B
1415 #undef DO_SQRDMLAH_H
1416 #undef DO_SQRDMLAH_S
1417 #undef DO_SQRDMLAH_D
1419 /* Note N and M are 4 elements bundled into one unit. */
1420 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1421 int sel_a, int sel_b, int sub_i)
1423 for (int i = 0; i <= 1; i++) {
1424 int32_t elt1_r = (int8_t)(n >> (16 * i));
1425 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1426 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1427 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1429 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1431 return a;
1434 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1435 int sel_a, int sel_b, int sub_i)
1437 for (int i = 0; i <= 1; i++) {
1438 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1439 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1440 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1441 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1443 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1445 return a;
1448 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1449 void *va, uint32_t desc)
1451 int opr_sz = simd_oprsz(desc);
1452 int rot = simd_data(desc);
1453 int sel_a = rot & 1;
1454 int sel_b = sel_a ^ 1;
1455 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1456 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1458 for (int e = 0; e < opr_sz / 4; e++) {
1459 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1463 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1464 void *va, uint32_t desc)
1466 int opr_sz = simd_oprsz(desc);
1467 int rot = simd_data(desc);
1468 int sel_a = rot & 1;
1469 int sel_b = sel_a ^ 1;
1470 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1471 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1473 for (int e = 0; e < opr_sz / 8; e++) {
1474 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1478 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1479 void *va, uint32_t desc)
1481 int opr_sz = simd_oprsz(desc);
1482 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1483 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1484 int sel_a = rot & 1;
1485 int sel_b = sel_a ^ 1;
1486 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1487 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1489 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1490 uint32_t seg_m = m[seg + idx];
1491 for (int e = 0; e < 4; e++) {
1492 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1493 sel_a, sel_b, sub_i);
1498 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1499 void *va, uint32_t desc)
1501 int seg, opr_sz = simd_oprsz(desc);
1502 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1503 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1504 int sel_a = rot & 1;
1505 int sel_b = sel_a ^ 1;
1506 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1507 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1509 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1510 uint64_t seg_m = m[seg + idx];
1511 for (int e = 0; e < 2; e++) {
1512 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1513 sel_a, sel_b, sub_i);
1518 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1519 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1521 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1522 intptr_t i, j, idx = simd_data(desc); \
1523 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1524 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1525 TYPE mm = m[i]; \
1526 for (j = 0; j < segment; j++) { \
1527 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1532 #define DO_SQRDMLAH_H(N, M, A) \
1533 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1534 #define DO_SQRDMLAH_S(N, M, A) \
1535 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1536 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1538 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1539 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1540 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1542 #define DO_SQRDMLSH_H(N, M, A) \
1543 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1544 #define DO_SQRDMLSH_S(N, M, A) \
1545 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1546 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1548 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1549 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1550 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1552 #undef DO_ZZXZ
1554 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1555 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1557 intptr_t i, j, oprsz = simd_oprsz(desc); \
1558 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1559 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1560 for (i = 0; i < oprsz; i += 16) { \
1561 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1562 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1563 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1564 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1565 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1570 #define DO_MLA(N, M, A) (A + N * M)
1572 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1573 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1574 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1575 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1577 #define DO_MLS(N, M, A) (A - N * M)
1579 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1580 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1581 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1582 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1584 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1585 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1587 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1588 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1590 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1591 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1593 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1594 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1596 #undef DO_MLA
1597 #undef DO_MLS
1598 #undef DO_ZZXW
1600 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1601 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1603 intptr_t i, j, oprsz = simd_oprsz(desc); \
1604 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1605 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1606 for (i = 0; i < oprsz; i += 16) { \
1607 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1608 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1609 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1610 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1615 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1616 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1618 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1619 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1621 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1622 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1624 #undef DO_ZZX
1626 #define DO_BITPERM(NAME, TYPE, OP) \
1627 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1629 intptr_t i, opr_sz = simd_oprsz(desc); \
1630 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1631 TYPE nn = *(TYPE *)(vn + i); \
1632 TYPE mm = *(TYPE *)(vm + i); \
1633 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1637 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1639 uint64_t res = 0;
1640 int db, rb = 0;
1642 for (db = 0; db < n; ++db) {
1643 if ((mask >> db) & 1) {
1644 res |= ((data >> db) & 1) << rb;
1645 ++rb;
1648 return res;
1651 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1652 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1653 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1654 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1656 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1658 uint64_t res = 0;
1659 int rb, db = 0;
1661 for (rb = 0; rb < n; ++rb) {
1662 if ((mask >> rb) & 1) {
1663 res |= ((data >> db) & 1) << rb;
1664 ++db;
1667 return res;
1670 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1671 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1672 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1673 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1675 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1677 uint64_t resm = 0, resu = 0;
1678 int db, rbm = 0, rbu = 0;
1680 for (db = 0; db < n; ++db) {
1681 uint64_t val = (data >> db) & 1;
1682 if ((mask >> db) & 1) {
1683 resm |= val << rbm++;
1684 } else {
1685 resu |= val << rbu++;
1689 return resm | (resu << rbm);
1692 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1693 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1694 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1695 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1697 #undef DO_BITPERM
1699 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1700 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1702 intptr_t i, opr_sz = simd_oprsz(desc); \
1703 int sub_r = simd_data(desc); \
1704 if (sub_r) { \
1705 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1706 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1707 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1708 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1709 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1710 acc_r = ADD_OP(acc_r, el2_i); \
1711 acc_i = SUB_OP(acc_i, el2_r); \
1712 *(TYPE *)(vd + H(i)) = acc_r; \
1713 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1715 } else { \
1716 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1717 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1718 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1719 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1720 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1721 acc_r = SUB_OP(acc_r, el2_i); \
1722 acc_i = ADD_OP(acc_i, el2_r); \
1723 *(TYPE *)(vd + H(i)) = acc_r; \
1724 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1729 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1730 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1731 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1732 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1734 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1735 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1736 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1737 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1739 #undef DO_CADD
1741 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1742 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1744 intptr_t i, opr_sz = simd_oprsz(desc); \
1745 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1746 int shift = simd_data(desc) >> 1; \
1747 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1748 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1749 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1753 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1754 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1755 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1757 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1758 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1759 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1761 #undef DO_ZZI_SHLL
1763 /* Two-operand reduction expander, controlled by a predicate.
1764 * The difference between TYPERED and TYPERET has to do with
1765 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1766 * but TYPERET must be unsigned so that e.g. a 32-bit value
1767 * is not sign-extended to the ABI uint64_t return type.
1769 /* ??? If we were to vectorize this by hand the reduction ordering
1770 * would change. For integer operands, this is perfectly fine.
1772 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1773 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1775 intptr_t i, opr_sz = simd_oprsz(desc); \
1776 TYPERED ret = INIT; \
1777 for (i = 0; i < opr_sz; ) { \
1778 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1779 do { \
1780 if (pg & 1) { \
1781 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1782 ret = OP(ret, nn); \
1784 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1785 } while (i & 15); \
1787 return (TYPERET)ret; \
1790 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1791 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1793 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1794 TYPEE *n = vn; \
1795 uint8_t *pg = vg; \
1796 TYPER ret = INIT; \
1797 for (i = 0; i < opr_sz; i += 1) { \
1798 if (pg[H1(i)] & 1) { \
1799 TYPEE nn = n[i]; \
1800 ret = OP(ret, nn); \
1803 return ret; \
1806 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1807 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1808 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1809 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1811 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1812 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1813 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1814 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1816 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1817 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1818 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1819 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1821 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1822 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1823 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1825 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1826 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1827 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1828 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1830 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1831 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1832 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1833 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1835 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1836 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1837 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1838 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1840 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1841 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1842 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1843 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1845 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1846 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1847 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1848 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1850 #undef DO_VPZ
1851 #undef DO_VPZ_D
1853 /* Two vector operand, one scalar operand, unpredicated. */
1854 #define DO_ZZI(NAME, TYPE, OP) \
1855 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1857 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1858 TYPE s = s64, *d = vd, *n = vn; \
1859 for (i = 0; i < opr_sz; ++i) { \
1860 d[i] = OP(n[i], s); \
1864 #define DO_SUBR(X, Y) (Y - X)
1866 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1867 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1868 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1869 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1871 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1872 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1873 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1874 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1876 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1877 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1878 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1879 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1881 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1882 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1883 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1884 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1886 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1887 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1888 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1889 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1891 #undef DO_ZZI
1893 #undef DO_AND
1894 #undef DO_ORR
1895 #undef DO_EOR
1896 #undef DO_BIC
1897 #undef DO_ADD
1898 #undef DO_SUB
1899 #undef DO_MAX
1900 #undef DO_MIN
1901 #undef DO_ABD
1902 #undef DO_MUL
1903 #undef DO_DIV
1904 #undef DO_ASR
1905 #undef DO_LSR
1906 #undef DO_LSL
1907 #undef DO_SUBR
1909 /* Similar to the ARM LastActiveElement pseudocode function, except the
1910 result is multiplied by the element size. This includes the not found
1911 indication; e.g. not found for esz=3 is -8. */
1912 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1914 uint64_t mask = pred_esz_masks[esz];
1915 intptr_t i = words;
1917 do {
1918 uint64_t this_g = g[--i] & mask;
1919 if (this_g) {
1920 return i * 64 + (63 - clz64(this_g));
1922 } while (i > 0);
1923 return (intptr_t)-1 << esz;
1926 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1928 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1929 uint32_t flags = PREDTEST_INIT;
1930 uint64_t *d = vd, *g = vg;
1931 intptr_t i = 0;
1933 do {
1934 uint64_t this_d = d[i];
1935 uint64_t this_g = g[i];
1937 if (this_g) {
1938 if (!(flags & 4)) {
1939 /* Set in D the first bit of G. */
1940 this_d |= this_g & -this_g;
1941 d[i] = this_d;
1943 flags = iter_predtest_fwd(this_d, this_g, flags);
1945 } while (++i < words);
1947 return flags;
1950 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1952 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1953 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1954 uint32_t flags = PREDTEST_INIT;
1955 uint64_t *d = vd, *g = vg, esz_mask;
1956 intptr_t i, next;
1958 next = last_active_element(vd, words, esz) + (1 << esz);
1959 esz_mask = pred_esz_masks[esz];
1961 /* Similar to the pseudocode for pnext, but scaled by ESZ
1962 so that we find the correct bit. */
1963 if (next < words * 64) {
1964 uint64_t mask = -1;
1966 if (next & 63) {
1967 mask = ~((1ull << (next & 63)) - 1);
1968 next &= -64;
1970 do {
1971 uint64_t this_g = g[next / 64] & esz_mask & mask;
1972 if (this_g != 0) {
1973 next = (next & -64) + ctz64(this_g);
1974 break;
1976 next += 64;
1977 mask = -1;
1978 } while (next < words * 64);
1981 i = 0;
1982 do {
1983 uint64_t this_d = 0;
1984 if (i == next / 64) {
1985 this_d = 1ull << (next & 63);
1987 d[i] = this_d;
1988 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1989 } while (++i < words);
1991 return flags;
1995 * Copy Zn into Zd, and store zero into inactive elements.
1996 * If inv, store zeros into the active elements.
1998 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
2000 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2001 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2002 uint64_t *d = vd, *n = vn;
2003 uint8_t *pg = vg;
2005 for (i = 0; i < opr_sz; i += 1) {
2006 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
2010 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
2012 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2013 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2014 uint64_t *d = vd, *n = vn;
2015 uint8_t *pg = vg;
2017 for (i = 0; i < opr_sz; i += 1) {
2018 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2022 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2024 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2025 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2026 uint64_t *d = vd, *n = vn;
2027 uint8_t *pg = vg;
2029 for (i = 0; i < opr_sz; i += 1) {
2030 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2034 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2036 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2037 uint64_t *d = vd, *n = vn;
2038 uint8_t *pg = vg;
2039 uint8_t inv = simd_data(desc);
2041 for (i = 0; i < opr_sz; i += 1) {
2042 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2046 /* Three-operand expander, immediate operand, controlled by a predicate.
2048 #define DO_ZPZI(NAME, TYPE, H, OP) \
2049 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2051 intptr_t i, opr_sz = simd_oprsz(desc); \
2052 TYPE imm = simd_data(desc); \
2053 for (i = 0; i < opr_sz; ) { \
2054 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2055 do { \
2056 if (pg & 1) { \
2057 TYPE nn = *(TYPE *)(vn + H(i)); \
2058 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2060 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2061 } while (i & 15); \
2065 /* Similarly, specialized for 64-bit operands. */
2066 #define DO_ZPZI_D(NAME, TYPE, OP) \
2067 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2069 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2070 TYPE *d = vd, *n = vn; \
2071 TYPE imm = simd_data(desc); \
2072 uint8_t *pg = vg; \
2073 for (i = 0; i < opr_sz; i += 1) { \
2074 if (pg[H1(i)] & 1) { \
2075 TYPE nn = n[i]; \
2076 d[i] = OP(nn, imm); \
2081 #define DO_SHR(N, M) (N >> M)
2082 #define DO_SHL(N, M) (N << M)
2084 /* Arithmetic shift right for division. This rounds negative numbers
2085 toward zero as per signed division. Therefore before shifting,
2086 when N is negative, add 2**M-1. */
2087 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2089 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2091 if (likely(sh < 64)) {
2092 return (x >> sh) + ((x >> (sh - 1)) & 1);
2093 } else if (sh == 64) {
2094 return x >> 63;
2095 } else {
2096 return 0;
2100 static inline int64_t do_srshr(int64_t x, unsigned sh)
2102 if (likely(sh < 64)) {
2103 return (x >> sh) + ((x >> (sh - 1)) & 1);
2104 } else {
2105 /* Rounding the sign bit always produces 0. */
2106 return 0;
2110 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2111 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2112 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2113 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2115 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2116 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2117 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2118 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2120 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2121 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2122 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2123 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2125 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2126 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2127 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2128 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2130 /* SVE2 bitwise shift by immediate */
2131 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2132 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2133 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2134 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2136 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2137 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2138 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2139 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2141 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2142 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2143 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2144 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2146 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2147 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2148 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2149 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2151 #define do_suqrshl_b(n, m) \
2152 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2153 #define do_suqrshl_h(n, m) \
2154 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2155 #define do_suqrshl_s(n, m) \
2156 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2157 #define do_suqrshl_d(n, m) \
2158 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2160 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2161 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2162 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2163 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2165 #undef DO_ASRD
2166 #undef DO_ZPZI
2167 #undef DO_ZPZI_D
2169 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2170 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2172 intptr_t i, opr_sz = simd_oprsz(desc); \
2173 int shift = simd_data(desc); \
2174 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2175 TYPEW nn = *(TYPEW *)(vn + i); \
2176 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2180 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2181 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2183 intptr_t i, opr_sz = simd_oprsz(desc); \
2184 int shift = simd_data(desc); \
2185 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2186 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2187 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2191 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2192 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2193 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2195 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2196 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2197 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2199 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2200 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2201 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2203 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2204 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2205 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2207 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2208 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2209 #define DO_SQSHRUN_D(x, sh) \
2210 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2212 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2213 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2214 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2216 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2217 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2218 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2220 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2221 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2222 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2224 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2225 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2226 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2228 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2229 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2230 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2232 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2233 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2234 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2236 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2237 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2238 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2240 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2241 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2242 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2244 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2245 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2246 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2248 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2249 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2250 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2252 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2253 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2254 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2256 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2257 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2258 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2260 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2261 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2262 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2264 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2265 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2266 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2268 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2269 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2270 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2272 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2273 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2274 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2276 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2277 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2278 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2280 #undef DO_SHRNB
2281 #undef DO_SHRNT
2283 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2284 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2286 intptr_t i, opr_sz = simd_oprsz(desc); \
2287 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2288 TYPEW nn = *(TYPEW *)(vn + i); \
2289 TYPEW mm = *(TYPEW *)(vm + i); \
2290 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2294 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2295 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2297 intptr_t i, opr_sz = simd_oprsz(desc); \
2298 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2299 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2300 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2301 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2305 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2306 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2307 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2308 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2310 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2311 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2312 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2314 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2315 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2316 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2318 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2319 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2320 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2322 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2323 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2324 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2326 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2327 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2328 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2330 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2331 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2332 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2334 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2335 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2336 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2338 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2339 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2340 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2342 #undef DO_RSUBHN
2343 #undef DO_SUBHN
2344 #undef DO_RADDHN
2345 #undef DO_ADDHN
2347 #undef DO_BINOPNB
2349 /* Fully general four-operand expander, controlled by a predicate.
2351 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2352 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2353 void *vg, uint32_t desc) \
2355 intptr_t i, opr_sz = simd_oprsz(desc); \
2356 for (i = 0; i < opr_sz; ) { \
2357 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2358 do { \
2359 if (pg & 1) { \
2360 TYPE nn = *(TYPE *)(vn + H(i)); \
2361 TYPE mm = *(TYPE *)(vm + H(i)); \
2362 TYPE aa = *(TYPE *)(va + H(i)); \
2363 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2365 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2366 } while (i & 15); \
2370 /* Similarly, specialized for 64-bit operands. */
2371 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2372 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2373 void *vg, uint32_t desc) \
2375 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2376 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2377 uint8_t *pg = vg; \
2378 for (i = 0; i < opr_sz; i += 1) { \
2379 if (pg[H1(i)] & 1) { \
2380 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2381 d[i] = OP(aa, nn, mm); \
2386 #define DO_MLA(A, N, M) (A + N * M)
2387 #define DO_MLS(A, N, M) (A - N * M)
2389 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2390 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2392 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2393 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2395 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2396 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2398 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2399 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2401 #undef DO_MLA
2402 #undef DO_MLS
2403 #undef DO_ZPZZZ
2404 #undef DO_ZPZZZ_D
2406 void HELPER(sve_index_b)(void *vd, uint32_t start,
2407 uint32_t incr, uint32_t desc)
2409 intptr_t i, opr_sz = simd_oprsz(desc);
2410 uint8_t *d = vd;
2411 for (i = 0; i < opr_sz; i += 1) {
2412 d[H1(i)] = start + i * incr;
2416 void HELPER(sve_index_h)(void *vd, uint32_t start,
2417 uint32_t incr, uint32_t desc)
2419 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2420 uint16_t *d = vd;
2421 for (i = 0; i < opr_sz; i += 1) {
2422 d[H2(i)] = start + i * incr;
2426 void HELPER(sve_index_s)(void *vd, uint32_t start,
2427 uint32_t incr, uint32_t desc)
2429 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2430 uint32_t *d = vd;
2431 for (i = 0; i < opr_sz; i += 1) {
2432 d[H4(i)] = start + i * incr;
2436 void HELPER(sve_index_d)(void *vd, uint64_t start,
2437 uint64_t incr, uint32_t desc)
2439 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2440 uint64_t *d = vd;
2441 for (i = 0; i < opr_sz; i += 1) {
2442 d[i] = start + i * incr;
2446 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2448 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2449 uint32_t sh = simd_data(desc);
2450 uint32_t *d = vd, *n = vn, *m = vm;
2451 for (i = 0; i < opr_sz; i += 1) {
2452 d[i] = n[i] + (m[i] << sh);
2456 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2458 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2459 uint64_t sh = simd_data(desc);
2460 uint64_t *d = vd, *n = vn, *m = vm;
2461 for (i = 0; i < opr_sz; i += 1) {
2462 d[i] = n[i] + (m[i] << sh);
2466 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2468 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2469 uint64_t sh = simd_data(desc);
2470 uint64_t *d = vd, *n = vn, *m = vm;
2471 for (i = 0; i < opr_sz; i += 1) {
2472 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2476 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2478 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2479 uint64_t sh = simd_data(desc);
2480 uint64_t *d = vd, *n = vn, *m = vm;
2481 for (i = 0; i < opr_sz; i += 1) {
2482 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2486 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2488 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2489 static const uint16_t coeff[] = {
2490 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2491 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2492 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2493 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2495 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2496 uint16_t *d = vd, *n = vn;
2498 for (i = 0; i < opr_sz; i++) {
2499 uint16_t nn = n[i];
2500 intptr_t idx = extract32(nn, 0, 5);
2501 uint16_t exp = extract32(nn, 5, 5);
2502 d[i] = coeff[idx] | (exp << 10);
2506 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2508 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2509 static const uint32_t coeff[] = {
2510 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2511 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2512 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2513 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2514 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2515 0x1ef532, 0x20b051, 0x227043, 0x243516,
2516 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2517 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2518 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2519 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2520 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2521 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2522 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2523 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2524 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2525 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2527 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2528 uint32_t *d = vd, *n = vn;
2530 for (i = 0; i < opr_sz; i++) {
2531 uint32_t nn = n[i];
2532 intptr_t idx = extract32(nn, 0, 6);
2533 uint32_t exp = extract32(nn, 6, 8);
2534 d[i] = coeff[idx] | (exp << 23);
2538 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2540 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2541 static const uint64_t coeff[] = {
2542 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2543 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2544 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2545 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2546 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2547 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2548 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2549 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2550 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2551 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2552 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2553 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2554 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2555 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2556 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2557 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2558 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2559 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2560 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2561 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2562 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2563 0xFA7C1819E90D8ull,
2565 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2566 uint64_t *d = vd, *n = vn;
2568 for (i = 0; i < opr_sz; i++) {
2569 uint64_t nn = n[i];
2570 intptr_t idx = extract32(nn, 0, 6);
2571 uint64_t exp = extract32(nn, 6, 11);
2572 d[i] = coeff[idx] | (exp << 52);
2576 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2578 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2579 uint16_t *d = vd, *n = vn, *m = vm;
2580 for (i = 0; i < opr_sz; i += 1) {
2581 uint16_t nn = n[i];
2582 uint16_t mm = m[i];
2583 if (mm & 1) {
2584 nn = float16_one;
2586 d[i] = nn ^ (mm & 2) << 14;
2590 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2592 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2593 uint32_t *d = vd, *n = vn, *m = vm;
2594 for (i = 0; i < opr_sz; i += 1) {
2595 uint32_t nn = n[i];
2596 uint32_t mm = m[i];
2597 if (mm & 1) {
2598 nn = float32_one;
2600 d[i] = nn ^ (mm & 2) << 30;
2604 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2606 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2607 uint64_t *d = vd, *n = vn, *m = vm;
2608 for (i = 0; i < opr_sz; i += 1) {
2609 uint64_t nn = n[i];
2610 uint64_t mm = m[i];
2611 if (mm & 1) {
2612 nn = float64_one;
2614 d[i] = nn ^ (mm & 2) << 62;
2619 * Signed saturating addition with scalar operand.
2622 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2624 intptr_t i, oprsz = simd_oprsz(desc);
2626 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2627 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2631 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2633 intptr_t i, oprsz = simd_oprsz(desc);
2635 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2636 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2640 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2642 intptr_t i, oprsz = simd_oprsz(desc);
2644 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2645 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2649 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2651 intptr_t i, oprsz = simd_oprsz(desc);
2653 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2654 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2659 * Unsigned saturating addition with scalar operand.
2662 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2664 intptr_t i, oprsz = simd_oprsz(desc);
2666 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2667 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2671 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2673 intptr_t i, oprsz = simd_oprsz(desc);
2675 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2676 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2680 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2682 intptr_t i, oprsz = simd_oprsz(desc);
2684 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2685 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2689 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2691 intptr_t i, oprsz = simd_oprsz(desc);
2693 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2694 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2698 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2700 intptr_t i, oprsz = simd_oprsz(desc);
2702 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2703 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2707 /* Two operand predicated copy immediate with merge. All valid immediates
2708 * can fit within 17 signed bits in the simd_data field.
2710 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2711 uint64_t mm, uint32_t desc)
2713 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2714 uint64_t *d = vd, *n = vn;
2715 uint8_t *pg = vg;
2717 mm = dup_const(MO_8, mm);
2718 for (i = 0; i < opr_sz; i += 1) {
2719 uint64_t nn = n[i];
2720 uint64_t pp = expand_pred_b(pg[H1(i)]);
2721 d[i] = (mm & pp) | (nn & ~pp);
2725 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2726 uint64_t mm, uint32_t desc)
2728 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2729 uint64_t *d = vd, *n = vn;
2730 uint8_t *pg = vg;
2732 mm = dup_const(MO_16, mm);
2733 for (i = 0; i < opr_sz; i += 1) {
2734 uint64_t nn = n[i];
2735 uint64_t pp = expand_pred_h(pg[H1(i)]);
2736 d[i] = (mm & pp) | (nn & ~pp);
2740 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2741 uint64_t mm, uint32_t desc)
2743 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2744 uint64_t *d = vd, *n = vn;
2745 uint8_t *pg = vg;
2747 mm = dup_const(MO_32, mm);
2748 for (i = 0; i < opr_sz; i += 1) {
2749 uint64_t nn = n[i];
2750 uint64_t pp = expand_pred_s(pg[H1(i)]);
2751 d[i] = (mm & pp) | (nn & ~pp);
2755 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2756 uint64_t mm, uint32_t desc)
2758 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2759 uint64_t *d = vd, *n = vn;
2760 uint8_t *pg = vg;
2762 for (i = 0; i < opr_sz; i += 1) {
2763 uint64_t nn = n[i];
2764 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2768 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2770 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2771 uint64_t *d = vd;
2772 uint8_t *pg = vg;
2774 val = dup_const(MO_8, val);
2775 for (i = 0; i < opr_sz; i += 1) {
2776 d[i] = val & expand_pred_b(pg[H1(i)]);
2780 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2782 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2783 uint64_t *d = vd;
2784 uint8_t *pg = vg;
2786 val = dup_const(MO_16, val);
2787 for (i = 0; i < opr_sz; i += 1) {
2788 d[i] = val & expand_pred_h(pg[H1(i)]);
2792 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2794 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2795 uint64_t *d = vd;
2796 uint8_t *pg = vg;
2798 val = dup_const(MO_32, val);
2799 for (i = 0; i < opr_sz; i += 1) {
2800 d[i] = val & expand_pred_s(pg[H1(i)]);
2804 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2806 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2807 uint64_t *d = vd;
2808 uint8_t *pg = vg;
2810 for (i = 0; i < opr_sz; i += 1) {
2811 d[i] = (pg[H1(i)] & 1 ? val : 0);
2815 /* Big-endian hosts need to frob the byte indices. If the copy
2816 * happens to be 8-byte aligned, then no frobbing necessary.
2818 static void swap_memmove(void *vd, void *vs, size_t n)
2820 uintptr_t d = (uintptr_t)vd;
2821 uintptr_t s = (uintptr_t)vs;
2822 uintptr_t o = (d | s | n) & 7;
2823 size_t i;
2825 #ifndef HOST_WORDS_BIGENDIAN
2826 o = 0;
2827 #endif
2828 switch (o) {
2829 case 0:
2830 memmove(vd, vs, n);
2831 break;
2833 case 4:
2834 if (d < s || d >= s + n) {
2835 for (i = 0; i < n; i += 4) {
2836 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2838 } else {
2839 for (i = n; i > 0; ) {
2840 i -= 4;
2841 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2844 break;
2846 case 2:
2847 case 6:
2848 if (d < s || d >= s + n) {
2849 for (i = 0; i < n; i += 2) {
2850 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2852 } else {
2853 for (i = n; i > 0; ) {
2854 i -= 2;
2855 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2858 break;
2860 default:
2861 if (d < s || d >= s + n) {
2862 for (i = 0; i < n; i++) {
2863 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2865 } else {
2866 for (i = n; i > 0; ) {
2867 i -= 1;
2868 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2871 break;
2875 /* Similarly for memset of 0. */
2876 static void swap_memzero(void *vd, size_t n)
2878 uintptr_t d = (uintptr_t)vd;
2879 uintptr_t o = (d | n) & 7;
2880 size_t i;
2882 /* Usually, the first bit of a predicate is set, so N is 0. */
2883 if (likely(n == 0)) {
2884 return;
2887 #ifndef HOST_WORDS_BIGENDIAN
2888 o = 0;
2889 #endif
2890 switch (o) {
2891 case 0:
2892 memset(vd, 0, n);
2893 break;
2895 case 4:
2896 for (i = 0; i < n; i += 4) {
2897 *(uint32_t *)H1_4(d + i) = 0;
2899 break;
2901 case 2:
2902 case 6:
2903 for (i = 0; i < n; i += 2) {
2904 *(uint16_t *)H1_2(d + i) = 0;
2906 break;
2908 default:
2909 for (i = 0; i < n; i++) {
2910 *(uint8_t *)H1(d + i) = 0;
2912 break;
2916 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2918 intptr_t opr_sz = simd_oprsz(desc);
2919 size_t n_ofs = simd_data(desc);
2920 size_t n_siz = opr_sz - n_ofs;
2922 if (vd != vm) {
2923 swap_memmove(vd, vn + n_ofs, n_siz);
2924 swap_memmove(vd + n_siz, vm, n_ofs);
2925 } else if (vd != vn) {
2926 swap_memmove(vd + n_siz, vd, n_ofs);
2927 swap_memmove(vd, vn + n_ofs, n_siz);
2928 } else {
2929 /* vd == vn == vm. Need temp space. */
2930 ARMVectorReg tmp;
2931 swap_memmove(&tmp, vm, n_ofs);
2932 swap_memmove(vd, vd + n_ofs, n_siz);
2933 memcpy(vd + n_siz, &tmp, n_ofs);
2937 #define DO_INSR(NAME, TYPE, H) \
2938 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2940 intptr_t opr_sz = simd_oprsz(desc); \
2941 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2942 *(TYPE *)(vd + H(0)) = val; \
2945 DO_INSR(sve_insr_b, uint8_t, H1)
2946 DO_INSR(sve_insr_h, uint16_t, H1_2)
2947 DO_INSR(sve_insr_s, uint32_t, H1_4)
2948 DO_INSR(sve_insr_d, uint64_t, H1_8)
2950 #undef DO_INSR
2952 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2954 intptr_t i, j, opr_sz = simd_oprsz(desc);
2955 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2956 uint64_t f = *(uint64_t *)(vn + i);
2957 uint64_t b = *(uint64_t *)(vn + j);
2958 *(uint64_t *)(vd + i) = bswap64(b);
2959 *(uint64_t *)(vd + j) = bswap64(f);
2963 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2965 intptr_t i, j, opr_sz = simd_oprsz(desc);
2966 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2967 uint64_t f = *(uint64_t *)(vn + i);
2968 uint64_t b = *(uint64_t *)(vn + j);
2969 *(uint64_t *)(vd + i) = hswap64(b);
2970 *(uint64_t *)(vd + j) = hswap64(f);
2974 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2976 intptr_t i, j, opr_sz = simd_oprsz(desc);
2977 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2978 uint64_t f = *(uint64_t *)(vn + i);
2979 uint64_t b = *(uint64_t *)(vn + j);
2980 *(uint64_t *)(vd + i) = rol64(b, 32);
2981 *(uint64_t *)(vd + j) = rol64(f, 32);
2985 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2987 intptr_t i, j, opr_sz = simd_oprsz(desc);
2988 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2989 uint64_t f = *(uint64_t *)(vn + i);
2990 uint64_t b = *(uint64_t *)(vn + j);
2991 *(uint64_t *)(vd + i) = b;
2992 *(uint64_t *)(vd + j) = f;
2996 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2998 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2999 bool is_tbx, tb_impl_fn *fn)
3001 ARMVectorReg scratch;
3002 uintptr_t oprsz = simd_oprsz(desc);
3004 if (unlikely(vd == vn)) {
3005 vn = memcpy(&scratch, vn, oprsz);
3008 fn(vd, vn, NULL, vm, oprsz, is_tbx);
3011 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3012 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3014 ARMVectorReg scratch;
3015 uintptr_t oprsz = simd_oprsz(desc);
3017 if (unlikely(vd == vn0)) {
3018 vn0 = memcpy(&scratch, vn0, oprsz);
3019 if (vd == vn1) {
3020 vn1 = vn0;
3022 } else if (unlikely(vd == vn1)) {
3023 vn1 = memcpy(&scratch, vn1, oprsz);
3026 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3029 #define DO_TB(SUFF, TYPE, H) \
3030 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3031 void *vm, uintptr_t oprsz, bool is_tbx) \
3033 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3034 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3035 for (i = 0; i < nelem; ++i) { \
3036 TYPE index = indexes[H1(i)], val = 0; \
3037 if (index < nelem) { \
3038 val = tbl0[H(index)]; \
3039 } else { \
3040 index -= nelem; \
3041 if (tbl1 && index < nelem) { \
3042 val = tbl1[H(index)]; \
3043 } else if (is_tbx) { \
3044 continue; \
3047 d[H(i)] = val; \
3050 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3052 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3054 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3055 void *vm, uint32_t desc) \
3057 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3059 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3061 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3064 DO_TB(b, uint8_t, H1)
3065 DO_TB(h, uint16_t, H2)
3066 DO_TB(s, uint32_t, H4)
3067 DO_TB(d, uint64_t, H8)
3069 #undef DO_TB
3071 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3072 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3074 intptr_t i, opr_sz = simd_oprsz(desc); \
3075 TYPED *d = vd; \
3076 TYPES *n = vn; \
3077 ARMVectorReg tmp; \
3078 if (unlikely(vn - vd < opr_sz)) { \
3079 n = memcpy(&tmp, n, opr_sz / 2); \
3081 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3082 d[HD(i)] = n[HS(i)]; \
3086 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3087 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3088 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3090 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3091 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3092 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3094 #undef DO_UNPK
3096 /* Mask of bits included in the even numbered predicates of width esz.
3097 * We also use this for expand_bits/compress_bits, and so extend the
3098 * same pattern out to 16-bit units.
3100 static const uint64_t even_bit_esz_masks[5] = {
3101 0x5555555555555555ull,
3102 0x3333333333333333ull,
3103 0x0f0f0f0f0f0f0f0full,
3104 0x00ff00ff00ff00ffull,
3105 0x0000ffff0000ffffull,
3108 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3109 * For N==0, this corresponds to the operation that in qemu/bitops.h
3110 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3111 * section 7-2 Shuffling Bits.
3113 static uint64_t expand_bits(uint64_t x, int n)
3115 int i;
3117 x &= 0xffffffffu;
3118 for (i = 4; i >= n; i--) {
3119 int sh = 1 << i;
3120 x = ((x << sh) | x) & even_bit_esz_masks[i];
3122 return x;
3125 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3126 * For N==0, this corresponds to the operation that in qemu/bitops.h
3127 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3128 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3130 static uint64_t compress_bits(uint64_t x, int n)
3132 int i;
3134 for (i = n; i <= 4; i++) {
3135 int sh = 1 << i;
3136 x &= even_bit_esz_masks[i];
3137 x = (x >> sh) | x;
3139 return x & 0xffffffffu;
3142 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3144 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3145 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3146 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3147 int esize = 1 << esz;
3148 uint64_t *d = vd;
3149 intptr_t i;
3151 if (oprsz <= 8) {
3152 uint64_t nn = *(uint64_t *)vn;
3153 uint64_t mm = *(uint64_t *)vm;
3154 int half = 4 * oprsz;
3156 nn = extract64(nn, high * half, half);
3157 mm = extract64(mm, high * half, half);
3158 nn = expand_bits(nn, esz);
3159 mm = expand_bits(mm, esz);
3160 d[0] = nn | (mm << esize);
3161 } else {
3162 ARMPredicateReg tmp;
3164 /* We produce output faster than we consume input.
3165 Therefore we must be mindful of possible overlap. */
3166 if (vd == vn) {
3167 vn = memcpy(&tmp, vn, oprsz);
3168 if (vd == vm) {
3169 vm = vn;
3171 } else if (vd == vm) {
3172 vm = memcpy(&tmp, vm, oprsz);
3174 if (high) {
3175 high = oprsz >> 1;
3178 if ((oprsz & 7) == 0) {
3179 uint32_t *n = vn, *m = vm;
3180 high >>= 2;
3182 for (i = 0; i < oprsz / 8; i++) {
3183 uint64_t nn = n[H4(high + i)];
3184 uint64_t mm = m[H4(high + i)];
3186 nn = expand_bits(nn, esz);
3187 mm = expand_bits(mm, esz);
3188 d[i] = nn | (mm << esize);
3190 } else {
3191 uint8_t *n = vn, *m = vm;
3192 uint16_t *d16 = vd;
3194 for (i = 0; i < oprsz / 2; i++) {
3195 uint16_t nn = n[H1(high + i)];
3196 uint16_t mm = m[H1(high + i)];
3198 nn = expand_bits(nn, esz);
3199 mm = expand_bits(mm, esz);
3200 d16[H2(i)] = nn | (mm << esize);
3206 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3208 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3209 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3210 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3211 uint64_t *d = vd, *n = vn, *m = vm;
3212 uint64_t l, h;
3213 intptr_t i;
3215 if (oprsz <= 8) {
3216 l = compress_bits(n[0] >> odd, esz);
3217 h = compress_bits(m[0] >> odd, esz);
3218 d[0] = l | (h << (4 * oprsz));
3219 } else {
3220 ARMPredicateReg tmp_m;
3221 intptr_t oprsz_16 = oprsz / 16;
3223 if ((vm - vd) < (uintptr_t)oprsz) {
3224 m = memcpy(&tmp_m, vm, oprsz);
3227 for (i = 0; i < oprsz_16; i++) {
3228 l = n[2 * i + 0];
3229 h = n[2 * i + 1];
3230 l = compress_bits(l >> odd, esz);
3231 h = compress_bits(h >> odd, esz);
3232 d[i] = l | (h << 32);
3236 * For VL which is not a multiple of 512, the results from M do not
3237 * align nicely with the uint64_t for D. Put the aligned results
3238 * from M into TMP_M and then copy it into place afterward.
3240 if (oprsz & 15) {
3241 int final_shift = (oprsz & 15) * 2;
3243 l = n[2 * i + 0];
3244 h = n[2 * i + 1];
3245 l = compress_bits(l >> odd, esz);
3246 h = compress_bits(h >> odd, esz);
3247 d[i] = l | (h << final_shift);
3249 for (i = 0; i < oprsz_16; i++) {
3250 l = m[2 * i + 0];
3251 h = m[2 * i + 1];
3252 l = compress_bits(l >> odd, esz);
3253 h = compress_bits(h >> odd, esz);
3254 tmp_m.p[i] = l | (h << 32);
3256 l = m[2 * i + 0];
3257 h = m[2 * i + 1];
3258 l = compress_bits(l >> odd, esz);
3259 h = compress_bits(h >> odd, esz);
3260 tmp_m.p[i] = l | (h << final_shift);
3262 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3263 } else {
3264 for (i = 0; i < oprsz_16; i++) {
3265 l = m[2 * i + 0];
3266 h = m[2 * i + 1];
3267 l = compress_bits(l >> odd, esz);
3268 h = compress_bits(h >> odd, esz);
3269 d[oprsz_16 + i] = l | (h << 32);
3275 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3277 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3278 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3279 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3280 uint64_t *d = vd, *n = vn, *m = vm;
3281 uint64_t mask;
3282 int shr, shl;
3283 intptr_t i;
3285 shl = 1 << esz;
3286 shr = 0;
3287 mask = even_bit_esz_masks[esz];
3288 if (odd) {
3289 mask <<= shl;
3290 shr = shl;
3291 shl = 0;
3294 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3295 uint64_t nn = (n[i] & mask) >> shr;
3296 uint64_t mm = (m[i] & mask) << shl;
3297 d[i] = nn + mm;
3301 /* Reverse units of 2**N bits. */
3302 static uint64_t reverse_bits_64(uint64_t x, int n)
3304 int i, sh;
3306 x = bswap64(x);
3307 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3308 uint64_t mask = even_bit_esz_masks[i];
3309 x = ((x & mask) << sh) | ((x >> sh) & mask);
3311 return x;
3314 static uint8_t reverse_bits_8(uint8_t x, int n)
3316 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3317 int i, sh;
3319 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3320 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3322 return x;
3325 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3327 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3328 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3329 intptr_t i, oprsz_2 = oprsz / 2;
3331 if (oprsz <= 8) {
3332 uint64_t l = *(uint64_t *)vn;
3333 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3334 *(uint64_t *)vd = l;
3335 } else if ((oprsz & 15) == 0) {
3336 for (i = 0; i < oprsz_2; i += 8) {
3337 intptr_t ih = oprsz - 8 - i;
3338 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3339 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3340 *(uint64_t *)(vd + i) = h;
3341 *(uint64_t *)(vd + ih) = l;
3343 } else {
3344 for (i = 0; i < oprsz_2; i += 1) {
3345 intptr_t il = H1(i);
3346 intptr_t ih = H1(oprsz - 1 - i);
3347 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3348 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3349 *(uint8_t *)(vd + il) = h;
3350 *(uint8_t *)(vd + ih) = l;
3355 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3357 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3358 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3359 uint64_t *d = vd;
3360 intptr_t i;
3362 if (oprsz <= 8) {
3363 uint64_t nn = *(uint64_t *)vn;
3364 int half = 4 * oprsz;
3366 nn = extract64(nn, high * half, half);
3367 nn = expand_bits(nn, 0);
3368 d[0] = nn;
3369 } else {
3370 ARMPredicateReg tmp_n;
3372 /* We produce output faster than we consume input.
3373 Therefore we must be mindful of possible overlap. */
3374 if ((vn - vd) < (uintptr_t)oprsz) {
3375 vn = memcpy(&tmp_n, vn, oprsz);
3377 if (high) {
3378 high = oprsz >> 1;
3381 if ((oprsz & 7) == 0) {
3382 uint32_t *n = vn;
3383 high >>= 2;
3385 for (i = 0; i < oprsz / 8; i++) {
3386 uint64_t nn = n[H4(high + i)];
3387 d[i] = expand_bits(nn, 0);
3389 } else {
3390 uint16_t *d16 = vd;
3391 uint8_t *n = vn;
3393 for (i = 0; i < oprsz / 2; i++) {
3394 uint16_t nn = n[H1(high + i)];
3395 d16[H2(i)] = expand_bits(nn, 0);
3401 #define DO_ZIP(NAME, TYPE, H) \
3402 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3404 intptr_t oprsz = simd_oprsz(desc); \
3405 intptr_t i, oprsz_2 = oprsz / 2; \
3406 ARMVectorReg tmp_n, tmp_m; \
3407 /* We produce output faster than we consume input. \
3408 Therefore we must be mindful of possible overlap. */ \
3409 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3410 vn = memcpy(&tmp_n, vn, oprsz_2); \
3412 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3413 vm = memcpy(&tmp_m, vm, oprsz_2); \
3415 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3416 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3417 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3419 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3420 memset(vd + oprsz - 16, 0, 16); \
3424 DO_ZIP(sve_zip_b, uint8_t, H1)
3425 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3426 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3427 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3428 DO_ZIP(sve2_zip_q, Int128, )
3430 #define DO_UZP(NAME, TYPE, H) \
3431 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3433 intptr_t oprsz = simd_oprsz(desc); \
3434 intptr_t odd_ofs = simd_data(desc); \
3435 intptr_t i, p; \
3436 ARMVectorReg tmp_m; \
3437 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3438 vm = memcpy(&tmp_m, vm, oprsz); \
3440 i = 0, p = odd_ofs; \
3441 do { \
3442 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3443 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3444 } while (p < oprsz); \
3445 p -= oprsz; \
3446 do { \
3447 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3448 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3449 } while (p < oprsz); \
3450 tcg_debug_assert(i == oprsz); \
3453 DO_UZP(sve_uzp_b, uint8_t, H1)
3454 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3455 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3456 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3457 DO_UZP(sve2_uzp_q, Int128, )
3459 #define DO_TRN(NAME, TYPE, H) \
3460 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3462 intptr_t oprsz = simd_oprsz(desc); \
3463 intptr_t odd_ofs = simd_data(desc); \
3464 intptr_t i; \
3465 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3466 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3467 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3468 *(TYPE *)(vd + H(i + 0)) = ae; \
3469 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3471 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3472 memset(vd + oprsz - 16, 0, 16); \
3476 DO_TRN(sve_trn_b, uint8_t, H1)
3477 DO_TRN(sve_trn_h, uint16_t, H1_2)
3478 DO_TRN(sve_trn_s, uint32_t, H1_4)
3479 DO_TRN(sve_trn_d, uint64_t, H1_8)
3480 DO_TRN(sve2_trn_q, Int128, )
3482 #undef DO_ZIP
3483 #undef DO_UZP
3484 #undef DO_TRN
3486 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3488 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3489 uint32_t *d = vd, *n = vn;
3490 uint8_t *pg = vg;
3492 for (i = j = 0; i < opr_sz; i++) {
3493 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3494 d[H4(j)] = n[H4(i)];
3495 j++;
3498 for (; j < opr_sz; j++) {
3499 d[H4(j)] = 0;
3503 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3505 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3506 uint64_t *d = vd, *n = vn;
3507 uint8_t *pg = vg;
3509 for (i = j = 0; i < opr_sz; i++) {
3510 if (pg[H1(i)] & 1) {
3511 d[j] = n[i];
3512 j++;
3515 for (; j < opr_sz; j++) {
3516 d[j] = 0;
3520 /* Similar to the ARM LastActiveElement pseudocode function, except the
3521 * result is multiplied by the element size. This includes the not found
3522 * indication; e.g. not found for esz=3 is -8.
3524 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3526 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3527 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3529 return last_active_element(vg, words, esz);
3532 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3534 intptr_t opr_sz = simd_oprsz(desc) / 8;
3535 int esz = simd_data(desc);
3536 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3537 intptr_t i, first_i, last_i;
3538 ARMVectorReg tmp;
3540 first_i = last_i = 0;
3541 first_g = last_g = 0;
3543 /* Find the extent of the active elements within VG. */
3544 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3545 pg = *(uint64_t *)(vg + i) & mask;
3546 if (pg) {
3547 if (last_g == 0) {
3548 last_g = pg;
3549 last_i = i;
3551 first_g = pg;
3552 first_i = i;
3556 len = 0;
3557 if (first_g != 0) {
3558 first_i = first_i * 8 + ctz64(first_g);
3559 last_i = last_i * 8 + 63 - clz64(last_g);
3560 len = last_i - first_i + (1 << esz);
3561 if (vd == vm) {
3562 vm = memcpy(&tmp, vm, opr_sz * 8);
3564 swap_memmove(vd, vn + first_i, len);
3566 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3569 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3570 void *vg, uint32_t desc)
3572 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3573 uint64_t *d = vd, *n = vn, *m = vm;
3574 uint8_t *pg = vg;
3576 for (i = 0; i < opr_sz; i += 1) {
3577 uint64_t nn = n[i], mm = m[i];
3578 uint64_t pp = expand_pred_b(pg[H1(i)]);
3579 d[i] = (nn & pp) | (mm & ~pp);
3583 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3584 void *vg, uint32_t desc)
3586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3587 uint64_t *d = vd, *n = vn, *m = vm;
3588 uint8_t *pg = vg;
3590 for (i = 0; i < opr_sz; i += 1) {
3591 uint64_t nn = n[i], mm = m[i];
3592 uint64_t pp = expand_pred_h(pg[H1(i)]);
3593 d[i] = (nn & pp) | (mm & ~pp);
3597 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3598 void *vg, uint32_t desc)
3600 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3601 uint64_t *d = vd, *n = vn, *m = vm;
3602 uint8_t *pg = vg;
3604 for (i = 0; i < opr_sz; i += 1) {
3605 uint64_t nn = n[i], mm = m[i];
3606 uint64_t pp = expand_pred_s(pg[H1(i)]);
3607 d[i] = (nn & pp) | (mm & ~pp);
3611 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3612 void *vg, uint32_t desc)
3614 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3615 uint64_t *d = vd, *n = vn, *m = vm;
3616 uint8_t *pg = vg;
3618 for (i = 0; i < opr_sz; i += 1) {
3619 uint64_t nn = n[i], mm = m[i];
3620 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3624 /* Two operand comparison controlled by a predicate.
3625 * ??? It is very tempting to want to be able to expand this inline
3626 * with x86 instructions, e.g.
3628 * vcmpeqw zm, zn, %ymm0
3629 * vpmovmskb %ymm0, %eax
3630 * and $0x5555, %eax
3631 * and pg, %eax
3633 * or even aarch64, e.g.
3635 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3636 * cmeq v0.8h, zn, zm
3637 * and v0.8h, v0.8h, mask
3638 * addv h0, v0.8h
3639 * and v0.8b, pg
3641 * However, coming up with an abstraction that allows vector inputs and
3642 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3643 * scalar outputs, is tricky.
3645 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3646 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3648 intptr_t opr_sz = simd_oprsz(desc); \
3649 uint32_t flags = PREDTEST_INIT; \
3650 intptr_t i = opr_sz; \
3651 do { \
3652 uint64_t out = 0, pg; \
3653 do { \
3654 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3655 TYPE nn = *(TYPE *)(vn + H(i)); \
3656 TYPE mm = *(TYPE *)(vm + H(i)); \
3657 out |= nn OP mm; \
3658 } while (i & 63); \
3659 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3660 out &= pg; \
3661 *(uint64_t *)(vd + (i >> 3)) = out; \
3662 flags = iter_predtest_bwd(out, pg, flags); \
3663 } while (i > 0); \
3664 return flags; \
3667 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3668 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3669 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3670 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3671 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3672 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3673 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3674 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3676 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3677 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3678 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3679 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3681 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3682 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3683 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3684 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3686 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3687 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3688 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3689 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3691 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3692 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3693 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3694 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3696 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3697 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3698 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3699 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3701 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3702 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3703 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3704 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3706 #undef DO_CMP_PPZZ_B
3707 #undef DO_CMP_PPZZ_H
3708 #undef DO_CMP_PPZZ_S
3709 #undef DO_CMP_PPZZ_D
3710 #undef DO_CMP_PPZZ
3712 /* Similar, but the second source is "wide". */
3713 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3714 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3716 intptr_t opr_sz = simd_oprsz(desc); \
3717 uint32_t flags = PREDTEST_INIT; \
3718 intptr_t i = opr_sz; \
3719 do { \
3720 uint64_t out = 0, pg; \
3721 do { \
3722 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3723 do { \
3724 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3725 TYPE nn = *(TYPE *)(vn + H(i)); \
3726 out |= nn OP mm; \
3727 } while (i & 7); \
3728 } while (i & 63); \
3729 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3730 out &= pg; \
3731 *(uint64_t *)(vd + (i >> 3)) = out; \
3732 flags = iter_predtest_bwd(out, pg, flags); \
3733 } while (i > 0); \
3734 return flags; \
3737 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3738 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3739 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3740 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3741 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3742 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3744 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3745 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3746 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3748 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3749 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3750 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3752 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3753 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3754 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3756 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3757 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3758 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3760 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3761 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3762 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3764 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3765 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3766 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3768 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3769 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3770 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3772 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3773 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3774 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3776 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3777 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3778 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3780 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3781 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3782 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3784 #undef DO_CMP_PPZW_B
3785 #undef DO_CMP_PPZW_H
3786 #undef DO_CMP_PPZW_S
3787 #undef DO_CMP_PPZW
3789 /* Similar, but the second source is immediate. */
3790 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3793 intptr_t opr_sz = simd_oprsz(desc); \
3794 uint32_t flags = PREDTEST_INIT; \
3795 TYPE mm = simd_data(desc); \
3796 intptr_t i = opr_sz; \
3797 do { \
3798 uint64_t out = 0, pg; \
3799 do { \
3800 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3801 TYPE nn = *(TYPE *)(vn + H(i)); \
3802 out |= nn OP mm; \
3803 } while (i & 63); \
3804 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3805 out &= pg; \
3806 *(uint64_t *)(vd + (i >> 3)) = out; \
3807 flags = iter_predtest_bwd(out, pg, flags); \
3808 } while (i > 0); \
3809 return flags; \
3812 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3813 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3814 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3815 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3816 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3817 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3818 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3819 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3821 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3822 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3823 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3824 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3826 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3827 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3828 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3829 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3831 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3832 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3833 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3834 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3836 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3837 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3838 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3839 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3841 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3842 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3843 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3844 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3846 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3847 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3848 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3849 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3851 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3852 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3853 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3854 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3856 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3857 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3858 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3859 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3861 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3862 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3863 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3864 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3866 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3867 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3868 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3869 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3871 #undef DO_CMP_PPZI_B
3872 #undef DO_CMP_PPZI_H
3873 #undef DO_CMP_PPZI_S
3874 #undef DO_CMP_PPZI_D
3875 #undef DO_CMP_PPZI
3877 /* Similar to the ARM LastActive pseudocode function. */
3878 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3880 intptr_t i;
3882 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3883 uint64_t pg = *(uint64_t *)(vg + i);
3884 if (pg) {
3885 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3888 return 0;
3891 /* Compute a mask into RETB that is true for all G, up to and including
3892 * (if after) or excluding (if !after) the first G & N.
3893 * Return true if BRK found.
3895 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3896 bool brk, bool after)
3898 uint64_t b;
3900 if (brk) {
3901 b = 0;
3902 } else if ((g & n) == 0) {
3903 /* For all G, no N are set; break not found. */
3904 b = g;
3905 } else {
3906 /* Break somewhere in N. Locate it. */
3907 b = g & n; /* guard true, pred true */
3908 b = b & -b; /* first such */
3909 if (after) {
3910 b = b | (b - 1); /* break after same */
3911 } else {
3912 b = b - 1; /* break before same */
3914 brk = true;
3917 *retb = b;
3918 return brk;
3921 /* Compute a zeroing BRK. */
3922 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3923 intptr_t oprsz, bool after)
3925 bool brk = false;
3926 intptr_t i;
3928 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3929 uint64_t this_b, this_g = g[i];
3931 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3932 d[i] = this_b & this_g;
3936 /* Likewise, but also compute flags. */
3937 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3938 intptr_t oprsz, bool after)
3940 uint32_t flags = PREDTEST_INIT;
3941 bool brk = false;
3942 intptr_t i;
3944 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3945 uint64_t this_b, this_d, this_g = g[i];
3947 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3948 d[i] = this_d = this_b & this_g;
3949 flags = iter_predtest_fwd(this_d, this_g, flags);
3951 return flags;
3954 /* Compute a merging BRK. */
3955 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3956 intptr_t oprsz, bool after)
3958 bool brk = false;
3959 intptr_t i;
3961 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3962 uint64_t this_b, this_g = g[i];
3964 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3965 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3969 /* Likewise, but also compute flags. */
3970 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3971 intptr_t oprsz, bool after)
3973 uint32_t flags = PREDTEST_INIT;
3974 bool brk = false;
3975 intptr_t i;
3977 for (i = 0; i < oprsz / 8; ++i) {
3978 uint64_t this_b, this_d = d[i], this_g = g[i];
3980 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3981 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3982 flags = iter_predtest_fwd(this_d, this_g, flags);
3984 return flags;
3987 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3989 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3990 * The compiler should turn this into 4 64-bit integer stores.
3992 memset(d, 0, sizeof(ARMPredicateReg));
3993 return PREDTEST_INIT;
3996 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3997 uint32_t pred_desc)
3999 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4000 if (last_active_pred(vn, vg, oprsz)) {
4001 compute_brk_z(vd, vm, vg, oprsz, true);
4002 } else {
4003 do_zero(vd, oprsz);
4007 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4008 uint32_t pred_desc)
4010 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4011 if (last_active_pred(vn, vg, oprsz)) {
4012 return compute_brks_z(vd, vm, vg, oprsz, true);
4013 } else {
4014 return do_zero(vd, oprsz);
4018 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4019 uint32_t pred_desc)
4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022 if (last_active_pred(vn, vg, oprsz)) {
4023 compute_brk_z(vd, vm, vg, oprsz, false);
4024 } else {
4025 do_zero(vd, oprsz);
4029 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4030 uint32_t pred_desc)
4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4033 if (last_active_pred(vn, vg, oprsz)) {
4034 return compute_brks_z(vd, vm, vg, oprsz, false);
4035 } else {
4036 return do_zero(vd, oprsz);
4040 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4042 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4043 compute_brk_z(vd, vn, vg, oprsz, true);
4046 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4048 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4049 return compute_brks_z(vd, vn, vg, oprsz, true);
4052 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4054 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4055 compute_brk_z(vd, vn, vg, oprsz, false);
4058 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4060 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4061 return compute_brks_z(vd, vn, vg, oprsz, false);
4064 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4066 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4067 compute_brk_m(vd, vn, vg, oprsz, true);
4070 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4072 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4073 return compute_brks_m(vd, vn, vg, oprsz, true);
4076 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4078 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4079 compute_brk_m(vd, vn, vg, oprsz, false);
4082 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4084 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4085 return compute_brks_m(vd, vn, vg, oprsz, false);
4088 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4090 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4091 if (!last_active_pred(vn, vg, oprsz)) {
4092 do_zero(vd, oprsz);
4096 /* As if PredTest(Ones(PL), D, esz). */
4097 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4098 uint64_t esz_mask)
4100 uint32_t flags = PREDTEST_INIT;
4101 intptr_t i;
4103 for (i = 0; i < oprsz / 8; i++) {
4104 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4106 if (oprsz & 7) {
4107 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4108 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4110 return flags;
4113 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4115 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4116 if (last_active_pred(vn, vg, oprsz)) {
4117 return predtest_ones(vd, oprsz, -1);
4118 } else {
4119 return do_zero(vd, oprsz);
4123 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4125 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4126 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4127 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4128 intptr_t i;
4130 for (i = 0; i < words; ++i) {
4131 uint64_t t = n[i] & g[i] & mask;
4132 sum += ctpop64(t);
4134 return sum;
4137 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4139 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4140 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4141 uint64_t esz_mask = pred_esz_masks[esz];
4142 ARMPredicateReg *d = vd;
4143 uint32_t flags;
4144 intptr_t i;
4146 /* Begin with a zero predicate register. */
4147 flags = do_zero(d, oprsz);
4148 if (count == 0) {
4149 return flags;
4152 /* Set all of the requested bits. */
4153 for (i = 0; i < count / 64; ++i) {
4154 d->p[i] = esz_mask;
4156 if (count & 63) {
4157 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4160 return predtest_ones(d, oprsz, esz_mask);
4163 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4165 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4166 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4167 uint64_t esz_mask = pred_esz_masks[esz];
4168 ARMPredicateReg *d = vd;
4169 intptr_t i, invcount, oprbits;
4170 uint64_t bits;
4172 if (count == 0) {
4173 return do_zero(d, oprsz);
4176 oprbits = oprsz * 8;
4177 tcg_debug_assert(count <= oprbits);
4179 bits = esz_mask;
4180 if (oprbits & 63) {
4181 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4184 invcount = oprbits - count;
4185 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4186 d->p[i] = bits;
4187 bits = esz_mask;
4190 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4192 while (--i >= 0) {
4193 d->p[i] = 0;
4196 return predtest_ones(d, oprsz, esz_mask);
4199 /* Recursive reduction on a function;
4200 * C.f. the ARM ARM function ReducePredicated.
4202 * While it would be possible to write this without the DATA temporary,
4203 * it is much simpler to process the predicate register this way.
4204 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4205 * little to gain with a more complex non-recursive form.
4207 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4208 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4210 if (n == 1) { \
4211 return *data; \
4212 } else { \
4213 uintptr_t half = n / 2; \
4214 TYPE lo = NAME##_reduce(data, status, half); \
4215 TYPE hi = NAME##_reduce(data + half, status, half); \
4216 return TYPE##_##FUNC(lo, hi, status); \
4219 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4221 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4222 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4223 for (i = 0; i < oprsz; ) { \
4224 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4225 do { \
4226 TYPE nn = *(TYPE *)(vn + H(i)); \
4227 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4228 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4229 } while (i & 15); \
4231 for (; i < maxsz; i += sizeof(TYPE)) { \
4232 *(TYPE *)((void *)data + i) = IDENT; \
4234 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4237 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4238 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4239 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4241 /* Identity is floatN_default_nan, without the function call. */
4242 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4243 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4244 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4246 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4247 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4248 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4250 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4251 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4252 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4254 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4255 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4256 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4258 #undef DO_REDUCE
4260 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4261 void *status, uint32_t desc)
4263 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4264 float16 result = nn;
4266 do {
4267 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4268 do {
4269 if (pg & 1) {
4270 float16 mm = *(float16 *)(vm + H1_2(i));
4271 result = float16_add(result, mm, status);
4273 i += sizeof(float16), pg >>= sizeof(float16);
4274 } while (i & 15);
4275 } while (i < opr_sz);
4277 return result;
4280 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4281 void *status, uint32_t desc)
4283 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4284 float32 result = nn;
4286 do {
4287 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4288 do {
4289 if (pg & 1) {
4290 float32 mm = *(float32 *)(vm + H1_2(i));
4291 result = float32_add(result, mm, status);
4293 i += sizeof(float32), pg >>= sizeof(float32);
4294 } while (i & 15);
4295 } while (i < opr_sz);
4297 return result;
4300 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4301 void *status, uint32_t desc)
4303 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4304 uint64_t *m = vm;
4305 uint8_t *pg = vg;
4307 for (i = 0; i < opr_sz; i++) {
4308 if (pg[H1(i)] & 1) {
4309 nn = float64_add(nn, m[i], status);
4313 return nn;
4316 /* Fully general three-operand expander, controlled by a predicate,
4317 * With the extra float_status parameter.
4319 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4320 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4321 void *status, uint32_t desc) \
4323 intptr_t i = simd_oprsz(desc); \
4324 uint64_t *g = vg; \
4325 do { \
4326 uint64_t pg = g[(i - 1) >> 6]; \
4327 do { \
4328 i -= sizeof(TYPE); \
4329 if (likely((pg >> (i & 63)) & 1)) { \
4330 TYPE nn = *(TYPE *)(vn + H(i)); \
4331 TYPE mm = *(TYPE *)(vm + H(i)); \
4332 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4334 } while (i & 63); \
4335 } while (i != 0); \
4338 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4339 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4340 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4342 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4343 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4344 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4346 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4347 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4348 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4350 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4351 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4352 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4354 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4355 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4356 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4358 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4359 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4360 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4362 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4363 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4364 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4366 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4367 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4368 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4370 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4372 return float16_abs(float16_sub(a, b, s));
4375 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4377 return float32_abs(float32_sub(a, b, s));
4380 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4382 return float64_abs(float64_sub(a, b, s));
4385 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4386 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4387 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4389 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4391 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4392 return float64_scalbn(a, b_int, s);
4395 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4396 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4397 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4399 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4400 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4401 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4403 #undef DO_ZPZZ_FP
4405 /* Three-operand expander, with one scalar operand, controlled by
4406 * a predicate, with the extra float_status parameter.
4408 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4409 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4410 void *status, uint32_t desc) \
4412 intptr_t i = simd_oprsz(desc); \
4413 uint64_t *g = vg; \
4414 TYPE mm = scalar; \
4415 do { \
4416 uint64_t pg = g[(i - 1) >> 6]; \
4417 do { \
4418 i -= sizeof(TYPE); \
4419 if (likely((pg >> (i & 63)) & 1)) { \
4420 TYPE nn = *(TYPE *)(vn + H(i)); \
4421 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4423 } while (i & 63); \
4424 } while (i != 0); \
4427 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4428 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4429 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4431 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4432 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4433 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4435 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4436 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4437 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4439 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4441 return float16_sub(b, a, s);
4444 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4446 return float32_sub(b, a, s);
4449 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4451 return float64_sub(b, a, s);
4454 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4455 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4456 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4458 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4459 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4460 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4462 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4463 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4464 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4466 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4467 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4468 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4470 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4471 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4472 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4474 /* Fully general two-operand expander, controlled by a predicate,
4475 * With the extra float_status parameter.
4477 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4478 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4480 intptr_t i = simd_oprsz(desc); \
4481 uint64_t *g = vg; \
4482 do { \
4483 uint64_t pg = g[(i - 1) >> 6]; \
4484 do { \
4485 i -= sizeof(TYPE); \
4486 if (likely((pg >> (i & 63)) & 1)) { \
4487 TYPE nn = *(TYPE *)(vn + H(i)); \
4488 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4490 } while (i & 63); \
4491 } while (i != 0); \
4494 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4495 * FZ16. When converting from fp16, this affects flushing input denormals;
4496 * when converting to fp16, this affects flushing output denormals.
4498 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4500 bool save = get_flush_inputs_to_zero(fpst);
4501 float32 ret;
4503 set_flush_inputs_to_zero(false, fpst);
4504 ret = float16_to_float32(f, true, fpst);
4505 set_flush_inputs_to_zero(save, fpst);
4506 return ret;
4509 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4511 bool save = get_flush_inputs_to_zero(fpst);
4512 float64 ret;
4514 set_flush_inputs_to_zero(false, fpst);
4515 ret = float16_to_float64(f, true, fpst);
4516 set_flush_inputs_to_zero(save, fpst);
4517 return ret;
4520 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4522 bool save = get_flush_to_zero(fpst);
4523 float16 ret;
4525 set_flush_to_zero(false, fpst);
4526 ret = float32_to_float16(f, true, fpst);
4527 set_flush_to_zero(save, fpst);
4528 return ret;
4531 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4533 bool save = get_flush_to_zero(fpst);
4534 float16 ret;
4536 set_flush_to_zero(false, fpst);
4537 ret = float64_to_float16(f, true, fpst);
4538 set_flush_to_zero(save, fpst);
4539 return ret;
4542 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4544 if (float16_is_any_nan(f)) {
4545 float_raise(float_flag_invalid, s);
4546 return 0;
4548 return float16_to_int16_round_to_zero(f, s);
4551 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4553 if (float16_is_any_nan(f)) {
4554 float_raise(float_flag_invalid, s);
4555 return 0;
4557 return float16_to_int64_round_to_zero(f, s);
4560 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4562 if (float32_is_any_nan(f)) {
4563 float_raise(float_flag_invalid, s);
4564 return 0;
4566 return float32_to_int64_round_to_zero(f, s);
4569 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4571 if (float64_is_any_nan(f)) {
4572 float_raise(float_flag_invalid, s);
4573 return 0;
4575 return float64_to_int64_round_to_zero(f, s);
4578 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4580 if (float16_is_any_nan(f)) {
4581 float_raise(float_flag_invalid, s);
4582 return 0;
4584 return float16_to_uint16_round_to_zero(f, s);
4587 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4589 if (float16_is_any_nan(f)) {
4590 float_raise(float_flag_invalid, s);
4591 return 0;
4593 return float16_to_uint64_round_to_zero(f, s);
4596 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4598 if (float32_is_any_nan(f)) {
4599 float_raise(float_flag_invalid, s);
4600 return 0;
4602 return float32_to_uint64_round_to_zero(f, s);
4605 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4607 if (float64_is_any_nan(f)) {
4608 float_raise(float_flag_invalid, s);
4609 return 0;
4611 return float64_to_uint64_round_to_zero(f, s);
4614 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4615 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4616 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4617 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4618 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4619 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4620 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4622 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4623 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4624 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4625 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4626 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4627 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4628 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4630 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4631 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4632 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4633 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4634 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4635 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4636 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4638 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4639 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4640 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4642 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4643 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4644 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4646 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4647 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4648 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4650 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4651 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4652 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4654 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4655 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4656 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4657 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4658 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4659 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4660 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4662 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4663 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4664 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4665 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4666 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4667 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4668 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4670 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4672 /* Extract frac to the top of the uint32_t. */
4673 uint32_t frac = (uint32_t)a << (16 + 6);
4674 int16_t exp = extract32(a, 10, 5);
4676 if (unlikely(exp == 0)) {
4677 if (frac != 0) {
4678 if (!get_flush_inputs_to_zero(s)) {
4679 /* denormal: bias - fractional_zeros */
4680 return -15 - clz32(frac);
4682 /* flush to zero */
4683 float_raise(float_flag_input_denormal, s);
4685 } else if (unlikely(exp == 0x1f)) {
4686 if (frac == 0) {
4687 return INT16_MAX; /* infinity */
4689 } else {
4690 /* normal: exp - bias */
4691 return exp - 15;
4693 /* nan or zero */
4694 float_raise(float_flag_invalid, s);
4695 return INT16_MIN;
4698 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4700 /* Extract frac to the top of the uint32_t. */
4701 uint32_t frac = a << 9;
4702 int32_t exp = extract32(a, 23, 8);
4704 if (unlikely(exp == 0)) {
4705 if (frac != 0) {
4706 if (!get_flush_inputs_to_zero(s)) {
4707 /* denormal: bias - fractional_zeros */
4708 return -127 - clz32(frac);
4710 /* flush to zero */
4711 float_raise(float_flag_input_denormal, s);
4713 } else if (unlikely(exp == 0xff)) {
4714 if (frac == 0) {
4715 return INT32_MAX; /* infinity */
4717 } else {
4718 /* normal: exp - bias */
4719 return exp - 127;
4721 /* nan or zero */
4722 float_raise(float_flag_invalid, s);
4723 return INT32_MIN;
4726 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4728 /* Extract frac to the top of the uint64_t. */
4729 uint64_t frac = a << 12;
4730 int64_t exp = extract64(a, 52, 11);
4732 if (unlikely(exp == 0)) {
4733 if (frac != 0) {
4734 if (!get_flush_inputs_to_zero(s)) {
4735 /* denormal: bias - fractional_zeros */
4736 return -1023 - clz64(frac);
4738 /* flush to zero */
4739 float_raise(float_flag_input_denormal, s);
4741 } else if (unlikely(exp == 0x7ff)) {
4742 if (frac == 0) {
4743 return INT64_MAX; /* infinity */
4745 } else {
4746 /* normal: exp - bias */
4747 return exp - 1023;
4749 /* nan or zero */
4750 float_raise(float_flag_invalid, s);
4751 return INT64_MIN;
4754 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4755 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4756 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4758 #undef DO_ZPZ_FP
4760 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4761 float_status *status, uint32_t desc,
4762 uint16_t neg1, uint16_t neg3)
4764 intptr_t i = simd_oprsz(desc);
4765 uint64_t *g = vg;
4767 do {
4768 uint64_t pg = g[(i - 1) >> 6];
4769 do {
4770 i -= 2;
4771 if (likely((pg >> (i & 63)) & 1)) {
4772 float16 e1, e2, e3, r;
4774 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4775 e2 = *(uint16_t *)(vm + H1_2(i));
4776 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4777 r = float16_muladd(e1, e2, e3, 0, status);
4778 *(uint16_t *)(vd + H1_2(i)) = r;
4780 } while (i & 63);
4781 } while (i != 0);
4784 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4785 void *vg, void *status, uint32_t desc)
4787 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4790 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4791 void *vg, void *status, uint32_t desc)
4793 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4796 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4797 void *vg, void *status, uint32_t desc)
4799 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4802 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4803 void *vg, void *status, uint32_t desc)
4805 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4808 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4809 float_status *status, uint32_t desc,
4810 uint32_t neg1, uint32_t neg3)
4812 intptr_t i = simd_oprsz(desc);
4813 uint64_t *g = vg;
4815 do {
4816 uint64_t pg = g[(i - 1) >> 6];
4817 do {
4818 i -= 4;
4819 if (likely((pg >> (i & 63)) & 1)) {
4820 float32 e1, e2, e3, r;
4822 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4823 e2 = *(uint32_t *)(vm + H1_4(i));
4824 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4825 r = float32_muladd(e1, e2, e3, 0, status);
4826 *(uint32_t *)(vd + H1_4(i)) = r;
4828 } while (i & 63);
4829 } while (i != 0);
4832 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4833 void *vg, void *status, uint32_t desc)
4835 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4838 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4839 void *vg, void *status, uint32_t desc)
4841 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4844 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4845 void *vg, void *status, uint32_t desc)
4847 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4850 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4851 void *vg, void *status, uint32_t desc)
4853 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4856 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4857 float_status *status, uint32_t desc,
4858 uint64_t neg1, uint64_t neg3)
4860 intptr_t i = simd_oprsz(desc);
4861 uint64_t *g = vg;
4863 do {
4864 uint64_t pg = g[(i - 1) >> 6];
4865 do {
4866 i -= 8;
4867 if (likely((pg >> (i & 63)) & 1)) {
4868 float64 e1, e2, e3, r;
4870 e1 = *(uint64_t *)(vn + i) ^ neg1;
4871 e2 = *(uint64_t *)(vm + i);
4872 e3 = *(uint64_t *)(va + i) ^ neg3;
4873 r = float64_muladd(e1, e2, e3, 0, status);
4874 *(uint64_t *)(vd + i) = r;
4876 } while (i & 63);
4877 } while (i != 0);
4880 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4881 void *vg, void *status, uint32_t desc)
4883 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4886 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4887 void *vg, void *status, uint32_t desc)
4889 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4892 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4893 void *vg, void *status, uint32_t desc)
4895 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4898 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4899 void *vg, void *status, uint32_t desc)
4901 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4904 /* Two operand floating-point comparison controlled by a predicate.
4905 * Unlike the integer version, we are not allowed to optimistically
4906 * compare operands, since the comparison may have side effects wrt
4907 * the FPSR.
4909 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4910 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4911 void *status, uint32_t desc) \
4913 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4914 uint64_t *d = vd, *g = vg; \
4915 do { \
4916 uint64_t out = 0, pg = g[j]; \
4917 do { \
4918 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4919 if (likely((pg >> (i & 63)) & 1)) { \
4920 TYPE nn = *(TYPE *)(vn + H(i)); \
4921 TYPE mm = *(TYPE *)(vm + H(i)); \
4922 out |= OP(TYPE, nn, mm, status); \
4924 } while (i & 63); \
4925 d[j--] = out; \
4926 } while (i > 0); \
4929 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4930 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4931 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4932 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4933 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4934 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4936 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4937 DO_FPCMP_PPZZ_H(NAME, OP) \
4938 DO_FPCMP_PPZZ_S(NAME, OP) \
4939 DO_FPCMP_PPZZ_D(NAME, OP)
4941 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4942 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4943 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4944 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4945 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4946 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4947 #define DO_FCMUO(TYPE, X, Y, ST) \
4948 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4949 #define DO_FACGE(TYPE, X, Y, ST) \
4950 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4951 #define DO_FACGT(TYPE, X, Y, ST) \
4952 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4954 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4955 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4956 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4957 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4958 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4959 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4960 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4962 #undef DO_FPCMP_PPZZ_ALL
4963 #undef DO_FPCMP_PPZZ_D
4964 #undef DO_FPCMP_PPZZ_S
4965 #undef DO_FPCMP_PPZZ_H
4966 #undef DO_FPCMP_PPZZ
4968 /* One operand floating-point comparison against zero, controlled
4969 * by a predicate.
4971 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4972 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4973 void *status, uint32_t desc) \
4975 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4976 uint64_t *d = vd, *g = vg; \
4977 do { \
4978 uint64_t out = 0, pg = g[j]; \
4979 do { \
4980 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4981 if ((pg >> (i & 63)) & 1) { \
4982 TYPE nn = *(TYPE *)(vn + H(i)); \
4983 out |= OP(TYPE, nn, 0, status); \
4985 } while (i & 63); \
4986 d[j--] = out; \
4987 } while (i > 0); \
4990 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4991 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4992 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4993 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4994 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4995 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4997 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4998 DO_FPCMP_PPZ0_H(NAME, OP) \
4999 DO_FPCMP_PPZ0_S(NAME, OP) \
5000 DO_FPCMP_PPZ0_D(NAME, OP)
5002 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5003 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5004 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5005 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5006 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5007 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5009 /* FP Trig Multiply-Add. */
5011 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5013 static const float16 coeff[16] = {
5014 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5015 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5017 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5018 intptr_t x = simd_data(desc);
5019 float16 *d = vd, *n = vn, *m = vm;
5020 for (i = 0; i < opr_sz; i++) {
5021 float16 mm = m[i];
5022 intptr_t xx = x;
5023 if (float16_is_neg(mm)) {
5024 mm = float16_abs(mm);
5025 xx += 8;
5027 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5031 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5033 static const float32 coeff[16] = {
5034 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5035 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5036 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5037 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5039 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5040 intptr_t x = simd_data(desc);
5041 float32 *d = vd, *n = vn, *m = vm;
5042 for (i = 0; i < opr_sz; i++) {
5043 float32 mm = m[i];
5044 intptr_t xx = x;
5045 if (float32_is_neg(mm)) {
5046 mm = float32_abs(mm);
5047 xx += 8;
5049 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5053 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5055 static const float64 coeff[16] = {
5056 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5057 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5058 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5059 0x3de5d8408868552full, 0x0000000000000000ull,
5060 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5061 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5062 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5063 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5065 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5066 intptr_t x = simd_data(desc);
5067 float64 *d = vd, *n = vn, *m = vm;
5068 for (i = 0; i < opr_sz; i++) {
5069 float64 mm = m[i];
5070 intptr_t xx = x;
5071 if (float64_is_neg(mm)) {
5072 mm = float64_abs(mm);
5073 xx += 8;
5075 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5080 * FP Complex Add
5083 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5084 void *vs, uint32_t desc)
5086 intptr_t j, i = simd_oprsz(desc);
5087 uint64_t *g = vg;
5088 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5089 float16 neg_real = float16_chs(neg_imag);
5091 do {
5092 uint64_t pg = g[(i - 1) >> 6];
5093 do {
5094 float16 e0, e1, e2, e3;
5096 /* I holds the real index; J holds the imag index. */
5097 j = i - sizeof(float16);
5098 i -= 2 * sizeof(float16);
5100 e0 = *(float16 *)(vn + H1_2(i));
5101 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5102 e2 = *(float16 *)(vn + H1_2(j));
5103 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5105 if (likely((pg >> (i & 63)) & 1)) {
5106 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5108 if (likely((pg >> (j & 63)) & 1)) {
5109 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5111 } while (i & 63);
5112 } while (i != 0);
5115 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5116 void *vs, uint32_t desc)
5118 intptr_t j, i = simd_oprsz(desc);
5119 uint64_t *g = vg;
5120 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5121 float32 neg_real = float32_chs(neg_imag);
5123 do {
5124 uint64_t pg = g[(i - 1) >> 6];
5125 do {
5126 float32 e0, e1, e2, e3;
5128 /* I holds the real index; J holds the imag index. */
5129 j = i - sizeof(float32);
5130 i -= 2 * sizeof(float32);
5132 e0 = *(float32 *)(vn + H1_2(i));
5133 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5134 e2 = *(float32 *)(vn + H1_2(j));
5135 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5137 if (likely((pg >> (i & 63)) & 1)) {
5138 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5140 if (likely((pg >> (j & 63)) & 1)) {
5141 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5143 } while (i & 63);
5144 } while (i != 0);
5147 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5148 void *vs, uint32_t desc)
5150 intptr_t j, i = simd_oprsz(desc);
5151 uint64_t *g = vg;
5152 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5153 float64 neg_real = float64_chs(neg_imag);
5155 do {
5156 uint64_t pg = g[(i - 1) >> 6];
5157 do {
5158 float64 e0, e1, e2, e3;
5160 /* I holds the real index; J holds the imag index. */
5161 j = i - sizeof(float64);
5162 i -= 2 * sizeof(float64);
5164 e0 = *(float64 *)(vn + H1_2(i));
5165 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5166 e2 = *(float64 *)(vn + H1_2(j));
5167 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5169 if (likely((pg >> (i & 63)) & 1)) {
5170 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5172 if (likely((pg >> (j & 63)) & 1)) {
5173 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5175 } while (i & 63);
5176 } while (i != 0);
5180 * FP Complex Multiply
5183 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5184 void *vg, void *status, uint32_t desc)
5186 intptr_t j, i = simd_oprsz(desc);
5187 unsigned rot = simd_data(desc);
5188 bool flip = rot & 1;
5189 float16 neg_imag, neg_real;
5190 uint64_t *g = vg;
5192 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5193 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5195 do {
5196 uint64_t pg = g[(i - 1) >> 6];
5197 do {
5198 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5200 /* I holds the real index; J holds the imag index. */
5201 j = i - sizeof(float16);
5202 i -= 2 * sizeof(float16);
5204 nr = *(float16 *)(vn + H1_2(i));
5205 ni = *(float16 *)(vn + H1_2(j));
5206 mr = *(float16 *)(vm + H1_2(i));
5207 mi = *(float16 *)(vm + H1_2(j));
5209 e2 = (flip ? ni : nr);
5210 e1 = (flip ? mi : mr) ^ neg_real;
5211 e4 = e2;
5212 e3 = (flip ? mr : mi) ^ neg_imag;
5214 if (likely((pg >> (i & 63)) & 1)) {
5215 d = *(float16 *)(va + H1_2(i));
5216 d = float16_muladd(e2, e1, d, 0, status);
5217 *(float16 *)(vd + H1_2(i)) = d;
5219 if (likely((pg >> (j & 63)) & 1)) {
5220 d = *(float16 *)(va + H1_2(j));
5221 d = float16_muladd(e4, e3, d, 0, status);
5222 *(float16 *)(vd + H1_2(j)) = d;
5224 } while (i & 63);
5225 } while (i != 0);
5228 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5229 void *vg, void *status, uint32_t desc)
5231 intptr_t j, i = simd_oprsz(desc);
5232 unsigned rot = simd_data(desc);
5233 bool flip = rot & 1;
5234 float32 neg_imag, neg_real;
5235 uint64_t *g = vg;
5237 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5238 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5240 do {
5241 uint64_t pg = g[(i - 1) >> 6];
5242 do {
5243 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5245 /* I holds the real index; J holds the imag index. */
5246 j = i - sizeof(float32);
5247 i -= 2 * sizeof(float32);
5249 nr = *(float32 *)(vn + H1_2(i));
5250 ni = *(float32 *)(vn + H1_2(j));
5251 mr = *(float32 *)(vm + H1_2(i));
5252 mi = *(float32 *)(vm + H1_2(j));
5254 e2 = (flip ? ni : nr);
5255 e1 = (flip ? mi : mr) ^ neg_real;
5256 e4 = e2;
5257 e3 = (flip ? mr : mi) ^ neg_imag;
5259 if (likely((pg >> (i & 63)) & 1)) {
5260 d = *(float32 *)(va + H1_2(i));
5261 d = float32_muladd(e2, e1, d, 0, status);
5262 *(float32 *)(vd + H1_2(i)) = d;
5264 if (likely((pg >> (j & 63)) & 1)) {
5265 d = *(float32 *)(va + H1_2(j));
5266 d = float32_muladd(e4, e3, d, 0, status);
5267 *(float32 *)(vd + H1_2(j)) = d;
5269 } while (i & 63);
5270 } while (i != 0);
5273 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5274 void *vg, void *status, uint32_t desc)
5276 intptr_t j, i = simd_oprsz(desc);
5277 unsigned rot = simd_data(desc);
5278 bool flip = rot & 1;
5279 float64 neg_imag, neg_real;
5280 uint64_t *g = vg;
5282 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5283 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5285 do {
5286 uint64_t pg = g[(i - 1) >> 6];
5287 do {
5288 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5290 /* I holds the real index; J holds the imag index. */
5291 j = i - sizeof(float64);
5292 i -= 2 * sizeof(float64);
5294 nr = *(float64 *)(vn + H1_2(i));
5295 ni = *(float64 *)(vn + H1_2(j));
5296 mr = *(float64 *)(vm + H1_2(i));
5297 mi = *(float64 *)(vm + H1_2(j));
5299 e2 = (flip ? ni : nr);
5300 e1 = (flip ? mi : mr) ^ neg_real;
5301 e4 = e2;
5302 e3 = (flip ? mr : mi) ^ neg_imag;
5304 if (likely((pg >> (i & 63)) & 1)) {
5305 d = *(float64 *)(va + H1_2(i));
5306 d = float64_muladd(e2, e1, d, 0, status);
5307 *(float64 *)(vd + H1_2(i)) = d;
5309 if (likely((pg >> (j & 63)) & 1)) {
5310 d = *(float64 *)(va + H1_2(j));
5311 d = float64_muladd(e4, e3, d, 0, status);
5312 *(float64 *)(vd + H1_2(j)) = d;
5314 } while (i & 63);
5315 } while (i != 0);
5319 * Load contiguous data, protected by a governing predicate.
5323 * Load one element into @vd + @reg_off from @host.
5324 * The controlling predicate is known to be true.
5326 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
5329 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5330 * The controlling predicate is known to be true.
5332 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5333 target_ulong vaddr, uintptr_t retaddr);
5336 * Generate the above primitives.
5339 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5340 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5342 TYPEM val = HOST(host); \
5343 *(TYPEE *)(vd + H(reg_off)) = val; \
5346 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5347 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5348 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5350 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5351 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5352 target_ulong addr, uintptr_t ra) \
5354 *(TYPEE *)(vd + H(reg_off)) = \
5355 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5358 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5359 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5360 target_ulong addr, uintptr_t ra) \
5362 TLB(env, useronly_clean_ptr(addr), \
5363 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5366 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5367 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5368 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5370 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5371 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5372 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5373 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5374 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5375 DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t)
5376 DO_LD_PRIM_1(ld1bds, H1_8, uint64_t, int8_t)
5378 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5379 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5380 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5382 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5383 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5384 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5385 DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t)
5387 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5388 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5389 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5390 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5391 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5393 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5394 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5395 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5396 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5397 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5399 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5400 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5401 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5402 DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw)
5403 DO_LD_PRIM_2(hds, H1_8, uint64_t, int16_t, lduw)
5405 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5406 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5407 DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw)
5409 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5410 DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl)
5411 DO_LD_PRIM_2(sds, H1_8, uint64_t, int32_t, ldl)
5413 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5414 DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
5416 DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
5417 DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
5419 #undef DO_LD_TLB
5420 #undef DO_ST_TLB
5421 #undef DO_LD_HOST
5422 #undef DO_LD_PRIM_1
5423 #undef DO_ST_PRIM_1
5424 #undef DO_LD_PRIM_2
5425 #undef DO_ST_PRIM_2
5428 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5429 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5430 * element >= @reg_off, or @reg_max if there were no active elements at all.
5432 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5433 intptr_t reg_max, int esz)
5435 uint64_t pg_mask = pred_esz_masks[esz];
5436 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5438 /* In normal usage, the first element is active. */
5439 if (likely(pg & 1)) {
5440 return reg_off;
5443 if (pg == 0) {
5444 reg_off &= -64;
5445 do {
5446 reg_off += 64;
5447 if (unlikely(reg_off >= reg_max)) {
5448 /* The entire predicate was false. */
5449 return reg_max;
5451 pg = vg[reg_off >> 6] & pg_mask;
5452 } while (pg == 0);
5454 reg_off += ctz64(pg);
5456 /* We should never see an out of range predicate bit set. */
5457 tcg_debug_assert(reg_off < reg_max);
5458 return reg_off;
5462 * Resolve the guest virtual address to info->host and info->flags.
5463 * If @nofault, return false if the page is invalid, otherwise
5464 * exit via page fault exception.
5467 typedef struct {
5468 void *host;
5469 int flags;
5470 MemTxAttrs attrs;
5471 } SVEHostPage;
5473 static bool sve_probe_page(SVEHostPage *info, bool nofault,
5474 CPUARMState *env, target_ulong addr,
5475 int mem_off, MMUAccessType access_type,
5476 int mmu_idx, uintptr_t retaddr)
5478 int flags;
5480 addr += mem_off;
5483 * User-only currently always issues with TBI. See the comment
5484 * above useronly_clean_ptr. Usually we clean this top byte away
5485 * during translation, but we can't do that for e.g. vector + imm
5486 * addressing modes.
5488 * We currently always enable TBI for user-only, and do not provide
5489 * a way to turn it off. So clean the pointer unconditionally here,
5490 * rather than look it up here, or pass it down from above.
5492 addr = useronly_clean_ptr(addr);
5494 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5495 &info->host, retaddr);
5496 info->flags = flags;
5498 if (flags & TLB_INVALID_MASK) {
5499 g_assert(nofault);
5500 return false;
5503 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5504 info->host -= mem_off;
5506 #ifdef CONFIG_USER_ONLY
5507 memset(&info->attrs, 0, sizeof(info->attrs));
5508 #else
5510 * Find the iotlbentry for addr and return the transaction attributes.
5511 * This *must* be present in the TLB because we just found the mapping.
5514 uintptr_t index = tlb_index(env, mmu_idx, addr);
5516 # ifdef CONFIG_DEBUG_TCG
5517 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5518 target_ulong comparator = (access_type == MMU_DATA_LOAD
5519 ? entry->addr_read
5520 : tlb_addr_write(entry));
5521 g_assert(tlb_hit(comparator, addr));
5522 # endif
5524 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5525 info->attrs = iotlbentry->attrs;
5527 #endif
5529 return true;
5534 * Analyse contiguous data, protected by a governing predicate.
5537 typedef enum {
5538 FAULT_NO,
5539 FAULT_FIRST,
5540 FAULT_ALL,
5541 } SVEContFault;
5543 typedef struct {
5545 * First and last element wholly contained within the two pages.
5546 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5547 * reg_off_last[0] may be < 0 if the first element crosses pages.
5548 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5549 * are set >= 0 only if there are complete elements on a second page.
5551 * The reg_off_* offsets are relative to the internal vector register.
5552 * The mem_off_first offset is relative to the memory address; the
5553 * two offsets are different when a load operation extends, a store
5554 * operation truncates, or for multi-register operations.
5556 int16_t mem_off_first[2];
5557 int16_t reg_off_first[2];
5558 int16_t reg_off_last[2];
5561 * One element that is misaligned and spans both pages,
5562 * or -1 if there is no such active element.
5564 int16_t mem_off_split;
5565 int16_t reg_off_split;
5568 * The byte offset at which the entire operation crosses a page boundary.
5569 * Set >= 0 if and only if the entire operation spans two pages.
5571 int16_t page_split;
5573 /* TLB data for the two pages. */
5574 SVEHostPage page[2];
5575 } SVEContLdSt;
5578 * Find first active element on each page, and a loose bound for the
5579 * final element on each page. Identify any single element that spans
5580 * the page boundary. Return true if there are any active elements.
5582 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5583 uint64_t *vg, intptr_t reg_max,
5584 int esz, int msize)
5586 const int esize = 1 << esz;
5587 const uint64_t pg_mask = pred_esz_masks[esz];
5588 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5589 intptr_t mem_off_last, mem_off_split;
5590 intptr_t page_split, elt_split;
5591 intptr_t i;
5593 /* Set all of the element indices to -1, and the TLB data to 0. */
5594 memset(info, -1, offsetof(SVEContLdSt, page));
5595 memset(info->page, 0, sizeof(info->page));
5597 /* Gross scan over the entire predicate to find bounds. */
5598 i = 0;
5599 do {
5600 uint64_t pg = vg[i] & pg_mask;
5601 if (pg) {
5602 reg_off_last = i * 64 + 63 - clz64(pg);
5603 if (reg_off_first < 0) {
5604 reg_off_first = i * 64 + ctz64(pg);
5607 } while (++i * 64 < reg_max);
5609 if (unlikely(reg_off_first < 0)) {
5610 /* No active elements, no pages touched. */
5611 return false;
5613 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5615 info->reg_off_first[0] = reg_off_first;
5616 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5617 mem_off_last = (reg_off_last >> esz) * msize;
5619 page_split = -(addr | TARGET_PAGE_MASK);
5620 if (likely(mem_off_last + msize <= page_split)) {
5621 /* The entire operation fits within a single page. */
5622 info->reg_off_last[0] = reg_off_last;
5623 return true;
5626 info->page_split = page_split;
5627 elt_split = page_split / msize;
5628 reg_off_split = elt_split << esz;
5629 mem_off_split = elt_split * msize;
5632 * This is the last full element on the first page, but it is not
5633 * necessarily active. If there is no full element, i.e. the first
5634 * active element is the one that's split, this value remains -1.
5635 * It is useful as iteration bounds.
5637 if (elt_split != 0) {
5638 info->reg_off_last[0] = reg_off_split - esize;
5641 /* Determine if an unaligned element spans the pages. */
5642 if (page_split % msize != 0) {
5643 /* It is helpful to know if the split element is active. */
5644 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5645 info->reg_off_split = reg_off_split;
5646 info->mem_off_split = mem_off_split;
5648 if (reg_off_split == reg_off_last) {
5649 /* The page crossing element is last. */
5650 return true;
5653 reg_off_split += esize;
5654 mem_off_split += msize;
5658 * We do want the first active element on the second page, because
5659 * this may affect the address reported in an exception.
5661 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5662 tcg_debug_assert(reg_off_split <= reg_off_last);
5663 info->reg_off_first[1] = reg_off_split;
5664 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5665 info->reg_off_last[1] = reg_off_last;
5666 return true;
5670 * Resolve the guest virtual addresses to info->page[].
5671 * Control the generation of page faults with @fault. Return false if
5672 * there is no work to do, which can only happen with @fault == FAULT_NO.
5674 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5675 CPUARMState *env, target_ulong addr,
5676 MMUAccessType access_type, uintptr_t retaddr)
5678 int mmu_idx = cpu_mmu_index(env, false);
5679 int mem_off = info->mem_off_first[0];
5680 bool nofault = fault == FAULT_NO;
5681 bool have_work = true;
5683 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5684 access_type, mmu_idx, retaddr)) {
5685 /* No work to be done. */
5686 return false;
5689 if (likely(info->page_split < 0)) {
5690 /* The entire operation was on the one page. */
5691 return true;
5695 * If the second page is invalid, then we want the fault address to be
5696 * the first byte on that page which is accessed.
5698 if (info->mem_off_split >= 0) {
5700 * There is an element split across the pages. The fault address
5701 * should be the first byte of the second page.
5703 mem_off = info->page_split;
5705 * If the split element is also the first active element
5706 * of the vector, then: For first-fault we should continue
5707 * to generate faults for the second page. For no-fault,
5708 * we have work only if the second page is valid.
5710 if (info->mem_off_first[0] < info->mem_off_split) {
5711 nofault = FAULT_FIRST;
5712 have_work = false;
5714 } else {
5716 * There is no element split across the pages. The fault address
5717 * should be the first active element on the second page.
5719 mem_off = info->mem_off_first[1];
5721 * There must have been one active element on the first page,
5722 * so we're out of first-fault territory.
5724 nofault = fault != FAULT_ALL;
5727 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5728 access_type, mmu_idx, retaddr);
5729 return have_work;
5732 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5733 uint64_t *vg, target_ulong addr,
5734 int esize, int msize, int wp_access,
5735 uintptr_t retaddr)
5737 #ifndef CONFIG_USER_ONLY
5738 intptr_t mem_off, reg_off, reg_last;
5739 int flags0 = info->page[0].flags;
5740 int flags1 = info->page[1].flags;
5742 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5743 return;
5746 /* Indicate that watchpoints are handled. */
5747 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5748 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5750 if (flags0 & TLB_WATCHPOINT) {
5751 mem_off = info->mem_off_first[0];
5752 reg_off = info->reg_off_first[0];
5753 reg_last = info->reg_off_last[0];
5755 while (reg_off <= reg_last) {
5756 uint64_t pg = vg[reg_off >> 6];
5757 do {
5758 if ((pg >> (reg_off & 63)) & 1) {
5759 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5760 msize, info->page[0].attrs,
5761 wp_access, retaddr);
5763 reg_off += esize;
5764 mem_off += msize;
5765 } while (reg_off <= reg_last && (reg_off & 63));
5769 mem_off = info->mem_off_split;
5770 if (mem_off >= 0) {
5771 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5772 info->page[0].attrs, wp_access, retaddr);
5775 mem_off = info->mem_off_first[1];
5776 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5777 reg_off = info->reg_off_first[1];
5778 reg_last = info->reg_off_last[1];
5780 do {
5781 uint64_t pg = vg[reg_off >> 6];
5782 do {
5783 if ((pg >> (reg_off & 63)) & 1) {
5784 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5785 msize, info->page[1].attrs,
5786 wp_access, retaddr);
5788 reg_off += esize;
5789 mem_off += msize;
5790 } while (reg_off & 63);
5791 } while (reg_off <= reg_last);
5793 #endif
5796 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5797 uint64_t *vg, target_ulong addr, int esize,
5798 int msize, uint32_t mtedesc, uintptr_t ra)
5800 intptr_t mem_off, reg_off, reg_last;
5802 /* Process the page only if MemAttr == Tagged. */
5803 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5804 mem_off = info->mem_off_first[0];
5805 reg_off = info->reg_off_first[0];
5806 reg_last = info->reg_off_split;
5807 if (reg_last < 0) {
5808 reg_last = info->reg_off_last[0];
5811 do {
5812 uint64_t pg = vg[reg_off >> 6];
5813 do {
5814 if ((pg >> (reg_off & 63)) & 1) {
5815 mte_check(env, mtedesc, addr, ra);
5817 reg_off += esize;
5818 mem_off += msize;
5819 } while (reg_off <= reg_last && (reg_off & 63));
5820 } while (reg_off <= reg_last);
5823 mem_off = info->mem_off_first[1];
5824 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5825 reg_off = info->reg_off_first[1];
5826 reg_last = info->reg_off_last[1];
5828 do {
5829 uint64_t pg = vg[reg_off >> 6];
5830 do {
5831 if ((pg >> (reg_off & 63)) & 1) {
5832 mte_check(env, mtedesc, addr, ra);
5834 reg_off += esize;
5835 mem_off += msize;
5836 } while (reg_off & 63);
5837 } while (reg_off <= reg_last);
5842 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5844 static inline QEMU_ALWAYS_INLINE
5845 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5846 uint32_t desc, const uintptr_t retaddr,
5847 const int esz, const int msz, const int N, uint32_t mtedesc,
5848 sve_ldst1_host_fn *host_fn,
5849 sve_ldst1_tlb_fn *tlb_fn)
5851 const unsigned rd = simd_data(desc);
5852 const intptr_t reg_max = simd_oprsz(desc);
5853 intptr_t reg_off, reg_last, mem_off;
5854 SVEContLdSt info;
5855 void *host;
5856 int flags, i;
5858 /* Find the active elements. */
5859 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5860 /* The entire predicate was false; no load occurs. */
5861 for (i = 0; i < N; ++i) {
5862 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5864 return;
5867 /* Probe the page(s). Exit with exception for any invalid page. */
5868 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5870 /* Handle watchpoints for all active elements. */
5871 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5872 BP_MEM_READ, retaddr);
5875 * Handle mte checks for all active elements.
5876 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5878 if (mtedesc) {
5879 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5880 mtedesc, retaddr);
5883 flags = info.page[0].flags | info.page[1].flags;
5884 if (unlikely(flags != 0)) {
5885 #ifdef CONFIG_USER_ONLY
5886 g_assert_not_reached();
5887 #else
5889 * At least one page includes MMIO.
5890 * Any bus operation can fail with cpu_transaction_failed,
5891 * which for ARM will raise SyncExternal. Perform the load
5892 * into scratch memory to preserve register state until the end.
5894 ARMVectorReg scratch[4] = { };
5896 mem_off = info.mem_off_first[0];
5897 reg_off = info.reg_off_first[0];
5898 reg_last = info.reg_off_last[1];
5899 if (reg_last < 0) {
5900 reg_last = info.reg_off_split;
5901 if (reg_last < 0) {
5902 reg_last = info.reg_off_last[0];
5906 do {
5907 uint64_t pg = vg[reg_off >> 6];
5908 do {
5909 if ((pg >> (reg_off & 63)) & 1) {
5910 for (i = 0; i < N; ++i) {
5911 tlb_fn(env, &scratch[i], reg_off,
5912 addr + mem_off + (i << msz), retaddr);
5915 reg_off += 1 << esz;
5916 mem_off += N << msz;
5917 } while (reg_off & 63);
5918 } while (reg_off <= reg_last);
5920 for (i = 0; i < N; ++i) {
5921 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5923 return;
5924 #endif
5927 /* The entire operation is in RAM, on valid pages. */
5929 for (i = 0; i < N; ++i) {
5930 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5933 mem_off = info.mem_off_first[0];
5934 reg_off = info.reg_off_first[0];
5935 reg_last = info.reg_off_last[0];
5936 host = info.page[0].host;
5938 while (reg_off <= reg_last) {
5939 uint64_t pg = vg[reg_off >> 6];
5940 do {
5941 if ((pg >> (reg_off & 63)) & 1) {
5942 for (i = 0; i < N; ++i) {
5943 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5944 host + mem_off + (i << msz));
5947 reg_off += 1 << esz;
5948 mem_off += N << msz;
5949 } while (reg_off <= reg_last && (reg_off & 63));
5953 * Use the slow path to manage the cross-page misalignment.
5954 * But we know this is RAM and cannot trap.
5956 mem_off = info.mem_off_split;
5957 if (unlikely(mem_off >= 0)) {
5958 reg_off = info.reg_off_split;
5959 for (i = 0; i < N; ++i) {
5960 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5961 addr + mem_off + (i << msz), retaddr);
5965 mem_off = info.mem_off_first[1];
5966 if (unlikely(mem_off >= 0)) {
5967 reg_off = info.reg_off_first[1];
5968 reg_last = info.reg_off_last[1];
5969 host = info.page[1].host;
5971 do {
5972 uint64_t pg = vg[reg_off >> 6];
5973 do {
5974 if ((pg >> (reg_off & 63)) & 1) {
5975 for (i = 0; i < N; ++i) {
5976 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5977 host + mem_off + (i << msz));
5980 reg_off += 1 << esz;
5981 mem_off += N << msz;
5982 } while (reg_off & 63);
5983 } while (reg_off <= reg_last);
5987 static inline QEMU_ALWAYS_INLINE
5988 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5989 uint32_t desc, const uintptr_t ra,
5990 const int esz, const int msz, const int N,
5991 sve_ldst1_host_fn *host_fn,
5992 sve_ldst1_tlb_fn *tlb_fn)
5994 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5995 int bit55 = extract64(addr, 55, 1);
5997 /* Remove mtedesc from the normal sve descriptor. */
5998 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6000 /* Perform gross MTE suppression early. */
6001 if (!tbi_check(desc, bit55) ||
6002 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6003 mtedesc = 0;
6006 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6009 #define DO_LD1_1(NAME, ESZ) \
6010 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
6011 target_ulong addr, uint32_t desc) \
6013 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
6014 sve_##NAME##_host, sve_##NAME##_tlb); \
6016 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
6017 target_ulong addr, uint32_t desc) \
6019 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6020 sve_##NAME##_host, sve_##NAME##_tlb); \
6023 #define DO_LD1_2(NAME, ESZ, MSZ) \
6024 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6025 target_ulong addr, uint32_t desc) \
6027 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6028 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6030 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6031 target_ulong addr, uint32_t desc) \
6033 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6034 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6036 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6037 target_ulong addr, uint32_t desc) \
6039 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6040 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6042 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6043 target_ulong addr, uint32_t desc) \
6045 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6046 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6049 DO_LD1_1(ld1bb, MO_8)
6050 DO_LD1_1(ld1bhu, MO_16)
6051 DO_LD1_1(ld1bhs, MO_16)
6052 DO_LD1_1(ld1bsu, MO_32)
6053 DO_LD1_1(ld1bss, MO_32)
6054 DO_LD1_1(ld1bdu, MO_64)
6055 DO_LD1_1(ld1bds, MO_64)
6057 DO_LD1_2(ld1hh, MO_16, MO_16)
6058 DO_LD1_2(ld1hsu, MO_32, MO_16)
6059 DO_LD1_2(ld1hss, MO_32, MO_16)
6060 DO_LD1_2(ld1hdu, MO_64, MO_16)
6061 DO_LD1_2(ld1hds, MO_64, MO_16)
6063 DO_LD1_2(ld1ss, MO_32, MO_32)
6064 DO_LD1_2(ld1sdu, MO_64, MO_32)
6065 DO_LD1_2(ld1sds, MO_64, MO_32)
6067 DO_LD1_2(ld1dd, MO_64, MO_64)
6069 #undef DO_LD1_1
6070 #undef DO_LD1_2
6072 #define DO_LDN_1(N) \
6073 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6074 target_ulong addr, uint32_t desc) \
6076 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6077 sve_ld1bb_host, sve_ld1bb_tlb); \
6079 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6080 target_ulong addr, uint32_t desc) \
6082 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6083 sve_ld1bb_host, sve_ld1bb_tlb); \
6086 #define DO_LDN_2(N, SUFF, ESZ) \
6087 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6088 target_ulong addr, uint32_t desc) \
6090 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6091 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6093 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6094 target_ulong addr, uint32_t desc) \
6096 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6097 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6099 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6100 target_ulong addr, uint32_t desc) \
6102 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6103 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6105 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6106 target_ulong addr, uint32_t desc) \
6108 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6109 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6112 DO_LDN_1(2)
6113 DO_LDN_1(3)
6114 DO_LDN_1(4)
6116 DO_LDN_2(2, hh, MO_16)
6117 DO_LDN_2(3, hh, MO_16)
6118 DO_LDN_2(4, hh, MO_16)
6120 DO_LDN_2(2, ss, MO_32)
6121 DO_LDN_2(3, ss, MO_32)
6122 DO_LDN_2(4, ss, MO_32)
6124 DO_LDN_2(2, dd, MO_64)
6125 DO_LDN_2(3, dd, MO_64)
6126 DO_LDN_2(4, dd, MO_64)
6128 #undef DO_LDN_1
6129 #undef DO_LDN_2
6132 * Load contiguous data, first-fault and no-fault.
6134 * For user-only, one could argue that we should hold the mmap_lock during
6135 * the operation so that there is no race between page_check_range and the
6136 * load operation. However, unmapping pages out from under a running thread
6137 * is extraordinarily unlikely. This theoretical race condition also affects
6138 * linux-user/ in its get_user/put_user macros.
6140 * TODO: Construct some helpers, written in assembly, that interact with
6141 * handle_cpu_signal to produce memory ops which can properly report errors
6142 * without racing.
6145 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6146 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6147 * option, which leaves subsequent data unchanged.
6149 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6151 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6153 if (i & 63) {
6154 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6155 i = ROUND_UP(i, 64);
6157 for (; i < oprsz; i += 64) {
6158 ffr[i / 64] = 0;
6163 * Common helper for all contiguous no-fault and first-fault loads.
6165 static inline QEMU_ALWAYS_INLINE
6166 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6167 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6168 const int esz, const int msz, const SVEContFault fault,
6169 sve_ldst1_host_fn *host_fn,
6170 sve_ldst1_tlb_fn *tlb_fn)
6172 const unsigned rd = simd_data(desc);
6173 void *vd = &env->vfp.zregs[rd];
6174 const intptr_t reg_max = simd_oprsz(desc);
6175 intptr_t reg_off, mem_off, reg_last;
6176 SVEContLdSt info;
6177 int flags;
6178 void *host;
6180 /* Find the active elements. */
6181 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6182 /* The entire predicate was false; no load occurs. */
6183 memset(vd, 0, reg_max);
6184 return;
6186 reg_off = info.reg_off_first[0];
6188 /* Probe the page(s). */
6189 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6190 /* Fault on first element. */
6191 tcg_debug_assert(fault == FAULT_NO);
6192 memset(vd, 0, reg_max);
6193 goto do_fault;
6196 mem_off = info.mem_off_first[0];
6197 flags = info.page[0].flags;
6200 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6201 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6203 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
6204 mtedesc = 0;
6207 if (fault == FAULT_FIRST) {
6208 /* Trapping mte check for the first-fault element. */
6209 if (mtedesc) {
6210 mte_check(env, mtedesc, addr + mem_off, retaddr);
6214 * Special handling of the first active element,
6215 * if it crosses a page boundary or is MMIO.
6217 bool is_split = mem_off == info.mem_off_split;
6218 if (unlikely(flags != 0) || unlikely(is_split)) {
6220 * Use the slow path for cross-page handling.
6221 * Might trap for MMIO or watchpoints.
6223 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6225 /* After any fault, zero the other elements. */
6226 swap_memzero(vd, reg_off);
6227 reg_off += 1 << esz;
6228 mem_off += 1 << msz;
6229 swap_memzero(vd + reg_off, reg_max - reg_off);
6231 if (is_split) {
6232 goto second_page;
6234 } else {
6235 memset(vd, 0, reg_max);
6237 } else {
6238 memset(vd, 0, reg_max);
6239 if (unlikely(mem_off == info.mem_off_split)) {
6240 /* The first active element crosses a page boundary. */
6241 flags |= info.page[1].flags;
6242 if (unlikely(flags & TLB_MMIO)) {
6243 /* Some page is MMIO, see below. */
6244 goto do_fault;
6246 if (unlikely(flags & TLB_WATCHPOINT) &&
6247 (cpu_watchpoint_address_matches
6248 (env_cpu(env), addr + mem_off, 1 << msz)
6249 & BP_MEM_READ)) {
6250 /* Watchpoint hit, see below. */
6251 goto do_fault;
6253 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6254 goto do_fault;
6257 * Use the slow path for cross-page handling.
6258 * This is RAM, without a watchpoint, and will not trap.
6260 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6261 goto second_page;
6266 * From this point on, all memory operations are MemSingleNF.
6268 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6269 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6271 * Unfortuately we do not have access to the memory attributes from the
6272 * PTE to tell Device memory from Normal memory. So we make a mostly
6273 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6274 * This gives the right answer for the common cases of "Normal memory,
6275 * backed by host RAM" and "Device memory, backed by MMIO".
6276 * The architecture allows us to suppress an NF load and return
6277 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6278 * case of "Normal memory, backed by MMIO" is permitted. The case we
6279 * get wrong is "Device memory, backed by host RAM", for which we
6280 * should return (UNKNOWN, FAULT) for but do not.
6282 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6283 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6284 * architectural breakpoints the same.
6286 if (unlikely(flags & TLB_MMIO)) {
6287 goto do_fault;
6290 reg_last = info.reg_off_last[0];
6291 host = info.page[0].host;
6293 do {
6294 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6295 do {
6296 if ((pg >> (reg_off & 63)) & 1) {
6297 if (unlikely(flags & TLB_WATCHPOINT) &&
6298 (cpu_watchpoint_address_matches
6299 (env_cpu(env), addr + mem_off, 1 << msz)
6300 & BP_MEM_READ)) {
6301 goto do_fault;
6303 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6304 goto do_fault;
6306 host_fn(vd, reg_off, host + mem_off);
6308 reg_off += 1 << esz;
6309 mem_off += 1 << msz;
6310 } while (reg_off <= reg_last && (reg_off & 63));
6311 } while (reg_off <= reg_last);
6314 * MemSingleNF is allowed to fail for any reason. We have special
6315 * code above to handle the first element crossing a page boundary.
6316 * As an implementation choice, decline to handle a cross-page element
6317 * in any other position.
6319 reg_off = info.reg_off_split;
6320 if (reg_off >= 0) {
6321 goto do_fault;
6324 second_page:
6325 reg_off = info.reg_off_first[1];
6326 if (likely(reg_off < 0)) {
6327 /* No active elements on the second page. All done. */
6328 return;
6332 * MemSingleNF is allowed to fail for any reason. As an implementation
6333 * choice, decline to handle elements on the second page. This should
6334 * be low frequency as the guest walks through memory -- the next
6335 * iteration of the guest's loop should be aligned on the page boundary,
6336 * and then all following iterations will stay aligned.
6339 do_fault:
6340 record_fault(env, reg_off, reg_max);
6343 static inline QEMU_ALWAYS_INLINE
6344 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6345 uint32_t desc, const uintptr_t retaddr,
6346 const int esz, const int msz, const SVEContFault fault,
6347 sve_ldst1_host_fn *host_fn,
6348 sve_ldst1_tlb_fn *tlb_fn)
6350 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6351 int bit55 = extract64(addr, 55, 1);
6353 /* Remove mtedesc from the normal sve descriptor. */
6354 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6356 /* Perform gross MTE suppression early. */
6357 if (!tbi_check(desc, bit55) ||
6358 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6359 mtedesc = 0;
6362 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6363 esz, msz, fault, host_fn, tlb_fn);
6366 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6367 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6368 target_ulong addr, uint32_t desc) \
6370 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6371 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6373 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6374 target_ulong addr, uint32_t desc) \
6376 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6377 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6379 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6380 target_ulong addr, uint32_t desc) \
6382 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6383 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6385 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6386 target_ulong addr, uint32_t desc) \
6388 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6389 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6392 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6393 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6394 target_ulong addr, uint32_t desc) \
6396 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6397 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6399 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6400 target_ulong addr, uint32_t desc) \
6402 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6403 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6405 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6406 target_ulong addr, uint32_t desc) \
6408 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6409 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6411 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6412 target_ulong addr, uint32_t desc) \
6414 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6415 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6417 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6418 target_ulong addr, uint32_t desc) \
6420 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6421 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6423 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6424 target_ulong addr, uint32_t desc) \
6426 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6427 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6429 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6430 target_ulong addr, uint32_t desc) \
6432 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6433 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6435 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6436 target_ulong addr, uint32_t desc) \
6438 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6439 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6442 DO_LDFF1_LDNF1_1(bb, MO_8)
6443 DO_LDFF1_LDNF1_1(bhu, MO_16)
6444 DO_LDFF1_LDNF1_1(bhs, MO_16)
6445 DO_LDFF1_LDNF1_1(bsu, MO_32)
6446 DO_LDFF1_LDNF1_1(bss, MO_32)
6447 DO_LDFF1_LDNF1_1(bdu, MO_64)
6448 DO_LDFF1_LDNF1_1(bds, MO_64)
6450 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6451 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6452 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6453 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6454 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6456 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6457 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6458 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6460 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6462 #undef DO_LDFF1_LDNF1_1
6463 #undef DO_LDFF1_LDNF1_2
6466 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6469 static inline QEMU_ALWAYS_INLINE
6470 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6471 uint32_t desc, const uintptr_t retaddr,
6472 const int esz, const int msz, const int N, uint32_t mtedesc,
6473 sve_ldst1_host_fn *host_fn,
6474 sve_ldst1_tlb_fn *tlb_fn)
6476 const unsigned rd = simd_data(desc);
6477 const intptr_t reg_max = simd_oprsz(desc);
6478 intptr_t reg_off, reg_last, mem_off;
6479 SVEContLdSt info;
6480 void *host;
6481 int i, flags;
6483 /* Find the active elements. */
6484 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6485 /* The entire predicate was false; no store occurs. */
6486 return;
6489 /* Probe the page(s). Exit with exception for any invalid page. */
6490 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6492 /* Handle watchpoints for all active elements. */
6493 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6494 BP_MEM_WRITE, retaddr);
6497 * Handle mte checks for all active elements.
6498 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6500 if (mtedesc) {
6501 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6502 mtedesc, retaddr);
6505 flags = info.page[0].flags | info.page[1].flags;
6506 if (unlikely(flags != 0)) {
6507 #ifdef CONFIG_USER_ONLY
6508 g_assert_not_reached();
6509 #else
6511 * At least one page includes MMIO.
6512 * Any bus operation can fail with cpu_transaction_failed,
6513 * which for ARM will raise SyncExternal. We cannot avoid
6514 * this fault and will leave with the store incomplete.
6516 mem_off = info.mem_off_first[0];
6517 reg_off = info.reg_off_first[0];
6518 reg_last = info.reg_off_last[1];
6519 if (reg_last < 0) {
6520 reg_last = info.reg_off_split;
6521 if (reg_last < 0) {
6522 reg_last = info.reg_off_last[0];
6526 do {
6527 uint64_t pg = vg[reg_off >> 6];
6528 do {
6529 if ((pg >> (reg_off & 63)) & 1) {
6530 for (i = 0; i < N; ++i) {
6531 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6532 addr + mem_off + (i << msz), retaddr);
6535 reg_off += 1 << esz;
6536 mem_off += N << msz;
6537 } while (reg_off & 63);
6538 } while (reg_off <= reg_last);
6539 return;
6540 #endif
6543 mem_off = info.mem_off_first[0];
6544 reg_off = info.reg_off_first[0];
6545 reg_last = info.reg_off_last[0];
6546 host = info.page[0].host;
6548 while (reg_off <= reg_last) {
6549 uint64_t pg = vg[reg_off >> 6];
6550 do {
6551 if ((pg >> (reg_off & 63)) & 1) {
6552 for (i = 0; i < N; ++i) {
6553 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6554 host + mem_off + (i << msz));
6557 reg_off += 1 << esz;
6558 mem_off += N << msz;
6559 } while (reg_off <= reg_last && (reg_off & 63));
6563 * Use the slow path to manage the cross-page misalignment.
6564 * But we know this is RAM and cannot trap.
6566 mem_off = info.mem_off_split;
6567 if (unlikely(mem_off >= 0)) {
6568 reg_off = info.reg_off_split;
6569 for (i = 0; i < N; ++i) {
6570 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6571 addr + mem_off + (i << msz), retaddr);
6575 mem_off = info.mem_off_first[1];
6576 if (unlikely(mem_off >= 0)) {
6577 reg_off = info.reg_off_first[1];
6578 reg_last = info.reg_off_last[1];
6579 host = info.page[1].host;
6581 do {
6582 uint64_t pg = vg[reg_off >> 6];
6583 do {
6584 if ((pg >> (reg_off & 63)) & 1) {
6585 for (i = 0; i < N; ++i) {
6586 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6587 host + mem_off + (i << msz));
6590 reg_off += 1 << esz;
6591 mem_off += N << msz;
6592 } while (reg_off & 63);
6593 } while (reg_off <= reg_last);
6597 static inline QEMU_ALWAYS_INLINE
6598 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6599 uint32_t desc, const uintptr_t ra,
6600 const int esz, const int msz, const int N,
6601 sve_ldst1_host_fn *host_fn,
6602 sve_ldst1_tlb_fn *tlb_fn)
6604 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6605 int bit55 = extract64(addr, 55, 1);
6607 /* Remove mtedesc from the normal sve descriptor. */
6608 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6610 /* Perform gross MTE suppression early. */
6611 if (!tbi_check(desc, bit55) ||
6612 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6613 mtedesc = 0;
6616 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6619 #define DO_STN_1(N, NAME, ESZ) \
6620 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6621 target_ulong addr, uint32_t desc) \
6623 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6624 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6626 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6627 target_ulong addr, uint32_t desc) \
6629 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6630 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6633 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6634 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6635 target_ulong addr, uint32_t desc) \
6637 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6638 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6640 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6641 target_ulong addr, uint32_t desc) \
6643 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6644 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6646 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6647 target_ulong addr, uint32_t desc) \
6649 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6650 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6652 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6653 target_ulong addr, uint32_t desc) \
6655 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6656 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6659 DO_STN_1(1, bb, MO_8)
6660 DO_STN_1(1, bh, MO_16)
6661 DO_STN_1(1, bs, MO_32)
6662 DO_STN_1(1, bd, MO_64)
6663 DO_STN_1(2, bb, MO_8)
6664 DO_STN_1(3, bb, MO_8)
6665 DO_STN_1(4, bb, MO_8)
6667 DO_STN_2(1, hh, MO_16, MO_16)
6668 DO_STN_2(1, hs, MO_32, MO_16)
6669 DO_STN_2(1, hd, MO_64, MO_16)
6670 DO_STN_2(2, hh, MO_16, MO_16)
6671 DO_STN_2(3, hh, MO_16, MO_16)
6672 DO_STN_2(4, hh, MO_16, MO_16)
6674 DO_STN_2(1, ss, MO_32, MO_32)
6675 DO_STN_2(1, sd, MO_64, MO_32)
6676 DO_STN_2(2, ss, MO_32, MO_32)
6677 DO_STN_2(3, ss, MO_32, MO_32)
6678 DO_STN_2(4, ss, MO_32, MO_32)
6680 DO_STN_2(1, dd, MO_64, MO_64)
6681 DO_STN_2(2, dd, MO_64, MO_64)
6682 DO_STN_2(3, dd, MO_64, MO_64)
6683 DO_STN_2(4, dd, MO_64, MO_64)
6685 #undef DO_STN_1
6686 #undef DO_STN_2
6689 * Loads with a vector index.
6693 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6695 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6697 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6699 return *(uint32_t *)(reg + H1_4(reg_ofs));
6702 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6704 return *(int32_t *)(reg + H1_4(reg_ofs));
6707 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6709 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6712 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6714 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6717 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6719 return *(uint64_t *)(reg + reg_ofs);
6722 static inline QEMU_ALWAYS_INLINE
6723 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6724 target_ulong base, uint32_t desc, uintptr_t retaddr,
6725 uint32_t mtedesc, int esize, int msize,
6726 zreg_off_fn *off_fn,
6727 sve_ldst1_host_fn *host_fn,
6728 sve_ldst1_tlb_fn *tlb_fn)
6730 const int mmu_idx = cpu_mmu_index(env, false);
6731 const intptr_t reg_max = simd_oprsz(desc);
6732 const int scale = simd_data(desc);
6733 ARMVectorReg scratch;
6734 intptr_t reg_off;
6735 SVEHostPage info, info2;
6737 memset(&scratch, 0, reg_max);
6738 reg_off = 0;
6739 do {
6740 uint64_t pg = vg[reg_off >> 6];
6741 do {
6742 if (likely(pg & 1)) {
6743 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6744 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6746 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6747 mmu_idx, retaddr);
6749 if (likely(in_page >= msize)) {
6750 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6751 cpu_check_watchpoint(env_cpu(env), addr, msize,
6752 info.attrs, BP_MEM_READ, retaddr);
6754 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6755 mte_check(env, mtedesc, addr, retaddr);
6757 host_fn(&scratch, reg_off, info.host);
6758 } else {
6759 /* Element crosses the page boundary. */
6760 sve_probe_page(&info2, false, env, addr + in_page, 0,
6761 MMU_DATA_LOAD, mmu_idx, retaddr);
6762 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6763 cpu_check_watchpoint(env_cpu(env), addr,
6764 msize, info.attrs,
6765 BP_MEM_READ, retaddr);
6767 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6768 mte_check(env, mtedesc, addr, retaddr);
6770 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6773 reg_off += esize;
6774 pg >>= esize;
6775 } while (reg_off & 63);
6776 } while (reg_off < reg_max);
6778 /* Wait until all exceptions have been raised to write back. */
6779 memcpy(vd, &scratch, reg_max);
6782 static inline QEMU_ALWAYS_INLINE
6783 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6784 target_ulong base, uint32_t desc, uintptr_t retaddr,
6785 int esize, int msize, zreg_off_fn *off_fn,
6786 sve_ldst1_host_fn *host_fn,
6787 sve_ldst1_tlb_fn *tlb_fn)
6789 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6790 /* Remove mtedesc from the normal sve descriptor. */
6791 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6794 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6795 * offset base entirely over the address space hole to change the
6796 * pointer tag, or change the bit55 selector. So we could here
6797 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6799 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6800 esize, msize, off_fn, host_fn, tlb_fn);
6803 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6804 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6805 void *vm, target_ulong base, uint32_t desc) \
6807 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6808 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6810 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6811 void *vm, target_ulong base, uint32_t desc) \
6813 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6814 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6817 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6818 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6819 void *vm, target_ulong base, uint32_t desc) \
6821 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6822 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6825 void *vm, target_ulong base, uint32_t desc) \
6827 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6828 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6831 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6832 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6833 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6834 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6835 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6837 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6838 DO_LD1_ZPZ_S(bss, zss, MO_8)
6839 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6840 DO_LD1_ZPZ_D(bds, zss, MO_8)
6841 DO_LD1_ZPZ_D(bds, zd, MO_8)
6843 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6844 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6845 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6846 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6847 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6849 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6850 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6851 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6852 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6853 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6855 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6856 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6857 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6858 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6859 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6861 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6862 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6863 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6864 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6865 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6867 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6868 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6869 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6870 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6871 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6873 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6874 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6875 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6876 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6877 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6879 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6880 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6881 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6883 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6884 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6885 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6887 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6888 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6889 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6891 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6892 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6893 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6895 #undef DO_LD1_ZPZ_S
6896 #undef DO_LD1_ZPZ_D
6898 /* First fault loads with a vector index. */
6901 * Common helpers for all gather first-faulting loads.
6904 static inline QEMU_ALWAYS_INLINE
6905 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6906 target_ulong base, uint32_t desc, uintptr_t retaddr,
6907 uint32_t mtedesc, const int esz, const int msz,
6908 zreg_off_fn *off_fn,
6909 sve_ldst1_host_fn *host_fn,
6910 sve_ldst1_tlb_fn *tlb_fn)
6912 const int mmu_idx = cpu_mmu_index(env, false);
6913 const intptr_t reg_max = simd_oprsz(desc);
6914 const int scale = simd_data(desc);
6915 const int esize = 1 << esz;
6916 const int msize = 1 << msz;
6917 intptr_t reg_off;
6918 SVEHostPage info;
6919 target_ulong addr, in_page;
6921 /* Skip to the first true predicate. */
6922 reg_off = find_next_active(vg, 0, reg_max, esz);
6923 if (unlikely(reg_off >= reg_max)) {
6924 /* The entire predicate was false; no load occurs. */
6925 memset(vd, 0, reg_max);
6926 return;
6930 * Probe the first element, allowing faults.
6932 addr = base + (off_fn(vm, reg_off) << scale);
6933 if (mtedesc) {
6934 mte_check(env, mtedesc, addr, retaddr);
6936 tlb_fn(env, vd, reg_off, addr, retaddr);
6938 /* After any fault, zero the other elements. */
6939 swap_memzero(vd, reg_off);
6940 reg_off += esize;
6941 swap_memzero(vd + reg_off, reg_max - reg_off);
6944 * Probe the remaining elements, not allowing faults.
6946 while (reg_off < reg_max) {
6947 uint64_t pg = vg[reg_off >> 6];
6948 do {
6949 if (likely((pg >> (reg_off & 63)) & 1)) {
6950 addr = base + (off_fn(vm, reg_off) << scale);
6951 in_page = -(addr | TARGET_PAGE_MASK);
6953 if (unlikely(in_page < msize)) {
6954 /* Stop if the element crosses a page boundary. */
6955 goto fault;
6958 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6959 mmu_idx, retaddr);
6960 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6961 goto fault;
6963 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6964 (cpu_watchpoint_address_matches
6965 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6966 goto fault;
6968 if (mtedesc &&
6969 arm_tlb_mte_tagged(&info.attrs) &&
6970 !mte_probe(env, mtedesc, addr)) {
6971 goto fault;
6974 host_fn(vd, reg_off, info.host);
6976 reg_off += esize;
6977 } while (reg_off & 63);
6979 return;
6981 fault:
6982 record_fault(env, reg_off, reg_max);
6985 static inline QEMU_ALWAYS_INLINE
6986 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6987 target_ulong base, uint32_t desc, uintptr_t retaddr,
6988 const int esz, const int msz,
6989 zreg_off_fn *off_fn,
6990 sve_ldst1_host_fn *host_fn,
6991 sve_ldst1_tlb_fn *tlb_fn)
6993 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6994 /* Remove mtedesc from the normal sve descriptor. */
6995 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6998 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6999 * offset base entirely over the address space hole to change the
7000 * pointer tag, or change the bit55 selector. So we could here
7001 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7003 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7004 esz, msz, off_fn, host_fn, tlb_fn);
7007 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
7008 void HELPER(sve_ldff##MEM##_##OFS) \
7009 (CPUARMState *env, void *vd, void *vg, \
7010 void *vm, target_ulong base, uint32_t desc) \
7012 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
7013 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7015 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7016 (CPUARMState *env, void *vd, void *vg, \
7017 void *vm, target_ulong base, uint32_t desc) \
7019 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7020 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7023 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7024 void HELPER(sve_ldff##MEM##_##OFS) \
7025 (CPUARMState *env, void *vd, void *vg, \
7026 void *vm, target_ulong base, uint32_t desc) \
7028 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7029 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7031 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7032 (CPUARMState *env, void *vd, void *vg, \
7033 void *vm, target_ulong base, uint32_t desc) \
7035 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7036 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7039 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7040 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7041 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7042 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7043 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7045 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7046 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7047 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7048 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7049 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7051 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7052 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7053 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7054 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7055 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7057 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7058 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7059 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7060 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7061 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7063 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7064 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7065 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7066 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7067 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7069 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7070 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7071 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7072 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7073 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7075 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7076 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7077 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7078 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7079 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7081 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7082 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7083 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7084 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7085 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7087 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7088 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7089 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7091 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7092 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7093 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7095 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7096 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7097 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7099 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7100 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7101 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7103 /* Stores with a vector index. */
7105 static inline QEMU_ALWAYS_INLINE
7106 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7107 target_ulong base, uint32_t desc, uintptr_t retaddr,
7108 uint32_t mtedesc, int esize, int msize,
7109 zreg_off_fn *off_fn,
7110 sve_ldst1_host_fn *host_fn,
7111 sve_ldst1_tlb_fn *tlb_fn)
7113 const int mmu_idx = cpu_mmu_index(env, false);
7114 const intptr_t reg_max = simd_oprsz(desc);
7115 const int scale = simd_data(desc);
7116 void *host[ARM_MAX_VQ * 4];
7117 intptr_t reg_off, i;
7118 SVEHostPage info, info2;
7121 * Probe all of the elements for host addresses and flags.
7123 i = reg_off = 0;
7124 do {
7125 uint64_t pg = vg[reg_off >> 6];
7126 do {
7127 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7128 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7130 host[i] = NULL;
7131 if (likely((pg >> (reg_off & 63)) & 1)) {
7132 if (likely(in_page >= msize)) {
7133 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7134 mmu_idx, retaddr);
7135 host[i] = info.host;
7136 } else {
7138 * Element crosses the page boundary.
7139 * Probe both pages, but do not record the host address,
7140 * so that we use the slow path.
7142 sve_probe_page(&info, false, env, addr, 0,
7143 MMU_DATA_STORE, mmu_idx, retaddr);
7144 sve_probe_page(&info2, false, env, addr + in_page, 0,
7145 MMU_DATA_STORE, mmu_idx, retaddr);
7146 info.flags |= info2.flags;
7149 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7150 cpu_check_watchpoint(env_cpu(env), addr, msize,
7151 info.attrs, BP_MEM_WRITE, retaddr);
7154 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
7155 mte_check(env, mtedesc, addr, retaddr);
7158 i += 1;
7159 reg_off += esize;
7160 } while (reg_off & 63);
7161 } while (reg_off < reg_max);
7164 * Now that we have recognized all exceptions except SyncExternal
7165 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7167 * Note for the common case of an element in RAM, not crossing a page
7168 * boundary, we have stored the host address in host[]. This doubles
7169 * as a first-level check against the predicate, since only enabled
7170 * elements have non-null host addresses.
7172 i = reg_off = 0;
7173 do {
7174 void *h = host[i];
7175 if (likely(h != NULL)) {
7176 host_fn(vd, reg_off, h);
7177 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7178 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7179 tlb_fn(env, vd, reg_off, addr, retaddr);
7181 i += 1;
7182 reg_off += esize;
7183 } while (reg_off < reg_max);
7186 static inline QEMU_ALWAYS_INLINE
7187 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7188 target_ulong base, uint32_t desc, uintptr_t retaddr,
7189 int esize, int msize, zreg_off_fn *off_fn,
7190 sve_ldst1_host_fn *host_fn,
7191 sve_ldst1_tlb_fn *tlb_fn)
7193 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7194 /* Remove mtedesc from the normal sve descriptor. */
7195 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7198 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7199 * offset base entirely over the address space hole to change the
7200 * pointer tag, or change the bit55 selector. So we could here
7201 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7203 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7204 esize, msize, off_fn, host_fn, tlb_fn);
7207 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7208 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7209 void *vm, target_ulong base, uint32_t desc) \
7211 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7212 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7214 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7215 void *vm, target_ulong base, uint32_t desc) \
7217 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7218 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7221 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7222 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7223 void *vm, target_ulong base, uint32_t desc) \
7225 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7226 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7228 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7229 void *vm, target_ulong base, uint32_t desc) \
7231 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7232 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7235 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7236 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7237 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7238 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7239 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7241 DO_ST1_ZPZ_S(bs, zss, MO_8)
7242 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7243 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7244 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7245 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7247 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7248 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7249 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7250 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7251 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7252 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7253 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7255 DO_ST1_ZPZ_D(bd, zss, MO_8)
7256 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7257 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7258 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7259 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7260 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7261 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7263 DO_ST1_ZPZ_D(bd, zd, MO_8)
7264 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7265 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7266 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7267 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7268 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7269 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7271 #undef DO_ST1_ZPZ_S
7272 #undef DO_ST1_ZPZ_D
7274 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7276 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7277 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7279 for (i = 0; i < opr_sz; ++i) {
7280 d[i] = n[i] ^ m[i] ^ k[i];
7284 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7286 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7287 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7289 for (i = 0; i < opr_sz; ++i) {
7290 d[i] = n[i] ^ (m[i] & ~k[i]);
7294 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7296 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7297 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7299 for (i = 0; i < opr_sz; ++i) {
7300 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7304 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7306 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7307 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7309 for (i = 0; i < opr_sz; ++i) {
7310 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7314 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7316 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7317 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7319 for (i = 0; i < opr_sz; ++i) {
7320 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7325 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7326 * See hasless(v,1) from
7327 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7329 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7331 int bits = 8 << esz;
7332 uint64_t ones = dup_const(esz, 1);
7333 uint64_t signs = ones << (bits - 1);
7334 uint64_t cmp0, cmp1;
7336 cmp1 = dup_const(esz, n);
7337 cmp0 = cmp1 ^ m0;
7338 cmp1 = cmp1 ^ m1;
7339 cmp0 = (cmp0 - ones) & ~cmp0;
7340 cmp1 = (cmp1 - ones) & ~cmp1;
7341 return (cmp0 | cmp1) & signs;
7344 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7345 uint32_t desc, int esz, bool nmatch)
7347 uint16_t esz_mask = pred_esz_masks[esz];
7348 intptr_t opr_sz = simd_oprsz(desc);
7349 uint32_t flags = PREDTEST_INIT;
7350 intptr_t i, j, k;
7352 for (i = 0; i < opr_sz; i += 16) {
7353 uint64_t m0 = *(uint64_t *)(vm + i);
7354 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7355 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7356 uint16_t out = 0;
7358 for (j = 0; j < 16; j += 8) {
7359 uint64_t n = *(uint64_t *)(vn + i + j);
7361 for (k = 0; k < 8; k += 1 << esz) {
7362 if (pg & (1 << (j + k))) {
7363 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7364 out |= (o ^ nmatch) << (j + k);
7368 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7369 flags = iter_predtest_fwd(out, pg, flags);
7371 return flags;
7374 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7375 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7377 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7380 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7381 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7383 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7384 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7386 #undef DO_PPZZ_MATCH
7388 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7389 uint32_t desc)
7391 ARMVectorReg scratch;
7392 intptr_t i, j;
7393 intptr_t opr_sz = simd_oprsz(desc);
7394 uint32_t *d = vd, *n = vn, *m = vm;
7395 uint8_t *pg = vg;
7397 if (d == n) {
7398 n = memcpy(&scratch, n, opr_sz);
7399 if (d == m) {
7400 m = n;
7402 } else if (d == m) {
7403 m = memcpy(&scratch, m, opr_sz);
7406 for (i = 0; i < opr_sz; i += 4) {
7407 uint64_t count = 0;
7408 uint8_t pred;
7410 pred = pg[H1(i >> 3)] >> (i & 7);
7411 if (pred & 1) {
7412 uint32_t nn = n[H4(i >> 2)];
7414 for (j = 0; j <= i; j += 4) {
7415 pred = pg[H1(j >> 3)] >> (j & 7);
7416 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7417 ++count;
7421 d[H4(i >> 2)] = count;
7425 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7426 uint32_t desc)
7428 ARMVectorReg scratch;
7429 intptr_t i, j;
7430 intptr_t opr_sz = simd_oprsz(desc);
7431 uint64_t *d = vd, *n = vn, *m = vm;
7432 uint8_t *pg = vg;
7434 if (d == n) {
7435 n = memcpy(&scratch, n, opr_sz);
7436 if (d == m) {
7437 m = n;
7439 } else if (d == m) {
7440 m = memcpy(&scratch, m, opr_sz);
7443 for (i = 0; i < opr_sz / 8; ++i) {
7444 uint64_t count = 0;
7445 if (pg[H1(i)] & 1) {
7446 uint64_t nn = n[i];
7447 for (j = 0; j <= i; ++j) {
7448 if ((pg[H1(j)] & 1) && nn == m[j]) {
7449 ++count;
7453 d[i] = count;
7458 * Returns the number of bytes in m0 and m1 that match n.
7459 * Unlike do_match2 we don't just need true/false, we need an exact count.
7460 * This requires two extra logical operations.
7462 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7464 const uint64_t mask = dup_const(MO_8, 0x7f);
7465 uint64_t cmp0, cmp1;
7467 cmp1 = dup_const(MO_8, n);
7468 cmp0 = cmp1 ^ m0;
7469 cmp1 = cmp1 ^ m1;
7472 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7473 * 2: carry in to msb if byte != 0 (+ mask)
7474 * 3: set msb if cmp has msb set (| cmp)
7475 * 4: set ~msb to ignore them (| mask)
7476 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7477 * 5: invert, resulting in 0x80 if and only if byte == 0.
7479 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7480 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7483 * Combine the two compares in a way that the bits do
7484 * not overlap, and so preserves the count of set bits.
7485 * If the host has an efficient instruction for ctpop,
7486 * then ctpop(x) + ctpop(y) has the same number of
7487 * operations as ctpop(x | (y >> 1)). If the host does
7488 * not have an efficient ctpop, then we only want to
7489 * use it once.
7491 return ctpop64(cmp0 | (cmp1 >> 1));
7494 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7496 intptr_t i, j;
7497 intptr_t opr_sz = simd_oprsz(desc);
7499 for (i = 0; i < opr_sz; i += 16) {
7500 uint64_t n0 = *(uint64_t *)(vn + i);
7501 uint64_t m0 = *(uint64_t *)(vm + i);
7502 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7503 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7504 uint64_t out0 = 0;
7505 uint64_t out1 = 0;
7507 for (j = 0; j < 64; j += 8) {
7508 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7509 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7510 out0 |= cnt0 << j;
7511 out1 |= cnt1 << j;
7514 *(uint64_t *)(vd + i) = out0;
7515 *(uint64_t *)(vd + i + 8) = out1;
7519 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7521 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7522 int shr = simd_data(desc);
7523 int shl = 8 - shr;
7524 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7525 uint64_t *d = vd, *n = vn, *m = vm;
7527 for (i = 0; i < opr_sz; ++i) {
7528 uint64_t t = n[i] ^ m[i];
7529 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7533 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7535 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7536 int shr = simd_data(desc);
7537 int shl = 16 - shr;
7538 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7539 uint64_t *d = vd, *n = vn, *m = vm;
7541 for (i = 0; i < opr_sz; ++i) {
7542 uint64_t t = n[i] ^ m[i];
7543 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7547 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7549 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7550 int shr = simd_data(desc);
7551 uint32_t *d = vd, *n = vn, *m = vm;
7553 for (i = 0; i < opr_sz; ++i) {
7554 d[i] = ror32(n[i] ^ m[i], shr);
7558 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7559 void *status, uint32_t desc)
7561 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7563 for (s = 0; s < opr_sz; ++s) {
7564 float32 *n = vn + s * sizeof(float32) * 4;
7565 float32 *m = vm + s * sizeof(float32) * 4;
7566 float32 *a = va + s * sizeof(float32) * 4;
7567 float32 *d = vd + s * sizeof(float32) * 4;
7568 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7569 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7570 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7571 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7572 float32 p0, p1;
7574 /* i = 0, j = 0 */
7575 p0 = float32_mul(n00, m00, status);
7576 p1 = float32_mul(n01, m01, status);
7577 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7579 /* i = 0, j = 1 */
7580 p0 = float32_mul(n00, m10, status);
7581 p1 = float32_mul(n01, m11, status);
7582 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7584 /* i = 1, j = 0 */
7585 p0 = float32_mul(n10, m00, status);
7586 p1 = float32_mul(n11, m01, status);
7587 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7589 /* i = 1, j = 1 */
7590 p0 = float32_mul(n10, m10, status);
7591 p1 = float32_mul(n11, m11, status);
7592 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7596 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7597 void *status, uint32_t desc)
7599 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7601 for (s = 0; s < opr_sz; ++s) {
7602 float64 *n = vn + s * sizeof(float64) * 4;
7603 float64 *m = vm + s * sizeof(float64) * 4;
7604 float64 *a = va + s * sizeof(float64) * 4;
7605 float64 *d = vd + s * sizeof(float64) * 4;
7606 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7607 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7608 float64 p0, p1;
7610 /* i = 0, j = 0 */
7611 p0 = float64_mul(n00, m00, status);
7612 p1 = float64_mul(n01, m01, status);
7613 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7615 /* i = 0, j = 1 */
7616 p0 = float64_mul(n00, m10, status);
7617 p1 = float64_mul(n01, m11, status);
7618 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7620 /* i = 1, j = 0 */
7621 p0 = float64_mul(n10, m00, status);
7622 p1 = float64_mul(n11, m01, status);
7623 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7625 /* i = 1, j = 1 */
7626 p0 = float64_mul(n10, m10, status);
7627 p1 = float64_mul(n11, m11, status);
7628 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7632 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7633 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7635 intptr_t i = simd_oprsz(desc); \
7636 uint64_t *g = vg; \
7637 do { \
7638 uint64_t pg = g[(i - 1) >> 6]; \
7639 do { \
7640 i -= sizeof(TYPEW); \
7641 if (likely((pg >> (i & 63)) & 1)) { \
7642 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7643 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7645 } while (i & 63); \
7646 } while (i != 0); \
7649 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7650 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7651 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7653 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7654 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7656 intptr_t i = simd_oprsz(desc); \
7657 uint64_t *g = vg; \
7658 do { \
7659 uint64_t pg = g[(i - 1) >> 6]; \
7660 do { \
7661 i -= sizeof(TYPEW); \
7662 if (likely((pg >> (i & 63)) & 1)) { \
7663 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7664 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7666 } while (i & 63); \
7667 } while (i != 0); \
7670 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7671 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7673 #undef DO_FCVTLT
7674 #undef DO_FCVTNT