bitops.h: Provide hswap32(), hswap64(), wswap64() swapping operations
[qemu/ar7.git] / target / arm / sve_helper.c
blobdab5f1d1cda85d88c25831238cba2368bbc1f1d0
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
36 * within CPUARMState.
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
43 * moving forward.
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 if (likely(g)) {
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
55 /* Accumulate Z from each D & G. */
56 flags |= ((d & g) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 return flags;
64 /* This is an iterative function, called for each Pd and Pg word
65 * moving backward.
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 if (likely(g)) {
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
72 if (!(flags & 4)) {
73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags |= (d & pow2floor(g)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags |= ((d & g) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 return flags;
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
103 return flags;
107 * Expand active predicate bits to bytes, for byte elements.
108 * (The data table itself is in vec_helper.c as MVE also needs it.)
110 static inline uint64_t expand_pred_b(uint8_t byte)
112 return expand_pred_b_data[byte];
115 /* Similarly for half-word elements.
116 * for (i = 0; i < 256; ++i) {
117 * unsigned long m = 0;
118 * if (i & 0xaa) {
119 * continue;
121 * for (j = 0; j < 8; j += 2) {
122 * if ((i >> j) & 1) {
123 * m |= 0xfffful << (j << 3);
126 * printf("[0x%x] = 0x%016lx,\n", i, m);
129 static inline uint64_t expand_pred_h(uint8_t byte)
131 static const uint64_t word[] = {
132 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
133 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
134 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
135 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
136 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
137 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
138 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
139 [0x55] = 0xffffffffffffffff,
141 return word[byte & 0x55];
144 /* Similarly for single word elements. */
145 static inline uint64_t expand_pred_s(uint8_t byte)
147 static const uint64_t word[] = {
148 [0x01] = 0x00000000ffffffffull,
149 [0x10] = 0xffffffff00000000ull,
150 [0x11] = 0xffffffffffffffffull,
152 return word[byte & 0x11];
155 #define LOGICAL_PPPP(NAME, FUNC) \
156 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
158 uintptr_t opr_sz = simd_oprsz(desc); \
159 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
160 uintptr_t i; \
161 for (i = 0; i < opr_sz / 8; ++i) { \
162 d[i] = FUNC(n[i], m[i], g[i]); \
166 #define DO_AND(N, M, G) (((N) & (M)) & (G))
167 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
168 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
169 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
170 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
171 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
172 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
173 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
175 LOGICAL_PPPP(sve_and_pppp, DO_AND)
176 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
177 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
178 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
179 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
180 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
181 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
182 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
184 #undef DO_AND
185 #undef DO_BIC
186 #undef DO_EOR
187 #undef DO_ORR
188 #undef DO_ORN
189 #undef DO_NOR
190 #undef DO_NAND
191 #undef DO_SEL
192 #undef LOGICAL_PPPP
194 /* Fully general three-operand expander, controlled by a predicate.
195 * This is complicated by the host-endian storage of the register file.
197 /* ??? I don't expect the compiler could ever vectorize this itself.
198 * With some tables we can convert bit masks to byte masks, and with
199 * extra care wrt byte/word ordering we could use gcc generic vectors
200 * and do 16 bytes at a time.
202 #define DO_ZPZZ(NAME, TYPE, H, OP) \
203 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
205 intptr_t i, opr_sz = simd_oprsz(desc); \
206 for (i = 0; i < opr_sz; ) { \
207 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
208 do { \
209 if (pg & 1) { \
210 TYPE nn = *(TYPE *)(vn + H(i)); \
211 TYPE mm = *(TYPE *)(vm + H(i)); \
212 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
214 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
215 } while (i & 15); \
219 /* Similarly, specialized for 64-bit operands. */
220 #define DO_ZPZZ_D(NAME, TYPE, OP) \
221 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
223 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
224 TYPE *d = vd, *n = vn, *m = vm; \
225 uint8_t *pg = vg; \
226 for (i = 0; i < opr_sz; i += 1) { \
227 if (pg[H1(i)] & 1) { \
228 TYPE nn = n[i], mm = m[i]; \
229 d[i] = OP(nn, mm); \
234 #define DO_AND(N, M) (N & M)
235 #define DO_EOR(N, M) (N ^ M)
236 #define DO_ORR(N, M) (N | M)
237 #define DO_BIC(N, M) (N & ~M)
238 #define DO_ADD(N, M) (N + M)
239 #define DO_SUB(N, M) (N - M)
240 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
241 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
242 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
243 #define DO_MUL(N, M) (N * M)
247 * We must avoid the C undefined behaviour cases: division by
248 * zero and signed division of INT_MIN by -1. Both of these
249 * have architecturally defined required results for Arm.
250 * We special case all signed divisions by -1 to avoid having
251 * to deduce the minimum integer for the type involved.
253 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
254 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
256 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
257 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
258 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
259 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
261 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
262 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
263 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
264 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
266 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
267 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
268 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
269 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
271 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
272 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
273 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
274 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
276 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
277 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
278 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
279 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
281 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
282 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
283 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
284 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
286 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
287 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
288 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
289 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
291 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
292 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
293 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
294 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
296 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
297 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
298 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
299 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
301 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
302 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
303 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
304 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
306 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
307 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
308 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
309 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
311 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
312 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
313 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
314 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
316 /* Because the computation type is at least twice as large as required,
317 these work for both signed and unsigned source types. */
318 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
320 return (n * m) >> 8;
323 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
325 return (n * m) >> 16;
328 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
330 return (n * m) >> 32;
333 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
335 uint64_t lo, hi;
336 muls64(&lo, &hi, n, m);
337 return hi;
340 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
342 uint64_t lo, hi;
343 mulu64(&lo, &hi, n, m);
344 return hi;
347 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
348 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
349 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
350 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
352 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
353 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
354 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
355 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
357 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
358 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
359 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
360 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
362 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
363 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
365 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
366 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
368 /* Note that all bits of the shift are significant
369 and not modulo the element size. */
370 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
371 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
372 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
374 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
375 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
376 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
378 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
379 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
380 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
382 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
383 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
384 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
386 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
387 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
388 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
390 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
392 int8_t n1 = n, n2 = n >> 8;
393 return m + n1 + n2;
396 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
398 int16_t n1 = n, n2 = n >> 16;
399 return m + n1 + n2;
402 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
404 int32_t n1 = n, n2 = n >> 32;
405 return m + n1 + n2;
408 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
409 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
410 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
412 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
414 uint8_t n1 = n, n2 = n >> 8;
415 return m + n1 + n2;
418 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
420 uint16_t n1 = n, n2 = n >> 16;
421 return m + n1 + n2;
424 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
426 uint32_t n1 = n, n2 = n >> 32;
427 return m + n1 + n2;
430 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
431 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
432 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
434 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
435 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
436 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
437 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
439 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
440 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
441 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
442 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
444 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
445 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
446 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
447 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
449 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
450 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
451 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
452 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
455 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
456 * We pass in a pointer to a dummy saturation field to trigger
457 * the saturating arithmetic but discard the information about
458 * whether it has occurred.
460 #define do_sqshl_b(n, m) \
461 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
462 #define do_sqshl_h(n, m) \
463 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
464 #define do_sqshl_s(n, m) \
465 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
466 #define do_sqshl_d(n, m) \
467 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
469 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
470 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
471 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
472 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
474 #define do_uqshl_b(n, m) \
475 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
476 #define do_uqshl_h(n, m) \
477 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
478 #define do_uqshl_s(n, m) \
479 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
480 #define do_uqshl_d(n, m) \
481 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
483 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
484 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
485 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
486 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
488 #define do_sqrshl_b(n, m) \
489 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
490 #define do_sqrshl_h(n, m) \
491 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
492 #define do_sqrshl_s(n, m) \
493 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
494 #define do_sqrshl_d(n, m) \
495 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
497 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
498 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
499 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
500 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
502 #undef do_sqrshl_d
504 #define do_uqrshl_b(n, m) \
505 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
506 #define do_uqrshl_h(n, m) \
507 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
508 #define do_uqrshl_s(n, m) \
509 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
510 #define do_uqrshl_d(n, m) \
511 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
513 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
514 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
515 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
516 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
518 #undef do_uqrshl_d
520 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
521 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
523 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
524 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
525 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
526 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
528 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
529 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
530 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
531 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
533 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
534 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
536 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
537 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
538 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
539 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
541 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
542 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
543 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
544 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
546 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
547 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
549 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
550 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
551 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
552 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
554 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
555 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
556 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
557 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
559 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
561 return val >= max ? max : val <= min ? min : val;
564 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
565 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
566 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
568 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
570 int64_t r = n + m;
571 if (((r ^ n) & ~(n ^ m)) < 0) {
572 /* Signed overflow. */
573 return r < 0 ? INT64_MAX : INT64_MIN;
575 return r;
578 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
579 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
580 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
581 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
583 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
584 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
585 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
587 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
589 uint64_t r = n + m;
590 return r < n ? UINT64_MAX : r;
593 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
594 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
595 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
596 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
598 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
599 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
600 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
602 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
604 int64_t r = n - m;
605 if (((r ^ n) & (n ^ m)) < 0) {
606 /* Signed overflow. */
607 return r < 0 ? INT64_MAX : INT64_MIN;
609 return r;
612 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
613 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
614 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
615 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
617 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
618 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
619 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
621 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
623 return n > m ? n - m : 0;
626 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
627 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
628 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
629 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
631 #define DO_SUQADD_B(n, m) \
632 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
633 #define DO_SUQADD_H(n, m) \
634 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
635 #define DO_SUQADD_S(n, m) \
636 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
638 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
640 uint64_t r = n + m;
642 if (n < 0) {
643 /* Note that m - abs(n) cannot underflow. */
644 if (r > INT64_MAX) {
645 /* Result is either very large positive or negative. */
646 if (m > -n) {
647 /* m > abs(n), so r is a very large positive. */
648 return INT64_MAX;
650 /* Result is negative. */
652 } else {
653 /* Both inputs are positive: check for overflow. */
654 if (r < m || r > INT64_MAX) {
655 return INT64_MAX;
658 return r;
661 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
662 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
663 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
664 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
666 #define DO_USQADD_B(n, m) \
667 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
668 #define DO_USQADD_H(n, m) \
669 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
670 #define DO_USQADD_S(n, m) \
671 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
673 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
675 uint64_t r = n + m;
677 if (m < 0) {
678 return n < -m ? 0 : r;
680 return r < n ? UINT64_MAX : r;
683 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
684 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
685 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
686 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
688 #undef DO_ZPZZ
689 #undef DO_ZPZZ_D
692 * Three operand expander, operating on element pairs.
693 * If the slot I is even, the elements from from VN {I, I+1}.
694 * If the slot I is odd, the elements from from VM {I-1, I}.
695 * Load all of the input elements in each pair before overwriting output.
697 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
698 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
700 intptr_t i, opr_sz = simd_oprsz(desc); \
701 for (i = 0; i < opr_sz; ) { \
702 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
703 do { \
704 TYPE n0 = *(TYPE *)(vn + H(i)); \
705 TYPE m0 = *(TYPE *)(vm + H(i)); \
706 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
707 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
708 if (pg & 1) { \
709 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
711 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
712 if (pg & 1) { \
713 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
715 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
716 } while (i & 15); \
720 /* Similarly, specialized for 64-bit operands. */
721 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
722 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
724 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
725 TYPE *d = vd, *n = vn, *m = vm; \
726 uint8_t *pg = vg; \
727 for (i = 0; i < opr_sz; i += 2) { \
728 TYPE n0 = n[i], n1 = n[i + 1]; \
729 TYPE m0 = m[i], m1 = m[i + 1]; \
730 if (pg[H1(i)] & 1) { \
731 d[i] = OP(n0, n1); \
733 if (pg[H1(i + 1)] & 1) { \
734 d[i + 1] = OP(m0, m1); \
739 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
740 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
741 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
742 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
744 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
745 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
746 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
747 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
749 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
750 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
751 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
752 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
754 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
755 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
756 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
757 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
759 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
760 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
761 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
762 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
764 #undef DO_ZPZZ_PAIR
765 #undef DO_ZPZZ_PAIR_D
767 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
768 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
769 void *status, uint32_t desc) \
771 intptr_t i, opr_sz = simd_oprsz(desc); \
772 for (i = 0; i < opr_sz; ) { \
773 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
774 do { \
775 TYPE n0 = *(TYPE *)(vn + H(i)); \
776 TYPE m0 = *(TYPE *)(vm + H(i)); \
777 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
778 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
779 if (pg & 1) { \
780 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
782 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
783 if (pg & 1) { \
784 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
786 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
787 } while (i & 15); \
791 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
792 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
793 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
795 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
796 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
797 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
799 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
800 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
801 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
803 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
804 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
805 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
807 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
808 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
809 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
811 #undef DO_ZPZZ_PAIR_FP
813 /* Three-operand expander, controlled by a predicate, in which the
814 * third operand is "wide". That is, for D = N op M, the same 64-bit
815 * value of M is used with all of the narrower values of N.
817 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
818 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
820 intptr_t i, opr_sz = simd_oprsz(desc); \
821 for (i = 0; i < opr_sz; ) { \
822 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
823 TYPEW mm = *(TYPEW *)(vm + i); \
824 do { \
825 if (pg & 1) { \
826 TYPE nn = *(TYPE *)(vn + H(i)); \
827 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
829 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
830 } while (i & 7); \
834 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
835 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
836 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
838 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
839 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
840 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
842 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
843 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
844 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
846 #undef DO_ZPZW
848 /* Fully general two-operand expander, controlled by a predicate.
850 #define DO_ZPZ(NAME, TYPE, H, OP) \
851 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
853 intptr_t i, opr_sz = simd_oprsz(desc); \
854 for (i = 0; i < opr_sz; ) { \
855 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
856 do { \
857 if (pg & 1) { \
858 TYPE nn = *(TYPE *)(vn + H(i)); \
859 *(TYPE *)(vd + H(i)) = OP(nn); \
861 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
862 } while (i & 15); \
866 /* Similarly, specialized for 64-bit operands. */
867 #define DO_ZPZ_D(NAME, TYPE, OP) \
868 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
870 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
871 TYPE *d = vd, *n = vn; \
872 uint8_t *pg = vg; \
873 for (i = 0; i < opr_sz; i += 1) { \
874 if (pg[H1(i)] & 1) { \
875 TYPE nn = n[i]; \
876 d[i] = OP(nn); \
881 #define DO_CLS_B(N) (clrsb32(N) - 24)
882 #define DO_CLS_H(N) (clrsb32(N) - 16)
884 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
885 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
886 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
887 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
889 #define DO_CLZ_B(N) (clz32(N) - 24)
890 #define DO_CLZ_H(N) (clz32(N) - 16)
892 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
893 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
894 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
895 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
897 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
898 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
899 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
900 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
902 #define DO_CNOT(N) (N == 0)
904 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
905 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
906 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
907 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
909 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
911 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
912 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
913 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
915 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
917 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
918 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
919 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
921 #define DO_NOT(N) (~N)
923 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
924 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
925 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
926 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
928 #define DO_SXTB(N) ((int8_t)N)
929 #define DO_SXTH(N) ((int16_t)N)
930 #define DO_SXTS(N) ((int32_t)N)
931 #define DO_UXTB(N) ((uint8_t)N)
932 #define DO_UXTH(N) ((uint16_t)N)
933 #define DO_UXTS(N) ((uint32_t)N)
935 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
936 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
937 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
938 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
939 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
940 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
942 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
943 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
944 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
945 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
946 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
947 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
949 #define DO_ABS(N) (N < 0 ? -N : N)
951 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
952 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
953 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
954 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
956 #define DO_NEG(N) (-N)
958 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
959 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
960 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
961 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
963 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
964 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
965 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
967 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
968 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
970 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
972 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
973 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
974 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
975 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
977 #define DO_SQABS(X) \
978 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
979 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
981 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
982 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
983 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
984 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
986 #define DO_SQNEG(X) \
987 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
988 x_ == min_ ? -min_ - 1 : -x_; })
990 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
991 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
992 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
993 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
995 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
996 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
998 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1000 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1001 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1003 intptr_t i, opr_sz = simd_oprsz(desc); \
1004 for (i = 0; i < opr_sz; ) { \
1005 TYPEW mm = *(TYPEW *)(vm + i); \
1006 do { \
1007 TYPE nn = *(TYPE *)(vn + H(i)); \
1008 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1009 i += sizeof(TYPE); \
1010 } while (i & 7); \
1014 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1015 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1016 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1018 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1019 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1020 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1022 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1023 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1024 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1026 #undef DO_ZZW
1028 #undef DO_CLS_B
1029 #undef DO_CLS_H
1030 #undef DO_CLZ_B
1031 #undef DO_CLZ_H
1032 #undef DO_CNOT
1033 #undef DO_FABS
1034 #undef DO_FNEG
1035 #undef DO_ABS
1036 #undef DO_NEG
1037 #undef DO_ZPZ
1038 #undef DO_ZPZ_D
1041 * Three-operand expander, unpredicated, in which the two inputs are
1042 * selected from the top or bottom half of the wide column.
1044 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1045 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1047 intptr_t i, opr_sz = simd_oprsz(desc); \
1048 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1049 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1050 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1051 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1052 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1053 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1057 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1058 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1059 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1061 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1062 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1063 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1065 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1066 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1067 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1069 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1070 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1071 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1073 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1074 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1075 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1077 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1078 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1079 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1081 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1082 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1083 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1085 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1086 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1087 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1089 /* Note that the multiply cannot overflow, but the doubling can. */
1090 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1092 int16_t val = n * m;
1093 return DO_SQADD_H(val, val);
1096 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1098 int32_t val = n * m;
1099 return DO_SQADD_S(val, val);
1102 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1104 int64_t val = n * m;
1105 return do_sqadd_d(val, val);
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1109 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1110 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1112 #undef DO_ZZZ_TB
1114 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1115 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1117 intptr_t i, opr_sz = simd_oprsz(desc); \
1118 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1119 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1120 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1121 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1122 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1126 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1127 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1128 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1130 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1131 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1132 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1134 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1135 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1136 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1138 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1139 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1140 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1142 #undef DO_ZZZ_WTB
1144 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1145 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1147 intptr_t i, opr_sz = simd_oprsz(desc); \
1148 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1149 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1150 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1151 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1152 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1153 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1157 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1159 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1160 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1162 #undef DO_ZZZ_NTB
1164 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1167 intptr_t i, opr_sz = simd_oprsz(desc); \
1168 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1169 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1170 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1171 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1172 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1173 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1177 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1178 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1179 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1181 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1182 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1183 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1186 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1187 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1190 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1191 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1193 #define DO_NMUL(N, M) -(N * M)
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1203 #undef DO_ZZZW_ACC
1205 #define DO_XTNB(NAME, TYPE, OP) \
1206 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1208 intptr_t i, opr_sz = simd_oprsz(desc); \
1209 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1210 TYPE nn = *(TYPE *)(vn + i); \
1211 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1212 *(TYPE *)(vd + i) = nn; \
1216 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1217 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1219 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1220 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1221 TYPE nn = *(TYPE *)(vn + i); \
1222 *(TYPEN *)(vd + i + odd) = OP(nn); \
1226 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1227 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1228 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1230 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1231 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1232 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1234 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1235 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1236 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1238 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1239 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1240 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1242 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1243 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1244 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1246 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1247 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1248 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1250 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1251 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1252 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1254 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1255 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1256 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1258 #undef DO_XTNB
1259 #undef DO_XTNT
1261 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1263 intptr_t i, opr_sz = simd_oprsz(desc);
1264 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1265 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1266 uint32_t *a = va, *n = vn;
1267 uint64_t *d = vd, *m = vm;
1269 for (i = 0; i < opr_sz / 8; ++i) {
1270 uint32_t e1 = a[2 * i + H4(0)];
1271 uint32_t e2 = n[2 * i + sel] ^ inv;
1272 uint64_t c = extract64(m[i], 32, 1);
1273 /* Compute and store the entire 33-bit result at once. */
1274 d[i] = c + e1 + e2;
1278 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1280 intptr_t i, opr_sz = simd_oprsz(desc);
1281 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1282 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1283 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1285 for (i = 0; i < opr_sz / 8; i += 2) {
1286 Int128 e1 = int128_make64(a[i]);
1287 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1288 Int128 c = int128_make64(m[i + 1] & 1);
1289 Int128 r = int128_add(int128_add(e1, e2), c);
1290 d[i + 0] = int128_getlo(r);
1291 d[i + 1] = int128_gethi(r);
1295 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1300 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1301 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1302 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1303 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1304 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1305 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1309 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1310 do_sqdmull_h, DO_SQADD_H)
1311 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1312 do_sqdmull_s, DO_SQADD_S)
1313 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1314 do_sqdmull_d, do_sqadd_d)
1316 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1317 do_sqdmull_h, DO_SQSUB_H)
1318 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1319 do_sqdmull_s, DO_SQSUB_S)
1320 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1321 do_sqdmull_d, do_sqsub_d)
1323 #undef DO_SQDMLAL
1325 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1326 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1328 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1329 int rot = simd_data(desc); \
1330 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1331 bool sub_r = rot == 1 || rot == 2; \
1332 bool sub_i = rot >= 2; \
1333 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1334 for (i = 0; i < opr_sz; i += 2) { \
1335 TYPE elt1_a = n[H(i + sel_a)]; \
1336 TYPE elt2_a = m[H(i + sel_a)]; \
1337 TYPE elt2_b = m[H(i + sel_b)]; \
1338 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1339 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1343 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1345 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1346 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1347 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1348 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1350 #define DO_SQRDMLAH_B(N, M, A, S) \
1351 do_sqrdmlah_b(N, M, A, S, true)
1352 #define DO_SQRDMLAH_H(N, M, A, S) \
1353 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1354 #define DO_SQRDMLAH_S(N, M, A, S) \
1355 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1356 #define DO_SQRDMLAH_D(N, M, A, S) \
1357 do_sqrdmlah_d(N, M, A, S, true)
1359 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1364 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1365 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1367 intptr_t i, j, oprsz = simd_oprsz(desc); \
1368 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1369 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1370 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1371 bool sub_r = rot == 1 || rot == 2; \
1372 bool sub_i = rot >= 2; \
1373 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1374 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1375 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1376 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1377 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1378 TYPE elt1_a = n[H(i + j + sel_a)]; \
1379 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1380 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1385 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1386 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1388 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1389 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1391 #undef DO_CMLA
1392 #undef DO_CMLA_FUNC
1393 #undef DO_CMLA_IDX_FUNC
1394 #undef DO_SQRDMLAH_B
1395 #undef DO_SQRDMLAH_H
1396 #undef DO_SQRDMLAH_S
1397 #undef DO_SQRDMLAH_D
1399 /* Note N and M are 4 elements bundled into one unit. */
1400 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1401 int sel_a, int sel_b, int sub_i)
1403 for (int i = 0; i <= 1; i++) {
1404 int32_t elt1_r = (int8_t)(n >> (16 * i));
1405 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1406 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1407 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1409 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1411 return a;
1414 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1415 int sel_a, int sel_b, int sub_i)
1417 for (int i = 0; i <= 1; i++) {
1418 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1419 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1420 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1421 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1423 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1425 return a;
1428 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1429 void *va, uint32_t desc)
1431 int opr_sz = simd_oprsz(desc);
1432 int rot = simd_data(desc);
1433 int sel_a = rot & 1;
1434 int sel_b = sel_a ^ 1;
1435 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1436 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1438 for (int e = 0; e < opr_sz / 4; e++) {
1439 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1443 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1444 void *va, uint32_t desc)
1446 int opr_sz = simd_oprsz(desc);
1447 int rot = simd_data(desc);
1448 int sel_a = rot & 1;
1449 int sel_b = sel_a ^ 1;
1450 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1451 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1453 for (int e = 0; e < opr_sz / 8; e++) {
1454 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1458 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1459 void *va, uint32_t desc)
1461 int opr_sz = simd_oprsz(desc);
1462 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1464 int sel_a = rot & 1;
1465 int sel_b = sel_a ^ 1;
1466 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1469 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1470 uint32_t seg_m = m[seg + idx];
1471 for (int e = 0; e < 4; e++) {
1472 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1473 sel_a, sel_b, sub_i);
1478 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1479 void *va, uint32_t desc)
1481 int seg, opr_sz = simd_oprsz(desc);
1482 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1483 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1484 int sel_a = rot & 1;
1485 int sel_b = sel_a ^ 1;
1486 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1487 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1489 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1490 uint64_t seg_m = m[seg + idx];
1491 for (int e = 0; e < 2; e++) {
1492 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1493 sel_a, sel_b, sub_i);
1498 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1499 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1501 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1502 intptr_t i, j, idx = simd_data(desc); \
1503 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1504 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1505 TYPE mm = m[i]; \
1506 for (j = 0; j < segment; j++) { \
1507 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1512 #define DO_SQRDMLAH_H(N, M, A) \
1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1514 #define DO_SQRDMLAH_S(N, M, A) \
1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1516 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1518 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1519 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1520 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1522 #define DO_SQRDMLSH_H(N, M, A) \
1523 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1524 #define DO_SQRDMLSH_S(N, M, A) \
1525 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1526 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1528 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1529 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1530 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1532 #undef DO_ZZXZ
1534 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1535 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1537 intptr_t i, j, oprsz = simd_oprsz(desc); \
1538 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1539 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1540 for (i = 0; i < oprsz; i += 16) { \
1541 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1542 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1543 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1544 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1545 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1550 #define DO_MLA(N, M, A) (A + N * M)
1552 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1553 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1554 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1555 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1557 #define DO_MLS(N, M, A) (A - N * M)
1559 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1560 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1561 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1562 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1564 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1565 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1567 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1568 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1570 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1571 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1573 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1574 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1576 #undef DO_MLA
1577 #undef DO_MLS
1578 #undef DO_ZZXW
1580 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1581 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1583 intptr_t i, j, oprsz = simd_oprsz(desc); \
1584 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1585 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1586 for (i = 0; i < oprsz; i += 16) { \
1587 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1588 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1589 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1590 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1595 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1596 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1598 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1599 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1601 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1602 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1604 #undef DO_ZZX
1606 #define DO_BITPERM(NAME, TYPE, OP) \
1607 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1609 intptr_t i, opr_sz = simd_oprsz(desc); \
1610 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1611 TYPE nn = *(TYPE *)(vn + i); \
1612 TYPE mm = *(TYPE *)(vm + i); \
1613 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1617 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1619 uint64_t res = 0;
1620 int db, rb = 0;
1622 for (db = 0; db < n; ++db) {
1623 if ((mask >> db) & 1) {
1624 res |= ((data >> db) & 1) << rb;
1625 ++rb;
1628 return res;
1631 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1632 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1633 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1634 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1636 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1638 uint64_t res = 0;
1639 int rb, db = 0;
1641 for (rb = 0; rb < n; ++rb) {
1642 if ((mask >> rb) & 1) {
1643 res |= ((data >> db) & 1) << rb;
1644 ++db;
1647 return res;
1650 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1651 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1652 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1653 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1655 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1657 uint64_t resm = 0, resu = 0;
1658 int db, rbm = 0, rbu = 0;
1660 for (db = 0; db < n; ++db) {
1661 uint64_t val = (data >> db) & 1;
1662 if ((mask >> db) & 1) {
1663 resm |= val << rbm++;
1664 } else {
1665 resu |= val << rbu++;
1669 return resm | (resu << rbm);
1672 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1673 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1674 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1675 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1677 #undef DO_BITPERM
1679 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1680 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1682 intptr_t i, opr_sz = simd_oprsz(desc); \
1683 int sub_r = simd_data(desc); \
1684 if (sub_r) { \
1685 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1686 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1687 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1688 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1689 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1690 acc_r = ADD_OP(acc_r, el2_i); \
1691 acc_i = SUB_OP(acc_i, el2_r); \
1692 *(TYPE *)(vd + H(i)) = acc_r; \
1693 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1695 } else { \
1696 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1697 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1698 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1699 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1700 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1701 acc_r = SUB_OP(acc_r, el2_i); \
1702 acc_i = ADD_OP(acc_i, el2_r); \
1703 *(TYPE *)(vd + H(i)) = acc_r; \
1704 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1709 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1710 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1711 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1712 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1714 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1715 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1716 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1717 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1719 #undef DO_CADD
1721 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1722 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1724 intptr_t i, opr_sz = simd_oprsz(desc); \
1725 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1726 int shift = simd_data(desc) >> 1; \
1727 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1728 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1729 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1733 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1734 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1735 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1737 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1738 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1739 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1741 #undef DO_ZZI_SHLL
1743 /* Two-operand reduction expander, controlled by a predicate.
1744 * The difference between TYPERED and TYPERET has to do with
1745 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1746 * but TYPERET must be unsigned so that e.g. a 32-bit value
1747 * is not sign-extended to the ABI uint64_t return type.
1749 /* ??? If we were to vectorize this by hand the reduction ordering
1750 * would change. For integer operands, this is perfectly fine.
1752 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1753 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1755 intptr_t i, opr_sz = simd_oprsz(desc); \
1756 TYPERED ret = INIT; \
1757 for (i = 0; i < opr_sz; ) { \
1758 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1759 do { \
1760 if (pg & 1) { \
1761 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1762 ret = OP(ret, nn); \
1764 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1765 } while (i & 15); \
1767 return (TYPERET)ret; \
1770 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1771 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1773 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1774 TYPEE *n = vn; \
1775 uint8_t *pg = vg; \
1776 TYPER ret = INIT; \
1777 for (i = 0; i < opr_sz; i += 1) { \
1778 if (pg[H1(i)] & 1) { \
1779 TYPEE nn = n[i]; \
1780 ret = OP(ret, nn); \
1783 return ret; \
1786 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1787 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1788 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1789 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1791 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1792 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1793 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1794 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1796 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1797 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1798 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1799 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1801 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1802 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1803 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1805 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1806 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1807 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1808 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1810 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1811 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1812 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1813 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1815 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1816 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1817 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1818 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1820 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1821 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1822 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1823 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1825 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1826 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1827 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1828 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1830 #undef DO_VPZ
1831 #undef DO_VPZ_D
1833 /* Two vector operand, one scalar operand, unpredicated. */
1834 #define DO_ZZI(NAME, TYPE, OP) \
1835 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1837 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1838 TYPE s = s64, *d = vd, *n = vn; \
1839 for (i = 0; i < opr_sz; ++i) { \
1840 d[i] = OP(n[i], s); \
1844 #define DO_SUBR(X, Y) (Y - X)
1846 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1847 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1848 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1849 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1851 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1852 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1853 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1854 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1856 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1857 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1858 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1859 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1861 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1862 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1863 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1864 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1866 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1867 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1868 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1869 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1871 #undef DO_ZZI
1873 #undef DO_AND
1874 #undef DO_ORR
1875 #undef DO_EOR
1876 #undef DO_BIC
1877 #undef DO_ADD
1878 #undef DO_SUB
1879 #undef DO_MAX
1880 #undef DO_MIN
1881 #undef DO_ABD
1882 #undef DO_MUL
1883 #undef DO_DIV
1884 #undef DO_ASR
1885 #undef DO_LSR
1886 #undef DO_LSL
1887 #undef DO_SUBR
1889 /* Similar to the ARM LastActiveElement pseudocode function, except the
1890 result is multiplied by the element size. This includes the not found
1891 indication; e.g. not found for esz=3 is -8. */
1892 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1894 uint64_t mask = pred_esz_masks[esz];
1895 intptr_t i = words;
1897 do {
1898 uint64_t this_g = g[--i] & mask;
1899 if (this_g) {
1900 return i * 64 + (63 - clz64(this_g));
1902 } while (i > 0);
1903 return (intptr_t)-1 << esz;
1906 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1908 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1909 uint32_t flags = PREDTEST_INIT;
1910 uint64_t *d = vd, *g = vg;
1911 intptr_t i = 0;
1913 do {
1914 uint64_t this_d = d[i];
1915 uint64_t this_g = g[i];
1917 if (this_g) {
1918 if (!(flags & 4)) {
1919 /* Set in D the first bit of G. */
1920 this_d |= this_g & -this_g;
1921 d[i] = this_d;
1923 flags = iter_predtest_fwd(this_d, this_g, flags);
1925 } while (++i < words);
1927 return flags;
1930 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1932 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1933 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1934 uint32_t flags = PREDTEST_INIT;
1935 uint64_t *d = vd, *g = vg, esz_mask;
1936 intptr_t i, next;
1938 next = last_active_element(vd, words, esz) + (1 << esz);
1939 esz_mask = pred_esz_masks[esz];
1941 /* Similar to the pseudocode for pnext, but scaled by ESZ
1942 so that we find the correct bit. */
1943 if (next < words * 64) {
1944 uint64_t mask = -1;
1946 if (next & 63) {
1947 mask = ~((1ull << (next & 63)) - 1);
1948 next &= -64;
1950 do {
1951 uint64_t this_g = g[next / 64] & esz_mask & mask;
1952 if (this_g != 0) {
1953 next = (next & -64) + ctz64(this_g);
1954 break;
1956 next += 64;
1957 mask = -1;
1958 } while (next < words * 64);
1961 i = 0;
1962 do {
1963 uint64_t this_d = 0;
1964 if (i == next / 64) {
1965 this_d = 1ull << (next & 63);
1967 d[i] = this_d;
1968 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1969 } while (++i < words);
1971 return flags;
1975 * Copy Zn into Zd, and store zero into inactive elements.
1976 * If inv, store zeros into the active elements.
1978 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1980 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1981 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1982 uint64_t *d = vd, *n = vn;
1983 uint8_t *pg = vg;
1985 for (i = 0; i < opr_sz; i += 1) {
1986 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1990 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1992 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1993 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1994 uint64_t *d = vd, *n = vn;
1995 uint8_t *pg = vg;
1997 for (i = 0; i < opr_sz; i += 1) {
1998 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2002 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2004 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2005 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2006 uint64_t *d = vd, *n = vn;
2007 uint8_t *pg = vg;
2009 for (i = 0; i < opr_sz; i += 1) {
2010 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2014 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2016 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2017 uint64_t *d = vd, *n = vn;
2018 uint8_t *pg = vg;
2019 uint8_t inv = simd_data(desc);
2021 for (i = 0; i < opr_sz; i += 1) {
2022 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2026 /* Three-operand expander, immediate operand, controlled by a predicate.
2028 #define DO_ZPZI(NAME, TYPE, H, OP) \
2029 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2031 intptr_t i, opr_sz = simd_oprsz(desc); \
2032 TYPE imm = simd_data(desc); \
2033 for (i = 0; i < opr_sz; ) { \
2034 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2035 do { \
2036 if (pg & 1) { \
2037 TYPE nn = *(TYPE *)(vn + H(i)); \
2038 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2040 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2041 } while (i & 15); \
2045 /* Similarly, specialized for 64-bit operands. */
2046 #define DO_ZPZI_D(NAME, TYPE, OP) \
2047 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2049 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2050 TYPE *d = vd, *n = vn; \
2051 TYPE imm = simd_data(desc); \
2052 uint8_t *pg = vg; \
2053 for (i = 0; i < opr_sz; i += 1) { \
2054 if (pg[H1(i)] & 1) { \
2055 TYPE nn = n[i]; \
2056 d[i] = OP(nn, imm); \
2061 #define DO_SHR(N, M) (N >> M)
2062 #define DO_SHL(N, M) (N << M)
2064 /* Arithmetic shift right for division. This rounds negative numbers
2065 toward zero as per signed division. Therefore before shifting,
2066 when N is negative, add 2**M-1. */
2067 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2069 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2071 if (likely(sh < 64)) {
2072 return (x >> sh) + ((x >> (sh - 1)) & 1);
2073 } else if (sh == 64) {
2074 return x >> 63;
2075 } else {
2076 return 0;
2080 static inline int64_t do_srshr(int64_t x, unsigned sh)
2082 if (likely(sh < 64)) {
2083 return (x >> sh) + ((x >> (sh - 1)) & 1);
2084 } else {
2085 /* Rounding the sign bit always produces 0. */
2086 return 0;
2090 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2091 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2092 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2093 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2095 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2096 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2097 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2098 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2100 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2101 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2102 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2103 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2105 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2106 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2107 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2108 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2110 /* SVE2 bitwise shift by immediate */
2111 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2112 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2113 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2114 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2116 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2117 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2118 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2119 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2121 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2122 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2123 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2124 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2126 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2127 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2128 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2129 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2131 #define do_suqrshl_b(n, m) \
2132 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2133 #define do_suqrshl_h(n, m) \
2134 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2135 #define do_suqrshl_s(n, m) \
2136 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2137 #define do_suqrshl_d(n, m) \
2138 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2140 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2141 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2142 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2143 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2145 #undef DO_ASRD
2146 #undef DO_ZPZI
2147 #undef DO_ZPZI_D
2149 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2150 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2152 intptr_t i, opr_sz = simd_oprsz(desc); \
2153 int shift = simd_data(desc); \
2154 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2155 TYPEW nn = *(TYPEW *)(vn + i); \
2156 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2160 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2161 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2163 intptr_t i, opr_sz = simd_oprsz(desc); \
2164 int shift = simd_data(desc); \
2165 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2166 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2167 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2171 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2172 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2173 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2175 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2176 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2177 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2179 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2180 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2181 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2183 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2184 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2185 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2187 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2188 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2189 #define DO_SQSHRUN_D(x, sh) \
2190 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2192 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2193 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2194 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2196 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2197 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2198 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2200 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2201 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2202 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2204 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2205 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2206 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2208 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2209 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2210 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2212 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2213 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2214 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2216 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2217 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2218 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2220 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2221 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2222 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2224 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2225 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2226 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2228 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2229 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2230 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2232 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2233 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2234 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2236 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2237 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2238 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2240 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2241 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2242 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2244 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2245 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2246 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2248 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2249 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2250 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2252 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2253 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2254 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2256 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2257 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2258 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2260 #undef DO_SHRNB
2261 #undef DO_SHRNT
2263 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2264 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2266 intptr_t i, opr_sz = simd_oprsz(desc); \
2267 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2268 TYPEW nn = *(TYPEW *)(vn + i); \
2269 TYPEW mm = *(TYPEW *)(vm + i); \
2270 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2274 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2275 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2277 intptr_t i, opr_sz = simd_oprsz(desc); \
2278 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2279 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2280 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2281 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2285 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2286 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2287 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2288 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2290 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2291 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2292 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2294 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2295 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2296 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2298 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2299 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2300 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2302 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2303 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2304 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2306 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2307 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2308 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2310 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2311 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2312 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2314 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2315 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2316 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2318 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2319 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2320 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2322 #undef DO_RSUBHN
2323 #undef DO_SUBHN
2324 #undef DO_RADDHN
2325 #undef DO_ADDHN
2327 #undef DO_BINOPNB
2329 /* Fully general four-operand expander, controlled by a predicate.
2331 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2333 void *vg, uint32_t desc) \
2335 intptr_t i, opr_sz = simd_oprsz(desc); \
2336 for (i = 0; i < opr_sz; ) { \
2337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2338 do { \
2339 if (pg & 1) { \
2340 TYPE nn = *(TYPE *)(vn + H(i)); \
2341 TYPE mm = *(TYPE *)(vm + H(i)); \
2342 TYPE aa = *(TYPE *)(va + H(i)); \
2343 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2346 } while (i & 15); \
2350 /* Similarly, specialized for 64-bit operands. */
2351 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2352 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2353 void *vg, uint32_t desc) \
2355 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2356 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2357 uint8_t *pg = vg; \
2358 for (i = 0; i < opr_sz; i += 1) { \
2359 if (pg[H1(i)] & 1) { \
2360 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2361 d[i] = OP(aa, nn, mm); \
2366 #define DO_MLA(A, N, M) (A + N * M)
2367 #define DO_MLS(A, N, M) (A - N * M)
2369 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2370 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2372 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2373 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2375 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2376 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2378 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2379 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2381 #undef DO_MLA
2382 #undef DO_MLS
2383 #undef DO_ZPZZZ
2384 #undef DO_ZPZZZ_D
2386 void HELPER(sve_index_b)(void *vd, uint32_t start,
2387 uint32_t incr, uint32_t desc)
2389 intptr_t i, opr_sz = simd_oprsz(desc);
2390 uint8_t *d = vd;
2391 for (i = 0; i < opr_sz; i += 1) {
2392 d[H1(i)] = start + i * incr;
2396 void HELPER(sve_index_h)(void *vd, uint32_t start,
2397 uint32_t incr, uint32_t desc)
2399 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2400 uint16_t *d = vd;
2401 for (i = 0; i < opr_sz; i += 1) {
2402 d[H2(i)] = start + i * incr;
2406 void HELPER(sve_index_s)(void *vd, uint32_t start,
2407 uint32_t incr, uint32_t desc)
2409 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2410 uint32_t *d = vd;
2411 for (i = 0; i < opr_sz; i += 1) {
2412 d[H4(i)] = start + i * incr;
2416 void HELPER(sve_index_d)(void *vd, uint64_t start,
2417 uint64_t incr, uint32_t desc)
2419 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2420 uint64_t *d = vd;
2421 for (i = 0; i < opr_sz; i += 1) {
2422 d[i] = start + i * incr;
2426 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2428 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2429 uint32_t sh = simd_data(desc);
2430 uint32_t *d = vd, *n = vn, *m = vm;
2431 for (i = 0; i < opr_sz; i += 1) {
2432 d[i] = n[i] + (m[i] << sh);
2436 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2438 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439 uint64_t sh = simd_data(desc);
2440 uint64_t *d = vd, *n = vn, *m = vm;
2441 for (i = 0; i < opr_sz; i += 1) {
2442 d[i] = n[i] + (m[i] << sh);
2446 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2448 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2449 uint64_t sh = simd_data(desc);
2450 uint64_t *d = vd, *n = vn, *m = vm;
2451 for (i = 0; i < opr_sz; i += 1) {
2452 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2456 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2458 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2459 uint64_t sh = simd_data(desc);
2460 uint64_t *d = vd, *n = vn, *m = vm;
2461 for (i = 0; i < opr_sz; i += 1) {
2462 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2466 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2468 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2469 static const uint16_t coeff[] = {
2470 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2471 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2472 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2473 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2475 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2476 uint16_t *d = vd, *n = vn;
2478 for (i = 0; i < opr_sz; i++) {
2479 uint16_t nn = n[i];
2480 intptr_t idx = extract32(nn, 0, 5);
2481 uint16_t exp = extract32(nn, 5, 5);
2482 d[i] = coeff[idx] | (exp << 10);
2486 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2488 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2489 static const uint32_t coeff[] = {
2490 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2491 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2492 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2493 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2494 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2495 0x1ef532, 0x20b051, 0x227043, 0x243516,
2496 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2497 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2498 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2499 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2500 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2501 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2502 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2503 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2504 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2505 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2507 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2508 uint32_t *d = vd, *n = vn;
2510 for (i = 0; i < opr_sz; i++) {
2511 uint32_t nn = n[i];
2512 intptr_t idx = extract32(nn, 0, 6);
2513 uint32_t exp = extract32(nn, 6, 8);
2514 d[i] = coeff[idx] | (exp << 23);
2518 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2520 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2521 static const uint64_t coeff[] = {
2522 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2523 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2524 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2525 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2526 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2527 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2528 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2529 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2530 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2531 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2532 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2533 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2534 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2535 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2536 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2537 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2538 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2539 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2540 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2541 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2542 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2543 0xFA7C1819E90D8ull,
2545 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2546 uint64_t *d = vd, *n = vn;
2548 for (i = 0; i < opr_sz; i++) {
2549 uint64_t nn = n[i];
2550 intptr_t idx = extract32(nn, 0, 6);
2551 uint64_t exp = extract32(nn, 6, 11);
2552 d[i] = coeff[idx] | (exp << 52);
2556 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2558 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2559 uint16_t *d = vd, *n = vn, *m = vm;
2560 for (i = 0; i < opr_sz; i += 1) {
2561 uint16_t nn = n[i];
2562 uint16_t mm = m[i];
2563 if (mm & 1) {
2564 nn = float16_one;
2566 d[i] = nn ^ (mm & 2) << 14;
2570 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2572 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2573 uint32_t *d = vd, *n = vn, *m = vm;
2574 for (i = 0; i < opr_sz; i += 1) {
2575 uint32_t nn = n[i];
2576 uint32_t mm = m[i];
2577 if (mm & 1) {
2578 nn = float32_one;
2580 d[i] = nn ^ (mm & 2) << 30;
2584 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2587 uint64_t *d = vd, *n = vn, *m = vm;
2588 for (i = 0; i < opr_sz; i += 1) {
2589 uint64_t nn = n[i];
2590 uint64_t mm = m[i];
2591 if (mm & 1) {
2592 nn = float64_one;
2594 d[i] = nn ^ (mm & 2) << 62;
2599 * Signed saturating addition with scalar operand.
2602 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2604 intptr_t i, oprsz = simd_oprsz(desc);
2606 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2607 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2611 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2613 intptr_t i, oprsz = simd_oprsz(desc);
2615 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2616 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2620 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2622 intptr_t i, oprsz = simd_oprsz(desc);
2624 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2625 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2629 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2631 intptr_t i, oprsz = simd_oprsz(desc);
2633 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2634 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2639 * Unsigned saturating addition with scalar operand.
2642 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2644 intptr_t i, oprsz = simd_oprsz(desc);
2646 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2647 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2651 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2653 intptr_t i, oprsz = simd_oprsz(desc);
2655 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2656 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2660 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2662 intptr_t i, oprsz = simd_oprsz(desc);
2664 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2665 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2669 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2671 intptr_t i, oprsz = simd_oprsz(desc);
2673 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2674 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2678 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2680 intptr_t i, oprsz = simd_oprsz(desc);
2682 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2683 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2687 /* Two operand predicated copy immediate with merge. All valid immediates
2688 * can fit within 17 signed bits in the simd_data field.
2690 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2691 uint64_t mm, uint32_t desc)
2693 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2694 uint64_t *d = vd, *n = vn;
2695 uint8_t *pg = vg;
2697 mm = dup_const(MO_8, mm);
2698 for (i = 0; i < opr_sz; i += 1) {
2699 uint64_t nn = n[i];
2700 uint64_t pp = expand_pred_b(pg[H1(i)]);
2701 d[i] = (mm & pp) | (nn & ~pp);
2705 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2706 uint64_t mm, uint32_t desc)
2708 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2709 uint64_t *d = vd, *n = vn;
2710 uint8_t *pg = vg;
2712 mm = dup_const(MO_16, mm);
2713 for (i = 0; i < opr_sz; i += 1) {
2714 uint64_t nn = n[i];
2715 uint64_t pp = expand_pred_h(pg[H1(i)]);
2716 d[i] = (mm & pp) | (nn & ~pp);
2720 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2721 uint64_t mm, uint32_t desc)
2723 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2724 uint64_t *d = vd, *n = vn;
2725 uint8_t *pg = vg;
2727 mm = dup_const(MO_32, mm);
2728 for (i = 0; i < opr_sz; i += 1) {
2729 uint64_t nn = n[i];
2730 uint64_t pp = expand_pred_s(pg[H1(i)]);
2731 d[i] = (mm & pp) | (nn & ~pp);
2735 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2736 uint64_t mm, uint32_t desc)
2738 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2739 uint64_t *d = vd, *n = vn;
2740 uint8_t *pg = vg;
2742 for (i = 0; i < opr_sz; i += 1) {
2743 uint64_t nn = n[i];
2744 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2748 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2750 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2751 uint64_t *d = vd;
2752 uint8_t *pg = vg;
2754 val = dup_const(MO_8, val);
2755 for (i = 0; i < opr_sz; i += 1) {
2756 d[i] = val & expand_pred_b(pg[H1(i)]);
2760 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2762 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2763 uint64_t *d = vd;
2764 uint8_t *pg = vg;
2766 val = dup_const(MO_16, val);
2767 for (i = 0; i < opr_sz; i += 1) {
2768 d[i] = val & expand_pred_h(pg[H1(i)]);
2772 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2774 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2775 uint64_t *d = vd;
2776 uint8_t *pg = vg;
2778 val = dup_const(MO_32, val);
2779 for (i = 0; i < opr_sz; i += 1) {
2780 d[i] = val & expand_pred_s(pg[H1(i)]);
2784 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2786 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2787 uint64_t *d = vd;
2788 uint8_t *pg = vg;
2790 for (i = 0; i < opr_sz; i += 1) {
2791 d[i] = (pg[H1(i)] & 1 ? val : 0);
2795 /* Big-endian hosts need to frob the byte indices. If the copy
2796 * happens to be 8-byte aligned, then no frobbing necessary.
2798 static void swap_memmove(void *vd, void *vs, size_t n)
2800 uintptr_t d = (uintptr_t)vd;
2801 uintptr_t s = (uintptr_t)vs;
2802 uintptr_t o = (d | s | n) & 7;
2803 size_t i;
2805 #ifndef HOST_WORDS_BIGENDIAN
2806 o = 0;
2807 #endif
2808 switch (o) {
2809 case 0:
2810 memmove(vd, vs, n);
2811 break;
2813 case 4:
2814 if (d < s || d >= s + n) {
2815 for (i = 0; i < n; i += 4) {
2816 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2818 } else {
2819 for (i = n; i > 0; ) {
2820 i -= 4;
2821 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2824 break;
2826 case 2:
2827 case 6:
2828 if (d < s || d >= s + n) {
2829 for (i = 0; i < n; i += 2) {
2830 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2832 } else {
2833 for (i = n; i > 0; ) {
2834 i -= 2;
2835 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2838 break;
2840 default:
2841 if (d < s || d >= s + n) {
2842 for (i = 0; i < n; i++) {
2843 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2845 } else {
2846 for (i = n; i > 0; ) {
2847 i -= 1;
2848 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2851 break;
2855 /* Similarly for memset of 0. */
2856 static void swap_memzero(void *vd, size_t n)
2858 uintptr_t d = (uintptr_t)vd;
2859 uintptr_t o = (d | n) & 7;
2860 size_t i;
2862 /* Usually, the first bit of a predicate is set, so N is 0. */
2863 if (likely(n == 0)) {
2864 return;
2867 #ifndef HOST_WORDS_BIGENDIAN
2868 o = 0;
2869 #endif
2870 switch (o) {
2871 case 0:
2872 memset(vd, 0, n);
2873 break;
2875 case 4:
2876 for (i = 0; i < n; i += 4) {
2877 *(uint32_t *)H1_4(d + i) = 0;
2879 break;
2881 case 2:
2882 case 6:
2883 for (i = 0; i < n; i += 2) {
2884 *(uint16_t *)H1_2(d + i) = 0;
2886 break;
2888 default:
2889 for (i = 0; i < n; i++) {
2890 *(uint8_t *)H1(d + i) = 0;
2892 break;
2896 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2898 intptr_t opr_sz = simd_oprsz(desc);
2899 size_t n_ofs = simd_data(desc);
2900 size_t n_siz = opr_sz - n_ofs;
2902 if (vd != vm) {
2903 swap_memmove(vd, vn + n_ofs, n_siz);
2904 swap_memmove(vd + n_siz, vm, n_ofs);
2905 } else if (vd != vn) {
2906 swap_memmove(vd + n_siz, vd, n_ofs);
2907 swap_memmove(vd, vn + n_ofs, n_siz);
2908 } else {
2909 /* vd == vn == vm. Need temp space. */
2910 ARMVectorReg tmp;
2911 swap_memmove(&tmp, vm, n_ofs);
2912 swap_memmove(vd, vd + n_ofs, n_siz);
2913 memcpy(vd + n_siz, &tmp, n_ofs);
2917 #define DO_INSR(NAME, TYPE, H) \
2918 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2920 intptr_t opr_sz = simd_oprsz(desc); \
2921 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2922 *(TYPE *)(vd + H(0)) = val; \
2925 DO_INSR(sve_insr_b, uint8_t, H1)
2926 DO_INSR(sve_insr_h, uint16_t, H1_2)
2927 DO_INSR(sve_insr_s, uint32_t, H1_4)
2928 DO_INSR(sve_insr_d, uint64_t, H1_8)
2930 #undef DO_INSR
2932 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2934 intptr_t i, j, opr_sz = simd_oprsz(desc);
2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2936 uint64_t f = *(uint64_t *)(vn + i);
2937 uint64_t b = *(uint64_t *)(vn + j);
2938 *(uint64_t *)(vd + i) = bswap64(b);
2939 *(uint64_t *)(vd + j) = bswap64(f);
2943 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2945 intptr_t i, j, opr_sz = simd_oprsz(desc);
2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947 uint64_t f = *(uint64_t *)(vn + i);
2948 uint64_t b = *(uint64_t *)(vn + j);
2949 *(uint64_t *)(vd + i) = hswap64(b);
2950 *(uint64_t *)(vd + j) = hswap64(f);
2954 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2956 intptr_t i, j, opr_sz = simd_oprsz(desc);
2957 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2958 uint64_t f = *(uint64_t *)(vn + i);
2959 uint64_t b = *(uint64_t *)(vn + j);
2960 *(uint64_t *)(vd + i) = rol64(b, 32);
2961 *(uint64_t *)(vd + j) = rol64(f, 32);
2965 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2967 intptr_t i, j, opr_sz = simd_oprsz(desc);
2968 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2969 uint64_t f = *(uint64_t *)(vn + i);
2970 uint64_t b = *(uint64_t *)(vn + j);
2971 *(uint64_t *)(vd + i) = b;
2972 *(uint64_t *)(vd + j) = f;
2976 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2978 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2979 bool is_tbx, tb_impl_fn *fn)
2981 ARMVectorReg scratch;
2982 uintptr_t oprsz = simd_oprsz(desc);
2984 if (unlikely(vd == vn)) {
2985 vn = memcpy(&scratch, vn, oprsz);
2988 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2991 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2992 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2994 ARMVectorReg scratch;
2995 uintptr_t oprsz = simd_oprsz(desc);
2997 if (unlikely(vd == vn0)) {
2998 vn0 = memcpy(&scratch, vn0, oprsz);
2999 if (vd == vn1) {
3000 vn1 = vn0;
3002 } else if (unlikely(vd == vn1)) {
3003 vn1 = memcpy(&scratch, vn1, oprsz);
3006 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3009 #define DO_TB(SUFF, TYPE, H) \
3010 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3011 void *vm, uintptr_t oprsz, bool is_tbx) \
3013 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3014 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3015 for (i = 0; i < nelem; ++i) { \
3016 TYPE index = indexes[H1(i)], val = 0; \
3017 if (index < nelem) { \
3018 val = tbl0[H(index)]; \
3019 } else { \
3020 index -= nelem; \
3021 if (tbl1 && index < nelem) { \
3022 val = tbl1[H(index)]; \
3023 } else if (is_tbx) { \
3024 continue; \
3027 d[H(i)] = val; \
3030 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3032 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3034 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3035 void *vm, uint32_t desc) \
3037 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3039 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3041 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3044 DO_TB(b, uint8_t, H1)
3045 DO_TB(h, uint16_t, H2)
3046 DO_TB(s, uint32_t, H4)
3047 DO_TB(d, uint64_t, H8)
3049 #undef DO_TB
3051 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3052 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3054 intptr_t i, opr_sz = simd_oprsz(desc); \
3055 TYPED *d = vd; \
3056 TYPES *n = vn; \
3057 ARMVectorReg tmp; \
3058 if (unlikely(vn - vd < opr_sz)) { \
3059 n = memcpy(&tmp, n, opr_sz / 2); \
3061 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3062 d[HD(i)] = n[HS(i)]; \
3066 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3067 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3068 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3070 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3071 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3072 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3074 #undef DO_UNPK
3076 /* Mask of bits included in the even numbered predicates of width esz.
3077 * We also use this for expand_bits/compress_bits, and so extend the
3078 * same pattern out to 16-bit units.
3080 static const uint64_t even_bit_esz_masks[5] = {
3081 0x5555555555555555ull,
3082 0x3333333333333333ull,
3083 0x0f0f0f0f0f0f0f0full,
3084 0x00ff00ff00ff00ffull,
3085 0x0000ffff0000ffffull,
3088 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3089 * For N==0, this corresponds to the operation that in qemu/bitops.h
3090 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3091 * section 7-2 Shuffling Bits.
3093 static uint64_t expand_bits(uint64_t x, int n)
3095 int i;
3097 x &= 0xffffffffu;
3098 for (i = 4; i >= n; i--) {
3099 int sh = 1 << i;
3100 x = ((x << sh) | x) & even_bit_esz_masks[i];
3102 return x;
3105 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3106 * For N==0, this corresponds to the operation that in qemu/bitops.h
3107 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3108 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3110 static uint64_t compress_bits(uint64_t x, int n)
3112 int i;
3114 for (i = n; i <= 4; i++) {
3115 int sh = 1 << i;
3116 x &= even_bit_esz_masks[i];
3117 x = (x >> sh) | x;
3119 return x & 0xffffffffu;
3122 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3124 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3125 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3126 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3127 int esize = 1 << esz;
3128 uint64_t *d = vd;
3129 intptr_t i;
3131 if (oprsz <= 8) {
3132 uint64_t nn = *(uint64_t *)vn;
3133 uint64_t mm = *(uint64_t *)vm;
3134 int half = 4 * oprsz;
3136 nn = extract64(nn, high * half, half);
3137 mm = extract64(mm, high * half, half);
3138 nn = expand_bits(nn, esz);
3139 mm = expand_bits(mm, esz);
3140 d[0] = nn | (mm << esize);
3141 } else {
3142 ARMPredicateReg tmp;
3144 /* We produce output faster than we consume input.
3145 Therefore we must be mindful of possible overlap. */
3146 if (vd == vn) {
3147 vn = memcpy(&tmp, vn, oprsz);
3148 if (vd == vm) {
3149 vm = vn;
3151 } else if (vd == vm) {
3152 vm = memcpy(&tmp, vm, oprsz);
3154 if (high) {
3155 high = oprsz >> 1;
3158 if ((oprsz & 7) == 0) {
3159 uint32_t *n = vn, *m = vm;
3160 high >>= 2;
3162 for (i = 0; i < oprsz / 8; i++) {
3163 uint64_t nn = n[H4(high + i)];
3164 uint64_t mm = m[H4(high + i)];
3166 nn = expand_bits(nn, esz);
3167 mm = expand_bits(mm, esz);
3168 d[i] = nn | (mm << esize);
3170 } else {
3171 uint8_t *n = vn, *m = vm;
3172 uint16_t *d16 = vd;
3174 for (i = 0; i < oprsz / 2; i++) {
3175 uint16_t nn = n[H1(high + i)];
3176 uint16_t mm = m[H1(high + i)];
3178 nn = expand_bits(nn, esz);
3179 mm = expand_bits(mm, esz);
3180 d16[H2(i)] = nn | (mm << esize);
3186 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3188 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3189 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3190 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3191 uint64_t *d = vd, *n = vn, *m = vm;
3192 uint64_t l, h;
3193 intptr_t i;
3195 if (oprsz <= 8) {
3196 l = compress_bits(n[0] >> odd, esz);
3197 h = compress_bits(m[0] >> odd, esz);
3198 d[0] = l | (h << (4 * oprsz));
3199 } else {
3200 ARMPredicateReg tmp_m;
3201 intptr_t oprsz_16 = oprsz / 16;
3203 if ((vm - vd) < (uintptr_t)oprsz) {
3204 m = memcpy(&tmp_m, vm, oprsz);
3207 for (i = 0; i < oprsz_16; i++) {
3208 l = n[2 * i + 0];
3209 h = n[2 * i + 1];
3210 l = compress_bits(l >> odd, esz);
3211 h = compress_bits(h >> odd, esz);
3212 d[i] = l | (h << 32);
3216 * For VL which is not a multiple of 512, the results from M do not
3217 * align nicely with the uint64_t for D. Put the aligned results
3218 * from M into TMP_M and then copy it into place afterward.
3220 if (oprsz & 15) {
3221 int final_shift = (oprsz & 15) * 2;
3223 l = n[2 * i + 0];
3224 h = n[2 * i + 1];
3225 l = compress_bits(l >> odd, esz);
3226 h = compress_bits(h >> odd, esz);
3227 d[i] = l | (h << final_shift);
3229 for (i = 0; i < oprsz_16; i++) {
3230 l = m[2 * i + 0];
3231 h = m[2 * i + 1];
3232 l = compress_bits(l >> odd, esz);
3233 h = compress_bits(h >> odd, esz);
3234 tmp_m.p[i] = l | (h << 32);
3236 l = m[2 * i + 0];
3237 h = m[2 * i + 1];
3238 l = compress_bits(l >> odd, esz);
3239 h = compress_bits(h >> odd, esz);
3240 tmp_m.p[i] = l | (h << final_shift);
3242 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3243 } else {
3244 for (i = 0; i < oprsz_16; i++) {
3245 l = m[2 * i + 0];
3246 h = m[2 * i + 1];
3247 l = compress_bits(l >> odd, esz);
3248 h = compress_bits(h >> odd, esz);
3249 d[oprsz_16 + i] = l | (h << 32);
3255 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3257 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3258 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3259 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3260 uint64_t *d = vd, *n = vn, *m = vm;
3261 uint64_t mask;
3262 int shr, shl;
3263 intptr_t i;
3265 shl = 1 << esz;
3266 shr = 0;
3267 mask = even_bit_esz_masks[esz];
3268 if (odd) {
3269 mask <<= shl;
3270 shr = shl;
3271 shl = 0;
3274 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3275 uint64_t nn = (n[i] & mask) >> shr;
3276 uint64_t mm = (m[i] & mask) << shl;
3277 d[i] = nn + mm;
3281 /* Reverse units of 2**N bits. */
3282 static uint64_t reverse_bits_64(uint64_t x, int n)
3284 int i, sh;
3286 x = bswap64(x);
3287 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3288 uint64_t mask = even_bit_esz_masks[i];
3289 x = ((x & mask) << sh) | ((x >> sh) & mask);
3291 return x;
3294 static uint8_t reverse_bits_8(uint8_t x, int n)
3296 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3297 int i, sh;
3299 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3300 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3302 return x;
3305 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3307 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3308 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3309 intptr_t i, oprsz_2 = oprsz / 2;
3311 if (oprsz <= 8) {
3312 uint64_t l = *(uint64_t *)vn;
3313 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3314 *(uint64_t *)vd = l;
3315 } else if ((oprsz & 15) == 0) {
3316 for (i = 0; i < oprsz_2; i += 8) {
3317 intptr_t ih = oprsz - 8 - i;
3318 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3319 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3320 *(uint64_t *)(vd + i) = h;
3321 *(uint64_t *)(vd + ih) = l;
3323 } else {
3324 for (i = 0; i < oprsz_2; i += 1) {
3325 intptr_t il = H1(i);
3326 intptr_t ih = H1(oprsz - 1 - i);
3327 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3328 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3329 *(uint8_t *)(vd + il) = h;
3330 *(uint8_t *)(vd + ih) = l;
3335 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3337 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3338 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3339 uint64_t *d = vd;
3340 intptr_t i;
3342 if (oprsz <= 8) {
3343 uint64_t nn = *(uint64_t *)vn;
3344 int half = 4 * oprsz;
3346 nn = extract64(nn, high * half, half);
3347 nn = expand_bits(nn, 0);
3348 d[0] = nn;
3349 } else {
3350 ARMPredicateReg tmp_n;
3352 /* We produce output faster than we consume input.
3353 Therefore we must be mindful of possible overlap. */
3354 if ((vn - vd) < (uintptr_t)oprsz) {
3355 vn = memcpy(&tmp_n, vn, oprsz);
3357 if (high) {
3358 high = oprsz >> 1;
3361 if ((oprsz & 7) == 0) {
3362 uint32_t *n = vn;
3363 high >>= 2;
3365 for (i = 0; i < oprsz / 8; i++) {
3366 uint64_t nn = n[H4(high + i)];
3367 d[i] = expand_bits(nn, 0);
3369 } else {
3370 uint16_t *d16 = vd;
3371 uint8_t *n = vn;
3373 for (i = 0; i < oprsz / 2; i++) {
3374 uint16_t nn = n[H1(high + i)];
3375 d16[H2(i)] = expand_bits(nn, 0);
3381 #define DO_ZIP(NAME, TYPE, H) \
3382 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3384 intptr_t oprsz = simd_oprsz(desc); \
3385 intptr_t i, oprsz_2 = oprsz / 2; \
3386 ARMVectorReg tmp_n, tmp_m; \
3387 /* We produce output faster than we consume input. \
3388 Therefore we must be mindful of possible overlap. */ \
3389 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3390 vn = memcpy(&tmp_n, vn, oprsz_2); \
3392 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3393 vm = memcpy(&tmp_m, vm, oprsz_2); \
3395 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3396 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3397 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3399 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3400 memset(vd + oprsz - 16, 0, 16); \
3404 DO_ZIP(sve_zip_b, uint8_t, H1)
3405 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3406 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3407 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3408 DO_ZIP(sve2_zip_q, Int128, )
3410 #define DO_UZP(NAME, TYPE, H) \
3411 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3413 intptr_t oprsz = simd_oprsz(desc); \
3414 intptr_t odd_ofs = simd_data(desc); \
3415 intptr_t i, p; \
3416 ARMVectorReg tmp_m; \
3417 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3418 vm = memcpy(&tmp_m, vm, oprsz); \
3420 i = 0, p = odd_ofs; \
3421 do { \
3422 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3423 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3424 } while (p < oprsz); \
3425 p -= oprsz; \
3426 do { \
3427 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3428 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3429 } while (p < oprsz); \
3430 tcg_debug_assert(i == oprsz); \
3433 DO_UZP(sve_uzp_b, uint8_t, H1)
3434 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3435 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3436 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3437 DO_UZP(sve2_uzp_q, Int128, )
3439 #define DO_TRN(NAME, TYPE, H) \
3440 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3442 intptr_t oprsz = simd_oprsz(desc); \
3443 intptr_t odd_ofs = simd_data(desc); \
3444 intptr_t i; \
3445 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3446 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3447 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3448 *(TYPE *)(vd + H(i + 0)) = ae; \
3449 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3451 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3452 memset(vd + oprsz - 16, 0, 16); \
3456 DO_TRN(sve_trn_b, uint8_t, H1)
3457 DO_TRN(sve_trn_h, uint16_t, H1_2)
3458 DO_TRN(sve_trn_s, uint32_t, H1_4)
3459 DO_TRN(sve_trn_d, uint64_t, H1_8)
3460 DO_TRN(sve2_trn_q, Int128, )
3462 #undef DO_ZIP
3463 #undef DO_UZP
3464 #undef DO_TRN
3466 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3468 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3469 uint32_t *d = vd, *n = vn;
3470 uint8_t *pg = vg;
3472 for (i = j = 0; i < opr_sz; i++) {
3473 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3474 d[H4(j)] = n[H4(i)];
3475 j++;
3478 for (; j < opr_sz; j++) {
3479 d[H4(j)] = 0;
3483 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3485 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3486 uint64_t *d = vd, *n = vn;
3487 uint8_t *pg = vg;
3489 for (i = j = 0; i < opr_sz; i++) {
3490 if (pg[H1(i)] & 1) {
3491 d[j] = n[i];
3492 j++;
3495 for (; j < opr_sz; j++) {
3496 d[j] = 0;
3500 /* Similar to the ARM LastActiveElement pseudocode function, except the
3501 * result is multiplied by the element size. This includes the not found
3502 * indication; e.g. not found for esz=3 is -8.
3504 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3506 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3507 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3509 return last_active_element(vg, words, esz);
3512 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3514 intptr_t opr_sz = simd_oprsz(desc) / 8;
3515 int esz = simd_data(desc);
3516 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3517 intptr_t i, first_i, last_i;
3518 ARMVectorReg tmp;
3520 first_i = last_i = 0;
3521 first_g = last_g = 0;
3523 /* Find the extent of the active elements within VG. */
3524 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3525 pg = *(uint64_t *)(vg + i) & mask;
3526 if (pg) {
3527 if (last_g == 0) {
3528 last_g = pg;
3529 last_i = i;
3531 first_g = pg;
3532 first_i = i;
3536 len = 0;
3537 if (first_g != 0) {
3538 first_i = first_i * 8 + ctz64(first_g);
3539 last_i = last_i * 8 + 63 - clz64(last_g);
3540 len = last_i - first_i + (1 << esz);
3541 if (vd == vm) {
3542 vm = memcpy(&tmp, vm, opr_sz * 8);
3544 swap_memmove(vd, vn + first_i, len);
3546 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3549 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3550 void *vg, uint32_t desc)
3552 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3553 uint64_t *d = vd, *n = vn, *m = vm;
3554 uint8_t *pg = vg;
3556 for (i = 0; i < opr_sz; i += 1) {
3557 uint64_t nn = n[i], mm = m[i];
3558 uint64_t pp = expand_pred_b(pg[H1(i)]);
3559 d[i] = (nn & pp) | (mm & ~pp);
3563 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3564 void *vg, uint32_t desc)
3566 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3567 uint64_t *d = vd, *n = vn, *m = vm;
3568 uint8_t *pg = vg;
3570 for (i = 0; i < opr_sz; i += 1) {
3571 uint64_t nn = n[i], mm = m[i];
3572 uint64_t pp = expand_pred_h(pg[H1(i)]);
3573 d[i] = (nn & pp) | (mm & ~pp);
3577 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3578 void *vg, uint32_t desc)
3580 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3581 uint64_t *d = vd, *n = vn, *m = vm;
3582 uint8_t *pg = vg;
3584 for (i = 0; i < opr_sz; i += 1) {
3585 uint64_t nn = n[i], mm = m[i];
3586 uint64_t pp = expand_pred_s(pg[H1(i)]);
3587 d[i] = (nn & pp) | (mm & ~pp);
3591 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3592 void *vg, uint32_t desc)
3594 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3595 uint64_t *d = vd, *n = vn, *m = vm;
3596 uint8_t *pg = vg;
3598 for (i = 0; i < opr_sz; i += 1) {
3599 uint64_t nn = n[i], mm = m[i];
3600 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3604 /* Two operand comparison controlled by a predicate.
3605 * ??? It is very tempting to want to be able to expand this inline
3606 * with x86 instructions, e.g.
3608 * vcmpeqw zm, zn, %ymm0
3609 * vpmovmskb %ymm0, %eax
3610 * and $0x5555, %eax
3611 * and pg, %eax
3613 * or even aarch64, e.g.
3615 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3616 * cmeq v0.8h, zn, zm
3617 * and v0.8h, v0.8h, mask
3618 * addv h0, v0.8h
3619 * and v0.8b, pg
3621 * However, coming up with an abstraction that allows vector inputs and
3622 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3623 * scalar outputs, is tricky.
3625 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3626 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3628 intptr_t opr_sz = simd_oprsz(desc); \
3629 uint32_t flags = PREDTEST_INIT; \
3630 intptr_t i = opr_sz; \
3631 do { \
3632 uint64_t out = 0, pg; \
3633 do { \
3634 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3635 TYPE nn = *(TYPE *)(vn + H(i)); \
3636 TYPE mm = *(TYPE *)(vm + H(i)); \
3637 out |= nn OP mm; \
3638 } while (i & 63); \
3639 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3640 out &= pg; \
3641 *(uint64_t *)(vd + (i >> 3)) = out; \
3642 flags = iter_predtest_bwd(out, pg, flags); \
3643 } while (i > 0); \
3644 return flags; \
3647 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3648 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3649 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3650 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3651 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3652 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3653 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3654 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3656 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3657 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3658 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3659 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3661 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3662 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3663 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3664 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3666 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3667 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3668 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3669 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3671 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3672 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3673 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3674 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3676 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3677 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3678 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3679 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3681 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3682 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3683 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3684 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3686 #undef DO_CMP_PPZZ_B
3687 #undef DO_CMP_PPZZ_H
3688 #undef DO_CMP_PPZZ_S
3689 #undef DO_CMP_PPZZ_D
3690 #undef DO_CMP_PPZZ
3692 /* Similar, but the second source is "wide". */
3693 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3694 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3696 intptr_t opr_sz = simd_oprsz(desc); \
3697 uint32_t flags = PREDTEST_INIT; \
3698 intptr_t i = opr_sz; \
3699 do { \
3700 uint64_t out = 0, pg; \
3701 do { \
3702 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3703 do { \
3704 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3705 TYPE nn = *(TYPE *)(vn + H(i)); \
3706 out |= nn OP mm; \
3707 } while (i & 7); \
3708 } while (i & 63); \
3709 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3710 out &= pg; \
3711 *(uint64_t *)(vd + (i >> 3)) = out; \
3712 flags = iter_predtest_bwd(out, pg, flags); \
3713 } while (i > 0); \
3714 return flags; \
3717 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3718 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3719 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3720 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3721 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3722 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3724 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3725 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3726 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3728 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3729 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3730 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3732 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3733 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3734 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3736 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3737 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3738 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3740 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3741 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3742 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3744 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3745 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3746 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3748 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3749 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3750 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3752 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3753 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3754 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3756 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3757 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3758 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3760 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3761 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3762 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3764 #undef DO_CMP_PPZW_B
3765 #undef DO_CMP_PPZW_H
3766 #undef DO_CMP_PPZW_S
3767 #undef DO_CMP_PPZW
3769 /* Similar, but the second source is immediate. */
3770 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3771 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3773 intptr_t opr_sz = simd_oprsz(desc); \
3774 uint32_t flags = PREDTEST_INIT; \
3775 TYPE mm = simd_data(desc); \
3776 intptr_t i = opr_sz; \
3777 do { \
3778 uint64_t out = 0, pg; \
3779 do { \
3780 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3781 TYPE nn = *(TYPE *)(vn + H(i)); \
3782 out |= nn OP mm; \
3783 } while (i & 63); \
3784 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3785 out &= pg; \
3786 *(uint64_t *)(vd + (i >> 3)) = out; \
3787 flags = iter_predtest_bwd(out, pg, flags); \
3788 } while (i > 0); \
3789 return flags; \
3792 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3793 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3794 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3795 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3796 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3797 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3798 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3799 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3801 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3802 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3803 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3804 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3806 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3807 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3808 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3809 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3811 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3812 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3813 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3814 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3816 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3817 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3818 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3819 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3821 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3822 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3823 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3824 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3826 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3827 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3828 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3829 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3831 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3832 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3833 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3834 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3836 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3837 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3838 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3839 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3841 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3842 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3843 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3844 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3846 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3847 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3848 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3849 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3851 #undef DO_CMP_PPZI_B
3852 #undef DO_CMP_PPZI_H
3853 #undef DO_CMP_PPZI_S
3854 #undef DO_CMP_PPZI_D
3855 #undef DO_CMP_PPZI
3857 /* Similar to the ARM LastActive pseudocode function. */
3858 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3860 intptr_t i;
3862 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3863 uint64_t pg = *(uint64_t *)(vg + i);
3864 if (pg) {
3865 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3868 return 0;
3871 /* Compute a mask into RETB that is true for all G, up to and including
3872 * (if after) or excluding (if !after) the first G & N.
3873 * Return true if BRK found.
3875 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3876 bool brk, bool after)
3878 uint64_t b;
3880 if (brk) {
3881 b = 0;
3882 } else if ((g & n) == 0) {
3883 /* For all G, no N are set; break not found. */
3884 b = g;
3885 } else {
3886 /* Break somewhere in N. Locate it. */
3887 b = g & n; /* guard true, pred true */
3888 b = b & -b; /* first such */
3889 if (after) {
3890 b = b | (b - 1); /* break after same */
3891 } else {
3892 b = b - 1; /* break before same */
3894 brk = true;
3897 *retb = b;
3898 return brk;
3901 /* Compute a zeroing BRK. */
3902 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3903 intptr_t oprsz, bool after)
3905 bool brk = false;
3906 intptr_t i;
3908 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3909 uint64_t this_b, this_g = g[i];
3911 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3912 d[i] = this_b & this_g;
3916 /* Likewise, but also compute flags. */
3917 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3918 intptr_t oprsz, bool after)
3920 uint32_t flags = PREDTEST_INIT;
3921 bool brk = false;
3922 intptr_t i;
3924 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3925 uint64_t this_b, this_d, this_g = g[i];
3927 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3928 d[i] = this_d = this_b & this_g;
3929 flags = iter_predtest_fwd(this_d, this_g, flags);
3931 return flags;
3934 /* Compute a merging BRK. */
3935 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3936 intptr_t oprsz, bool after)
3938 bool brk = false;
3939 intptr_t i;
3941 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3942 uint64_t this_b, this_g = g[i];
3944 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3945 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3949 /* Likewise, but also compute flags. */
3950 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3951 intptr_t oprsz, bool after)
3953 uint32_t flags = PREDTEST_INIT;
3954 bool brk = false;
3955 intptr_t i;
3957 for (i = 0; i < oprsz / 8; ++i) {
3958 uint64_t this_b, this_d = d[i], this_g = g[i];
3960 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3961 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3962 flags = iter_predtest_fwd(this_d, this_g, flags);
3964 return flags;
3967 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3969 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3970 * The compiler should turn this into 4 64-bit integer stores.
3972 memset(d, 0, sizeof(ARMPredicateReg));
3973 return PREDTEST_INIT;
3976 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3977 uint32_t pred_desc)
3979 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3980 if (last_active_pred(vn, vg, oprsz)) {
3981 compute_brk_z(vd, vm, vg, oprsz, true);
3982 } else {
3983 do_zero(vd, oprsz);
3987 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3988 uint32_t pred_desc)
3990 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3991 if (last_active_pred(vn, vg, oprsz)) {
3992 return compute_brks_z(vd, vm, vg, oprsz, true);
3993 } else {
3994 return do_zero(vd, oprsz);
3998 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3999 uint32_t pred_desc)
4001 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4002 if (last_active_pred(vn, vg, oprsz)) {
4003 compute_brk_z(vd, vm, vg, oprsz, false);
4004 } else {
4005 do_zero(vd, oprsz);
4009 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4010 uint32_t pred_desc)
4012 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4013 if (last_active_pred(vn, vg, oprsz)) {
4014 return compute_brks_z(vd, vm, vg, oprsz, false);
4015 } else {
4016 return do_zero(vd, oprsz);
4020 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4022 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023 compute_brk_z(vd, vn, vg, oprsz, true);
4026 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029 return compute_brks_z(vd, vn, vg, oprsz, true);
4032 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4034 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035 compute_brk_z(vd, vn, vg, oprsz, false);
4038 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4040 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041 return compute_brks_z(vd, vn, vg, oprsz, false);
4044 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4046 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4047 compute_brk_m(vd, vn, vg, oprsz, true);
4050 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4052 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4053 return compute_brks_m(vd, vn, vg, oprsz, true);
4056 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4058 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4059 compute_brk_m(vd, vn, vg, oprsz, false);
4062 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4064 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4065 return compute_brks_m(vd, vn, vg, oprsz, false);
4068 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4070 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4071 if (!last_active_pred(vn, vg, oprsz)) {
4072 do_zero(vd, oprsz);
4076 /* As if PredTest(Ones(PL), D, esz). */
4077 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4078 uint64_t esz_mask)
4080 uint32_t flags = PREDTEST_INIT;
4081 intptr_t i;
4083 for (i = 0; i < oprsz / 8; i++) {
4084 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4086 if (oprsz & 7) {
4087 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4088 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4090 return flags;
4093 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4095 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4096 if (last_active_pred(vn, vg, oprsz)) {
4097 return predtest_ones(vd, oprsz, -1);
4098 } else {
4099 return do_zero(vd, oprsz);
4103 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4105 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4106 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4107 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4108 intptr_t i;
4110 for (i = 0; i < words; ++i) {
4111 uint64_t t = n[i] & g[i] & mask;
4112 sum += ctpop64(t);
4114 return sum;
4117 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4119 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4120 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4121 uint64_t esz_mask = pred_esz_masks[esz];
4122 ARMPredicateReg *d = vd;
4123 uint32_t flags;
4124 intptr_t i;
4126 /* Begin with a zero predicate register. */
4127 flags = do_zero(d, oprsz);
4128 if (count == 0) {
4129 return flags;
4132 /* Set all of the requested bits. */
4133 for (i = 0; i < count / 64; ++i) {
4134 d->p[i] = esz_mask;
4136 if (count & 63) {
4137 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4140 return predtest_ones(d, oprsz, esz_mask);
4143 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4145 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4146 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4147 uint64_t esz_mask = pred_esz_masks[esz];
4148 ARMPredicateReg *d = vd;
4149 intptr_t i, invcount, oprbits;
4150 uint64_t bits;
4152 if (count == 0) {
4153 return do_zero(d, oprsz);
4156 oprbits = oprsz * 8;
4157 tcg_debug_assert(count <= oprbits);
4159 bits = esz_mask;
4160 if (oprbits & 63) {
4161 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4164 invcount = oprbits - count;
4165 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4166 d->p[i] = bits;
4167 bits = esz_mask;
4170 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4172 while (--i >= 0) {
4173 d->p[i] = 0;
4176 return predtest_ones(d, oprsz, esz_mask);
4179 /* Recursive reduction on a function;
4180 * C.f. the ARM ARM function ReducePredicated.
4182 * While it would be possible to write this without the DATA temporary,
4183 * it is much simpler to process the predicate register this way.
4184 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4185 * little to gain with a more complex non-recursive form.
4187 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4188 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4190 if (n == 1) { \
4191 return *data; \
4192 } else { \
4193 uintptr_t half = n / 2; \
4194 TYPE lo = NAME##_reduce(data, status, half); \
4195 TYPE hi = NAME##_reduce(data + half, status, half); \
4196 return TYPE##_##FUNC(lo, hi, status); \
4199 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4201 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4202 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4203 for (i = 0; i < oprsz; ) { \
4204 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4205 do { \
4206 TYPE nn = *(TYPE *)(vn + H(i)); \
4207 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4208 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4209 } while (i & 15); \
4211 for (; i < maxsz; i += sizeof(TYPE)) { \
4212 *(TYPE *)((void *)data + i) = IDENT; \
4214 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4217 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4218 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4219 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4221 /* Identity is floatN_default_nan, without the function call. */
4222 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4223 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4224 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4226 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4227 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4228 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4230 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4231 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4232 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4234 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4235 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4236 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4238 #undef DO_REDUCE
4240 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4241 void *status, uint32_t desc)
4243 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4244 float16 result = nn;
4246 do {
4247 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4248 do {
4249 if (pg & 1) {
4250 float16 mm = *(float16 *)(vm + H1_2(i));
4251 result = float16_add(result, mm, status);
4253 i += sizeof(float16), pg >>= sizeof(float16);
4254 } while (i & 15);
4255 } while (i < opr_sz);
4257 return result;
4260 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4261 void *status, uint32_t desc)
4263 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4264 float32 result = nn;
4266 do {
4267 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4268 do {
4269 if (pg & 1) {
4270 float32 mm = *(float32 *)(vm + H1_2(i));
4271 result = float32_add(result, mm, status);
4273 i += sizeof(float32), pg >>= sizeof(float32);
4274 } while (i & 15);
4275 } while (i < opr_sz);
4277 return result;
4280 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4281 void *status, uint32_t desc)
4283 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4284 uint64_t *m = vm;
4285 uint8_t *pg = vg;
4287 for (i = 0; i < opr_sz; i++) {
4288 if (pg[H1(i)] & 1) {
4289 nn = float64_add(nn, m[i], status);
4293 return nn;
4296 /* Fully general three-operand expander, controlled by a predicate,
4297 * With the extra float_status parameter.
4299 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4300 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4301 void *status, uint32_t desc) \
4303 intptr_t i = simd_oprsz(desc); \
4304 uint64_t *g = vg; \
4305 do { \
4306 uint64_t pg = g[(i - 1) >> 6]; \
4307 do { \
4308 i -= sizeof(TYPE); \
4309 if (likely((pg >> (i & 63)) & 1)) { \
4310 TYPE nn = *(TYPE *)(vn + H(i)); \
4311 TYPE mm = *(TYPE *)(vm + H(i)); \
4312 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4314 } while (i & 63); \
4315 } while (i != 0); \
4318 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4319 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4320 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4322 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4323 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4324 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4326 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4327 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4328 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4330 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4331 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4332 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4334 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4335 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4336 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4338 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4339 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4340 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4342 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4343 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4344 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4346 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4347 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4348 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4350 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4352 return float16_abs(float16_sub(a, b, s));
4355 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4357 return float32_abs(float32_sub(a, b, s));
4360 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4362 return float64_abs(float64_sub(a, b, s));
4365 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4366 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4367 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4369 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4371 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4372 return float64_scalbn(a, b_int, s);
4375 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4376 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4377 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4379 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4380 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4381 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4383 #undef DO_ZPZZ_FP
4385 /* Three-operand expander, with one scalar operand, controlled by
4386 * a predicate, with the extra float_status parameter.
4388 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4389 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4390 void *status, uint32_t desc) \
4392 intptr_t i = simd_oprsz(desc); \
4393 uint64_t *g = vg; \
4394 TYPE mm = scalar; \
4395 do { \
4396 uint64_t pg = g[(i - 1) >> 6]; \
4397 do { \
4398 i -= sizeof(TYPE); \
4399 if (likely((pg >> (i & 63)) & 1)) { \
4400 TYPE nn = *(TYPE *)(vn + H(i)); \
4401 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4403 } while (i & 63); \
4404 } while (i != 0); \
4407 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4408 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4409 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4411 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4412 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4413 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4415 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4416 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4417 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4419 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4421 return float16_sub(b, a, s);
4424 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4426 return float32_sub(b, a, s);
4429 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4431 return float64_sub(b, a, s);
4434 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4435 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4436 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4438 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4439 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4440 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4442 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4443 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4444 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4446 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4447 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4448 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4450 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4451 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4452 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4454 /* Fully general two-operand expander, controlled by a predicate,
4455 * With the extra float_status parameter.
4457 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4458 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4460 intptr_t i = simd_oprsz(desc); \
4461 uint64_t *g = vg; \
4462 do { \
4463 uint64_t pg = g[(i - 1) >> 6]; \
4464 do { \
4465 i -= sizeof(TYPE); \
4466 if (likely((pg >> (i & 63)) & 1)) { \
4467 TYPE nn = *(TYPE *)(vn + H(i)); \
4468 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4470 } while (i & 63); \
4471 } while (i != 0); \
4474 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4475 * FZ16. When converting from fp16, this affects flushing input denormals;
4476 * when converting to fp16, this affects flushing output denormals.
4478 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4480 bool save = get_flush_inputs_to_zero(fpst);
4481 float32 ret;
4483 set_flush_inputs_to_zero(false, fpst);
4484 ret = float16_to_float32(f, true, fpst);
4485 set_flush_inputs_to_zero(save, fpst);
4486 return ret;
4489 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4491 bool save = get_flush_inputs_to_zero(fpst);
4492 float64 ret;
4494 set_flush_inputs_to_zero(false, fpst);
4495 ret = float16_to_float64(f, true, fpst);
4496 set_flush_inputs_to_zero(save, fpst);
4497 return ret;
4500 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4502 bool save = get_flush_to_zero(fpst);
4503 float16 ret;
4505 set_flush_to_zero(false, fpst);
4506 ret = float32_to_float16(f, true, fpst);
4507 set_flush_to_zero(save, fpst);
4508 return ret;
4511 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4513 bool save = get_flush_to_zero(fpst);
4514 float16 ret;
4516 set_flush_to_zero(false, fpst);
4517 ret = float64_to_float16(f, true, fpst);
4518 set_flush_to_zero(save, fpst);
4519 return ret;
4522 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4524 if (float16_is_any_nan(f)) {
4525 float_raise(float_flag_invalid, s);
4526 return 0;
4528 return float16_to_int16_round_to_zero(f, s);
4531 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4533 if (float16_is_any_nan(f)) {
4534 float_raise(float_flag_invalid, s);
4535 return 0;
4537 return float16_to_int64_round_to_zero(f, s);
4540 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4542 if (float32_is_any_nan(f)) {
4543 float_raise(float_flag_invalid, s);
4544 return 0;
4546 return float32_to_int64_round_to_zero(f, s);
4549 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4551 if (float64_is_any_nan(f)) {
4552 float_raise(float_flag_invalid, s);
4553 return 0;
4555 return float64_to_int64_round_to_zero(f, s);
4558 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4560 if (float16_is_any_nan(f)) {
4561 float_raise(float_flag_invalid, s);
4562 return 0;
4564 return float16_to_uint16_round_to_zero(f, s);
4567 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4569 if (float16_is_any_nan(f)) {
4570 float_raise(float_flag_invalid, s);
4571 return 0;
4573 return float16_to_uint64_round_to_zero(f, s);
4576 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4578 if (float32_is_any_nan(f)) {
4579 float_raise(float_flag_invalid, s);
4580 return 0;
4582 return float32_to_uint64_round_to_zero(f, s);
4585 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4587 if (float64_is_any_nan(f)) {
4588 float_raise(float_flag_invalid, s);
4589 return 0;
4591 return float64_to_uint64_round_to_zero(f, s);
4594 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4595 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4596 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4597 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4598 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4599 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4600 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4602 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4603 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4604 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4605 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4606 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4608 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4610 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4611 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4612 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4613 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4614 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4615 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4616 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4618 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4619 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4620 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4622 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4623 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4624 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4626 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4627 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4628 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4630 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4631 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4632 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4634 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4635 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4636 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4637 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4638 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4639 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4640 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4642 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4643 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4644 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4645 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4646 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4647 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4648 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4650 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4652 /* Extract frac to the top of the uint32_t. */
4653 uint32_t frac = (uint32_t)a << (16 + 6);
4654 int16_t exp = extract32(a, 10, 5);
4656 if (unlikely(exp == 0)) {
4657 if (frac != 0) {
4658 if (!get_flush_inputs_to_zero(s)) {
4659 /* denormal: bias - fractional_zeros */
4660 return -15 - clz32(frac);
4662 /* flush to zero */
4663 float_raise(float_flag_input_denormal, s);
4665 } else if (unlikely(exp == 0x1f)) {
4666 if (frac == 0) {
4667 return INT16_MAX; /* infinity */
4669 } else {
4670 /* normal: exp - bias */
4671 return exp - 15;
4673 /* nan or zero */
4674 float_raise(float_flag_invalid, s);
4675 return INT16_MIN;
4678 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4680 /* Extract frac to the top of the uint32_t. */
4681 uint32_t frac = a << 9;
4682 int32_t exp = extract32(a, 23, 8);
4684 if (unlikely(exp == 0)) {
4685 if (frac != 0) {
4686 if (!get_flush_inputs_to_zero(s)) {
4687 /* denormal: bias - fractional_zeros */
4688 return -127 - clz32(frac);
4690 /* flush to zero */
4691 float_raise(float_flag_input_denormal, s);
4693 } else if (unlikely(exp == 0xff)) {
4694 if (frac == 0) {
4695 return INT32_MAX; /* infinity */
4697 } else {
4698 /* normal: exp - bias */
4699 return exp - 127;
4701 /* nan or zero */
4702 float_raise(float_flag_invalid, s);
4703 return INT32_MIN;
4706 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4708 /* Extract frac to the top of the uint64_t. */
4709 uint64_t frac = a << 12;
4710 int64_t exp = extract64(a, 52, 11);
4712 if (unlikely(exp == 0)) {
4713 if (frac != 0) {
4714 if (!get_flush_inputs_to_zero(s)) {
4715 /* denormal: bias - fractional_zeros */
4716 return -1023 - clz64(frac);
4718 /* flush to zero */
4719 float_raise(float_flag_input_denormal, s);
4721 } else if (unlikely(exp == 0x7ff)) {
4722 if (frac == 0) {
4723 return INT64_MAX; /* infinity */
4725 } else {
4726 /* normal: exp - bias */
4727 return exp - 1023;
4729 /* nan or zero */
4730 float_raise(float_flag_invalid, s);
4731 return INT64_MIN;
4734 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4735 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4736 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4738 #undef DO_ZPZ_FP
4740 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4741 float_status *status, uint32_t desc,
4742 uint16_t neg1, uint16_t neg3)
4744 intptr_t i = simd_oprsz(desc);
4745 uint64_t *g = vg;
4747 do {
4748 uint64_t pg = g[(i - 1) >> 6];
4749 do {
4750 i -= 2;
4751 if (likely((pg >> (i & 63)) & 1)) {
4752 float16 e1, e2, e3, r;
4754 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4755 e2 = *(uint16_t *)(vm + H1_2(i));
4756 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4757 r = float16_muladd(e1, e2, e3, 0, status);
4758 *(uint16_t *)(vd + H1_2(i)) = r;
4760 } while (i & 63);
4761 } while (i != 0);
4764 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4765 void *vg, void *status, uint32_t desc)
4767 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4770 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4771 void *vg, void *status, uint32_t desc)
4773 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4776 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4777 void *vg, void *status, uint32_t desc)
4779 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4782 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4783 void *vg, void *status, uint32_t desc)
4785 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4788 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4789 float_status *status, uint32_t desc,
4790 uint32_t neg1, uint32_t neg3)
4792 intptr_t i = simd_oprsz(desc);
4793 uint64_t *g = vg;
4795 do {
4796 uint64_t pg = g[(i - 1) >> 6];
4797 do {
4798 i -= 4;
4799 if (likely((pg >> (i & 63)) & 1)) {
4800 float32 e1, e2, e3, r;
4802 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4803 e2 = *(uint32_t *)(vm + H1_4(i));
4804 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4805 r = float32_muladd(e1, e2, e3, 0, status);
4806 *(uint32_t *)(vd + H1_4(i)) = r;
4808 } while (i & 63);
4809 } while (i != 0);
4812 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4813 void *vg, void *status, uint32_t desc)
4815 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4818 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4819 void *vg, void *status, uint32_t desc)
4821 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4824 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4825 void *vg, void *status, uint32_t desc)
4827 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4830 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4831 void *vg, void *status, uint32_t desc)
4833 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4836 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4837 float_status *status, uint32_t desc,
4838 uint64_t neg1, uint64_t neg3)
4840 intptr_t i = simd_oprsz(desc);
4841 uint64_t *g = vg;
4843 do {
4844 uint64_t pg = g[(i - 1) >> 6];
4845 do {
4846 i -= 8;
4847 if (likely((pg >> (i & 63)) & 1)) {
4848 float64 e1, e2, e3, r;
4850 e1 = *(uint64_t *)(vn + i) ^ neg1;
4851 e2 = *(uint64_t *)(vm + i);
4852 e3 = *(uint64_t *)(va + i) ^ neg3;
4853 r = float64_muladd(e1, e2, e3, 0, status);
4854 *(uint64_t *)(vd + i) = r;
4856 } while (i & 63);
4857 } while (i != 0);
4860 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4861 void *vg, void *status, uint32_t desc)
4863 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4866 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4867 void *vg, void *status, uint32_t desc)
4869 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4872 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4873 void *vg, void *status, uint32_t desc)
4875 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4878 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4879 void *vg, void *status, uint32_t desc)
4881 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4884 /* Two operand floating-point comparison controlled by a predicate.
4885 * Unlike the integer version, we are not allowed to optimistically
4886 * compare operands, since the comparison may have side effects wrt
4887 * the FPSR.
4889 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4890 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4891 void *status, uint32_t desc) \
4893 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4894 uint64_t *d = vd, *g = vg; \
4895 do { \
4896 uint64_t out = 0, pg = g[j]; \
4897 do { \
4898 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4899 if (likely((pg >> (i & 63)) & 1)) { \
4900 TYPE nn = *(TYPE *)(vn + H(i)); \
4901 TYPE mm = *(TYPE *)(vm + H(i)); \
4902 out |= OP(TYPE, nn, mm, status); \
4904 } while (i & 63); \
4905 d[j--] = out; \
4906 } while (i > 0); \
4909 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4910 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4911 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4912 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4913 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4914 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4916 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4917 DO_FPCMP_PPZZ_H(NAME, OP) \
4918 DO_FPCMP_PPZZ_S(NAME, OP) \
4919 DO_FPCMP_PPZZ_D(NAME, OP)
4921 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4922 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4923 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4924 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4925 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4926 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4927 #define DO_FCMUO(TYPE, X, Y, ST) \
4928 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4929 #define DO_FACGE(TYPE, X, Y, ST) \
4930 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4931 #define DO_FACGT(TYPE, X, Y, ST) \
4932 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4934 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4935 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4936 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4937 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4938 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4939 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4940 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4942 #undef DO_FPCMP_PPZZ_ALL
4943 #undef DO_FPCMP_PPZZ_D
4944 #undef DO_FPCMP_PPZZ_S
4945 #undef DO_FPCMP_PPZZ_H
4946 #undef DO_FPCMP_PPZZ
4948 /* One operand floating-point comparison against zero, controlled
4949 * by a predicate.
4951 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4952 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4953 void *status, uint32_t desc) \
4955 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4956 uint64_t *d = vd, *g = vg; \
4957 do { \
4958 uint64_t out = 0, pg = g[j]; \
4959 do { \
4960 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4961 if ((pg >> (i & 63)) & 1) { \
4962 TYPE nn = *(TYPE *)(vn + H(i)); \
4963 out |= OP(TYPE, nn, 0, status); \
4965 } while (i & 63); \
4966 d[j--] = out; \
4967 } while (i > 0); \
4970 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4971 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4972 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4973 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4974 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4975 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4977 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4978 DO_FPCMP_PPZ0_H(NAME, OP) \
4979 DO_FPCMP_PPZ0_S(NAME, OP) \
4980 DO_FPCMP_PPZ0_D(NAME, OP)
4982 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4983 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4984 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4985 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4986 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4987 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4989 /* FP Trig Multiply-Add. */
4991 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4993 static const float16 coeff[16] = {
4994 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4995 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4997 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4998 intptr_t x = simd_data(desc);
4999 float16 *d = vd, *n = vn, *m = vm;
5000 for (i = 0; i < opr_sz; i++) {
5001 float16 mm = m[i];
5002 intptr_t xx = x;
5003 if (float16_is_neg(mm)) {
5004 mm = float16_abs(mm);
5005 xx += 8;
5007 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5011 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5013 static const float32 coeff[16] = {
5014 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5015 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5016 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5017 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5019 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5020 intptr_t x = simd_data(desc);
5021 float32 *d = vd, *n = vn, *m = vm;
5022 for (i = 0; i < opr_sz; i++) {
5023 float32 mm = m[i];
5024 intptr_t xx = x;
5025 if (float32_is_neg(mm)) {
5026 mm = float32_abs(mm);
5027 xx += 8;
5029 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5033 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5035 static const float64 coeff[16] = {
5036 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5037 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5038 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5039 0x3de5d8408868552full, 0x0000000000000000ull,
5040 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5041 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5042 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5043 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5045 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5046 intptr_t x = simd_data(desc);
5047 float64 *d = vd, *n = vn, *m = vm;
5048 for (i = 0; i < opr_sz; i++) {
5049 float64 mm = m[i];
5050 intptr_t xx = x;
5051 if (float64_is_neg(mm)) {
5052 mm = float64_abs(mm);
5053 xx += 8;
5055 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5060 * FP Complex Add
5063 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5064 void *vs, uint32_t desc)
5066 intptr_t j, i = simd_oprsz(desc);
5067 uint64_t *g = vg;
5068 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5069 float16 neg_real = float16_chs(neg_imag);
5071 do {
5072 uint64_t pg = g[(i - 1) >> 6];
5073 do {
5074 float16 e0, e1, e2, e3;
5076 /* I holds the real index; J holds the imag index. */
5077 j = i - sizeof(float16);
5078 i -= 2 * sizeof(float16);
5080 e0 = *(float16 *)(vn + H1_2(i));
5081 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5082 e2 = *(float16 *)(vn + H1_2(j));
5083 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5085 if (likely((pg >> (i & 63)) & 1)) {
5086 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5088 if (likely((pg >> (j & 63)) & 1)) {
5089 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5091 } while (i & 63);
5092 } while (i != 0);
5095 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5096 void *vs, uint32_t desc)
5098 intptr_t j, i = simd_oprsz(desc);
5099 uint64_t *g = vg;
5100 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5101 float32 neg_real = float32_chs(neg_imag);
5103 do {
5104 uint64_t pg = g[(i - 1) >> 6];
5105 do {
5106 float32 e0, e1, e2, e3;
5108 /* I holds the real index; J holds the imag index. */
5109 j = i - sizeof(float32);
5110 i -= 2 * sizeof(float32);
5112 e0 = *(float32 *)(vn + H1_2(i));
5113 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5114 e2 = *(float32 *)(vn + H1_2(j));
5115 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5117 if (likely((pg >> (i & 63)) & 1)) {
5118 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5120 if (likely((pg >> (j & 63)) & 1)) {
5121 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5123 } while (i & 63);
5124 } while (i != 0);
5127 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5128 void *vs, uint32_t desc)
5130 intptr_t j, i = simd_oprsz(desc);
5131 uint64_t *g = vg;
5132 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5133 float64 neg_real = float64_chs(neg_imag);
5135 do {
5136 uint64_t pg = g[(i - 1) >> 6];
5137 do {
5138 float64 e0, e1, e2, e3;
5140 /* I holds the real index; J holds the imag index. */
5141 j = i - sizeof(float64);
5142 i -= 2 * sizeof(float64);
5144 e0 = *(float64 *)(vn + H1_2(i));
5145 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5146 e2 = *(float64 *)(vn + H1_2(j));
5147 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5149 if (likely((pg >> (i & 63)) & 1)) {
5150 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5152 if (likely((pg >> (j & 63)) & 1)) {
5153 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5155 } while (i & 63);
5156 } while (i != 0);
5160 * FP Complex Multiply
5163 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5164 void *vg, void *status, uint32_t desc)
5166 intptr_t j, i = simd_oprsz(desc);
5167 unsigned rot = simd_data(desc);
5168 bool flip = rot & 1;
5169 float16 neg_imag, neg_real;
5170 uint64_t *g = vg;
5172 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5173 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5175 do {
5176 uint64_t pg = g[(i - 1) >> 6];
5177 do {
5178 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5180 /* I holds the real index; J holds the imag index. */
5181 j = i - sizeof(float16);
5182 i -= 2 * sizeof(float16);
5184 nr = *(float16 *)(vn + H1_2(i));
5185 ni = *(float16 *)(vn + H1_2(j));
5186 mr = *(float16 *)(vm + H1_2(i));
5187 mi = *(float16 *)(vm + H1_2(j));
5189 e2 = (flip ? ni : nr);
5190 e1 = (flip ? mi : mr) ^ neg_real;
5191 e4 = e2;
5192 e3 = (flip ? mr : mi) ^ neg_imag;
5194 if (likely((pg >> (i & 63)) & 1)) {
5195 d = *(float16 *)(va + H1_2(i));
5196 d = float16_muladd(e2, e1, d, 0, status);
5197 *(float16 *)(vd + H1_2(i)) = d;
5199 if (likely((pg >> (j & 63)) & 1)) {
5200 d = *(float16 *)(va + H1_2(j));
5201 d = float16_muladd(e4, e3, d, 0, status);
5202 *(float16 *)(vd + H1_2(j)) = d;
5204 } while (i & 63);
5205 } while (i != 0);
5208 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5209 void *vg, void *status, uint32_t desc)
5211 intptr_t j, i = simd_oprsz(desc);
5212 unsigned rot = simd_data(desc);
5213 bool flip = rot & 1;
5214 float32 neg_imag, neg_real;
5215 uint64_t *g = vg;
5217 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5218 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5220 do {
5221 uint64_t pg = g[(i - 1) >> 6];
5222 do {
5223 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5225 /* I holds the real index; J holds the imag index. */
5226 j = i - sizeof(float32);
5227 i -= 2 * sizeof(float32);
5229 nr = *(float32 *)(vn + H1_2(i));
5230 ni = *(float32 *)(vn + H1_2(j));
5231 mr = *(float32 *)(vm + H1_2(i));
5232 mi = *(float32 *)(vm + H1_2(j));
5234 e2 = (flip ? ni : nr);
5235 e1 = (flip ? mi : mr) ^ neg_real;
5236 e4 = e2;
5237 e3 = (flip ? mr : mi) ^ neg_imag;
5239 if (likely((pg >> (i & 63)) & 1)) {
5240 d = *(float32 *)(va + H1_2(i));
5241 d = float32_muladd(e2, e1, d, 0, status);
5242 *(float32 *)(vd + H1_2(i)) = d;
5244 if (likely((pg >> (j & 63)) & 1)) {
5245 d = *(float32 *)(va + H1_2(j));
5246 d = float32_muladd(e4, e3, d, 0, status);
5247 *(float32 *)(vd + H1_2(j)) = d;
5249 } while (i & 63);
5250 } while (i != 0);
5253 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5254 void *vg, void *status, uint32_t desc)
5256 intptr_t j, i = simd_oprsz(desc);
5257 unsigned rot = simd_data(desc);
5258 bool flip = rot & 1;
5259 float64 neg_imag, neg_real;
5260 uint64_t *g = vg;
5262 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5263 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5265 do {
5266 uint64_t pg = g[(i - 1) >> 6];
5267 do {
5268 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5270 /* I holds the real index; J holds the imag index. */
5271 j = i - sizeof(float64);
5272 i -= 2 * sizeof(float64);
5274 nr = *(float64 *)(vn + H1_2(i));
5275 ni = *(float64 *)(vn + H1_2(j));
5276 mr = *(float64 *)(vm + H1_2(i));
5277 mi = *(float64 *)(vm + H1_2(j));
5279 e2 = (flip ? ni : nr);
5280 e1 = (flip ? mi : mr) ^ neg_real;
5281 e4 = e2;
5282 e3 = (flip ? mr : mi) ^ neg_imag;
5284 if (likely((pg >> (i & 63)) & 1)) {
5285 d = *(float64 *)(va + H1_2(i));
5286 d = float64_muladd(e2, e1, d, 0, status);
5287 *(float64 *)(vd + H1_2(i)) = d;
5289 if (likely((pg >> (j & 63)) & 1)) {
5290 d = *(float64 *)(va + H1_2(j));
5291 d = float64_muladd(e4, e3, d, 0, status);
5292 *(float64 *)(vd + H1_2(j)) = d;
5294 } while (i & 63);
5295 } while (i != 0);
5299 * Load contiguous data, protected by a governing predicate.
5303 * Load one element into @vd + @reg_off from @host.
5304 * The controlling predicate is known to be true.
5306 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
5309 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5310 * The controlling predicate is known to be true.
5312 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5313 target_ulong vaddr, uintptr_t retaddr);
5316 * Generate the above primitives.
5319 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5320 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5322 TYPEM val = HOST(host); \
5323 *(TYPEE *)(vd + H(reg_off)) = val; \
5326 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5327 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5328 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5330 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5331 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5332 target_ulong addr, uintptr_t ra) \
5334 *(TYPEE *)(vd + H(reg_off)) = \
5335 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5338 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5339 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5340 target_ulong addr, uintptr_t ra) \
5342 TLB(env, useronly_clean_ptr(addr), \
5343 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5346 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5347 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5348 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5350 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5351 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5352 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5353 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5354 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5355 DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t)
5356 DO_LD_PRIM_1(ld1bds, H1_8, uint64_t, int8_t)
5358 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5359 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5360 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5362 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5363 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5364 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5365 DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t)
5367 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5368 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5369 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5370 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5371 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5373 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5374 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5375 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5376 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5377 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5379 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5380 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5381 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5382 DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw)
5383 DO_LD_PRIM_2(hds, H1_8, uint64_t, int16_t, lduw)
5385 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5386 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5387 DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw)
5389 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5390 DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl)
5391 DO_LD_PRIM_2(sds, H1_8, uint64_t, int32_t, ldl)
5393 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5394 DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
5396 DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
5397 DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
5399 #undef DO_LD_TLB
5400 #undef DO_ST_TLB
5401 #undef DO_LD_HOST
5402 #undef DO_LD_PRIM_1
5403 #undef DO_ST_PRIM_1
5404 #undef DO_LD_PRIM_2
5405 #undef DO_ST_PRIM_2
5408 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5409 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5410 * element >= @reg_off, or @reg_max if there were no active elements at all.
5412 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5413 intptr_t reg_max, int esz)
5415 uint64_t pg_mask = pred_esz_masks[esz];
5416 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5418 /* In normal usage, the first element is active. */
5419 if (likely(pg & 1)) {
5420 return reg_off;
5423 if (pg == 0) {
5424 reg_off &= -64;
5425 do {
5426 reg_off += 64;
5427 if (unlikely(reg_off >= reg_max)) {
5428 /* The entire predicate was false. */
5429 return reg_max;
5431 pg = vg[reg_off >> 6] & pg_mask;
5432 } while (pg == 0);
5434 reg_off += ctz64(pg);
5436 /* We should never see an out of range predicate bit set. */
5437 tcg_debug_assert(reg_off < reg_max);
5438 return reg_off;
5442 * Resolve the guest virtual address to info->host and info->flags.
5443 * If @nofault, return false if the page is invalid, otherwise
5444 * exit via page fault exception.
5447 typedef struct {
5448 void *host;
5449 int flags;
5450 MemTxAttrs attrs;
5451 } SVEHostPage;
5453 static bool sve_probe_page(SVEHostPage *info, bool nofault,
5454 CPUARMState *env, target_ulong addr,
5455 int mem_off, MMUAccessType access_type,
5456 int mmu_idx, uintptr_t retaddr)
5458 int flags;
5460 addr += mem_off;
5463 * User-only currently always issues with TBI. See the comment
5464 * above useronly_clean_ptr. Usually we clean this top byte away
5465 * during translation, but we can't do that for e.g. vector + imm
5466 * addressing modes.
5468 * We currently always enable TBI for user-only, and do not provide
5469 * a way to turn it off. So clean the pointer unconditionally here,
5470 * rather than look it up here, or pass it down from above.
5472 addr = useronly_clean_ptr(addr);
5474 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5475 &info->host, retaddr);
5476 info->flags = flags;
5478 if (flags & TLB_INVALID_MASK) {
5479 g_assert(nofault);
5480 return false;
5483 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5484 info->host -= mem_off;
5486 #ifdef CONFIG_USER_ONLY
5487 memset(&info->attrs, 0, sizeof(info->attrs));
5488 #else
5490 * Find the iotlbentry for addr and return the transaction attributes.
5491 * This *must* be present in the TLB because we just found the mapping.
5494 uintptr_t index = tlb_index(env, mmu_idx, addr);
5496 # ifdef CONFIG_DEBUG_TCG
5497 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5498 target_ulong comparator = (access_type == MMU_DATA_LOAD
5499 ? entry->addr_read
5500 : tlb_addr_write(entry));
5501 g_assert(tlb_hit(comparator, addr));
5502 # endif
5504 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5505 info->attrs = iotlbentry->attrs;
5507 #endif
5509 return true;
5514 * Analyse contiguous data, protected by a governing predicate.
5517 typedef enum {
5518 FAULT_NO,
5519 FAULT_FIRST,
5520 FAULT_ALL,
5521 } SVEContFault;
5523 typedef struct {
5525 * First and last element wholly contained within the two pages.
5526 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5527 * reg_off_last[0] may be < 0 if the first element crosses pages.
5528 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5529 * are set >= 0 only if there are complete elements on a second page.
5531 * The reg_off_* offsets are relative to the internal vector register.
5532 * The mem_off_first offset is relative to the memory address; the
5533 * two offsets are different when a load operation extends, a store
5534 * operation truncates, or for multi-register operations.
5536 int16_t mem_off_first[2];
5537 int16_t reg_off_first[2];
5538 int16_t reg_off_last[2];
5541 * One element that is misaligned and spans both pages,
5542 * or -1 if there is no such active element.
5544 int16_t mem_off_split;
5545 int16_t reg_off_split;
5548 * The byte offset at which the entire operation crosses a page boundary.
5549 * Set >= 0 if and only if the entire operation spans two pages.
5551 int16_t page_split;
5553 /* TLB data for the two pages. */
5554 SVEHostPage page[2];
5555 } SVEContLdSt;
5558 * Find first active element on each page, and a loose bound for the
5559 * final element on each page. Identify any single element that spans
5560 * the page boundary. Return true if there are any active elements.
5562 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5563 uint64_t *vg, intptr_t reg_max,
5564 int esz, int msize)
5566 const int esize = 1 << esz;
5567 const uint64_t pg_mask = pred_esz_masks[esz];
5568 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5569 intptr_t mem_off_last, mem_off_split;
5570 intptr_t page_split, elt_split;
5571 intptr_t i;
5573 /* Set all of the element indices to -1, and the TLB data to 0. */
5574 memset(info, -1, offsetof(SVEContLdSt, page));
5575 memset(info->page, 0, sizeof(info->page));
5577 /* Gross scan over the entire predicate to find bounds. */
5578 i = 0;
5579 do {
5580 uint64_t pg = vg[i] & pg_mask;
5581 if (pg) {
5582 reg_off_last = i * 64 + 63 - clz64(pg);
5583 if (reg_off_first < 0) {
5584 reg_off_first = i * 64 + ctz64(pg);
5587 } while (++i * 64 < reg_max);
5589 if (unlikely(reg_off_first < 0)) {
5590 /* No active elements, no pages touched. */
5591 return false;
5593 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5595 info->reg_off_first[0] = reg_off_first;
5596 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5597 mem_off_last = (reg_off_last >> esz) * msize;
5599 page_split = -(addr | TARGET_PAGE_MASK);
5600 if (likely(mem_off_last + msize <= page_split)) {
5601 /* The entire operation fits within a single page. */
5602 info->reg_off_last[0] = reg_off_last;
5603 return true;
5606 info->page_split = page_split;
5607 elt_split = page_split / msize;
5608 reg_off_split = elt_split << esz;
5609 mem_off_split = elt_split * msize;
5612 * This is the last full element on the first page, but it is not
5613 * necessarily active. If there is no full element, i.e. the first
5614 * active element is the one that's split, this value remains -1.
5615 * It is useful as iteration bounds.
5617 if (elt_split != 0) {
5618 info->reg_off_last[0] = reg_off_split - esize;
5621 /* Determine if an unaligned element spans the pages. */
5622 if (page_split % msize != 0) {
5623 /* It is helpful to know if the split element is active. */
5624 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5625 info->reg_off_split = reg_off_split;
5626 info->mem_off_split = mem_off_split;
5628 if (reg_off_split == reg_off_last) {
5629 /* The page crossing element is last. */
5630 return true;
5633 reg_off_split += esize;
5634 mem_off_split += msize;
5638 * We do want the first active element on the second page, because
5639 * this may affect the address reported in an exception.
5641 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5642 tcg_debug_assert(reg_off_split <= reg_off_last);
5643 info->reg_off_first[1] = reg_off_split;
5644 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5645 info->reg_off_last[1] = reg_off_last;
5646 return true;
5650 * Resolve the guest virtual addresses to info->page[].
5651 * Control the generation of page faults with @fault. Return false if
5652 * there is no work to do, which can only happen with @fault == FAULT_NO.
5654 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5655 CPUARMState *env, target_ulong addr,
5656 MMUAccessType access_type, uintptr_t retaddr)
5658 int mmu_idx = cpu_mmu_index(env, false);
5659 int mem_off = info->mem_off_first[0];
5660 bool nofault = fault == FAULT_NO;
5661 bool have_work = true;
5663 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5664 access_type, mmu_idx, retaddr)) {
5665 /* No work to be done. */
5666 return false;
5669 if (likely(info->page_split < 0)) {
5670 /* The entire operation was on the one page. */
5671 return true;
5675 * If the second page is invalid, then we want the fault address to be
5676 * the first byte on that page which is accessed.
5678 if (info->mem_off_split >= 0) {
5680 * There is an element split across the pages. The fault address
5681 * should be the first byte of the second page.
5683 mem_off = info->page_split;
5685 * If the split element is also the first active element
5686 * of the vector, then: For first-fault we should continue
5687 * to generate faults for the second page. For no-fault,
5688 * we have work only if the second page is valid.
5690 if (info->mem_off_first[0] < info->mem_off_split) {
5691 nofault = FAULT_FIRST;
5692 have_work = false;
5694 } else {
5696 * There is no element split across the pages. The fault address
5697 * should be the first active element on the second page.
5699 mem_off = info->mem_off_first[1];
5701 * There must have been one active element on the first page,
5702 * so we're out of first-fault territory.
5704 nofault = fault != FAULT_ALL;
5707 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5708 access_type, mmu_idx, retaddr);
5709 return have_work;
5712 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5713 uint64_t *vg, target_ulong addr,
5714 int esize, int msize, int wp_access,
5715 uintptr_t retaddr)
5717 #ifndef CONFIG_USER_ONLY
5718 intptr_t mem_off, reg_off, reg_last;
5719 int flags0 = info->page[0].flags;
5720 int flags1 = info->page[1].flags;
5722 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5723 return;
5726 /* Indicate that watchpoints are handled. */
5727 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5728 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5730 if (flags0 & TLB_WATCHPOINT) {
5731 mem_off = info->mem_off_first[0];
5732 reg_off = info->reg_off_first[0];
5733 reg_last = info->reg_off_last[0];
5735 while (reg_off <= reg_last) {
5736 uint64_t pg = vg[reg_off >> 6];
5737 do {
5738 if ((pg >> (reg_off & 63)) & 1) {
5739 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5740 msize, info->page[0].attrs,
5741 wp_access, retaddr);
5743 reg_off += esize;
5744 mem_off += msize;
5745 } while (reg_off <= reg_last && (reg_off & 63));
5749 mem_off = info->mem_off_split;
5750 if (mem_off >= 0) {
5751 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5752 info->page[0].attrs, wp_access, retaddr);
5755 mem_off = info->mem_off_first[1];
5756 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5757 reg_off = info->reg_off_first[1];
5758 reg_last = info->reg_off_last[1];
5760 do {
5761 uint64_t pg = vg[reg_off >> 6];
5762 do {
5763 if ((pg >> (reg_off & 63)) & 1) {
5764 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5765 msize, info->page[1].attrs,
5766 wp_access, retaddr);
5768 reg_off += esize;
5769 mem_off += msize;
5770 } while (reg_off & 63);
5771 } while (reg_off <= reg_last);
5773 #endif
5776 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5777 uint64_t *vg, target_ulong addr, int esize,
5778 int msize, uint32_t mtedesc, uintptr_t ra)
5780 intptr_t mem_off, reg_off, reg_last;
5782 /* Process the page only if MemAttr == Tagged. */
5783 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5784 mem_off = info->mem_off_first[0];
5785 reg_off = info->reg_off_first[0];
5786 reg_last = info->reg_off_split;
5787 if (reg_last < 0) {
5788 reg_last = info->reg_off_last[0];
5791 do {
5792 uint64_t pg = vg[reg_off >> 6];
5793 do {
5794 if ((pg >> (reg_off & 63)) & 1) {
5795 mte_check(env, mtedesc, addr, ra);
5797 reg_off += esize;
5798 mem_off += msize;
5799 } while (reg_off <= reg_last && (reg_off & 63));
5800 } while (reg_off <= reg_last);
5803 mem_off = info->mem_off_first[1];
5804 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5805 reg_off = info->reg_off_first[1];
5806 reg_last = info->reg_off_last[1];
5808 do {
5809 uint64_t pg = vg[reg_off >> 6];
5810 do {
5811 if ((pg >> (reg_off & 63)) & 1) {
5812 mte_check(env, mtedesc, addr, ra);
5814 reg_off += esize;
5815 mem_off += msize;
5816 } while (reg_off & 63);
5817 } while (reg_off <= reg_last);
5822 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5824 static inline QEMU_ALWAYS_INLINE
5825 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5826 uint32_t desc, const uintptr_t retaddr,
5827 const int esz, const int msz, const int N, uint32_t mtedesc,
5828 sve_ldst1_host_fn *host_fn,
5829 sve_ldst1_tlb_fn *tlb_fn)
5831 const unsigned rd = simd_data(desc);
5832 const intptr_t reg_max = simd_oprsz(desc);
5833 intptr_t reg_off, reg_last, mem_off;
5834 SVEContLdSt info;
5835 void *host;
5836 int flags, i;
5838 /* Find the active elements. */
5839 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5840 /* The entire predicate was false; no load occurs. */
5841 for (i = 0; i < N; ++i) {
5842 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5844 return;
5847 /* Probe the page(s). Exit with exception for any invalid page. */
5848 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5850 /* Handle watchpoints for all active elements. */
5851 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5852 BP_MEM_READ, retaddr);
5855 * Handle mte checks for all active elements.
5856 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5858 if (mtedesc) {
5859 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5860 mtedesc, retaddr);
5863 flags = info.page[0].flags | info.page[1].flags;
5864 if (unlikely(flags != 0)) {
5865 #ifdef CONFIG_USER_ONLY
5866 g_assert_not_reached();
5867 #else
5869 * At least one page includes MMIO.
5870 * Any bus operation can fail with cpu_transaction_failed,
5871 * which for ARM will raise SyncExternal. Perform the load
5872 * into scratch memory to preserve register state until the end.
5874 ARMVectorReg scratch[4] = { };
5876 mem_off = info.mem_off_first[0];
5877 reg_off = info.reg_off_first[0];
5878 reg_last = info.reg_off_last[1];
5879 if (reg_last < 0) {
5880 reg_last = info.reg_off_split;
5881 if (reg_last < 0) {
5882 reg_last = info.reg_off_last[0];
5886 do {
5887 uint64_t pg = vg[reg_off >> 6];
5888 do {
5889 if ((pg >> (reg_off & 63)) & 1) {
5890 for (i = 0; i < N; ++i) {
5891 tlb_fn(env, &scratch[i], reg_off,
5892 addr + mem_off + (i << msz), retaddr);
5895 reg_off += 1 << esz;
5896 mem_off += N << msz;
5897 } while (reg_off & 63);
5898 } while (reg_off <= reg_last);
5900 for (i = 0; i < N; ++i) {
5901 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5903 return;
5904 #endif
5907 /* The entire operation is in RAM, on valid pages. */
5909 for (i = 0; i < N; ++i) {
5910 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5913 mem_off = info.mem_off_first[0];
5914 reg_off = info.reg_off_first[0];
5915 reg_last = info.reg_off_last[0];
5916 host = info.page[0].host;
5918 while (reg_off <= reg_last) {
5919 uint64_t pg = vg[reg_off >> 6];
5920 do {
5921 if ((pg >> (reg_off & 63)) & 1) {
5922 for (i = 0; i < N; ++i) {
5923 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5924 host + mem_off + (i << msz));
5927 reg_off += 1 << esz;
5928 mem_off += N << msz;
5929 } while (reg_off <= reg_last && (reg_off & 63));
5933 * Use the slow path to manage the cross-page misalignment.
5934 * But we know this is RAM and cannot trap.
5936 mem_off = info.mem_off_split;
5937 if (unlikely(mem_off >= 0)) {
5938 reg_off = info.reg_off_split;
5939 for (i = 0; i < N; ++i) {
5940 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5941 addr + mem_off + (i << msz), retaddr);
5945 mem_off = info.mem_off_first[1];
5946 if (unlikely(mem_off >= 0)) {
5947 reg_off = info.reg_off_first[1];
5948 reg_last = info.reg_off_last[1];
5949 host = info.page[1].host;
5951 do {
5952 uint64_t pg = vg[reg_off >> 6];
5953 do {
5954 if ((pg >> (reg_off & 63)) & 1) {
5955 for (i = 0; i < N; ++i) {
5956 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5957 host + mem_off + (i << msz));
5960 reg_off += 1 << esz;
5961 mem_off += N << msz;
5962 } while (reg_off & 63);
5963 } while (reg_off <= reg_last);
5967 static inline QEMU_ALWAYS_INLINE
5968 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5969 uint32_t desc, const uintptr_t ra,
5970 const int esz, const int msz, const int N,
5971 sve_ldst1_host_fn *host_fn,
5972 sve_ldst1_tlb_fn *tlb_fn)
5974 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5975 int bit55 = extract64(addr, 55, 1);
5977 /* Remove mtedesc from the normal sve descriptor. */
5978 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5980 /* Perform gross MTE suppression early. */
5981 if (!tbi_check(desc, bit55) ||
5982 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5983 mtedesc = 0;
5986 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5989 #define DO_LD1_1(NAME, ESZ) \
5990 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5991 target_ulong addr, uint32_t desc) \
5993 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5994 sve_##NAME##_host, sve_##NAME##_tlb); \
5996 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5997 target_ulong addr, uint32_t desc) \
5999 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6000 sve_##NAME##_host, sve_##NAME##_tlb); \
6003 #define DO_LD1_2(NAME, ESZ, MSZ) \
6004 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6005 target_ulong addr, uint32_t desc) \
6007 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6008 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6010 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6011 target_ulong addr, uint32_t desc) \
6013 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6014 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6016 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6017 target_ulong addr, uint32_t desc) \
6019 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6020 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6022 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6023 target_ulong addr, uint32_t desc) \
6025 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6026 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6029 DO_LD1_1(ld1bb, MO_8)
6030 DO_LD1_1(ld1bhu, MO_16)
6031 DO_LD1_1(ld1bhs, MO_16)
6032 DO_LD1_1(ld1bsu, MO_32)
6033 DO_LD1_1(ld1bss, MO_32)
6034 DO_LD1_1(ld1bdu, MO_64)
6035 DO_LD1_1(ld1bds, MO_64)
6037 DO_LD1_2(ld1hh, MO_16, MO_16)
6038 DO_LD1_2(ld1hsu, MO_32, MO_16)
6039 DO_LD1_2(ld1hss, MO_32, MO_16)
6040 DO_LD1_2(ld1hdu, MO_64, MO_16)
6041 DO_LD1_2(ld1hds, MO_64, MO_16)
6043 DO_LD1_2(ld1ss, MO_32, MO_32)
6044 DO_LD1_2(ld1sdu, MO_64, MO_32)
6045 DO_LD1_2(ld1sds, MO_64, MO_32)
6047 DO_LD1_2(ld1dd, MO_64, MO_64)
6049 #undef DO_LD1_1
6050 #undef DO_LD1_2
6052 #define DO_LDN_1(N) \
6053 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6054 target_ulong addr, uint32_t desc) \
6056 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6057 sve_ld1bb_host, sve_ld1bb_tlb); \
6059 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6060 target_ulong addr, uint32_t desc) \
6062 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6063 sve_ld1bb_host, sve_ld1bb_tlb); \
6066 #define DO_LDN_2(N, SUFF, ESZ) \
6067 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6068 target_ulong addr, uint32_t desc) \
6070 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6071 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6073 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6074 target_ulong addr, uint32_t desc) \
6076 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6077 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6079 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6080 target_ulong addr, uint32_t desc) \
6082 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6083 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6085 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6086 target_ulong addr, uint32_t desc) \
6088 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6089 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6092 DO_LDN_1(2)
6093 DO_LDN_1(3)
6094 DO_LDN_1(4)
6096 DO_LDN_2(2, hh, MO_16)
6097 DO_LDN_2(3, hh, MO_16)
6098 DO_LDN_2(4, hh, MO_16)
6100 DO_LDN_2(2, ss, MO_32)
6101 DO_LDN_2(3, ss, MO_32)
6102 DO_LDN_2(4, ss, MO_32)
6104 DO_LDN_2(2, dd, MO_64)
6105 DO_LDN_2(3, dd, MO_64)
6106 DO_LDN_2(4, dd, MO_64)
6108 #undef DO_LDN_1
6109 #undef DO_LDN_2
6112 * Load contiguous data, first-fault and no-fault.
6114 * For user-only, one could argue that we should hold the mmap_lock during
6115 * the operation so that there is no race between page_check_range and the
6116 * load operation. However, unmapping pages out from under a running thread
6117 * is extraordinarily unlikely. This theoretical race condition also affects
6118 * linux-user/ in its get_user/put_user macros.
6120 * TODO: Construct some helpers, written in assembly, that interact with
6121 * handle_cpu_signal to produce memory ops which can properly report errors
6122 * without racing.
6125 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6126 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6127 * option, which leaves subsequent data unchanged.
6129 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6131 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6133 if (i & 63) {
6134 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6135 i = ROUND_UP(i, 64);
6137 for (; i < oprsz; i += 64) {
6138 ffr[i / 64] = 0;
6143 * Common helper for all contiguous no-fault and first-fault loads.
6145 static inline QEMU_ALWAYS_INLINE
6146 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6147 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6148 const int esz, const int msz, const SVEContFault fault,
6149 sve_ldst1_host_fn *host_fn,
6150 sve_ldst1_tlb_fn *tlb_fn)
6152 const unsigned rd = simd_data(desc);
6153 void *vd = &env->vfp.zregs[rd];
6154 const intptr_t reg_max = simd_oprsz(desc);
6155 intptr_t reg_off, mem_off, reg_last;
6156 SVEContLdSt info;
6157 int flags;
6158 void *host;
6160 /* Find the active elements. */
6161 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6162 /* The entire predicate was false; no load occurs. */
6163 memset(vd, 0, reg_max);
6164 return;
6166 reg_off = info.reg_off_first[0];
6168 /* Probe the page(s). */
6169 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6170 /* Fault on first element. */
6171 tcg_debug_assert(fault == FAULT_NO);
6172 memset(vd, 0, reg_max);
6173 goto do_fault;
6176 mem_off = info.mem_off_first[0];
6177 flags = info.page[0].flags;
6180 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6181 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6183 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
6184 mtedesc = 0;
6187 if (fault == FAULT_FIRST) {
6188 /* Trapping mte check for the first-fault element. */
6189 if (mtedesc) {
6190 mte_check(env, mtedesc, addr + mem_off, retaddr);
6194 * Special handling of the first active element,
6195 * if it crosses a page boundary or is MMIO.
6197 bool is_split = mem_off == info.mem_off_split;
6198 if (unlikely(flags != 0) || unlikely(is_split)) {
6200 * Use the slow path for cross-page handling.
6201 * Might trap for MMIO or watchpoints.
6203 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6205 /* After any fault, zero the other elements. */
6206 swap_memzero(vd, reg_off);
6207 reg_off += 1 << esz;
6208 mem_off += 1 << msz;
6209 swap_memzero(vd + reg_off, reg_max - reg_off);
6211 if (is_split) {
6212 goto second_page;
6214 } else {
6215 memset(vd, 0, reg_max);
6217 } else {
6218 memset(vd, 0, reg_max);
6219 if (unlikely(mem_off == info.mem_off_split)) {
6220 /* The first active element crosses a page boundary. */
6221 flags |= info.page[1].flags;
6222 if (unlikely(flags & TLB_MMIO)) {
6223 /* Some page is MMIO, see below. */
6224 goto do_fault;
6226 if (unlikely(flags & TLB_WATCHPOINT) &&
6227 (cpu_watchpoint_address_matches
6228 (env_cpu(env), addr + mem_off, 1 << msz)
6229 & BP_MEM_READ)) {
6230 /* Watchpoint hit, see below. */
6231 goto do_fault;
6233 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6234 goto do_fault;
6237 * Use the slow path for cross-page handling.
6238 * This is RAM, without a watchpoint, and will not trap.
6240 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6241 goto second_page;
6246 * From this point on, all memory operations are MemSingleNF.
6248 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6249 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6251 * Unfortuately we do not have access to the memory attributes from the
6252 * PTE to tell Device memory from Normal memory. So we make a mostly
6253 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6254 * This gives the right answer for the common cases of "Normal memory,
6255 * backed by host RAM" and "Device memory, backed by MMIO".
6256 * The architecture allows us to suppress an NF load and return
6257 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6258 * case of "Normal memory, backed by MMIO" is permitted. The case we
6259 * get wrong is "Device memory, backed by host RAM", for which we
6260 * should return (UNKNOWN, FAULT) for but do not.
6262 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6263 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6264 * architectural breakpoints the same.
6266 if (unlikely(flags & TLB_MMIO)) {
6267 goto do_fault;
6270 reg_last = info.reg_off_last[0];
6271 host = info.page[0].host;
6273 do {
6274 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6275 do {
6276 if ((pg >> (reg_off & 63)) & 1) {
6277 if (unlikely(flags & TLB_WATCHPOINT) &&
6278 (cpu_watchpoint_address_matches
6279 (env_cpu(env), addr + mem_off, 1 << msz)
6280 & BP_MEM_READ)) {
6281 goto do_fault;
6283 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6284 goto do_fault;
6286 host_fn(vd, reg_off, host + mem_off);
6288 reg_off += 1 << esz;
6289 mem_off += 1 << msz;
6290 } while (reg_off <= reg_last && (reg_off & 63));
6291 } while (reg_off <= reg_last);
6294 * MemSingleNF is allowed to fail for any reason. We have special
6295 * code above to handle the first element crossing a page boundary.
6296 * As an implementation choice, decline to handle a cross-page element
6297 * in any other position.
6299 reg_off = info.reg_off_split;
6300 if (reg_off >= 0) {
6301 goto do_fault;
6304 second_page:
6305 reg_off = info.reg_off_first[1];
6306 if (likely(reg_off < 0)) {
6307 /* No active elements on the second page. All done. */
6308 return;
6312 * MemSingleNF is allowed to fail for any reason. As an implementation
6313 * choice, decline to handle elements on the second page. This should
6314 * be low frequency as the guest walks through memory -- the next
6315 * iteration of the guest's loop should be aligned on the page boundary,
6316 * and then all following iterations will stay aligned.
6319 do_fault:
6320 record_fault(env, reg_off, reg_max);
6323 static inline QEMU_ALWAYS_INLINE
6324 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6325 uint32_t desc, const uintptr_t retaddr,
6326 const int esz, const int msz, const SVEContFault fault,
6327 sve_ldst1_host_fn *host_fn,
6328 sve_ldst1_tlb_fn *tlb_fn)
6330 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6331 int bit55 = extract64(addr, 55, 1);
6333 /* Remove mtedesc from the normal sve descriptor. */
6334 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6336 /* Perform gross MTE suppression early. */
6337 if (!tbi_check(desc, bit55) ||
6338 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6339 mtedesc = 0;
6342 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6343 esz, msz, fault, host_fn, tlb_fn);
6346 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6347 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6348 target_ulong addr, uint32_t desc) \
6350 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6351 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6353 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6354 target_ulong addr, uint32_t desc) \
6356 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6357 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6359 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6360 target_ulong addr, uint32_t desc) \
6362 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6363 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6365 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6366 target_ulong addr, uint32_t desc) \
6368 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6369 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6372 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6373 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6374 target_ulong addr, uint32_t desc) \
6376 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6377 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6379 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6380 target_ulong addr, uint32_t desc) \
6382 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6383 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6385 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6386 target_ulong addr, uint32_t desc) \
6388 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6389 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6391 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6392 target_ulong addr, uint32_t desc) \
6394 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6395 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6397 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6398 target_ulong addr, uint32_t desc) \
6400 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6401 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6403 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6404 target_ulong addr, uint32_t desc) \
6406 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6407 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6409 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6410 target_ulong addr, uint32_t desc) \
6412 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6413 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6415 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6416 target_ulong addr, uint32_t desc) \
6418 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6419 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6422 DO_LDFF1_LDNF1_1(bb, MO_8)
6423 DO_LDFF1_LDNF1_1(bhu, MO_16)
6424 DO_LDFF1_LDNF1_1(bhs, MO_16)
6425 DO_LDFF1_LDNF1_1(bsu, MO_32)
6426 DO_LDFF1_LDNF1_1(bss, MO_32)
6427 DO_LDFF1_LDNF1_1(bdu, MO_64)
6428 DO_LDFF1_LDNF1_1(bds, MO_64)
6430 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6431 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6432 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6433 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6434 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6436 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6437 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6438 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6440 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6442 #undef DO_LDFF1_LDNF1_1
6443 #undef DO_LDFF1_LDNF1_2
6446 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6449 static inline QEMU_ALWAYS_INLINE
6450 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6451 uint32_t desc, const uintptr_t retaddr,
6452 const int esz, const int msz, const int N, uint32_t mtedesc,
6453 sve_ldst1_host_fn *host_fn,
6454 sve_ldst1_tlb_fn *tlb_fn)
6456 const unsigned rd = simd_data(desc);
6457 const intptr_t reg_max = simd_oprsz(desc);
6458 intptr_t reg_off, reg_last, mem_off;
6459 SVEContLdSt info;
6460 void *host;
6461 int i, flags;
6463 /* Find the active elements. */
6464 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6465 /* The entire predicate was false; no store occurs. */
6466 return;
6469 /* Probe the page(s). Exit with exception for any invalid page. */
6470 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6472 /* Handle watchpoints for all active elements. */
6473 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6474 BP_MEM_WRITE, retaddr);
6477 * Handle mte checks for all active elements.
6478 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6480 if (mtedesc) {
6481 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6482 mtedesc, retaddr);
6485 flags = info.page[0].flags | info.page[1].flags;
6486 if (unlikely(flags != 0)) {
6487 #ifdef CONFIG_USER_ONLY
6488 g_assert_not_reached();
6489 #else
6491 * At least one page includes MMIO.
6492 * Any bus operation can fail with cpu_transaction_failed,
6493 * which for ARM will raise SyncExternal. We cannot avoid
6494 * this fault and will leave with the store incomplete.
6496 mem_off = info.mem_off_first[0];
6497 reg_off = info.reg_off_first[0];
6498 reg_last = info.reg_off_last[1];
6499 if (reg_last < 0) {
6500 reg_last = info.reg_off_split;
6501 if (reg_last < 0) {
6502 reg_last = info.reg_off_last[0];
6506 do {
6507 uint64_t pg = vg[reg_off >> 6];
6508 do {
6509 if ((pg >> (reg_off & 63)) & 1) {
6510 for (i = 0; i < N; ++i) {
6511 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6512 addr + mem_off + (i << msz), retaddr);
6515 reg_off += 1 << esz;
6516 mem_off += N << msz;
6517 } while (reg_off & 63);
6518 } while (reg_off <= reg_last);
6519 return;
6520 #endif
6523 mem_off = info.mem_off_first[0];
6524 reg_off = info.reg_off_first[0];
6525 reg_last = info.reg_off_last[0];
6526 host = info.page[0].host;
6528 while (reg_off <= reg_last) {
6529 uint64_t pg = vg[reg_off >> 6];
6530 do {
6531 if ((pg >> (reg_off & 63)) & 1) {
6532 for (i = 0; i < N; ++i) {
6533 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6534 host + mem_off + (i << msz));
6537 reg_off += 1 << esz;
6538 mem_off += N << msz;
6539 } while (reg_off <= reg_last && (reg_off & 63));
6543 * Use the slow path to manage the cross-page misalignment.
6544 * But we know this is RAM and cannot trap.
6546 mem_off = info.mem_off_split;
6547 if (unlikely(mem_off >= 0)) {
6548 reg_off = info.reg_off_split;
6549 for (i = 0; i < N; ++i) {
6550 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6551 addr + mem_off + (i << msz), retaddr);
6555 mem_off = info.mem_off_first[1];
6556 if (unlikely(mem_off >= 0)) {
6557 reg_off = info.reg_off_first[1];
6558 reg_last = info.reg_off_last[1];
6559 host = info.page[1].host;
6561 do {
6562 uint64_t pg = vg[reg_off >> 6];
6563 do {
6564 if ((pg >> (reg_off & 63)) & 1) {
6565 for (i = 0; i < N; ++i) {
6566 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6567 host + mem_off + (i << msz));
6570 reg_off += 1 << esz;
6571 mem_off += N << msz;
6572 } while (reg_off & 63);
6573 } while (reg_off <= reg_last);
6577 static inline QEMU_ALWAYS_INLINE
6578 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6579 uint32_t desc, const uintptr_t ra,
6580 const int esz, const int msz, const int N,
6581 sve_ldst1_host_fn *host_fn,
6582 sve_ldst1_tlb_fn *tlb_fn)
6584 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6585 int bit55 = extract64(addr, 55, 1);
6587 /* Remove mtedesc from the normal sve descriptor. */
6588 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6590 /* Perform gross MTE suppression early. */
6591 if (!tbi_check(desc, bit55) ||
6592 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6593 mtedesc = 0;
6596 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6599 #define DO_STN_1(N, NAME, ESZ) \
6600 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6601 target_ulong addr, uint32_t desc) \
6603 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6604 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6606 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6607 target_ulong addr, uint32_t desc) \
6609 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6610 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6613 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6614 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6615 target_ulong addr, uint32_t desc) \
6617 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6618 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6620 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6621 target_ulong addr, uint32_t desc) \
6623 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6624 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6626 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6627 target_ulong addr, uint32_t desc) \
6629 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6630 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6632 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6633 target_ulong addr, uint32_t desc) \
6635 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6636 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6639 DO_STN_1(1, bb, MO_8)
6640 DO_STN_1(1, bh, MO_16)
6641 DO_STN_1(1, bs, MO_32)
6642 DO_STN_1(1, bd, MO_64)
6643 DO_STN_1(2, bb, MO_8)
6644 DO_STN_1(3, bb, MO_8)
6645 DO_STN_1(4, bb, MO_8)
6647 DO_STN_2(1, hh, MO_16, MO_16)
6648 DO_STN_2(1, hs, MO_32, MO_16)
6649 DO_STN_2(1, hd, MO_64, MO_16)
6650 DO_STN_2(2, hh, MO_16, MO_16)
6651 DO_STN_2(3, hh, MO_16, MO_16)
6652 DO_STN_2(4, hh, MO_16, MO_16)
6654 DO_STN_2(1, ss, MO_32, MO_32)
6655 DO_STN_2(1, sd, MO_64, MO_32)
6656 DO_STN_2(2, ss, MO_32, MO_32)
6657 DO_STN_2(3, ss, MO_32, MO_32)
6658 DO_STN_2(4, ss, MO_32, MO_32)
6660 DO_STN_2(1, dd, MO_64, MO_64)
6661 DO_STN_2(2, dd, MO_64, MO_64)
6662 DO_STN_2(3, dd, MO_64, MO_64)
6663 DO_STN_2(4, dd, MO_64, MO_64)
6665 #undef DO_STN_1
6666 #undef DO_STN_2
6669 * Loads with a vector index.
6673 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6675 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6677 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6679 return *(uint32_t *)(reg + H1_4(reg_ofs));
6682 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6684 return *(int32_t *)(reg + H1_4(reg_ofs));
6687 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6689 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6692 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6694 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6697 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6699 return *(uint64_t *)(reg + reg_ofs);
6702 static inline QEMU_ALWAYS_INLINE
6703 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6704 target_ulong base, uint32_t desc, uintptr_t retaddr,
6705 uint32_t mtedesc, int esize, int msize,
6706 zreg_off_fn *off_fn,
6707 sve_ldst1_host_fn *host_fn,
6708 sve_ldst1_tlb_fn *tlb_fn)
6710 const int mmu_idx = cpu_mmu_index(env, false);
6711 const intptr_t reg_max = simd_oprsz(desc);
6712 const int scale = simd_data(desc);
6713 ARMVectorReg scratch;
6714 intptr_t reg_off;
6715 SVEHostPage info, info2;
6717 memset(&scratch, 0, reg_max);
6718 reg_off = 0;
6719 do {
6720 uint64_t pg = vg[reg_off >> 6];
6721 do {
6722 if (likely(pg & 1)) {
6723 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6724 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6726 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6727 mmu_idx, retaddr);
6729 if (likely(in_page >= msize)) {
6730 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6731 cpu_check_watchpoint(env_cpu(env), addr, msize,
6732 info.attrs, BP_MEM_READ, retaddr);
6734 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6735 mte_check(env, mtedesc, addr, retaddr);
6737 host_fn(&scratch, reg_off, info.host);
6738 } else {
6739 /* Element crosses the page boundary. */
6740 sve_probe_page(&info2, false, env, addr + in_page, 0,
6741 MMU_DATA_LOAD, mmu_idx, retaddr);
6742 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6743 cpu_check_watchpoint(env_cpu(env), addr,
6744 msize, info.attrs,
6745 BP_MEM_READ, retaddr);
6747 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6748 mte_check(env, mtedesc, addr, retaddr);
6750 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6753 reg_off += esize;
6754 pg >>= esize;
6755 } while (reg_off & 63);
6756 } while (reg_off < reg_max);
6758 /* Wait until all exceptions have been raised to write back. */
6759 memcpy(vd, &scratch, reg_max);
6762 static inline QEMU_ALWAYS_INLINE
6763 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6764 target_ulong base, uint32_t desc, uintptr_t retaddr,
6765 int esize, int msize, zreg_off_fn *off_fn,
6766 sve_ldst1_host_fn *host_fn,
6767 sve_ldst1_tlb_fn *tlb_fn)
6769 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6770 /* Remove mtedesc from the normal sve descriptor. */
6771 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6774 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6775 * offset base entirely over the address space hole to change the
6776 * pointer tag, or change the bit55 selector. So we could here
6777 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6779 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6780 esize, msize, off_fn, host_fn, tlb_fn);
6783 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6784 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6785 void *vm, target_ulong base, uint32_t desc) \
6787 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6788 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6790 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6791 void *vm, target_ulong base, uint32_t desc) \
6793 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6794 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6797 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6798 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6799 void *vm, target_ulong base, uint32_t desc) \
6801 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6802 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6804 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6805 void *vm, target_ulong base, uint32_t desc) \
6807 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6808 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6811 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6812 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6813 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6814 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6815 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6817 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6818 DO_LD1_ZPZ_S(bss, zss, MO_8)
6819 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6820 DO_LD1_ZPZ_D(bds, zss, MO_8)
6821 DO_LD1_ZPZ_D(bds, zd, MO_8)
6823 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6824 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6825 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6826 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6827 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6829 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6830 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6831 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6832 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6833 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6835 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6836 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6837 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6838 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6839 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6841 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6842 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6843 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6844 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6845 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6847 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6848 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6849 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6850 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6851 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6853 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6854 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6855 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6856 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6857 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6859 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6860 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6861 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6863 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6864 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6865 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6867 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6868 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6869 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6871 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6872 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6873 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6875 #undef DO_LD1_ZPZ_S
6876 #undef DO_LD1_ZPZ_D
6878 /* First fault loads with a vector index. */
6881 * Common helpers for all gather first-faulting loads.
6884 static inline QEMU_ALWAYS_INLINE
6885 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6886 target_ulong base, uint32_t desc, uintptr_t retaddr,
6887 uint32_t mtedesc, const int esz, const int msz,
6888 zreg_off_fn *off_fn,
6889 sve_ldst1_host_fn *host_fn,
6890 sve_ldst1_tlb_fn *tlb_fn)
6892 const int mmu_idx = cpu_mmu_index(env, false);
6893 const intptr_t reg_max = simd_oprsz(desc);
6894 const int scale = simd_data(desc);
6895 const int esize = 1 << esz;
6896 const int msize = 1 << msz;
6897 intptr_t reg_off;
6898 SVEHostPage info;
6899 target_ulong addr, in_page;
6901 /* Skip to the first true predicate. */
6902 reg_off = find_next_active(vg, 0, reg_max, esz);
6903 if (unlikely(reg_off >= reg_max)) {
6904 /* The entire predicate was false; no load occurs. */
6905 memset(vd, 0, reg_max);
6906 return;
6910 * Probe the first element, allowing faults.
6912 addr = base + (off_fn(vm, reg_off) << scale);
6913 if (mtedesc) {
6914 mte_check(env, mtedesc, addr, retaddr);
6916 tlb_fn(env, vd, reg_off, addr, retaddr);
6918 /* After any fault, zero the other elements. */
6919 swap_memzero(vd, reg_off);
6920 reg_off += esize;
6921 swap_memzero(vd + reg_off, reg_max - reg_off);
6924 * Probe the remaining elements, not allowing faults.
6926 while (reg_off < reg_max) {
6927 uint64_t pg = vg[reg_off >> 6];
6928 do {
6929 if (likely((pg >> (reg_off & 63)) & 1)) {
6930 addr = base + (off_fn(vm, reg_off) << scale);
6931 in_page = -(addr | TARGET_PAGE_MASK);
6933 if (unlikely(in_page < msize)) {
6934 /* Stop if the element crosses a page boundary. */
6935 goto fault;
6938 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6939 mmu_idx, retaddr);
6940 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6941 goto fault;
6943 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6944 (cpu_watchpoint_address_matches
6945 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6946 goto fault;
6948 if (mtedesc &&
6949 arm_tlb_mte_tagged(&info.attrs) &&
6950 !mte_probe(env, mtedesc, addr)) {
6951 goto fault;
6954 host_fn(vd, reg_off, info.host);
6956 reg_off += esize;
6957 } while (reg_off & 63);
6959 return;
6961 fault:
6962 record_fault(env, reg_off, reg_max);
6965 static inline QEMU_ALWAYS_INLINE
6966 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6967 target_ulong base, uint32_t desc, uintptr_t retaddr,
6968 const int esz, const int msz,
6969 zreg_off_fn *off_fn,
6970 sve_ldst1_host_fn *host_fn,
6971 sve_ldst1_tlb_fn *tlb_fn)
6973 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6974 /* Remove mtedesc from the normal sve descriptor. */
6975 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6978 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6979 * offset base entirely over the address space hole to change the
6980 * pointer tag, or change the bit55 selector. So we could here
6981 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6983 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6984 esz, msz, off_fn, host_fn, tlb_fn);
6987 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6988 void HELPER(sve_ldff##MEM##_##OFS) \
6989 (CPUARMState *env, void *vd, void *vg, \
6990 void *vm, target_ulong base, uint32_t desc) \
6992 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6993 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6995 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6996 (CPUARMState *env, void *vd, void *vg, \
6997 void *vm, target_ulong base, uint32_t desc) \
6999 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7000 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7003 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7004 void HELPER(sve_ldff##MEM##_##OFS) \
7005 (CPUARMState *env, void *vd, void *vg, \
7006 void *vm, target_ulong base, uint32_t desc) \
7008 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7009 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7011 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7012 (CPUARMState *env, void *vd, void *vg, \
7013 void *vm, target_ulong base, uint32_t desc) \
7015 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7016 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7019 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7020 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7021 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7022 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7023 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7025 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7026 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7027 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7028 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7029 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7031 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7032 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7033 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7034 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7035 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7037 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7038 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7039 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7040 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7041 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7043 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7044 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7045 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7046 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7047 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7049 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7050 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7051 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7052 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7053 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7055 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7056 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7057 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7058 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7059 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7061 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7062 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7063 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7064 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7065 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7067 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7068 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7069 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7071 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7072 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7073 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7075 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7076 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7077 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7079 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7080 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7081 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7083 /* Stores with a vector index. */
7085 static inline QEMU_ALWAYS_INLINE
7086 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7087 target_ulong base, uint32_t desc, uintptr_t retaddr,
7088 uint32_t mtedesc, int esize, int msize,
7089 zreg_off_fn *off_fn,
7090 sve_ldst1_host_fn *host_fn,
7091 sve_ldst1_tlb_fn *tlb_fn)
7093 const int mmu_idx = cpu_mmu_index(env, false);
7094 const intptr_t reg_max = simd_oprsz(desc);
7095 const int scale = simd_data(desc);
7096 void *host[ARM_MAX_VQ * 4];
7097 intptr_t reg_off, i;
7098 SVEHostPage info, info2;
7101 * Probe all of the elements for host addresses and flags.
7103 i = reg_off = 0;
7104 do {
7105 uint64_t pg = vg[reg_off >> 6];
7106 do {
7107 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7108 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7110 host[i] = NULL;
7111 if (likely((pg >> (reg_off & 63)) & 1)) {
7112 if (likely(in_page >= msize)) {
7113 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7114 mmu_idx, retaddr);
7115 host[i] = info.host;
7116 } else {
7118 * Element crosses the page boundary.
7119 * Probe both pages, but do not record the host address,
7120 * so that we use the slow path.
7122 sve_probe_page(&info, false, env, addr, 0,
7123 MMU_DATA_STORE, mmu_idx, retaddr);
7124 sve_probe_page(&info2, false, env, addr + in_page, 0,
7125 MMU_DATA_STORE, mmu_idx, retaddr);
7126 info.flags |= info2.flags;
7129 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7130 cpu_check_watchpoint(env_cpu(env), addr, msize,
7131 info.attrs, BP_MEM_WRITE, retaddr);
7134 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
7135 mte_check(env, mtedesc, addr, retaddr);
7138 i += 1;
7139 reg_off += esize;
7140 } while (reg_off & 63);
7141 } while (reg_off < reg_max);
7144 * Now that we have recognized all exceptions except SyncExternal
7145 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7147 * Note for the common case of an element in RAM, not crossing a page
7148 * boundary, we have stored the host address in host[]. This doubles
7149 * as a first-level check against the predicate, since only enabled
7150 * elements have non-null host addresses.
7152 i = reg_off = 0;
7153 do {
7154 void *h = host[i];
7155 if (likely(h != NULL)) {
7156 host_fn(vd, reg_off, h);
7157 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7158 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7159 tlb_fn(env, vd, reg_off, addr, retaddr);
7161 i += 1;
7162 reg_off += esize;
7163 } while (reg_off < reg_max);
7166 static inline QEMU_ALWAYS_INLINE
7167 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7168 target_ulong base, uint32_t desc, uintptr_t retaddr,
7169 int esize, int msize, zreg_off_fn *off_fn,
7170 sve_ldst1_host_fn *host_fn,
7171 sve_ldst1_tlb_fn *tlb_fn)
7173 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7174 /* Remove mtedesc from the normal sve descriptor. */
7175 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7178 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7179 * offset base entirely over the address space hole to change the
7180 * pointer tag, or change the bit55 selector. So we could here
7181 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7183 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7184 esize, msize, off_fn, host_fn, tlb_fn);
7187 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7188 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7189 void *vm, target_ulong base, uint32_t desc) \
7191 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7192 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7194 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7195 void *vm, target_ulong base, uint32_t desc) \
7197 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7198 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7201 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7202 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7203 void *vm, target_ulong base, uint32_t desc) \
7205 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7206 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7208 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7209 void *vm, target_ulong base, uint32_t desc) \
7211 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7212 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7215 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7216 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7217 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7218 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7219 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7221 DO_ST1_ZPZ_S(bs, zss, MO_8)
7222 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7223 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7224 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7225 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7227 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7228 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7229 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7230 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7231 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7232 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7233 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7235 DO_ST1_ZPZ_D(bd, zss, MO_8)
7236 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7237 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7238 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7239 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7240 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7241 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7243 DO_ST1_ZPZ_D(bd, zd, MO_8)
7244 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7245 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7246 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7247 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7248 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7249 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7251 #undef DO_ST1_ZPZ_S
7252 #undef DO_ST1_ZPZ_D
7254 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7256 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7257 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7259 for (i = 0; i < opr_sz; ++i) {
7260 d[i] = n[i] ^ m[i] ^ k[i];
7264 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7266 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7267 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7269 for (i = 0; i < opr_sz; ++i) {
7270 d[i] = n[i] ^ (m[i] & ~k[i]);
7274 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7276 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7277 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7279 for (i = 0; i < opr_sz; ++i) {
7280 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7284 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7286 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7287 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7289 for (i = 0; i < opr_sz; ++i) {
7290 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7294 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7296 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7297 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7299 for (i = 0; i < opr_sz; ++i) {
7300 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7305 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7306 * See hasless(v,1) from
7307 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7309 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7311 int bits = 8 << esz;
7312 uint64_t ones = dup_const(esz, 1);
7313 uint64_t signs = ones << (bits - 1);
7314 uint64_t cmp0, cmp1;
7316 cmp1 = dup_const(esz, n);
7317 cmp0 = cmp1 ^ m0;
7318 cmp1 = cmp1 ^ m1;
7319 cmp0 = (cmp0 - ones) & ~cmp0;
7320 cmp1 = (cmp1 - ones) & ~cmp1;
7321 return (cmp0 | cmp1) & signs;
7324 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7325 uint32_t desc, int esz, bool nmatch)
7327 uint16_t esz_mask = pred_esz_masks[esz];
7328 intptr_t opr_sz = simd_oprsz(desc);
7329 uint32_t flags = PREDTEST_INIT;
7330 intptr_t i, j, k;
7332 for (i = 0; i < opr_sz; i += 16) {
7333 uint64_t m0 = *(uint64_t *)(vm + i);
7334 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7336 uint16_t out = 0;
7338 for (j = 0; j < 16; j += 8) {
7339 uint64_t n = *(uint64_t *)(vn + i + j);
7341 for (k = 0; k < 8; k += 1 << esz) {
7342 if (pg & (1 << (j + k))) {
7343 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7344 out |= (o ^ nmatch) << (j + k);
7348 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7349 flags = iter_predtest_fwd(out, pg, flags);
7351 return flags;
7354 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7355 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7357 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7360 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7361 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7363 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7364 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7366 #undef DO_PPZZ_MATCH
7368 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7369 uint32_t desc)
7371 ARMVectorReg scratch;
7372 intptr_t i, j;
7373 intptr_t opr_sz = simd_oprsz(desc);
7374 uint32_t *d = vd, *n = vn, *m = vm;
7375 uint8_t *pg = vg;
7377 if (d == n) {
7378 n = memcpy(&scratch, n, opr_sz);
7379 if (d == m) {
7380 m = n;
7382 } else if (d == m) {
7383 m = memcpy(&scratch, m, opr_sz);
7386 for (i = 0; i < opr_sz; i += 4) {
7387 uint64_t count = 0;
7388 uint8_t pred;
7390 pred = pg[H1(i >> 3)] >> (i & 7);
7391 if (pred & 1) {
7392 uint32_t nn = n[H4(i >> 2)];
7394 for (j = 0; j <= i; j += 4) {
7395 pred = pg[H1(j >> 3)] >> (j & 7);
7396 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7397 ++count;
7401 d[H4(i >> 2)] = count;
7405 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7406 uint32_t desc)
7408 ARMVectorReg scratch;
7409 intptr_t i, j;
7410 intptr_t opr_sz = simd_oprsz(desc);
7411 uint64_t *d = vd, *n = vn, *m = vm;
7412 uint8_t *pg = vg;
7414 if (d == n) {
7415 n = memcpy(&scratch, n, opr_sz);
7416 if (d == m) {
7417 m = n;
7419 } else if (d == m) {
7420 m = memcpy(&scratch, m, opr_sz);
7423 for (i = 0; i < opr_sz / 8; ++i) {
7424 uint64_t count = 0;
7425 if (pg[H1(i)] & 1) {
7426 uint64_t nn = n[i];
7427 for (j = 0; j <= i; ++j) {
7428 if ((pg[H1(j)] & 1) && nn == m[j]) {
7429 ++count;
7433 d[i] = count;
7438 * Returns the number of bytes in m0 and m1 that match n.
7439 * Unlike do_match2 we don't just need true/false, we need an exact count.
7440 * This requires two extra logical operations.
7442 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7444 const uint64_t mask = dup_const(MO_8, 0x7f);
7445 uint64_t cmp0, cmp1;
7447 cmp1 = dup_const(MO_8, n);
7448 cmp0 = cmp1 ^ m0;
7449 cmp1 = cmp1 ^ m1;
7452 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7453 * 2: carry in to msb if byte != 0 (+ mask)
7454 * 3: set msb if cmp has msb set (| cmp)
7455 * 4: set ~msb to ignore them (| mask)
7456 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7457 * 5: invert, resulting in 0x80 if and only if byte == 0.
7459 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7460 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7463 * Combine the two compares in a way that the bits do
7464 * not overlap, and so preserves the count of set bits.
7465 * If the host has an efficient instruction for ctpop,
7466 * then ctpop(x) + ctpop(y) has the same number of
7467 * operations as ctpop(x | (y >> 1)). If the host does
7468 * not have an efficient ctpop, then we only want to
7469 * use it once.
7471 return ctpop64(cmp0 | (cmp1 >> 1));
7474 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7476 intptr_t i, j;
7477 intptr_t opr_sz = simd_oprsz(desc);
7479 for (i = 0; i < opr_sz; i += 16) {
7480 uint64_t n0 = *(uint64_t *)(vn + i);
7481 uint64_t m0 = *(uint64_t *)(vm + i);
7482 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7483 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7484 uint64_t out0 = 0;
7485 uint64_t out1 = 0;
7487 for (j = 0; j < 64; j += 8) {
7488 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7489 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7490 out0 |= cnt0 << j;
7491 out1 |= cnt1 << j;
7494 *(uint64_t *)(vd + i) = out0;
7495 *(uint64_t *)(vd + i + 8) = out1;
7499 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7501 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7502 int shr = simd_data(desc);
7503 int shl = 8 - shr;
7504 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7505 uint64_t *d = vd, *n = vn, *m = vm;
7507 for (i = 0; i < opr_sz; ++i) {
7508 uint64_t t = n[i] ^ m[i];
7509 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7513 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7515 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7516 int shr = simd_data(desc);
7517 int shl = 16 - shr;
7518 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7519 uint64_t *d = vd, *n = vn, *m = vm;
7521 for (i = 0; i < opr_sz; ++i) {
7522 uint64_t t = n[i] ^ m[i];
7523 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7527 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7529 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7530 int shr = simd_data(desc);
7531 uint32_t *d = vd, *n = vn, *m = vm;
7533 for (i = 0; i < opr_sz; ++i) {
7534 d[i] = ror32(n[i] ^ m[i], shr);
7538 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7539 void *status, uint32_t desc)
7541 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7543 for (s = 0; s < opr_sz; ++s) {
7544 float32 *n = vn + s * sizeof(float32) * 4;
7545 float32 *m = vm + s * sizeof(float32) * 4;
7546 float32 *a = va + s * sizeof(float32) * 4;
7547 float32 *d = vd + s * sizeof(float32) * 4;
7548 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7549 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7550 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7551 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7552 float32 p0, p1;
7554 /* i = 0, j = 0 */
7555 p0 = float32_mul(n00, m00, status);
7556 p1 = float32_mul(n01, m01, status);
7557 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7559 /* i = 0, j = 1 */
7560 p0 = float32_mul(n00, m10, status);
7561 p1 = float32_mul(n01, m11, status);
7562 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7564 /* i = 1, j = 0 */
7565 p0 = float32_mul(n10, m00, status);
7566 p1 = float32_mul(n11, m01, status);
7567 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7569 /* i = 1, j = 1 */
7570 p0 = float32_mul(n10, m10, status);
7571 p1 = float32_mul(n11, m11, status);
7572 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7576 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7577 void *status, uint32_t desc)
7579 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7581 for (s = 0; s < opr_sz; ++s) {
7582 float64 *n = vn + s * sizeof(float64) * 4;
7583 float64 *m = vm + s * sizeof(float64) * 4;
7584 float64 *a = va + s * sizeof(float64) * 4;
7585 float64 *d = vd + s * sizeof(float64) * 4;
7586 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7587 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7588 float64 p0, p1;
7590 /* i = 0, j = 0 */
7591 p0 = float64_mul(n00, m00, status);
7592 p1 = float64_mul(n01, m01, status);
7593 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7595 /* i = 0, j = 1 */
7596 p0 = float64_mul(n00, m10, status);
7597 p1 = float64_mul(n01, m11, status);
7598 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7600 /* i = 1, j = 0 */
7601 p0 = float64_mul(n10, m00, status);
7602 p1 = float64_mul(n11, m01, status);
7603 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7605 /* i = 1, j = 1 */
7606 p0 = float64_mul(n10, m10, status);
7607 p1 = float64_mul(n11, m11, status);
7608 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7612 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7613 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7615 intptr_t i = simd_oprsz(desc); \
7616 uint64_t *g = vg; \
7617 do { \
7618 uint64_t pg = g[(i - 1) >> 6]; \
7619 do { \
7620 i -= sizeof(TYPEW); \
7621 if (likely((pg >> (i & 63)) & 1)) { \
7622 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7623 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7625 } while (i & 63); \
7626 } while (i != 0); \
7629 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7630 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7631 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7633 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7634 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7636 intptr_t i = simd_oprsz(desc); \
7637 uint64_t *g = vg; \
7638 do { \
7639 uint64_t pg = g[(i - 1) >> 6]; \
7640 do { \
7641 i -= sizeof(TYPEW); \
7642 if (likely((pg >> (i & 63)) & 1)) { \
7643 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7644 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7646 } while (i & 63); \
7647 } while (i != 0); \
7650 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7651 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7653 #undef DO_FCVTLT
7654 #undef DO_FCVTNT