target/arm: Move expand_pred_b to vec_internal.h
[qemu/ar7.git] / target / arm / sve_helper.c
blobe865c12527325e4ea5fc9c6d7eebefd217a035a2
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
36 * within CPUARMState.
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
43 * moving forward.
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 if (likely(g)) {
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
55 /* Accumulate Z from each D & G. */
56 flags |= ((d & g) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 return flags;
64 /* This is an iterative function, called for each Pd and Pg word
65 * moving backward.
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 if (likely(g)) {
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
72 if (!(flags & 4)) {
73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags |= (d & pow2floor(g)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags |= ((d & g) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 return flags;
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
103 return flags;
106 /* Similarly for half-word elements.
107 * for (i = 0; i < 256; ++i) {
108 * unsigned long m = 0;
109 * if (i & 0xaa) {
110 * continue;
112 * for (j = 0; j < 8; j += 2) {
113 * if ((i >> j) & 1) {
114 * m |= 0xfffful << (j << 3);
117 * printf("[0x%x] = 0x%016lx,\n", i, m);
120 static inline uint64_t expand_pred_h(uint8_t byte)
122 static const uint64_t word[] = {
123 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
124 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
125 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
126 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
127 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
128 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
129 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
130 [0x55] = 0xffffffffffffffff,
132 return word[byte & 0x55];
135 /* Similarly for single word elements. */
136 static inline uint64_t expand_pred_s(uint8_t byte)
138 static const uint64_t word[] = {
139 [0x01] = 0x00000000ffffffffull,
140 [0x10] = 0xffffffff00000000ull,
141 [0x11] = 0xffffffffffffffffull,
143 return word[byte & 0x11];
146 #define LOGICAL_PPPP(NAME, FUNC) \
147 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
149 uintptr_t opr_sz = simd_oprsz(desc); \
150 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
151 uintptr_t i; \
152 for (i = 0; i < opr_sz / 8; ++i) { \
153 d[i] = FUNC(n[i], m[i], g[i]); \
157 #define DO_AND(N, M, G) (((N) & (M)) & (G))
158 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
159 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
160 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
161 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
162 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
163 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
164 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
166 LOGICAL_PPPP(sve_and_pppp, DO_AND)
167 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
168 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
169 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
170 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
171 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
172 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
173 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
175 #undef DO_AND
176 #undef DO_BIC
177 #undef DO_EOR
178 #undef DO_ORR
179 #undef DO_ORN
180 #undef DO_NOR
181 #undef DO_NAND
182 #undef DO_SEL
183 #undef LOGICAL_PPPP
185 /* Fully general three-operand expander, controlled by a predicate.
186 * This is complicated by the host-endian storage of the register file.
188 /* ??? I don't expect the compiler could ever vectorize this itself.
189 * With some tables we can convert bit masks to byte masks, and with
190 * extra care wrt byte/word ordering we could use gcc generic vectors
191 * and do 16 bytes at a time.
193 #define DO_ZPZZ(NAME, TYPE, H, OP) \
194 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
196 intptr_t i, opr_sz = simd_oprsz(desc); \
197 for (i = 0; i < opr_sz; ) { \
198 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
199 do { \
200 if (pg & 1) { \
201 TYPE nn = *(TYPE *)(vn + H(i)); \
202 TYPE mm = *(TYPE *)(vm + H(i)); \
203 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
205 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
206 } while (i & 15); \
210 /* Similarly, specialized for 64-bit operands. */
211 #define DO_ZPZZ_D(NAME, TYPE, OP) \
212 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
214 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
215 TYPE *d = vd, *n = vn, *m = vm; \
216 uint8_t *pg = vg; \
217 for (i = 0; i < opr_sz; i += 1) { \
218 if (pg[H1(i)] & 1) { \
219 TYPE nn = n[i], mm = m[i]; \
220 d[i] = OP(nn, mm); \
225 #define DO_AND(N, M) (N & M)
226 #define DO_EOR(N, M) (N ^ M)
227 #define DO_ORR(N, M) (N | M)
228 #define DO_BIC(N, M) (N & ~M)
229 #define DO_ADD(N, M) (N + M)
230 #define DO_SUB(N, M) (N - M)
231 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
232 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
233 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
234 #define DO_MUL(N, M) (N * M)
238 * We must avoid the C undefined behaviour cases: division by
239 * zero and signed division of INT_MIN by -1. Both of these
240 * have architecturally defined required results for Arm.
241 * We special case all signed divisions by -1 to avoid having
242 * to deduce the minimum integer for the type involved.
244 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
245 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
247 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
248 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
249 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
250 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
252 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
253 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
254 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
255 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
257 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
258 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
259 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
260 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
262 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
263 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
264 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
265 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
267 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
268 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
269 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
270 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
272 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
273 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
274 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
275 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
277 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
278 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
279 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
280 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
282 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
283 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
284 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
285 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
287 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
288 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
289 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
290 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
292 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
293 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
294 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
295 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
297 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
298 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
299 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
300 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
302 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
303 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
304 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
305 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
307 /* Because the computation type is at least twice as large as required,
308 these work for both signed and unsigned source types. */
309 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
311 return (n * m) >> 8;
314 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
316 return (n * m) >> 16;
319 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
321 return (n * m) >> 32;
324 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
326 uint64_t lo, hi;
327 muls64(&lo, &hi, n, m);
328 return hi;
331 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
333 uint64_t lo, hi;
334 mulu64(&lo, &hi, n, m);
335 return hi;
338 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
339 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
340 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
341 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
343 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
344 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
345 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
346 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
348 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
349 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
350 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
351 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
353 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
354 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
356 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
357 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
359 /* Note that all bits of the shift are significant
360 and not modulo the element size. */
361 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
362 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
363 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
365 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
366 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
367 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
369 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
370 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
371 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
373 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
374 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
375 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
377 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
378 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
379 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
381 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
383 int8_t n1 = n, n2 = n >> 8;
384 return m + n1 + n2;
387 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
389 int16_t n1 = n, n2 = n >> 16;
390 return m + n1 + n2;
393 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
395 int32_t n1 = n, n2 = n >> 32;
396 return m + n1 + n2;
399 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
400 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
401 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
403 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
405 uint8_t n1 = n, n2 = n >> 8;
406 return m + n1 + n2;
409 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
411 uint16_t n1 = n, n2 = n >> 16;
412 return m + n1 + n2;
415 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
417 uint32_t n1 = n, n2 = n >> 32;
418 return m + n1 + n2;
421 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
422 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
423 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
425 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
426 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
427 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
428 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
430 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
431 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
432 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
433 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
435 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
436 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
437 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
438 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
440 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
441 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
442 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
443 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
446 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
447 * We pass in a pointer to a dummy saturation field to trigger
448 * the saturating arithmetic but discard the information about
449 * whether it has occurred.
451 #define do_sqshl_b(n, m) \
452 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
453 #define do_sqshl_h(n, m) \
454 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
455 #define do_sqshl_s(n, m) \
456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
457 #define do_sqshl_d(n, m) \
458 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
460 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
461 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
462 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
463 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
465 #define do_uqshl_b(n, m) \
466 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
467 #define do_uqshl_h(n, m) \
468 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
469 #define do_uqshl_s(n, m) \
470 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
471 #define do_uqshl_d(n, m) \
472 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
474 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
475 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
476 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
477 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
479 #define do_sqrshl_b(n, m) \
480 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
481 #define do_sqrshl_h(n, m) \
482 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
483 #define do_sqrshl_s(n, m) \
484 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
485 #define do_sqrshl_d(n, m) \
486 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
488 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
489 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
490 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
491 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
493 #undef do_sqrshl_d
495 #define do_uqrshl_b(n, m) \
496 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
497 #define do_uqrshl_h(n, m) \
498 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
499 #define do_uqrshl_s(n, m) \
500 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
501 #define do_uqrshl_d(n, m) \
502 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
504 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
505 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
506 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
507 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
509 #undef do_uqrshl_d
511 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
512 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
514 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
515 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
516 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
517 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
519 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
520 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
521 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
522 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
524 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
525 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
527 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
528 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
529 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
530 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
532 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
533 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
534 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
535 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
537 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
538 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
540 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
541 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
542 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
543 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
545 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
546 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
547 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
548 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
550 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
552 return val >= max ? max : val <= min ? min : val;
555 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
556 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
557 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
559 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
561 int64_t r = n + m;
562 if (((r ^ n) & ~(n ^ m)) < 0) {
563 /* Signed overflow. */
564 return r < 0 ? INT64_MAX : INT64_MIN;
566 return r;
569 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
570 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
571 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
572 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
574 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
575 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
576 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
578 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
580 uint64_t r = n + m;
581 return r < n ? UINT64_MAX : r;
584 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
585 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
586 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
587 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
589 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
590 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
591 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
593 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
595 int64_t r = n - m;
596 if (((r ^ n) & (n ^ m)) < 0) {
597 /* Signed overflow. */
598 return r < 0 ? INT64_MAX : INT64_MIN;
600 return r;
603 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
604 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
605 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
606 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
608 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
609 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
610 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
612 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
614 return n > m ? n - m : 0;
617 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
618 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
619 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
620 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
622 #define DO_SUQADD_B(n, m) \
623 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
624 #define DO_SUQADD_H(n, m) \
625 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
626 #define DO_SUQADD_S(n, m) \
627 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
629 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
631 uint64_t r = n + m;
633 if (n < 0) {
634 /* Note that m - abs(n) cannot underflow. */
635 if (r > INT64_MAX) {
636 /* Result is either very large positive or negative. */
637 if (m > -n) {
638 /* m > abs(n), so r is a very large positive. */
639 return INT64_MAX;
641 /* Result is negative. */
643 } else {
644 /* Both inputs are positive: check for overflow. */
645 if (r < m || r > INT64_MAX) {
646 return INT64_MAX;
649 return r;
652 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
653 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
654 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
655 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
657 #define DO_USQADD_B(n, m) \
658 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
659 #define DO_USQADD_H(n, m) \
660 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
661 #define DO_USQADD_S(n, m) \
662 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
664 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
666 uint64_t r = n + m;
668 if (m < 0) {
669 return n < -m ? 0 : r;
671 return r < n ? UINT64_MAX : r;
674 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
675 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
676 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
677 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
679 #undef DO_ZPZZ
680 #undef DO_ZPZZ_D
683 * Three operand expander, operating on element pairs.
684 * If the slot I is even, the elements from from VN {I, I+1}.
685 * If the slot I is odd, the elements from from VM {I-1, I}.
686 * Load all of the input elements in each pair before overwriting output.
688 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
689 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
691 intptr_t i, opr_sz = simd_oprsz(desc); \
692 for (i = 0; i < opr_sz; ) { \
693 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
694 do { \
695 TYPE n0 = *(TYPE *)(vn + H(i)); \
696 TYPE m0 = *(TYPE *)(vm + H(i)); \
697 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
698 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
699 if (pg & 1) { \
700 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
702 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
703 if (pg & 1) { \
704 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
706 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
707 } while (i & 15); \
711 /* Similarly, specialized for 64-bit operands. */
712 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
713 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
715 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
716 TYPE *d = vd, *n = vn, *m = vm; \
717 uint8_t *pg = vg; \
718 for (i = 0; i < opr_sz; i += 2) { \
719 TYPE n0 = n[i], n1 = n[i + 1]; \
720 TYPE m0 = m[i], m1 = m[i + 1]; \
721 if (pg[H1(i)] & 1) { \
722 d[i] = OP(n0, n1); \
724 if (pg[H1(i + 1)] & 1) { \
725 d[i + 1] = OP(m0, m1); \
730 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
731 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
732 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
733 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
735 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
736 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
737 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
738 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
740 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
741 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
742 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
743 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
745 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
746 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
747 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
748 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
750 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
751 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
752 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
753 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
755 #undef DO_ZPZZ_PAIR
756 #undef DO_ZPZZ_PAIR_D
758 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
759 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
760 void *status, uint32_t desc) \
762 intptr_t i, opr_sz = simd_oprsz(desc); \
763 for (i = 0; i < opr_sz; ) { \
764 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
765 do { \
766 TYPE n0 = *(TYPE *)(vn + H(i)); \
767 TYPE m0 = *(TYPE *)(vm + H(i)); \
768 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
769 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
770 if (pg & 1) { \
771 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
773 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
774 if (pg & 1) { \
775 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
777 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
778 } while (i & 15); \
782 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
783 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
784 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
786 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
787 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
788 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
790 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
791 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
792 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
794 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
795 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
796 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
798 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
799 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
800 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
802 #undef DO_ZPZZ_PAIR_FP
804 /* Three-operand expander, controlled by a predicate, in which the
805 * third operand is "wide". That is, for D = N op M, the same 64-bit
806 * value of M is used with all of the narrower values of N.
808 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
809 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
811 intptr_t i, opr_sz = simd_oprsz(desc); \
812 for (i = 0; i < opr_sz; ) { \
813 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
814 TYPEW mm = *(TYPEW *)(vm + i); \
815 do { \
816 if (pg & 1) { \
817 TYPE nn = *(TYPE *)(vn + H(i)); \
818 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
820 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
821 } while (i & 7); \
825 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
826 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
827 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
829 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
830 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
831 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
833 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
834 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
835 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
837 #undef DO_ZPZW
839 /* Fully general two-operand expander, controlled by a predicate.
841 #define DO_ZPZ(NAME, TYPE, H, OP) \
842 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
844 intptr_t i, opr_sz = simd_oprsz(desc); \
845 for (i = 0; i < opr_sz; ) { \
846 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
847 do { \
848 if (pg & 1) { \
849 TYPE nn = *(TYPE *)(vn + H(i)); \
850 *(TYPE *)(vd + H(i)) = OP(nn); \
852 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
853 } while (i & 15); \
857 /* Similarly, specialized for 64-bit operands. */
858 #define DO_ZPZ_D(NAME, TYPE, OP) \
859 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
861 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
862 TYPE *d = vd, *n = vn; \
863 uint8_t *pg = vg; \
864 for (i = 0; i < opr_sz; i += 1) { \
865 if (pg[H1(i)] & 1) { \
866 TYPE nn = n[i]; \
867 d[i] = OP(nn); \
872 #define DO_CLS_B(N) (clrsb32(N) - 24)
873 #define DO_CLS_H(N) (clrsb32(N) - 16)
875 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
876 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
877 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
878 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
880 #define DO_CLZ_B(N) (clz32(N) - 24)
881 #define DO_CLZ_H(N) (clz32(N) - 16)
883 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
884 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
885 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
886 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
888 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
889 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
890 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
891 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
893 #define DO_CNOT(N) (N == 0)
895 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
896 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
897 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
898 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
900 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
902 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
903 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
904 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
906 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
908 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
909 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
910 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
912 #define DO_NOT(N) (~N)
914 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
915 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
916 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
917 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
919 #define DO_SXTB(N) ((int8_t)N)
920 #define DO_SXTH(N) ((int16_t)N)
921 #define DO_SXTS(N) ((int32_t)N)
922 #define DO_UXTB(N) ((uint8_t)N)
923 #define DO_UXTH(N) ((uint16_t)N)
924 #define DO_UXTS(N) ((uint32_t)N)
926 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
927 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
928 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
929 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
930 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
931 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
933 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
934 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
935 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
936 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
937 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
938 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
940 #define DO_ABS(N) (N < 0 ? -N : N)
942 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
943 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
944 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
945 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
947 #define DO_NEG(N) (-N)
949 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
950 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
951 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
952 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
954 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
955 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
956 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
958 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
959 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
961 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
963 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
964 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
965 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
966 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
968 #define DO_SQABS(X) \
969 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
970 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
972 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
973 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
974 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
975 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
977 #define DO_SQNEG(X) \
978 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
979 x_ == min_ ? -min_ - 1 : -x_; })
981 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
982 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
983 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
984 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
986 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
987 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
989 /* Three-operand expander, unpredicated, in which the third operand is "wide".
991 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
992 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
994 intptr_t i, opr_sz = simd_oprsz(desc); \
995 for (i = 0; i < opr_sz; ) { \
996 TYPEW mm = *(TYPEW *)(vm + i); \
997 do { \
998 TYPE nn = *(TYPE *)(vn + H(i)); \
999 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1000 i += sizeof(TYPE); \
1001 } while (i & 7); \
1005 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1006 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1007 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1009 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1010 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1011 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1013 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1014 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1015 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1017 #undef DO_ZZW
1019 #undef DO_CLS_B
1020 #undef DO_CLS_H
1021 #undef DO_CLZ_B
1022 #undef DO_CLZ_H
1023 #undef DO_CNOT
1024 #undef DO_FABS
1025 #undef DO_FNEG
1026 #undef DO_ABS
1027 #undef DO_NEG
1028 #undef DO_ZPZ
1029 #undef DO_ZPZ_D
1032 * Three-operand expander, unpredicated, in which the two inputs are
1033 * selected from the top or bottom half of the wide column.
1035 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1036 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1038 intptr_t i, opr_sz = simd_oprsz(desc); \
1039 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1040 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1041 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1042 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1043 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1044 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1048 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1049 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1050 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1052 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1053 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1054 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1056 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1057 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1058 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1060 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1061 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1062 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1064 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1065 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1066 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1068 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1069 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1070 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1072 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1073 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1074 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1076 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1077 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1078 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1080 /* Note that the multiply cannot overflow, but the doubling can. */
1081 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1083 int16_t val = n * m;
1084 return DO_SQADD_H(val, val);
1087 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1089 int32_t val = n * m;
1090 return DO_SQADD_S(val, val);
1093 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1095 int64_t val = n * m;
1096 return do_sqadd_d(val, val);
1099 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1100 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1101 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1103 #undef DO_ZZZ_TB
1105 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1106 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1108 intptr_t i, opr_sz = simd_oprsz(desc); \
1109 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1110 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1111 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1112 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1113 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1117 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1118 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1119 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1121 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1122 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1123 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1125 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1126 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1127 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1129 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1130 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1131 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1133 #undef DO_ZZZ_WTB
1135 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1136 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1138 intptr_t i, opr_sz = simd_oprsz(desc); \
1139 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1140 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1141 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1142 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1143 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1144 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1148 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1149 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1150 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1151 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1153 #undef DO_ZZZ_NTB
1155 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1156 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1158 intptr_t i, opr_sz = simd_oprsz(desc); \
1159 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1160 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1161 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1162 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1163 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1164 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1168 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1169 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1170 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1172 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1173 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1174 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1176 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1177 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1178 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1180 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1181 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1182 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1184 #define DO_NMUL(N, M) -(N * M)
1186 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1187 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1188 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1190 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1191 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1192 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1194 #undef DO_ZZZW_ACC
1196 #define DO_XTNB(NAME, TYPE, OP) \
1197 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1199 intptr_t i, opr_sz = simd_oprsz(desc); \
1200 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1201 TYPE nn = *(TYPE *)(vn + i); \
1202 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1203 *(TYPE *)(vd + i) = nn; \
1207 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1208 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1210 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1211 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1212 TYPE nn = *(TYPE *)(vn + i); \
1213 *(TYPEN *)(vd + i + odd) = OP(nn); \
1217 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1218 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1219 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1221 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1222 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1223 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1225 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1226 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1227 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1229 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1230 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1231 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1233 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1234 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1235 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1237 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1238 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1239 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1241 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1242 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1243 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1245 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1246 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1247 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1249 #undef DO_XTNB
1250 #undef DO_XTNT
1252 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1254 intptr_t i, opr_sz = simd_oprsz(desc);
1255 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1256 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1257 uint32_t *a = va, *n = vn;
1258 uint64_t *d = vd, *m = vm;
1260 for (i = 0; i < opr_sz / 8; ++i) {
1261 uint32_t e1 = a[2 * i + H4(0)];
1262 uint32_t e2 = n[2 * i + sel] ^ inv;
1263 uint64_t c = extract64(m[i], 32, 1);
1264 /* Compute and store the entire 33-bit result at once. */
1265 d[i] = c + e1 + e2;
1269 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1271 intptr_t i, opr_sz = simd_oprsz(desc);
1272 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1273 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1274 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1276 for (i = 0; i < opr_sz / 8; i += 2) {
1277 Int128 e1 = int128_make64(a[i]);
1278 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1279 Int128 c = int128_make64(m[i + 1] & 1);
1280 Int128 r = int128_add(int128_add(e1, e2), c);
1281 d[i + 0] = int128_getlo(r);
1282 d[i + 1] = int128_gethi(r);
1286 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1289 intptr_t i, opr_sz = simd_oprsz(desc); \
1290 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1291 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1292 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1293 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1294 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1295 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1296 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1300 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1301 do_sqdmull_h, DO_SQADD_H)
1302 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1303 do_sqdmull_s, DO_SQADD_S)
1304 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1305 do_sqdmull_d, do_sqadd_d)
1307 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1308 do_sqdmull_h, DO_SQSUB_H)
1309 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1310 do_sqdmull_s, DO_SQSUB_S)
1311 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1312 do_sqdmull_d, do_sqsub_d)
1314 #undef DO_SQDMLAL
1316 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1317 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1319 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1320 int rot = simd_data(desc); \
1321 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1322 bool sub_r = rot == 1 || rot == 2; \
1323 bool sub_i = rot >= 2; \
1324 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1325 for (i = 0; i < opr_sz; i += 2) { \
1326 TYPE elt1_a = n[H(i + sel_a)]; \
1327 TYPE elt2_a = m[H(i + sel_a)]; \
1328 TYPE elt2_b = m[H(i + sel_b)]; \
1329 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1330 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1334 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1336 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1337 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1338 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1339 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1341 #define DO_SQRDMLAH_B(N, M, A, S) \
1342 do_sqrdmlah_b(N, M, A, S, true)
1343 #define DO_SQRDMLAH_H(N, M, A, S) \
1344 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1345 #define DO_SQRDMLAH_S(N, M, A, S) \
1346 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1347 #define DO_SQRDMLAH_D(N, M, A, S) \
1348 do_sqrdmlah_d(N, M, A, S, true)
1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1353 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1355 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1356 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1358 intptr_t i, j, oprsz = simd_oprsz(desc); \
1359 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1360 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1361 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1362 bool sub_r = rot == 1 || rot == 2; \
1363 bool sub_i = rot >= 2; \
1364 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1365 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1366 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1367 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1368 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1369 TYPE elt1_a = n[H(i + j + sel_a)]; \
1370 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1371 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1377 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1380 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1382 #undef DO_CMLA
1383 #undef DO_CMLA_FUNC
1384 #undef DO_CMLA_IDX_FUNC
1385 #undef DO_SQRDMLAH_B
1386 #undef DO_SQRDMLAH_H
1387 #undef DO_SQRDMLAH_S
1388 #undef DO_SQRDMLAH_D
1390 /* Note N and M are 4 elements bundled into one unit. */
1391 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1392 int sel_a, int sel_b, int sub_i)
1394 for (int i = 0; i <= 1; i++) {
1395 int32_t elt1_r = (int8_t)(n >> (16 * i));
1396 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1397 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1398 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1400 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1402 return a;
1405 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1406 int sel_a, int sel_b, int sub_i)
1408 for (int i = 0; i <= 1; i++) {
1409 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1410 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1411 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1412 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1414 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1416 return a;
1419 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1420 void *va, uint32_t desc)
1422 int opr_sz = simd_oprsz(desc);
1423 int rot = simd_data(desc);
1424 int sel_a = rot & 1;
1425 int sel_b = sel_a ^ 1;
1426 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1427 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1429 for (int e = 0; e < opr_sz / 4; e++) {
1430 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1434 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1435 void *va, uint32_t desc)
1437 int opr_sz = simd_oprsz(desc);
1438 int rot = simd_data(desc);
1439 int sel_a = rot & 1;
1440 int sel_b = sel_a ^ 1;
1441 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1442 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1444 for (int e = 0; e < opr_sz / 8; e++) {
1445 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1449 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1450 void *va, uint32_t desc)
1452 int opr_sz = simd_oprsz(desc);
1453 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1454 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1455 int sel_a = rot & 1;
1456 int sel_b = sel_a ^ 1;
1457 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1458 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1460 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1461 uint32_t seg_m = m[seg + idx];
1462 for (int e = 0; e < 4; e++) {
1463 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1464 sel_a, sel_b, sub_i);
1469 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1470 void *va, uint32_t desc)
1472 int seg, opr_sz = simd_oprsz(desc);
1473 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1474 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1475 int sel_a = rot & 1;
1476 int sel_b = sel_a ^ 1;
1477 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1478 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1480 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1481 uint64_t seg_m = m[seg + idx];
1482 for (int e = 0; e < 2; e++) {
1483 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1484 sel_a, sel_b, sub_i);
1489 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1490 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1492 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1493 intptr_t i, j, idx = simd_data(desc); \
1494 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1495 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1496 TYPE mm = m[i]; \
1497 for (j = 0; j < segment; j++) { \
1498 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1503 #define DO_SQRDMLAH_H(N, M, A) \
1504 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1505 #define DO_SQRDMLAH_S(N, M, A) \
1506 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1507 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1509 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1510 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1511 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1513 #define DO_SQRDMLSH_H(N, M, A) \
1514 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1515 #define DO_SQRDMLSH_S(N, M, A) \
1516 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1517 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1519 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1520 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1521 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1523 #undef DO_ZZXZ
1525 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1528 intptr_t i, j, oprsz = simd_oprsz(desc); \
1529 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1530 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1531 for (i = 0; i < oprsz; i += 16) { \
1532 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1533 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1534 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1535 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1536 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1541 #define DO_MLA(N, M, A) (A + N * M)
1543 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1544 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1545 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1546 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1548 #define DO_MLS(N, M, A) (A - N * M)
1550 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1551 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1552 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1553 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1555 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1556 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1558 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1559 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1561 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1562 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1564 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1565 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1567 #undef DO_MLA
1568 #undef DO_MLS
1569 #undef DO_ZZXW
1571 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1572 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1574 intptr_t i, j, oprsz = simd_oprsz(desc); \
1575 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1576 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1577 for (i = 0; i < oprsz; i += 16) { \
1578 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1579 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1580 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1581 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1586 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1587 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1589 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1590 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1592 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1593 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1595 #undef DO_ZZX
1597 #define DO_BITPERM(NAME, TYPE, OP) \
1598 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1600 intptr_t i, opr_sz = simd_oprsz(desc); \
1601 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1602 TYPE nn = *(TYPE *)(vn + i); \
1603 TYPE mm = *(TYPE *)(vm + i); \
1604 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1608 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1610 uint64_t res = 0;
1611 int db, rb = 0;
1613 for (db = 0; db < n; ++db) {
1614 if ((mask >> db) & 1) {
1615 res |= ((data >> db) & 1) << rb;
1616 ++rb;
1619 return res;
1622 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1623 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1624 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1625 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1627 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1629 uint64_t res = 0;
1630 int rb, db = 0;
1632 for (rb = 0; rb < n; ++rb) {
1633 if ((mask >> rb) & 1) {
1634 res |= ((data >> db) & 1) << rb;
1635 ++db;
1638 return res;
1641 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1642 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1643 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1644 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1646 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1648 uint64_t resm = 0, resu = 0;
1649 int db, rbm = 0, rbu = 0;
1651 for (db = 0; db < n; ++db) {
1652 uint64_t val = (data >> db) & 1;
1653 if ((mask >> db) & 1) {
1654 resm |= val << rbm++;
1655 } else {
1656 resu |= val << rbu++;
1660 return resm | (resu << rbm);
1663 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1664 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1665 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1666 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1668 #undef DO_BITPERM
1670 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1671 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1673 intptr_t i, opr_sz = simd_oprsz(desc); \
1674 int sub_r = simd_data(desc); \
1675 if (sub_r) { \
1676 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1677 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1678 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1679 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1680 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1681 acc_r = ADD_OP(acc_r, el2_i); \
1682 acc_i = SUB_OP(acc_i, el2_r); \
1683 *(TYPE *)(vd + H(i)) = acc_r; \
1684 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1686 } else { \
1687 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1688 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1689 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1690 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1691 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1692 acc_r = SUB_OP(acc_r, el2_i); \
1693 acc_i = ADD_OP(acc_i, el2_r); \
1694 *(TYPE *)(vd + H(i)) = acc_r; \
1695 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1700 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1701 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1702 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1703 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1705 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1706 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1707 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1708 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1710 #undef DO_CADD
1712 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1713 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1715 intptr_t i, opr_sz = simd_oprsz(desc); \
1716 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1717 int shift = simd_data(desc) >> 1; \
1718 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1719 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1720 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1724 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1725 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1726 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1728 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1729 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1730 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1732 #undef DO_ZZI_SHLL
1734 /* Two-operand reduction expander, controlled by a predicate.
1735 * The difference between TYPERED and TYPERET has to do with
1736 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1737 * but TYPERET must be unsigned so that e.g. a 32-bit value
1738 * is not sign-extended to the ABI uint64_t return type.
1740 /* ??? If we were to vectorize this by hand the reduction ordering
1741 * would change. For integer operands, this is perfectly fine.
1743 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1744 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1746 intptr_t i, opr_sz = simd_oprsz(desc); \
1747 TYPERED ret = INIT; \
1748 for (i = 0; i < opr_sz; ) { \
1749 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1750 do { \
1751 if (pg & 1) { \
1752 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1753 ret = OP(ret, nn); \
1755 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1756 } while (i & 15); \
1758 return (TYPERET)ret; \
1761 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1762 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1764 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1765 TYPEE *n = vn; \
1766 uint8_t *pg = vg; \
1767 TYPER ret = INIT; \
1768 for (i = 0; i < opr_sz; i += 1) { \
1769 if (pg[H1(i)] & 1) { \
1770 TYPEE nn = n[i]; \
1771 ret = OP(ret, nn); \
1774 return ret; \
1777 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1778 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1779 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1780 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1782 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1783 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1784 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1785 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1787 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1788 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1789 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1790 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1792 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1793 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1794 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1796 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1797 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1798 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1799 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1801 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1802 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1803 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1804 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1806 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1807 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1808 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1809 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1811 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1812 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1813 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1814 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1816 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1817 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1818 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1819 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1821 #undef DO_VPZ
1822 #undef DO_VPZ_D
1824 /* Two vector operand, one scalar operand, unpredicated. */
1825 #define DO_ZZI(NAME, TYPE, OP) \
1826 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1828 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1829 TYPE s = s64, *d = vd, *n = vn; \
1830 for (i = 0; i < opr_sz; ++i) { \
1831 d[i] = OP(n[i], s); \
1835 #define DO_SUBR(X, Y) (Y - X)
1837 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1838 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1839 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1840 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1842 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1843 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1844 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1845 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1847 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1848 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1849 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1850 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1852 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1853 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1854 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1855 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1857 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1858 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1859 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1860 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1862 #undef DO_ZZI
1864 #undef DO_AND
1865 #undef DO_ORR
1866 #undef DO_EOR
1867 #undef DO_BIC
1868 #undef DO_ADD
1869 #undef DO_SUB
1870 #undef DO_MAX
1871 #undef DO_MIN
1872 #undef DO_ABD
1873 #undef DO_MUL
1874 #undef DO_DIV
1875 #undef DO_ASR
1876 #undef DO_LSR
1877 #undef DO_LSL
1878 #undef DO_SUBR
1880 /* Similar to the ARM LastActiveElement pseudocode function, except the
1881 result is multiplied by the element size. This includes the not found
1882 indication; e.g. not found for esz=3 is -8. */
1883 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1885 uint64_t mask = pred_esz_masks[esz];
1886 intptr_t i = words;
1888 do {
1889 uint64_t this_g = g[--i] & mask;
1890 if (this_g) {
1891 return i * 64 + (63 - clz64(this_g));
1893 } while (i > 0);
1894 return (intptr_t)-1 << esz;
1897 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1899 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1900 uint32_t flags = PREDTEST_INIT;
1901 uint64_t *d = vd, *g = vg;
1902 intptr_t i = 0;
1904 do {
1905 uint64_t this_d = d[i];
1906 uint64_t this_g = g[i];
1908 if (this_g) {
1909 if (!(flags & 4)) {
1910 /* Set in D the first bit of G. */
1911 this_d |= this_g & -this_g;
1912 d[i] = this_d;
1914 flags = iter_predtest_fwd(this_d, this_g, flags);
1916 } while (++i < words);
1918 return flags;
1921 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1923 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1924 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1925 uint32_t flags = PREDTEST_INIT;
1926 uint64_t *d = vd, *g = vg, esz_mask;
1927 intptr_t i, next;
1929 next = last_active_element(vd, words, esz) + (1 << esz);
1930 esz_mask = pred_esz_masks[esz];
1932 /* Similar to the pseudocode for pnext, but scaled by ESZ
1933 so that we find the correct bit. */
1934 if (next < words * 64) {
1935 uint64_t mask = -1;
1937 if (next & 63) {
1938 mask = ~((1ull << (next & 63)) - 1);
1939 next &= -64;
1941 do {
1942 uint64_t this_g = g[next / 64] & esz_mask & mask;
1943 if (this_g != 0) {
1944 next = (next & -64) + ctz64(this_g);
1945 break;
1947 next += 64;
1948 mask = -1;
1949 } while (next < words * 64);
1952 i = 0;
1953 do {
1954 uint64_t this_d = 0;
1955 if (i == next / 64) {
1956 this_d = 1ull << (next & 63);
1958 d[i] = this_d;
1959 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1960 } while (++i < words);
1962 return flags;
1966 * Copy Zn into Zd, and store zero into inactive elements.
1967 * If inv, store zeros into the active elements.
1969 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1971 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1972 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1973 uint64_t *d = vd, *n = vn;
1974 uint8_t *pg = vg;
1976 for (i = 0; i < opr_sz; i += 1) {
1977 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1981 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1983 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985 uint64_t *d = vd, *n = vn;
1986 uint8_t *pg = vg;
1988 for (i = 0; i < opr_sz; i += 1) {
1989 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1993 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1995 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1997 uint64_t *d = vd, *n = vn;
1998 uint8_t *pg = vg;
2000 for (i = 0; i < opr_sz; i += 1) {
2001 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2005 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2007 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2008 uint64_t *d = vd, *n = vn;
2009 uint8_t *pg = vg;
2010 uint8_t inv = simd_data(desc);
2012 for (i = 0; i < opr_sz; i += 1) {
2013 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2017 /* Three-operand expander, immediate operand, controlled by a predicate.
2019 #define DO_ZPZI(NAME, TYPE, H, OP) \
2020 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2022 intptr_t i, opr_sz = simd_oprsz(desc); \
2023 TYPE imm = simd_data(desc); \
2024 for (i = 0; i < opr_sz; ) { \
2025 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2026 do { \
2027 if (pg & 1) { \
2028 TYPE nn = *(TYPE *)(vn + H(i)); \
2029 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2031 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2032 } while (i & 15); \
2036 /* Similarly, specialized for 64-bit operands. */
2037 #define DO_ZPZI_D(NAME, TYPE, OP) \
2038 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2040 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2041 TYPE *d = vd, *n = vn; \
2042 TYPE imm = simd_data(desc); \
2043 uint8_t *pg = vg; \
2044 for (i = 0; i < opr_sz; i += 1) { \
2045 if (pg[H1(i)] & 1) { \
2046 TYPE nn = n[i]; \
2047 d[i] = OP(nn, imm); \
2052 #define DO_SHR(N, M) (N >> M)
2053 #define DO_SHL(N, M) (N << M)
2055 /* Arithmetic shift right for division. This rounds negative numbers
2056 toward zero as per signed division. Therefore before shifting,
2057 when N is negative, add 2**M-1. */
2058 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2060 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2062 if (likely(sh < 64)) {
2063 return (x >> sh) + ((x >> (sh - 1)) & 1);
2064 } else if (sh == 64) {
2065 return x >> 63;
2066 } else {
2067 return 0;
2071 static inline int64_t do_srshr(int64_t x, unsigned sh)
2073 if (likely(sh < 64)) {
2074 return (x >> sh) + ((x >> (sh - 1)) & 1);
2075 } else {
2076 /* Rounding the sign bit always produces 0. */
2077 return 0;
2081 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2082 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2083 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2084 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2086 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2087 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2088 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2089 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2091 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2092 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2093 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2094 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2096 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2097 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2098 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2099 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2101 /* SVE2 bitwise shift by immediate */
2102 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2103 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2104 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2105 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2107 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2108 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2109 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2110 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2112 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2113 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2114 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2115 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2117 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2118 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2119 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2120 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2122 #define do_suqrshl_b(n, m) \
2123 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2124 #define do_suqrshl_h(n, m) \
2125 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2126 #define do_suqrshl_s(n, m) \
2127 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2128 #define do_suqrshl_d(n, m) \
2129 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2131 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2132 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2133 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2134 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2136 #undef DO_ASRD
2137 #undef DO_ZPZI
2138 #undef DO_ZPZI_D
2140 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2141 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2143 intptr_t i, opr_sz = simd_oprsz(desc); \
2144 int shift = simd_data(desc); \
2145 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2146 TYPEW nn = *(TYPEW *)(vn + i); \
2147 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2151 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2152 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2154 intptr_t i, opr_sz = simd_oprsz(desc); \
2155 int shift = simd_data(desc); \
2156 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2157 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2158 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2162 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2163 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2164 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2166 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2167 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2168 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2170 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2171 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2172 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2174 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2175 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2176 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2178 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2179 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2180 #define DO_SQSHRUN_D(x, sh) \
2181 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2183 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2184 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2185 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2187 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2188 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2189 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2191 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2192 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2193 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2195 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2196 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2197 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2199 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2200 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2201 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2203 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2204 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2205 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2207 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2208 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2209 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2211 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2212 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2213 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2215 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2216 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2217 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2219 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2220 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2221 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2223 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2224 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2225 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2227 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2228 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2229 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2231 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2232 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2233 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2235 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2236 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2237 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2239 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2240 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2241 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2243 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2244 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2245 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2247 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2248 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2249 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2251 #undef DO_SHRNB
2252 #undef DO_SHRNT
2254 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2255 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2257 intptr_t i, opr_sz = simd_oprsz(desc); \
2258 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2259 TYPEW nn = *(TYPEW *)(vn + i); \
2260 TYPEW mm = *(TYPEW *)(vm + i); \
2261 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2265 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2266 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2268 intptr_t i, opr_sz = simd_oprsz(desc); \
2269 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2270 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2271 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2272 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2276 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2277 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2278 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2279 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2281 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2282 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2283 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2285 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2286 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2287 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2289 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2290 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2291 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2293 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2294 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2295 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2297 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2298 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2299 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2301 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2302 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2303 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2305 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2306 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2307 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2309 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2310 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2311 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2313 #undef DO_RSUBHN
2314 #undef DO_SUBHN
2315 #undef DO_RADDHN
2316 #undef DO_ADDHN
2318 #undef DO_BINOPNB
2320 /* Fully general four-operand expander, controlled by a predicate.
2322 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2323 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2324 void *vg, uint32_t desc) \
2326 intptr_t i, opr_sz = simd_oprsz(desc); \
2327 for (i = 0; i < opr_sz; ) { \
2328 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2329 do { \
2330 if (pg & 1) { \
2331 TYPE nn = *(TYPE *)(vn + H(i)); \
2332 TYPE mm = *(TYPE *)(vm + H(i)); \
2333 TYPE aa = *(TYPE *)(va + H(i)); \
2334 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2336 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2337 } while (i & 15); \
2341 /* Similarly, specialized for 64-bit operands. */
2342 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2343 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2344 void *vg, uint32_t desc) \
2346 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2347 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2348 uint8_t *pg = vg; \
2349 for (i = 0; i < opr_sz; i += 1) { \
2350 if (pg[H1(i)] & 1) { \
2351 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2352 d[i] = OP(aa, nn, mm); \
2357 #define DO_MLA(A, N, M) (A + N * M)
2358 #define DO_MLS(A, N, M) (A - N * M)
2360 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2361 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2363 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2364 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2366 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2367 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2369 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2370 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2372 #undef DO_MLA
2373 #undef DO_MLS
2374 #undef DO_ZPZZZ
2375 #undef DO_ZPZZZ_D
2377 void HELPER(sve_index_b)(void *vd, uint32_t start,
2378 uint32_t incr, uint32_t desc)
2380 intptr_t i, opr_sz = simd_oprsz(desc);
2381 uint8_t *d = vd;
2382 for (i = 0; i < opr_sz; i += 1) {
2383 d[H1(i)] = start + i * incr;
2387 void HELPER(sve_index_h)(void *vd, uint32_t start,
2388 uint32_t incr, uint32_t desc)
2390 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2391 uint16_t *d = vd;
2392 for (i = 0; i < opr_sz; i += 1) {
2393 d[H2(i)] = start + i * incr;
2397 void HELPER(sve_index_s)(void *vd, uint32_t start,
2398 uint32_t incr, uint32_t desc)
2400 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2401 uint32_t *d = vd;
2402 for (i = 0; i < opr_sz; i += 1) {
2403 d[H4(i)] = start + i * incr;
2407 void HELPER(sve_index_d)(void *vd, uint64_t start,
2408 uint64_t incr, uint32_t desc)
2410 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2411 uint64_t *d = vd;
2412 for (i = 0; i < opr_sz; i += 1) {
2413 d[i] = start + i * incr;
2417 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2419 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2420 uint32_t sh = simd_data(desc);
2421 uint32_t *d = vd, *n = vn, *m = vm;
2422 for (i = 0; i < opr_sz; i += 1) {
2423 d[i] = n[i] + (m[i] << sh);
2427 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2429 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2430 uint64_t sh = simd_data(desc);
2431 uint64_t *d = vd, *n = vn, *m = vm;
2432 for (i = 0; i < opr_sz; i += 1) {
2433 d[i] = n[i] + (m[i] << sh);
2437 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2439 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2440 uint64_t sh = simd_data(desc);
2441 uint64_t *d = vd, *n = vn, *m = vm;
2442 for (i = 0; i < opr_sz; i += 1) {
2443 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2447 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2449 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2450 uint64_t sh = simd_data(desc);
2451 uint64_t *d = vd, *n = vn, *m = vm;
2452 for (i = 0; i < opr_sz; i += 1) {
2453 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2457 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2459 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2460 static const uint16_t coeff[] = {
2461 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2462 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2463 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2464 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2466 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2467 uint16_t *d = vd, *n = vn;
2469 for (i = 0; i < opr_sz; i++) {
2470 uint16_t nn = n[i];
2471 intptr_t idx = extract32(nn, 0, 5);
2472 uint16_t exp = extract32(nn, 5, 5);
2473 d[i] = coeff[idx] | (exp << 10);
2477 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2479 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2480 static const uint32_t coeff[] = {
2481 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2482 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2483 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2484 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2485 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2486 0x1ef532, 0x20b051, 0x227043, 0x243516,
2487 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2488 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2489 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2490 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2491 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2492 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2493 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2494 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2495 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2496 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2498 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2499 uint32_t *d = vd, *n = vn;
2501 for (i = 0; i < opr_sz; i++) {
2502 uint32_t nn = n[i];
2503 intptr_t idx = extract32(nn, 0, 6);
2504 uint32_t exp = extract32(nn, 6, 8);
2505 d[i] = coeff[idx] | (exp << 23);
2509 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2511 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2512 static const uint64_t coeff[] = {
2513 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2514 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2515 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2516 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2517 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2518 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2519 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2520 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2521 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2522 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2523 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2524 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2525 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2526 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2527 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2528 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2529 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2530 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2531 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2532 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2533 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2534 0xFA7C1819E90D8ull,
2536 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2537 uint64_t *d = vd, *n = vn;
2539 for (i = 0; i < opr_sz; i++) {
2540 uint64_t nn = n[i];
2541 intptr_t idx = extract32(nn, 0, 6);
2542 uint64_t exp = extract32(nn, 6, 11);
2543 d[i] = coeff[idx] | (exp << 52);
2547 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2549 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2550 uint16_t *d = vd, *n = vn, *m = vm;
2551 for (i = 0; i < opr_sz; i += 1) {
2552 uint16_t nn = n[i];
2553 uint16_t mm = m[i];
2554 if (mm & 1) {
2555 nn = float16_one;
2557 d[i] = nn ^ (mm & 2) << 14;
2561 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2563 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2564 uint32_t *d = vd, *n = vn, *m = vm;
2565 for (i = 0; i < opr_sz; i += 1) {
2566 uint32_t nn = n[i];
2567 uint32_t mm = m[i];
2568 if (mm & 1) {
2569 nn = float32_one;
2571 d[i] = nn ^ (mm & 2) << 30;
2575 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2577 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2578 uint64_t *d = vd, *n = vn, *m = vm;
2579 for (i = 0; i < opr_sz; i += 1) {
2580 uint64_t nn = n[i];
2581 uint64_t mm = m[i];
2582 if (mm & 1) {
2583 nn = float64_one;
2585 d[i] = nn ^ (mm & 2) << 62;
2590 * Signed saturating addition with scalar operand.
2593 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2595 intptr_t i, oprsz = simd_oprsz(desc);
2597 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2598 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2602 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2604 intptr_t i, oprsz = simd_oprsz(desc);
2606 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2607 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2611 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2613 intptr_t i, oprsz = simd_oprsz(desc);
2615 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2616 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2620 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2622 intptr_t i, oprsz = simd_oprsz(desc);
2624 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2625 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2630 * Unsigned saturating addition with scalar operand.
2633 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2635 intptr_t i, oprsz = simd_oprsz(desc);
2637 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2638 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2642 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2644 intptr_t i, oprsz = simd_oprsz(desc);
2646 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2647 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2651 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2653 intptr_t i, oprsz = simd_oprsz(desc);
2655 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2656 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2660 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2662 intptr_t i, oprsz = simd_oprsz(desc);
2664 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2665 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2669 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2671 intptr_t i, oprsz = simd_oprsz(desc);
2673 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2674 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2678 /* Two operand predicated copy immediate with merge. All valid immediates
2679 * can fit within 17 signed bits in the simd_data field.
2681 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2682 uint64_t mm, uint32_t desc)
2684 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2685 uint64_t *d = vd, *n = vn;
2686 uint8_t *pg = vg;
2688 mm = dup_const(MO_8, mm);
2689 for (i = 0; i < opr_sz; i += 1) {
2690 uint64_t nn = n[i];
2691 uint64_t pp = expand_pred_b(pg[H1(i)]);
2692 d[i] = (mm & pp) | (nn & ~pp);
2696 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2697 uint64_t mm, uint32_t desc)
2699 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2700 uint64_t *d = vd, *n = vn;
2701 uint8_t *pg = vg;
2703 mm = dup_const(MO_16, mm);
2704 for (i = 0; i < opr_sz; i += 1) {
2705 uint64_t nn = n[i];
2706 uint64_t pp = expand_pred_h(pg[H1(i)]);
2707 d[i] = (mm & pp) | (nn & ~pp);
2711 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2712 uint64_t mm, uint32_t desc)
2714 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2715 uint64_t *d = vd, *n = vn;
2716 uint8_t *pg = vg;
2718 mm = dup_const(MO_32, mm);
2719 for (i = 0; i < opr_sz; i += 1) {
2720 uint64_t nn = n[i];
2721 uint64_t pp = expand_pred_s(pg[H1(i)]);
2722 d[i] = (mm & pp) | (nn & ~pp);
2726 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2727 uint64_t mm, uint32_t desc)
2729 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2730 uint64_t *d = vd, *n = vn;
2731 uint8_t *pg = vg;
2733 for (i = 0; i < opr_sz; i += 1) {
2734 uint64_t nn = n[i];
2735 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2739 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2741 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2742 uint64_t *d = vd;
2743 uint8_t *pg = vg;
2745 val = dup_const(MO_8, val);
2746 for (i = 0; i < opr_sz; i += 1) {
2747 d[i] = val & expand_pred_b(pg[H1(i)]);
2751 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2753 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754 uint64_t *d = vd;
2755 uint8_t *pg = vg;
2757 val = dup_const(MO_16, val);
2758 for (i = 0; i < opr_sz; i += 1) {
2759 d[i] = val & expand_pred_h(pg[H1(i)]);
2763 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2765 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766 uint64_t *d = vd;
2767 uint8_t *pg = vg;
2769 val = dup_const(MO_32, val);
2770 for (i = 0; i < opr_sz; i += 1) {
2771 d[i] = val & expand_pred_s(pg[H1(i)]);
2775 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2777 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2778 uint64_t *d = vd;
2779 uint8_t *pg = vg;
2781 for (i = 0; i < opr_sz; i += 1) {
2782 d[i] = (pg[H1(i)] & 1 ? val : 0);
2786 /* Big-endian hosts need to frob the byte indices. If the copy
2787 * happens to be 8-byte aligned, then no frobbing necessary.
2789 static void swap_memmove(void *vd, void *vs, size_t n)
2791 uintptr_t d = (uintptr_t)vd;
2792 uintptr_t s = (uintptr_t)vs;
2793 uintptr_t o = (d | s | n) & 7;
2794 size_t i;
2796 #if !HOST_BIG_ENDIAN
2797 o = 0;
2798 #endif
2799 switch (o) {
2800 case 0:
2801 memmove(vd, vs, n);
2802 break;
2804 case 4:
2805 if (d < s || d >= s + n) {
2806 for (i = 0; i < n; i += 4) {
2807 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2809 } else {
2810 for (i = n; i > 0; ) {
2811 i -= 4;
2812 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2815 break;
2817 case 2:
2818 case 6:
2819 if (d < s || d >= s + n) {
2820 for (i = 0; i < n; i += 2) {
2821 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2823 } else {
2824 for (i = n; i > 0; ) {
2825 i -= 2;
2826 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2829 break;
2831 default:
2832 if (d < s || d >= s + n) {
2833 for (i = 0; i < n; i++) {
2834 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2836 } else {
2837 for (i = n; i > 0; ) {
2838 i -= 1;
2839 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2842 break;
2846 /* Similarly for memset of 0. */
2847 static void swap_memzero(void *vd, size_t n)
2849 uintptr_t d = (uintptr_t)vd;
2850 uintptr_t o = (d | n) & 7;
2851 size_t i;
2853 /* Usually, the first bit of a predicate is set, so N is 0. */
2854 if (likely(n == 0)) {
2855 return;
2858 #if !HOST_BIG_ENDIAN
2859 o = 0;
2860 #endif
2861 switch (o) {
2862 case 0:
2863 memset(vd, 0, n);
2864 break;
2866 case 4:
2867 for (i = 0; i < n; i += 4) {
2868 *(uint32_t *)H1_4(d + i) = 0;
2870 break;
2872 case 2:
2873 case 6:
2874 for (i = 0; i < n; i += 2) {
2875 *(uint16_t *)H1_2(d + i) = 0;
2877 break;
2879 default:
2880 for (i = 0; i < n; i++) {
2881 *(uint8_t *)H1(d + i) = 0;
2883 break;
2887 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2889 intptr_t opr_sz = simd_oprsz(desc);
2890 size_t n_ofs = simd_data(desc);
2891 size_t n_siz = opr_sz - n_ofs;
2893 if (vd != vm) {
2894 swap_memmove(vd, vn + n_ofs, n_siz);
2895 swap_memmove(vd + n_siz, vm, n_ofs);
2896 } else if (vd != vn) {
2897 swap_memmove(vd + n_siz, vd, n_ofs);
2898 swap_memmove(vd, vn + n_ofs, n_siz);
2899 } else {
2900 /* vd == vn == vm. Need temp space. */
2901 ARMVectorReg tmp;
2902 swap_memmove(&tmp, vm, n_ofs);
2903 swap_memmove(vd, vd + n_ofs, n_siz);
2904 memcpy(vd + n_siz, &tmp, n_ofs);
2908 #define DO_INSR(NAME, TYPE, H) \
2909 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2911 intptr_t opr_sz = simd_oprsz(desc); \
2912 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2913 *(TYPE *)(vd + H(0)) = val; \
2916 DO_INSR(sve_insr_b, uint8_t, H1)
2917 DO_INSR(sve_insr_h, uint16_t, H1_2)
2918 DO_INSR(sve_insr_s, uint32_t, H1_4)
2919 DO_INSR(sve_insr_d, uint64_t, H1_8)
2921 #undef DO_INSR
2923 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2925 intptr_t i, j, opr_sz = simd_oprsz(desc);
2926 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2927 uint64_t f = *(uint64_t *)(vn + i);
2928 uint64_t b = *(uint64_t *)(vn + j);
2929 *(uint64_t *)(vd + i) = bswap64(b);
2930 *(uint64_t *)(vd + j) = bswap64(f);
2934 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2936 intptr_t i, j, opr_sz = simd_oprsz(desc);
2937 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2938 uint64_t f = *(uint64_t *)(vn + i);
2939 uint64_t b = *(uint64_t *)(vn + j);
2940 *(uint64_t *)(vd + i) = hswap64(b);
2941 *(uint64_t *)(vd + j) = hswap64(f);
2945 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2947 intptr_t i, j, opr_sz = simd_oprsz(desc);
2948 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2949 uint64_t f = *(uint64_t *)(vn + i);
2950 uint64_t b = *(uint64_t *)(vn + j);
2951 *(uint64_t *)(vd + i) = rol64(b, 32);
2952 *(uint64_t *)(vd + j) = rol64(f, 32);
2956 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2958 intptr_t i, j, opr_sz = simd_oprsz(desc);
2959 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2960 uint64_t f = *(uint64_t *)(vn + i);
2961 uint64_t b = *(uint64_t *)(vn + j);
2962 *(uint64_t *)(vd + i) = b;
2963 *(uint64_t *)(vd + j) = f;
2967 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2969 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2970 bool is_tbx, tb_impl_fn *fn)
2972 ARMVectorReg scratch;
2973 uintptr_t oprsz = simd_oprsz(desc);
2975 if (unlikely(vd == vn)) {
2976 vn = memcpy(&scratch, vn, oprsz);
2979 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2982 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2983 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2985 ARMVectorReg scratch;
2986 uintptr_t oprsz = simd_oprsz(desc);
2988 if (unlikely(vd == vn0)) {
2989 vn0 = memcpy(&scratch, vn0, oprsz);
2990 if (vd == vn1) {
2991 vn1 = vn0;
2993 } else if (unlikely(vd == vn1)) {
2994 vn1 = memcpy(&scratch, vn1, oprsz);
2997 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3000 #define DO_TB(SUFF, TYPE, H) \
3001 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3002 void *vm, uintptr_t oprsz, bool is_tbx) \
3004 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3005 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3006 for (i = 0; i < nelem; ++i) { \
3007 TYPE index = indexes[H1(i)], val = 0; \
3008 if (index < nelem) { \
3009 val = tbl0[H(index)]; \
3010 } else { \
3011 index -= nelem; \
3012 if (tbl1 && index < nelem) { \
3013 val = tbl1[H(index)]; \
3014 } else if (is_tbx) { \
3015 continue; \
3018 d[H(i)] = val; \
3021 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3023 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3025 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3026 void *vm, uint32_t desc) \
3028 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3030 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3032 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3035 DO_TB(b, uint8_t, H1)
3036 DO_TB(h, uint16_t, H2)
3037 DO_TB(s, uint32_t, H4)
3038 DO_TB(d, uint64_t, H8)
3040 #undef DO_TB
3042 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3043 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3045 intptr_t i, opr_sz = simd_oprsz(desc); \
3046 TYPED *d = vd; \
3047 TYPES *n = vn; \
3048 ARMVectorReg tmp; \
3049 if (unlikely(vn - vd < opr_sz)) { \
3050 n = memcpy(&tmp, n, opr_sz / 2); \
3052 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3053 d[HD(i)] = n[HS(i)]; \
3057 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3058 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3059 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3061 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3062 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3063 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3065 #undef DO_UNPK
3067 /* Mask of bits included in the even numbered predicates of width esz.
3068 * We also use this for expand_bits/compress_bits, and so extend the
3069 * same pattern out to 16-bit units.
3071 static const uint64_t even_bit_esz_masks[5] = {
3072 0x5555555555555555ull,
3073 0x3333333333333333ull,
3074 0x0f0f0f0f0f0f0f0full,
3075 0x00ff00ff00ff00ffull,
3076 0x0000ffff0000ffffull,
3079 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3080 * For N==0, this corresponds to the operation that in qemu/bitops.h
3081 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3082 * section 7-2 Shuffling Bits.
3084 static uint64_t expand_bits(uint64_t x, int n)
3086 int i;
3088 x &= 0xffffffffu;
3089 for (i = 4; i >= n; i--) {
3090 int sh = 1 << i;
3091 x = ((x << sh) | x) & even_bit_esz_masks[i];
3093 return x;
3096 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3097 * For N==0, this corresponds to the operation that in qemu/bitops.h
3098 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3099 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3101 static uint64_t compress_bits(uint64_t x, int n)
3103 int i;
3105 for (i = n; i <= 4; i++) {
3106 int sh = 1 << i;
3107 x &= even_bit_esz_masks[i];
3108 x = (x >> sh) | x;
3110 return x & 0xffffffffu;
3113 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3115 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3116 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3117 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3118 int esize = 1 << esz;
3119 uint64_t *d = vd;
3120 intptr_t i;
3122 if (oprsz <= 8) {
3123 uint64_t nn = *(uint64_t *)vn;
3124 uint64_t mm = *(uint64_t *)vm;
3125 int half = 4 * oprsz;
3127 nn = extract64(nn, high * half, half);
3128 mm = extract64(mm, high * half, half);
3129 nn = expand_bits(nn, esz);
3130 mm = expand_bits(mm, esz);
3131 d[0] = nn | (mm << esize);
3132 } else {
3133 ARMPredicateReg tmp;
3135 /* We produce output faster than we consume input.
3136 Therefore we must be mindful of possible overlap. */
3137 if (vd == vn) {
3138 vn = memcpy(&tmp, vn, oprsz);
3139 if (vd == vm) {
3140 vm = vn;
3142 } else if (vd == vm) {
3143 vm = memcpy(&tmp, vm, oprsz);
3145 if (high) {
3146 high = oprsz >> 1;
3149 if ((oprsz & 7) == 0) {
3150 uint32_t *n = vn, *m = vm;
3151 high >>= 2;
3153 for (i = 0; i < oprsz / 8; i++) {
3154 uint64_t nn = n[H4(high + i)];
3155 uint64_t mm = m[H4(high + i)];
3157 nn = expand_bits(nn, esz);
3158 mm = expand_bits(mm, esz);
3159 d[i] = nn | (mm << esize);
3161 } else {
3162 uint8_t *n = vn, *m = vm;
3163 uint16_t *d16 = vd;
3165 for (i = 0; i < oprsz / 2; i++) {
3166 uint16_t nn = n[H1(high + i)];
3167 uint16_t mm = m[H1(high + i)];
3169 nn = expand_bits(nn, esz);
3170 mm = expand_bits(mm, esz);
3171 d16[H2(i)] = nn | (mm << esize);
3177 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3179 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3180 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3181 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3182 uint64_t *d = vd, *n = vn, *m = vm;
3183 uint64_t l, h;
3184 intptr_t i;
3186 if (oprsz <= 8) {
3187 l = compress_bits(n[0] >> odd, esz);
3188 h = compress_bits(m[0] >> odd, esz);
3189 d[0] = l | (h << (4 * oprsz));
3190 } else {
3191 ARMPredicateReg tmp_m;
3192 intptr_t oprsz_16 = oprsz / 16;
3194 if ((vm - vd) < (uintptr_t)oprsz) {
3195 m = memcpy(&tmp_m, vm, oprsz);
3198 for (i = 0; i < oprsz_16; i++) {
3199 l = n[2 * i + 0];
3200 h = n[2 * i + 1];
3201 l = compress_bits(l >> odd, esz);
3202 h = compress_bits(h >> odd, esz);
3203 d[i] = l | (h << 32);
3207 * For VL which is not a multiple of 512, the results from M do not
3208 * align nicely with the uint64_t for D. Put the aligned results
3209 * from M into TMP_M and then copy it into place afterward.
3211 if (oprsz & 15) {
3212 int final_shift = (oprsz & 15) * 2;
3214 l = n[2 * i + 0];
3215 h = n[2 * i + 1];
3216 l = compress_bits(l >> odd, esz);
3217 h = compress_bits(h >> odd, esz);
3218 d[i] = l | (h << final_shift);
3220 for (i = 0; i < oprsz_16; i++) {
3221 l = m[2 * i + 0];
3222 h = m[2 * i + 1];
3223 l = compress_bits(l >> odd, esz);
3224 h = compress_bits(h >> odd, esz);
3225 tmp_m.p[i] = l | (h << 32);
3227 l = m[2 * i + 0];
3228 h = m[2 * i + 1];
3229 l = compress_bits(l >> odd, esz);
3230 h = compress_bits(h >> odd, esz);
3231 tmp_m.p[i] = l | (h << final_shift);
3233 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3234 } else {
3235 for (i = 0; i < oprsz_16; i++) {
3236 l = m[2 * i + 0];
3237 h = m[2 * i + 1];
3238 l = compress_bits(l >> odd, esz);
3239 h = compress_bits(h >> odd, esz);
3240 d[oprsz_16 + i] = l | (h << 32);
3246 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3248 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3249 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3250 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3251 uint64_t *d = vd, *n = vn, *m = vm;
3252 uint64_t mask;
3253 int shr, shl;
3254 intptr_t i;
3256 shl = 1 << esz;
3257 shr = 0;
3258 mask = even_bit_esz_masks[esz];
3259 if (odd) {
3260 mask <<= shl;
3261 shr = shl;
3262 shl = 0;
3265 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3266 uint64_t nn = (n[i] & mask) >> shr;
3267 uint64_t mm = (m[i] & mask) << shl;
3268 d[i] = nn + mm;
3272 /* Reverse units of 2**N bits. */
3273 static uint64_t reverse_bits_64(uint64_t x, int n)
3275 int i, sh;
3277 x = bswap64(x);
3278 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3279 uint64_t mask = even_bit_esz_masks[i];
3280 x = ((x & mask) << sh) | ((x >> sh) & mask);
3282 return x;
3285 static uint8_t reverse_bits_8(uint8_t x, int n)
3287 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3288 int i, sh;
3290 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3291 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3293 return x;
3296 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3298 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3299 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3300 intptr_t i, oprsz_2 = oprsz / 2;
3302 if (oprsz <= 8) {
3303 uint64_t l = *(uint64_t *)vn;
3304 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3305 *(uint64_t *)vd = l;
3306 } else if ((oprsz & 15) == 0) {
3307 for (i = 0; i < oprsz_2; i += 8) {
3308 intptr_t ih = oprsz - 8 - i;
3309 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3310 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3311 *(uint64_t *)(vd + i) = h;
3312 *(uint64_t *)(vd + ih) = l;
3314 } else {
3315 for (i = 0; i < oprsz_2; i += 1) {
3316 intptr_t il = H1(i);
3317 intptr_t ih = H1(oprsz - 1 - i);
3318 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3319 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3320 *(uint8_t *)(vd + il) = h;
3321 *(uint8_t *)(vd + ih) = l;
3326 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3328 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3329 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3330 uint64_t *d = vd;
3331 intptr_t i;
3333 if (oprsz <= 8) {
3334 uint64_t nn = *(uint64_t *)vn;
3335 int half = 4 * oprsz;
3337 nn = extract64(nn, high * half, half);
3338 nn = expand_bits(nn, 0);
3339 d[0] = nn;
3340 } else {
3341 ARMPredicateReg tmp_n;
3343 /* We produce output faster than we consume input.
3344 Therefore we must be mindful of possible overlap. */
3345 if ((vn - vd) < (uintptr_t)oprsz) {
3346 vn = memcpy(&tmp_n, vn, oprsz);
3348 if (high) {
3349 high = oprsz >> 1;
3352 if ((oprsz & 7) == 0) {
3353 uint32_t *n = vn;
3354 high >>= 2;
3356 for (i = 0; i < oprsz / 8; i++) {
3357 uint64_t nn = n[H4(high + i)];
3358 d[i] = expand_bits(nn, 0);
3360 } else {
3361 uint16_t *d16 = vd;
3362 uint8_t *n = vn;
3364 for (i = 0; i < oprsz / 2; i++) {
3365 uint16_t nn = n[H1(high + i)];
3366 d16[H2(i)] = expand_bits(nn, 0);
3372 #define DO_ZIP(NAME, TYPE, H) \
3373 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3375 intptr_t oprsz = simd_oprsz(desc); \
3376 intptr_t odd_ofs = simd_data(desc); \
3377 intptr_t i, oprsz_2 = oprsz / 2; \
3378 ARMVectorReg tmp_n, tmp_m; \
3379 /* We produce output faster than we consume input. \
3380 Therefore we must be mindful of possible overlap. */ \
3381 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3382 vn = memcpy(&tmp_n, vn, oprsz_2); \
3384 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3385 vm = memcpy(&tmp_m, vm, oprsz_2); \
3387 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3388 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3389 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3390 *(TYPE *)(vm + odd_ofs + H(i)); \
3392 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3393 memset(vd + oprsz - 16, 0, 16); \
3397 DO_ZIP(sve_zip_b, uint8_t, H1)
3398 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3399 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3400 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3401 DO_ZIP(sve2_zip_q, Int128, )
3403 #define DO_UZP(NAME, TYPE, H) \
3404 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3406 intptr_t oprsz = simd_oprsz(desc); \
3407 intptr_t odd_ofs = simd_data(desc); \
3408 intptr_t i, p; \
3409 ARMVectorReg tmp_m; \
3410 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3411 vm = memcpy(&tmp_m, vm, oprsz); \
3413 i = 0, p = odd_ofs; \
3414 do { \
3415 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3416 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3417 } while (p < oprsz); \
3418 p -= oprsz; \
3419 do { \
3420 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3421 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3422 } while (p < oprsz); \
3423 tcg_debug_assert(i == oprsz); \
3426 DO_UZP(sve_uzp_b, uint8_t, H1)
3427 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3428 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3429 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3430 DO_UZP(sve2_uzp_q, Int128, )
3432 #define DO_TRN(NAME, TYPE, H) \
3433 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3435 intptr_t oprsz = simd_oprsz(desc); \
3436 intptr_t odd_ofs = simd_data(desc); \
3437 intptr_t i; \
3438 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3439 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3440 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3441 *(TYPE *)(vd + H(i + 0)) = ae; \
3442 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3444 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3445 memset(vd + oprsz - 16, 0, 16); \
3449 DO_TRN(sve_trn_b, uint8_t, H1)
3450 DO_TRN(sve_trn_h, uint16_t, H1_2)
3451 DO_TRN(sve_trn_s, uint32_t, H1_4)
3452 DO_TRN(sve_trn_d, uint64_t, H1_8)
3453 DO_TRN(sve2_trn_q, Int128, )
3455 #undef DO_ZIP
3456 #undef DO_UZP
3457 #undef DO_TRN
3459 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3461 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3462 uint32_t *d = vd, *n = vn;
3463 uint8_t *pg = vg;
3465 for (i = j = 0; i < opr_sz; i++) {
3466 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3467 d[H4(j)] = n[H4(i)];
3468 j++;
3471 for (; j < opr_sz; j++) {
3472 d[H4(j)] = 0;
3476 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3478 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3479 uint64_t *d = vd, *n = vn;
3480 uint8_t *pg = vg;
3482 for (i = j = 0; i < opr_sz; i++) {
3483 if (pg[H1(i)] & 1) {
3484 d[j] = n[i];
3485 j++;
3488 for (; j < opr_sz; j++) {
3489 d[j] = 0;
3493 /* Similar to the ARM LastActiveElement pseudocode function, except the
3494 * result is multiplied by the element size. This includes the not found
3495 * indication; e.g. not found for esz=3 is -8.
3497 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3499 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3500 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3502 return last_active_element(vg, words, esz);
3505 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3507 intptr_t opr_sz = simd_oprsz(desc) / 8;
3508 int esz = simd_data(desc);
3509 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3510 intptr_t i, first_i, last_i;
3511 ARMVectorReg tmp;
3513 first_i = last_i = 0;
3514 first_g = last_g = 0;
3516 /* Find the extent of the active elements within VG. */
3517 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3518 pg = *(uint64_t *)(vg + i) & mask;
3519 if (pg) {
3520 if (last_g == 0) {
3521 last_g = pg;
3522 last_i = i;
3524 first_g = pg;
3525 first_i = i;
3529 len = 0;
3530 if (first_g != 0) {
3531 first_i = first_i * 8 + ctz64(first_g);
3532 last_i = last_i * 8 + 63 - clz64(last_g);
3533 len = last_i - first_i + (1 << esz);
3534 if (vd == vm) {
3535 vm = memcpy(&tmp, vm, opr_sz * 8);
3537 swap_memmove(vd, vn + first_i, len);
3539 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3542 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3543 void *vg, uint32_t desc)
3545 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3546 uint64_t *d = vd, *n = vn, *m = vm;
3547 uint8_t *pg = vg;
3549 for (i = 0; i < opr_sz; i += 1) {
3550 uint64_t nn = n[i], mm = m[i];
3551 uint64_t pp = expand_pred_b(pg[H1(i)]);
3552 d[i] = (nn & pp) | (mm & ~pp);
3556 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3557 void *vg, uint32_t desc)
3559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3560 uint64_t *d = vd, *n = vn, *m = vm;
3561 uint8_t *pg = vg;
3563 for (i = 0; i < opr_sz; i += 1) {
3564 uint64_t nn = n[i], mm = m[i];
3565 uint64_t pp = expand_pred_h(pg[H1(i)]);
3566 d[i] = (nn & pp) | (mm & ~pp);
3570 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3571 void *vg, uint32_t desc)
3573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3574 uint64_t *d = vd, *n = vn, *m = vm;
3575 uint8_t *pg = vg;
3577 for (i = 0; i < opr_sz; i += 1) {
3578 uint64_t nn = n[i], mm = m[i];
3579 uint64_t pp = expand_pred_s(pg[H1(i)]);
3580 d[i] = (nn & pp) | (mm & ~pp);
3584 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3585 void *vg, uint32_t desc)
3587 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3588 uint64_t *d = vd, *n = vn, *m = vm;
3589 uint8_t *pg = vg;
3591 for (i = 0; i < opr_sz; i += 1) {
3592 uint64_t nn = n[i], mm = m[i];
3593 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3597 /* Two operand comparison controlled by a predicate.
3598 * ??? It is very tempting to want to be able to expand this inline
3599 * with x86 instructions, e.g.
3601 * vcmpeqw zm, zn, %ymm0
3602 * vpmovmskb %ymm0, %eax
3603 * and $0x5555, %eax
3604 * and pg, %eax
3606 * or even aarch64, e.g.
3608 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3609 * cmeq v0.8h, zn, zm
3610 * and v0.8h, v0.8h, mask
3611 * addv h0, v0.8h
3612 * and v0.8b, pg
3614 * However, coming up with an abstraction that allows vector inputs and
3615 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3616 * scalar outputs, is tricky.
3618 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3619 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3621 intptr_t opr_sz = simd_oprsz(desc); \
3622 uint32_t flags = PREDTEST_INIT; \
3623 intptr_t i = opr_sz; \
3624 do { \
3625 uint64_t out = 0, pg; \
3626 do { \
3627 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3628 TYPE nn = *(TYPE *)(vn + H(i)); \
3629 TYPE mm = *(TYPE *)(vm + H(i)); \
3630 out |= nn OP mm; \
3631 } while (i & 63); \
3632 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3633 out &= pg; \
3634 *(uint64_t *)(vd + (i >> 3)) = out; \
3635 flags = iter_predtest_bwd(out, pg, flags); \
3636 } while (i > 0); \
3637 return flags; \
3640 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3641 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3642 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3643 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3644 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3645 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3646 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3647 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3649 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3650 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3651 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3652 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3654 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3655 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3656 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3657 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3659 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3660 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3661 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3662 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3664 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3665 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3666 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3667 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3669 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3670 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3671 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3672 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3674 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3675 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3676 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3677 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3679 #undef DO_CMP_PPZZ_B
3680 #undef DO_CMP_PPZZ_H
3681 #undef DO_CMP_PPZZ_S
3682 #undef DO_CMP_PPZZ_D
3683 #undef DO_CMP_PPZZ
3685 /* Similar, but the second source is "wide". */
3686 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3687 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3689 intptr_t opr_sz = simd_oprsz(desc); \
3690 uint32_t flags = PREDTEST_INIT; \
3691 intptr_t i = opr_sz; \
3692 do { \
3693 uint64_t out = 0, pg; \
3694 do { \
3695 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3696 do { \
3697 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3698 TYPE nn = *(TYPE *)(vn + H(i)); \
3699 out |= nn OP mm; \
3700 } while (i & 7); \
3701 } while (i & 63); \
3702 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3703 out &= pg; \
3704 *(uint64_t *)(vd + (i >> 3)) = out; \
3705 flags = iter_predtest_bwd(out, pg, flags); \
3706 } while (i > 0); \
3707 return flags; \
3710 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3711 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3712 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3713 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3714 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3715 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3717 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3718 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3719 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3721 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3722 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3723 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3725 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3726 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3727 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3729 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3730 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3731 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3733 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3734 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3735 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3737 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3738 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3739 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3741 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3742 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3743 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3745 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3746 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3747 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3749 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3750 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3751 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3753 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3754 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3755 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3757 #undef DO_CMP_PPZW_B
3758 #undef DO_CMP_PPZW_H
3759 #undef DO_CMP_PPZW_S
3760 #undef DO_CMP_PPZW
3762 /* Similar, but the second source is immediate. */
3763 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3764 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3766 intptr_t opr_sz = simd_oprsz(desc); \
3767 uint32_t flags = PREDTEST_INIT; \
3768 TYPE mm = simd_data(desc); \
3769 intptr_t i = opr_sz; \
3770 do { \
3771 uint64_t out = 0, pg; \
3772 do { \
3773 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3774 TYPE nn = *(TYPE *)(vn + H(i)); \
3775 out |= nn OP mm; \
3776 } while (i & 63); \
3777 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3778 out &= pg; \
3779 *(uint64_t *)(vd + (i >> 3)) = out; \
3780 flags = iter_predtest_bwd(out, pg, flags); \
3781 } while (i > 0); \
3782 return flags; \
3785 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3786 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3787 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3788 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3789 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3790 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3791 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3792 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3794 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3795 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3796 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3797 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3799 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3800 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3801 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3802 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3804 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3805 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3806 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3807 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3809 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3810 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3811 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3812 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3814 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3815 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3816 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3817 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3819 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3820 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3821 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3822 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3824 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3825 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3826 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3827 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3829 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3830 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3831 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3832 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3834 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3835 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3836 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3837 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3839 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3840 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3841 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3842 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3844 #undef DO_CMP_PPZI_B
3845 #undef DO_CMP_PPZI_H
3846 #undef DO_CMP_PPZI_S
3847 #undef DO_CMP_PPZI_D
3848 #undef DO_CMP_PPZI
3850 /* Similar to the ARM LastActive pseudocode function. */
3851 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3853 intptr_t i;
3855 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3856 uint64_t pg = *(uint64_t *)(vg + i);
3857 if (pg) {
3858 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3861 return 0;
3864 /* Compute a mask into RETB that is true for all G, up to and including
3865 * (if after) or excluding (if !after) the first G & N.
3866 * Return true if BRK found.
3868 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3869 bool brk, bool after)
3871 uint64_t b;
3873 if (brk) {
3874 b = 0;
3875 } else if ((g & n) == 0) {
3876 /* For all G, no N are set; break not found. */
3877 b = g;
3878 } else {
3879 /* Break somewhere in N. Locate it. */
3880 b = g & n; /* guard true, pred true */
3881 b = b & -b; /* first such */
3882 if (after) {
3883 b = b | (b - 1); /* break after same */
3884 } else {
3885 b = b - 1; /* break before same */
3887 brk = true;
3890 *retb = b;
3891 return brk;
3894 /* Compute a zeroing BRK. */
3895 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3896 intptr_t oprsz, bool after)
3898 bool brk = false;
3899 intptr_t i;
3901 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3902 uint64_t this_b, this_g = g[i];
3904 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3905 d[i] = this_b & this_g;
3909 /* Likewise, but also compute flags. */
3910 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3911 intptr_t oprsz, bool after)
3913 uint32_t flags = PREDTEST_INIT;
3914 bool brk = false;
3915 intptr_t i;
3917 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3918 uint64_t this_b, this_d, this_g = g[i];
3920 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3921 d[i] = this_d = this_b & this_g;
3922 flags = iter_predtest_fwd(this_d, this_g, flags);
3924 return flags;
3927 /* Compute a merging BRK. */
3928 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3929 intptr_t oprsz, bool after)
3931 bool brk = false;
3932 intptr_t i;
3934 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935 uint64_t this_b, this_g = g[i];
3937 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3942 /* Likewise, but also compute flags. */
3943 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3944 intptr_t oprsz, bool after)
3946 uint32_t flags = PREDTEST_INIT;
3947 bool brk = false;
3948 intptr_t i;
3950 for (i = 0; i < oprsz / 8; ++i) {
3951 uint64_t this_b, this_d = d[i], this_g = g[i];
3953 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3955 flags = iter_predtest_fwd(this_d, this_g, flags);
3957 return flags;
3960 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3962 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3963 * The compiler should turn this into 4 64-bit integer stores.
3965 memset(d, 0, sizeof(ARMPredicateReg));
3966 return PREDTEST_INIT;
3969 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3970 uint32_t pred_desc)
3972 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3973 if (last_active_pred(vn, vg, oprsz)) {
3974 compute_brk_z(vd, vm, vg, oprsz, true);
3975 } else {
3976 do_zero(vd, oprsz);
3980 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3981 uint32_t pred_desc)
3983 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3984 if (last_active_pred(vn, vg, oprsz)) {
3985 return compute_brks_z(vd, vm, vg, oprsz, true);
3986 } else {
3987 return do_zero(vd, oprsz);
3991 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3992 uint32_t pred_desc)
3994 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3995 if (last_active_pred(vn, vg, oprsz)) {
3996 compute_brk_z(vd, vm, vg, oprsz, false);
3997 } else {
3998 do_zero(vd, oprsz);
4002 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4003 uint32_t pred_desc)
4005 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006 if (last_active_pred(vn, vg, oprsz)) {
4007 return compute_brks_z(vd, vm, vg, oprsz, false);
4008 } else {
4009 return do_zero(vd, oprsz);
4013 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016 compute_brk_z(vd, vn, vg, oprsz, true);
4019 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022 return compute_brks_z(vd, vn, vg, oprsz, true);
4025 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4027 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028 compute_brk_z(vd, vn, vg, oprsz, false);
4031 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4033 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4034 return compute_brks_z(vd, vn, vg, oprsz, false);
4037 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040 compute_brk_m(vd, vn, vg, oprsz, true);
4043 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4045 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4046 return compute_brks_m(vd, vn, vg, oprsz, true);
4049 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052 compute_brk_m(vd, vn, vg, oprsz, false);
4055 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4057 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058 return compute_brks_m(vd, vn, vg, oprsz, false);
4061 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4063 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064 if (!last_active_pred(vn, vg, oprsz)) {
4065 do_zero(vd, oprsz);
4069 /* As if PredTest(Ones(PL), D, esz). */
4070 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4071 uint64_t esz_mask)
4073 uint32_t flags = PREDTEST_INIT;
4074 intptr_t i;
4076 for (i = 0; i < oprsz / 8; i++) {
4077 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4079 if (oprsz & 7) {
4080 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4081 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4083 return flags;
4086 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4089 if (last_active_pred(vn, vg, oprsz)) {
4090 return predtest_ones(vd, oprsz, -1);
4091 } else {
4092 return do_zero(vd, oprsz);
4096 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4098 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4099 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4100 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4101 intptr_t i;
4103 for (i = 0; i < words; ++i) {
4104 uint64_t t = n[i] & g[i] & mask;
4105 sum += ctpop64(t);
4107 return sum;
4110 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4112 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4113 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4114 uint64_t esz_mask = pred_esz_masks[esz];
4115 ARMPredicateReg *d = vd;
4116 uint32_t flags;
4117 intptr_t i;
4119 /* Begin with a zero predicate register. */
4120 flags = do_zero(d, oprsz);
4121 if (count == 0) {
4122 return flags;
4125 /* Set all of the requested bits. */
4126 for (i = 0; i < count / 64; ++i) {
4127 d->p[i] = esz_mask;
4129 if (count & 63) {
4130 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4133 return predtest_ones(d, oprsz, esz_mask);
4136 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4138 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4139 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4140 uint64_t esz_mask = pred_esz_masks[esz];
4141 ARMPredicateReg *d = vd;
4142 intptr_t i, invcount, oprbits;
4143 uint64_t bits;
4145 if (count == 0) {
4146 return do_zero(d, oprsz);
4149 oprbits = oprsz * 8;
4150 tcg_debug_assert(count <= oprbits);
4152 bits = esz_mask;
4153 if (oprbits & 63) {
4154 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4157 invcount = oprbits - count;
4158 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4159 d->p[i] = bits;
4160 bits = esz_mask;
4163 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4165 while (--i >= 0) {
4166 d->p[i] = 0;
4169 return predtest_ones(d, oprsz, esz_mask);
4172 /* Recursive reduction on a function;
4173 * C.f. the ARM ARM function ReducePredicated.
4175 * While it would be possible to write this without the DATA temporary,
4176 * it is much simpler to process the predicate register this way.
4177 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4178 * little to gain with a more complex non-recursive form.
4180 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4181 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4183 if (n == 1) { \
4184 return *data; \
4185 } else { \
4186 uintptr_t half = n / 2; \
4187 TYPE lo = NAME##_reduce(data, status, half); \
4188 TYPE hi = NAME##_reduce(data + half, status, half); \
4189 return TYPE##_##FUNC(lo, hi, status); \
4192 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4194 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4195 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4196 for (i = 0; i < oprsz; ) { \
4197 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4198 do { \
4199 TYPE nn = *(TYPE *)(vn + H(i)); \
4200 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4201 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4202 } while (i & 15); \
4204 for (; i < maxsz; i += sizeof(TYPE)) { \
4205 *(TYPE *)((void *)data + i) = IDENT; \
4207 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4210 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4211 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4212 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4214 /* Identity is floatN_default_nan, without the function call. */
4215 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4216 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4217 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4219 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4220 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4221 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4223 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4224 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4225 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4227 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4228 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4229 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4231 #undef DO_REDUCE
4233 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4234 void *status, uint32_t desc)
4236 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4237 float16 result = nn;
4239 do {
4240 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4241 do {
4242 if (pg & 1) {
4243 float16 mm = *(float16 *)(vm + H1_2(i));
4244 result = float16_add(result, mm, status);
4246 i += sizeof(float16), pg >>= sizeof(float16);
4247 } while (i & 15);
4248 } while (i < opr_sz);
4250 return result;
4253 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4254 void *status, uint32_t desc)
4256 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4257 float32 result = nn;
4259 do {
4260 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4261 do {
4262 if (pg & 1) {
4263 float32 mm = *(float32 *)(vm + H1_2(i));
4264 result = float32_add(result, mm, status);
4266 i += sizeof(float32), pg >>= sizeof(float32);
4267 } while (i & 15);
4268 } while (i < opr_sz);
4270 return result;
4273 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4274 void *status, uint32_t desc)
4276 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4277 uint64_t *m = vm;
4278 uint8_t *pg = vg;
4280 for (i = 0; i < opr_sz; i++) {
4281 if (pg[H1(i)] & 1) {
4282 nn = float64_add(nn, m[i], status);
4286 return nn;
4289 /* Fully general three-operand expander, controlled by a predicate,
4290 * With the extra float_status parameter.
4292 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4293 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4294 void *status, uint32_t desc) \
4296 intptr_t i = simd_oprsz(desc); \
4297 uint64_t *g = vg; \
4298 do { \
4299 uint64_t pg = g[(i - 1) >> 6]; \
4300 do { \
4301 i -= sizeof(TYPE); \
4302 if (likely((pg >> (i & 63)) & 1)) { \
4303 TYPE nn = *(TYPE *)(vn + H(i)); \
4304 TYPE mm = *(TYPE *)(vm + H(i)); \
4305 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4307 } while (i & 63); \
4308 } while (i != 0); \
4311 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4312 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4313 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4315 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4316 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4317 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4319 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4320 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4321 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4323 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4324 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4325 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4327 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4328 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4329 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4331 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4332 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4333 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4335 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4336 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4337 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4339 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4340 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4341 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4343 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4345 return float16_abs(float16_sub(a, b, s));
4348 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4350 return float32_abs(float32_sub(a, b, s));
4353 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4355 return float64_abs(float64_sub(a, b, s));
4358 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4359 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4360 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4362 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4364 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4365 return float64_scalbn(a, b_int, s);
4368 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4369 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4370 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4372 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4373 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4374 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4376 #undef DO_ZPZZ_FP
4378 /* Three-operand expander, with one scalar operand, controlled by
4379 * a predicate, with the extra float_status parameter.
4381 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4382 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4383 void *status, uint32_t desc) \
4385 intptr_t i = simd_oprsz(desc); \
4386 uint64_t *g = vg; \
4387 TYPE mm = scalar; \
4388 do { \
4389 uint64_t pg = g[(i - 1) >> 6]; \
4390 do { \
4391 i -= sizeof(TYPE); \
4392 if (likely((pg >> (i & 63)) & 1)) { \
4393 TYPE nn = *(TYPE *)(vn + H(i)); \
4394 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4396 } while (i & 63); \
4397 } while (i != 0); \
4400 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4401 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4402 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4404 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4405 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4406 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4408 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4409 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4410 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4412 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4414 return float16_sub(b, a, s);
4417 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4419 return float32_sub(b, a, s);
4422 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4424 return float64_sub(b, a, s);
4427 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4428 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4429 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4431 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4432 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4433 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4435 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4436 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4437 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4439 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4440 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4441 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4443 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4444 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4445 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4447 /* Fully general two-operand expander, controlled by a predicate,
4448 * With the extra float_status parameter.
4450 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4451 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4453 intptr_t i = simd_oprsz(desc); \
4454 uint64_t *g = vg; \
4455 do { \
4456 uint64_t pg = g[(i - 1) >> 6]; \
4457 do { \
4458 i -= sizeof(TYPE); \
4459 if (likely((pg >> (i & 63)) & 1)) { \
4460 TYPE nn = *(TYPE *)(vn + H(i)); \
4461 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4463 } while (i & 63); \
4464 } while (i != 0); \
4467 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4468 * FZ16. When converting from fp16, this affects flushing input denormals;
4469 * when converting to fp16, this affects flushing output denormals.
4471 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4473 bool save = get_flush_inputs_to_zero(fpst);
4474 float32 ret;
4476 set_flush_inputs_to_zero(false, fpst);
4477 ret = float16_to_float32(f, true, fpst);
4478 set_flush_inputs_to_zero(save, fpst);
4479 return ret;
4482 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4484 bool save = get_flush_inputs_to_zero(fpst);
4485 float64 ret;
4487 set_flush_inputs_to_zero(false, fpst);
4488 ret = float16_to_float64(f, true, fpst);
4489 set_flush_inputs_to_zero(save, fpst);
4490 return ret;
4493 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4495 bool save = get_flush_to_zero(fpst);
4496 float16 ret;
4498 set_flush_to_zero(false, fpst);
4499 ret = float32_to_float16(f, true, fpst);
4500 set_flush_to_zero(save, fpst);
4501 return ret;
4504 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4506 bool save = get_flush_to_zero(fpst);
4507 float16 ret;
4509 set_flush_to_zero(false, fpst);
4510 ret = float64_to_float16(f, true, fpst);
4511 set_flush_to_zero(save, fpst);
4512 return ret;
4515 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4517 if (float16_is_any_nan(f)) {
4518 float_raise(float_flag_invalid, s);
4519 return 0;
4521 return float16_to_int16_round_to_zero(f, s);
4524 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4526 if (float16_is_any_nan(f)) {
4527 float_raise(float_flag_invalid, s);
4528 return 0;
4530 return float16_to_int64_round_to_zero(f, s);
4533 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4535 if (float32_is_any_nan(f)) {
4536 float_raise(float_flag_invalid, s);
4537 return 0;
4539 return float32_to_int64_round_to_zero(f, s);
4542 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4544 if (float64_is_any_nan(f)) {
4545 float_raise(float_flag_invalid, s);
4546 return 0;
4548 return float64_to_int64_round_to_zero(f, s);
4551 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4553 if (float16_is_any_nan(f)) {
4554 float_raise(float_flag_invalid, s);
4555 return 0;
4557 return float16_to_uint16_round_to_zero(f, s);
4560 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4562 if (float16_is_any_nan(f)) {
4563 float_raise(float_flag_invalid, s);
4564 return 0;
4566 return float16_to_uint64_round_to_zero(f, s);
4569 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4571 if (float32_is_any_nan(f)) {
4572 float_raise(float_flag_invalid, s);
4573 return 0;
4575 return float32_to_uint64_round_to_zero(f, s);
4578 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4580 if (float64_is_any_nan(f)) {
4581 float_raise(float_flag_invalid, s);
4582 return 0;
4584 return float64_to_uint64_round_to_zero(f, s);
4587 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4588 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4589 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4590 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4591 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4592 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4593 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4595 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4596 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4597 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4598 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4599 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4600 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4601 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4603 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4604 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4605 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4606 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4608 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4609 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4611 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4612 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4613 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4615 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4616 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4617 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4619 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4620 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4621 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4623 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4624 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4625 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4627 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4628 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4629 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4630 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4631 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4632 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4633 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4635 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4636 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4637 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4638 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4639 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4640 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4641 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4643 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4645 /* Extract frac to the top of the uint32_t. */
4646 uint32_t frac = (uint32_t)a << (16 + 6);
4647 int16_t exp = extract32(a, 10, 5);
4649 if (unlikely(exp == 0)) {
4650 if (frac != 0) {
4651 if (!get_flush_inputs_to_zero(s)) {
4652 /* denormal: bias - fractional_zeros */
4653 return -15 - clz32(frac);
4655 /* flush to zero */
4656 float_raise(float_flag_input_denormal, s);
4658 } else if (unlikely(exp == 0x1f)) {
4659 if (frac == 0) {
4660 return INT16_MAX; /* infinity */
4662 } else {
4663 /* normal: exp - bias */
4664 return exp - 15;
4666 /* nan or zero */
4667 float_raise(float_flag_invalid, s);
4668 return INT16_MIN;
4671 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4673 /* Extract frac to the top of the uint32_t. */
4674 uint32_t frac = a << 9;
4675 int32_t exp = extract32(a, 23, 8);
4677 if (unlikely(exp == 0)) {
4678 if (frac != 0) {
4679 if (!get_flush_inputs_to_zero(s)) {
4680 /* denormal: bias - fractional_zeros */
4681 return -127 - clz32(frac);
4683 /* flush to zero */
4684 float_raise(float_flag_input_denormal, s);
4686 } else if (unlikely(exp == 0xff)) {
4687 if (frac == 0) {
4688 return INT32_MAX; /* infinity */
4690 } else {
4691 /* normal: exp - bias */
4692 return exp - 127;
4694 /* nan or zero */
4695 float_raise(float_flag_invalid, s);
4696 return INT32_MIN;
4699 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4701 /* Extract frac to the top of the uint64_t. */
4702 uint64_t frac = a << 12;
4703 int64_t exp = extract64(a, 52, 11);
4705 if (unlikely(exp == 0)) {
4706 if (frac != 0) {
4707 if (!get_flush_inputs_to_zero(s)) {
4708 /* denormal: bias - fractional_zeros */
4709 return -1023 - clz64(frac);
4711 /* flush to zero */
4712 float_raise(float_flag_input_denormal, s);
4714 } else if (unlikely(exp == 0x7ff)) {
4715 if (frac == 0) {
4716 return INT64_MAX; /* infinity */
4718 } else {
4719 /* normal: exp - bias */
4720 return exp - 1023;
4722 /* nan or zero */
4723 float_raise(float_flag_invalid, s);
4724 return INT64_MIN;
4727 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4728 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4729 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4731 #undef DO_ZPZ_FP
4733 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4734 float_status *status, uint32_t desc,
4735 uint16_t neg1, uint16_t neg3)
4737 intptr_t i = simd_oprsz(desc);
4738 uint64_t *g = vg;
4740 do {
4741 uint64_t pg = g[(i - 1) >> 6];
4742 do {
4743 i -= 2;
4744 if (likely((pg >> (i & 63)) & 1)) {
4745 float16 e1, e2, e3, r;
4747 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4748 e2 = *(uint16_t *)(vm + H1_2(i));
4749 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4750 r = float16_muladd(e1, e2, e3, 0, status);
4751 *(uint16_t *)(vd + H1_2(i)) = r;
4753 } while (i & 63);
4754 } while (i != 0);
4757 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4758 void *vg, void *status, uint32_t desc)
4760 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4763 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4764 void *vg, void *status, uint32_t desc)
4766 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4769 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4770 void *vg, void *status, uint32_t desc)
4772 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4775 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4776 void *vg, void *status, uint32_t desc)
4778 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4781 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4782 float_status *status, uint32_t desc,
4783 uint32_t neg1, uint32_t neg3)
4785 intptr_t i = simd_oprsz(desc);
4786 uint64_t *g = vg;
4788 do {
4789 uint64_t pg = g[(i - 1) >> 6];
4790 do {
4791 i -= 4;
4792 if (likely((pg >> (i & 63)) & 1)) {
4793 float32 e1, e2, e3, r;
4795 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4796 e2 = *(uint32_t *)(vm + H1_4(i));
4797 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4798 r = float32_muladd(e1, e2, e3, 0, status);
4799 *(uint32_t *)(vd + H1_4(i)) = r;
4801 } while (i & 63);
4802 } while (i != 0);
4805 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4806 void *vg, void *status, uint32_t desc)
4808 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4811 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4812 void *vg, void *status, uint32_t desc)
4814 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4817 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4818 void *vg, void *status, uint32_t desc)
4820 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4823 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4824 void *vg, void *status, uint32_t desc)
4826 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4829 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4830 float_status *status, uint32_t desc,
4831 uint64_t neg1, uint64_t neg3)
4833 intptr_t i = simd_oprsz(desc);
4834 uint64_t *g = vg;
4836 do {
4837 uint64_t pg = g[(i - 1) >> 6];
4838 do {
4839 i -= 8;
4840 if (likely((pg >> (i & 63)) & 1)) {
4841 float64 e1, e2, e3, r;
4843 e1 = *(uint64_t *)(vn + i) ^ neg1;
4844 e2 = *(uint64_t *)(vm + i);
4845 e3 = *(uint64_t *)(va + i) ^ neg3;
4846 r = float64_muladd(e1, e2, e3, 0, status);
4847 *(uint64_t *)(vd + i) = r;
4849 } while (i & 63);
4850 } while (i != 0);
4853 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4854 void *vg, void *status, uint32_t desc)
4856 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4859 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4860 void *vg, void *status, uint32_t desc)
4862 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4865 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4866 void *vg, void *status, uint32_t desc)
4868 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4871 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4872 void *vg, void *status, uint32_t desc)
4874 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4877 /* Two operand floating-point comparison controlled by a predicate.
4878 * Unlike the integer version, we are not allowed to optimistically
4879 * compare operands, since the comparison may have side effects wrt
4880 * the FPSR.
4882 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4884 void *status, uint32_t desc) \
4886 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4887 uint64_t *d = vd, *g = vg; \
4888 do { \
4889 uint64_t out = 0, pg = g[j]; \
4890 do { \
4891 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4892 if (likely((pg >> (i & 63)) & 1)) { \
4893 TYPE nn = *(TYPE *)(vn + H(i)); \
4894 TYPE mm = *(TYPE *)(vm + H(i)); \
4895 out |= OP(TYPE, nn, mm, status); \
4897 } while (i & 63); \
4898 d[j--] = out; \
4899 } while (i > 0); \
4902 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4903 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4904 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4905 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4906 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4907 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4909 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4910 DO_FPCMP_PPZZ_H(NAME, OP) \
4911 DO_FPCMP_PPZZ_S(NAME, OP) \
4912 DO_FPCMP_PPZZ_D(NAME, OP)
4914 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4915 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4916 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4917 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4918 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4919 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4920 #define DO_FCMUO(TYPE, X, Y, ST) \
4921 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4922 #define DO_FACGE(TYPE, X, Y, ST) \
4923 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4924 #define DO_FACGT(TYPE, X, Y, ST) \
4925 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4927 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4928 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4932 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4933 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4935 #undef DO_FPCMP_PPZZ_ALL
4936 #undef DO_FPCMP_PPZZ_D
4937 #undef DO_FPCMP_PPZZ_S
4938 #undef DO_FPCMP_PPZZ_H
4939 #undef DO_FPCMP_PPZZ
4941 /* One operand floating-point comparison against zero, controlled
4942 * by a predicate.
4944 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4945 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4946 void *status, uint32_t desc) \
4948 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4949 uint64_t *d = vd, *g = vg; \
4950 do { \
4951 uint64_t out = 0, pg = g[j]; \
4952 do { \
4953 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4954 if ((pg >> (i & 63)) & 1) { \
4955 TYPE nn = *(TYPE *)(vn + H(i)); \
4956 out |= OP(TYPE, nn, 0, status); \
4958 } while (i & 63); \
4959 d[j--] = out; \
4960 } while (i > 0); \
4963 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4964 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4965 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4966 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4967 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4968 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4970 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4971 DO_FPCMP_PPZ0_H(NAME, OP) \
4972 DO_FPCMP_PPZ0_S(NAME, OP) \
4973 DO_FPCMP_PPZ0_D(NAME, OP)
4975 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4982 /* FP Trig Multiply-Add. */
4984 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4986 static const float16 coeff[16] = {
4987 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4990 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4991 intptr_t x = simd_data(desc);
4992 float16 *d = vd, *n = vn, *m = vm;
4993 for (i = 0; i < opr_sz; i++) {
4994 float16 mm = m[i];
4995 intptr_t xx = x;
4996 if (float16_is_neg(mm)) {
4997 mm = float16_abs(mm);
4998 xx += 8;
5000 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5004 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5006 static const float32 coeff[16] = {
5007 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5008 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5009 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5010 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5012 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5013 intptr_t x = simd_data(desc);
5014 float32 *d = vd, *n = vn, *m = vm;
5015 for (i = 0; i < opr_sz; i++) {
5016 float32 mm = m[i];
5017 intptr_t xx = x;
5018 if (float32_is_neg(mm)) {
5019 mm = float32_abs(mm);
5020 xx += 8;
5022 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5026 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5028 static const float64 coeff[16] = {
5029 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5030 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5031 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5032 0x3de5d8408868552full, 0x0000000000000000ull,
5033 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5034 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5035 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5036 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5038 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5039 intptr_t x = simd_data(desc);
5040 float64 *d = vd, *n = vn, *m = vm;
5041 for (i = 0; i < opr_sz; i++) {
5042 float64 mm = m[i];
5043 intptr_t xx = x;
5044 if (float64_is_neg(mm)) {
5045 mm = float64_abs(mm);
5046 xx += 8;
5048 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5053 * FP Complex Add
5056 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5057 void *vs, uint32_t desc)
5059 intptr_t j, i = simd_oprsz(desc);
5060 uint64_t *g = vg;
5061 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5062 float16 neg_real = float16_chs(neg_imag);
5064 do {
5065 uint64_t pg = g[(i - 1) >> 6];
5066 do {
5067 float16 e0, e1, e2, e3;
5069 /* I holds the real index; J holds the imag index. */
5070 j = i - sizeof(float16);
5071 i -= 2 * sizeof(float16);
5073 e0 = *(float16 *)(vn + H1_2(i));
5074 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5075 e2 = *(float16 *)(vn + H1_2(j));
5076 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5078 if (likely((pg >> (i & 63)) & 1)) {
5079 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5081 if (likely((pg >> (j & 63)) & 1)) {
5082 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5084 } while (i & 63);
5085 } while (i != 0);
5088 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5089 void *vs, uint32_t desc)
5091 intptr_t j, i = simd_oprsz(desc);
5092 uint64_t *g = vg;
5093 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5094 float32 neg_real = float32_chs(neg_imag);
5096 do {
5097 uint64_t pg = g[(i - 1) >> 6];
5098 do {
5099 float32 e0, e1, e2, e3;
5101 /* I holds the real index; J holds the imag index. */
5102 j = i - sizeof(float32);
5103 i -= 2 * sizeof(float32);
5105 e0 = *(float32 *)(vn + H1_2(i));
5106 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5107 e2 = *(float32 *)(vn + H1_2(j));
5108 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5110 if (likely((pg >> (i & 63)) & 1)) {
5111 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5113 if (likely((pg >> (j & 63)) & 1)) {
5114 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5116 } while (i & 63);
5117 } while (i != 0);
5120 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5121 void *vs, uint32_t desc)
5123 intptr_t j, i = simd_oprsz(desc);
5124 uint64_t *g = vg;
5125 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5126 float64 neg_real = float64_chs(neg_imag);
5128 do {
5129 uint64_t pg = g[(i - 1) >> 6];
5130 do {
5131 float64 e0, e1, e2, e3;
5133 /* I holds the real index; J holds the imag index. */
5134 j = i - sizeof(float64);
5135 i -= 2 * sizeof(float64);
5137 e0 = *(float64 *)(vn + H1_2(i));
5138 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5139 e2 = *(float64 *)(vn + H1_2(j));
5140 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5142 if (likely((pg >> (i & 63)) & 1)) {
5143 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5145 if (likely((pg >> (j & 63)) & 1)) {
5146 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5148 } while (i & 63);
5149 } while (i != 0);
5153 * FP Complex Multiply
5156 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5157 void *vg, void *status, uint32_t desc)
5159 intptr_t j, i = simd_oprsz(desc);
5160 unsigned rot = simd_data(desc);
5161 bool flip = rot & 1;
5162 float16 neg_imag, neg_real;
5163 uint64_t *g = vg;
5165 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5166 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5168 do {
5169 uint64_t pg = g[(i - 1) >> 6];
5170 do {
5171 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5173 /* I holds the real index; J holds the imag index. */
5174 j = i - sizeof(float16);
5175 i -= 2 * sizeof(float16);
5177 nr = *(float16 *)(vn + H1_2(i));
5178 ni = *(float16 *)(vn + H1_2(j));
5179 mr = *(float16 *)(vm + H1_2(i));
5180 mi = *(float16 *)(vm + H1_2(j));
5182 e2 = (flip ? ni : nr);
5183 e1 = (flip ? mi : mr) ^ neg_real;
5184 e4 = e2;
5185 e3 = (flip ? mr : mi) ^ neg_imag;
5187 if (likely((pg >> (i & 63)) & 1)) {
5188 d = *(float16 *)(va + H1_2(i));
5189 d = float16_muladd(e2, e1, d, 0, status);
5190 *(float16 *)(vd + H1_2(i)) = d;
5192 if (likely((pg >> (j & 63)) & 1)) {
5193 d = *(float16 *)(va + H1_2(j));
5194 d = float16_muladd(e4, e3, d, 0, status);
5195 *(float16 *)(vd + H1_2(j)) = d;
5197 } while (i & 63);
5198 } while (i != 0);
5201 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5202 void *vg, void *status, uint32_t desc)
5204 intptr_t j, i = simd_oprsz(desc);
5205 unsigned rot = simd_data(desc);
5206 bool flip = rot & 1;
5207 float32 neg_imag, neg_real;
5208 uint64_t *g = vg;
5210 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5211 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5213 do {
5214 uint64_t pg = g[(i - 1) >> 6];
5215 do {
5216 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5218 /* I holds the real index; J holds the imag index. */
5219 j = i - sizeof(float32);
5220 i -= 2 * sizeof(float32);
5222 nr = *(float32 *)(vn + H1_2(i));
5223 ni = *(float32 *)(vn + H1_2(j));
5224 mr = *(float32 *)(vm + H1_2(i));
5225 mi = *(float32 *)(vm + H1_2(j));
5227 e2 = (flip ? ni : nr);
5228 e1 = (flip ? mi : mr) ^ neg_real;
5229 e4 = e2;
5230 e3 = (flip ? mr : mi) ^ neg_imag;
5232 if (likely((pg >> (i & 63)) & 1)) {
5233 d = *(float32 *)(va + H1_2(i));
5234 d = float32_muladd(e2, e1, d, 0, status);
5235 *(float32 *)(vd + H1_2(i)) = d;
5237 if (likely((pg >> (j & 63)) & 1)) {
5238 d = *(float32 *)(va + H1_2(j));
5239 d = float32_muladd(e4, e3, d, 0, status);
5240 *(float32 *)(vd + H1_2(j)) = d;
5242 } while (i & 63);
5243 } while (i != 0);
5246 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5247 void *vg, void *status, uint32_t desc)
5249 intptr_t j, i = simd_oprsz(desc);
5250 unsigned rot = simd_data(desc);
5251 bool flip = rot & 1;
5252 float64 neg_imag, neg_real;
5253 uint64_t *g = vg;
5255 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5256 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5258 do {
5259 uint64_t pg = g[(i - 1) >> 6];
5260 do {
5261 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5263 /* I holds the real index; J holds the imag index. */
5264 j = i - sizeof(float64);
5265 i -= 2 * sizeof(float64);
5267 nr = *(float64 *)(vn + H1_2(i));
5268 ni = *(float64 *)(vn + H1_2(j));
5269 mr = *(float64 *)(vm + H1_2(i));
5270 mi = *(float64 *)(vm + H1_2(j));
5272 e2 = (flip ? ni : nr);
5273 e1 = (flip ? mi : mr) ^ neg_real;
5274 e4 = e2;
5275 e3 = (flip ? mr : mi) ^ neg_imag;
5277 if (likely((pg >> (i & 63)) & 1)) {
5278 d = *(float64 *)(va + H1_2(i));
5279 d = float64_muladd(e2, e1, d, 0, status);
5280 *(float64 *)(vd + H1_2(i)) = d;
5282 if (likely((pg >> (j & 63)) & 1)) {
5283 d = *(float64 *)(va + H1_2(j));
5284 d = float64_muladd(e4, e3, d, 0, status);
5285 *(float64 *)(vd + H1_2(j)) = d;
5287 } while (i & 63);
5288 } while (i != 0);
5292 * Load contiguous data, protected by a governing predicate.
5296 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5297 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5298 * element >= @reg_off, or @reg_max if there were no active elements at all.
5300 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5301 intptr_t reg_max, int esz)
5303 uint64_t pg_mask = pred_esz_masks[esz];
5304 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5306 /* In normal usage, the first element is active. */
5307 if (likely(pg & 1)) {
5308 return reg_off;
5311 if (pg == 0) {
5312 reg_off &= -64;
5313 do {
5314 reg_off += 64;
5315 if (unlikely(reg_off >= reg_max)) {
5316 /* The entire predicate was false. */
5317 return reg_max;
5319 pg = vg[reg_off >> 6] & pg_mask;
5320 } while (pg == 0);
5322 reg_off += ctz64(pg);
5324 /* We should never see an out of range predicate bit set. */
5325 tcg_debug_assert(reg_off < reg_max);
5326 return reg_off;
5330 * Resolve the guest virtual address to info->host and info->flags.
5331 * If @nofault, return false if the page is invalid, otherwise
5332 * exit via page fault exception.
5335 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5336 target_ulong addr, int mem_off, MMUAccessType access_type,
5337 int mmu_idx, uintptr_t retaddr)
5339 int flags;
5341 addr += mem_off;
5344 * User-only currently always issues with TBI. See the comment
5345 * above useronly_clean_ptr. Usually we clean this top byte away
5346 * during translation, but we can't do that for e.g. vector + imm
5347 * addressing modes.
5349 * We currently always enable TBI for user-only, and do not provide
5350 * a way to turn it off. So clean the pointer unconditionally here,
5351 * rather than look it up here, or pass it down from above.
5353 addr = useronly_clean_ptr(addr);
5355 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5356 &info->host, retaddr);
5357 info->flags = flags;
5359 if (flags & TLB_INVALID_MASK) {
5360 g_assert(nofault);
5361 return false;
5364 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5365 info->host -= mem_off;
5367 #ifdef CONFIG_USER_ONLY
5368 memset(&info->attrs, 0, sizeof(info->attrs));
5369 #else
5371 * Find the iotlbentry for addr and return the transaction attributes.
5372 * This *must* be present in the TLB because we just found the mapping.
5375 uintptr_t index = tlb_index(env, mmu_idx, addr);
5377 # ifdef CONFIG_DEBUG_TCG
5378 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5379 target_ulong comparator = (access_type == MMU_DATA_LOAD
5380 ? entry->addr_read
5381 : tlb_addr_write(entry));
5382 g_assert(tlb_hit(comparator, addr));
5383 # endif
5385 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5386 info->attrs = iotlbentry->attrs;
5388 #endif
5390 return true;
5394 * Find first active element on each page, and a loose bound for the
5395 * final element on each page. Identify any single element that spans
5396 * the page boundary. Return true if there are any active elements.
5398 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5399 intptr_t reg_max, int esz, int msize)
5401 const int esize = 1 << esz;
5402 const uint64_t pg_mask = pred_esz_masks[esz];
5403 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5404 intptr_t mem_off_last, mem_off_split;
5405 intptr_t page_split, elt_split;
5406 intptr_t i;
5408 /* Set all of the element indices to -1, and the TLB data to 0. */
5409 memset(info, -1, offsetof(SVEContLdSt, page));
5410 memset(info->page, 0, sizeof(info->page));
5412 /* Gross scan over the entire predicate to find bounds. */
5413 i = 0;
5414 do {
5415 uint64_t pg = vg[i] & pg_mask;
5416 if (pg) {
5417 reg_off_last = i * 64 + 63 - clz64(pg);
5418 if (reg_off_first < 0) {
5419 reg_off_first = i * 64 + ctz64(pg);
5422 } while (++i * 64 < reg_max);
5424 if (unlikely(reg_off_first < 0)) {
5425 /* No active elements, no pages touched. */
5426 return false;
5428 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5430 info->reg_off_first[0] = reg_off_first;
5431 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5432 mem_off_last = (reg_off_last >> esz) * msize;
5434 page_split = -(addr | TARGET_PAGE_MASK);
5435 if (likely(mem_off_last + msize <= page_split)) {
5436 /* The entire operation fits within a single page. */
5437 info->reg_off_last[0] = reg_off_last;
5438 return true;
5441 info->page_split = page_split;
5442 elt_split = page_split / msize;
5443 reg_off_split = elt_split << esz;
5444 mem_off_split = elt_split * msize;
5447 * This is the last full element on the first page, but it is not
5448 * necessarily active. If there is no full element, i.e. the first
5449 * active element is the one that's split, this value remains -1.
5450 * It is useful as iteration bounds.
5452 if (elt_split != 0) {
5453 info->reg_off_last[0] = reg_off_split - esize;
5456 /* Determine if an unaligned element spans the pages. */
5457 if (page_split % msize != 0) {
5458 /* It is helpful to know if the split element is active. */
5459 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5460 info->reg_off_split = reg_off_split;
5461 info->mem_off_split = mem_off_split;
5463 if (reg_off_split == reg_off_last) {
5464 /* The page crossing element is last. */
5465 return true;
5468 reg_off_split += esize;
5469 mem_off_split += msize;
5473 * We do want the first active element on the second page, because
5474 * this may affect the address reported in an exception.
5476 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5477 tcg_debug_assert(reg_off_split <= reg_off_last);
5478 info->reg_off_first[1] = reg_off_split;
5479 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5480 info->reg_off_last[1] = reg_off_last;
5481 return true;
5485 * Resolve the guest virtual addresses to info->page[].
5486 * Control the generation of page faults with @fault. Return false if
5487 * there is no work to do, which can only happen with @fault == FAULT_NO.
5489 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5490 CPUARMState *env, target_ulong addr,
5491 MMUAccessType access_type, uintptr_t retaddr)
5493 int mmu_idx = cpu_mmu_index(env, false);
5494 int mem_off = info->mem_off_first[0];
5495 bool nofault = fault == FAULT_NO;
5496 bool have_work = true;
5498 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5499 access_type, mmu_idx, retaddr)) {
5500 /* No work to be done. */
5501 return false;
5504 if (likely(info->page_split < 0)) {
5505 /* The entire operation was on the one page. */
5506 return true;
5510 * If the second page is invalid, then we want the fault address to be
5511 * the first byte on that page which is accessed.
5513 if (info->mem_off_split >= 0) {
5515 * There is an element split across the pages. The fault address
5516 * should be the first byte of the second page.
5518 mem_off = info->page_split;
5520 * If the split element is also the first active element
5521 * of the vector, then: For first-fault we should continue
5522 * to generate faults for the second page. For no-fault,
5523 * we have work only if the second page is valid.
5525 if (info->mem_off_first[0] < info->mem_off_split) {
5526 nofault = FAULT_FIRST;
5527 have_work = false;
5529 } else {
5531 * There is no element split across the pages. The fault address
5532 * should be the first active element on the second page.
5534 mem_off = info->mem_off_first[1];
5536 * There must have been one active element on the first page,
5537 * so we're out of first-fault territory.
5539 nofault = fault != FAULT_ALL;
5542 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5543 access_type, mmu_idx, retaddr);
5544 return have_work;
5547 #ifndef CONFIG_USER_ONLY
5548 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5549 uint64_t *vg, target_ulong addr,
5550 int esize, int msize, int wp_access,
5551 uintptr_t retaddr)
5553 intptr_t mem_off, reg_off, reg_last;
5554 int flags0 = info->page[0].flags;
5555 int flags1 = info->page[1].flags;
5557 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5558 return;
5561 /* Indicate that watchpoints are handled. */
5562 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5563 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5565 if (flags0 & TLB_WATCHPOINT) {
5566 mem_off = info->mem_off_first[0];
5567 reg_off = info->reg_off_first[0];
5568 reg_last = info->reg_off_last[0];
5570 while (reg_off <= reg_last) {
5571 uint64_t pg = vg[reg_off >> 6];
5572 do {
5573 if ((pg >> (reg_off & 63)) & 1) {
5574 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5575 msize, info->page[0].attrs,
5576 wp_access, retaddr);
5578 reg_off += esize;
5579 mem_off += msize;
5580 } while (reg_off <= reg_last && (reg_off & 63));
5584 mem_off = info->mem_off_split;
5585 if (mem_off >= 0) {
5586 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5587 info->page[0].attrs, wp_access, retaddr);
5590 mem_off = info->mem_off_first[1];
5591 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5592 reg_off = info->reg_off_first[1];
5593 reg_last = info->reg_off_last[1];
5595 do {
5596 uint64_t pg = vg[reg_off >> 6];
5597 do {
5598 if ((pg >> (reg_off & 63)) & 1) {
5599 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5600 msize, info->page[1].attrs,
5601 wp_access, retaddr);
5603 reg_off += esize;
5604 mem_off += msize;
5605 } while (reg_off & 63);
5606 } while (reg_off <= reg_last);
5609 #endif
5611 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5612 uint64_t *vg, target_ulong addr, int esize,
5613 int msize, uint32_t mtedesc, uintptr_t ra)
5615 intptr_t mem_off, reg_off, reg_last;
5617 /* Process the page only if MemAttr == Tagged. */
5618 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5619 mem_off = info->mem_off_first[0];
5620 reg_off = info->reg_off_first[0];
5621 reg_last = info->reg_off_split;
5622 if (reg_last < 0) {
5623 reg_last = info->reg_off_last[0];
5626 do {
5627 uint64_t pg = vg[reg_off >> 6];
5628 do {
5629 if ((pg >> (reg_off & 63)) & 1) {
5630 mte_check(env, mtedesc, addr, ra);
5632 reg_off += esize;
5633 mem_off += msize;
5634 } while (reg_off <= reg_last && (reg_off & 63));
5635 } while (reg_off <= reg_last);
5638 mem_off = info->mem_off_first[1];
5639 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5640 reg_off = info->reg_off_first[1];
5641 reg_last = info->reg_off_last[1];
5643 do {
5644 uint64_t pg = vg[reg_off >> 6];
5645 do {
5646 if ((pg >> (reg_off & 63)) & 1) {
5647 mte_check(env, mtedesc, addr, ra);
5649 reg_off += esize;
5650 mem_off += msize;
5651 } while (reg_off & 63);
5652 } while (reg_off <= reg_last);
5657 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5659 static inline QEMU_ALWAYS_INLINE
5660 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5661 uint32_t desc, const uintptr_t retaddr,
5662 const int esz, const int msz, const int N, uint32_t mtedesc,
5663 sve_ldst1_host_fn *host_fn,
5664 sve_ldst1_tlb_fn *tlb_fn)
5666 const unsigned rd = simd_data(desc);
5667 const intptr_t reg_max = simd_oprsz(desc);
5668 intptr_t reg_off, reg_last, mem_off;
5669 SVEContLdSt info;
5670 void *host;
5671 int flags, i;
5673 /* Find the active elements. */
5674 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5675 /* The entire predicate was false; no load occurs. */
5676 for (i = 0; i < N; ++i) {
5677 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5679 return;
5682 /* Probe the page(s). Exit with exception for any invalid page. */
5683 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5685 /* Handle watchpoints for all active elements. */
5686 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5687 BP_MEM_READ, retaddr);
5690 * Handle mte checks for all active elements.
5691 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5693 if (mtedesc) {
5694 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5695 mtedesc, retaddr);
5698 flags = info.page[0].flags | info.page[1].flags;
5699 if (unlikely(flags != 0)) {
5700 #ifdef CONFIG_USER_ONLY
5701 g_assert_not_reached();
5702 #else
5704 * At least one page includes MMIO.
5705 * Any bus operation can fail with cpu_transaction_failed,
5706 * which for ARM will raise SyncExternal. Perform the load
5707 * into scratch memory to preserve register state until the end.
5709 ARMVectorReg scratch[4] = { };
5711 mem_off = info.mem_off_first[0];
5712 reg_off = info.reg_off_first[0];
5713 reg_last = info.reg_off_last[1];
5714 if (reg_last < 0) {
5715 reg_last = info.reg_off_split;
5716 if (reg_last < 0) {
5717 reg_last = info.reg_off_last[0];
5721 do {
5722 uint64_t pg = vg[reg_off >> 6];
5723 do {
5724 if ((pg >> (reg_off & 63)) & 1) {
5725 for (i = 0; i < N; ++i) {
5726 tlb_fn(env, &scratch[i], reg_off,
5727 addr + mem_off + (i << msz), retaddr);
5730 reg_off += 1 << esz;
5731 mem_off += N << msz;
5732 } while (reg_off & 63);
5733 } while (reg_off <= reg_last);
5735 for (i = 0; i < N; ++i) {
5736 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5738 return;
5739 #endif
5742 /* The entire operation is in RAM, on valid pages. */
5744 for (i = 0; i < N; ++i) {
5745 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5748 mem_off = info.mem_off_first[0];
5749 reg_off = info.reg_off_first[0];
5750 reg_last = info.reg_off_last[0];
5751 host = info.page[0].host;
5753 while (reg_off <= reg_last) {
5754 uint64_t pg = vg[reg_off >> 6];
5755 do {
5756 if ((pg >> (reg_off & 63)) & 1) {
5757 for (i = 0; i < N; ++i) {
5758 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5759 host + mem_off + (i << msz));
5762 reg_off += 1 << esz;
5763 mem_off += N << msz;
5764 } while (reg_off <= reg_last && (reg_off & 63));
5768 * Use the slow path to manage the cross-page misalignment.
5769 * But we know this is RAM and cannot trap.
5771 mem_off = info.mem_off_split;
5772 if (unlikely(mem_off >= 0)) {
5773 reg_off = info.reg_off_split;
5774 for (i = 0; i < N; ++i) {
5775 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5776 addr + mem_off + (i << msz), retaddr);
5780 mem_off = info.mem_off_first[1];
5781 if (unlikely(mem_off >= 0)) {
5782 reg_off = info.reg_off_first[1];
5783 reg_last = info.reg_off_last[1];
5784 host = info.page[1].host;
5786 do {
5787 uint64_t pg = vg[reg_off >> 6];
5788 do {
5789 if ((pg >> (reg_off & 63)) & 1) {
5790 for (i = 0; i < N; ++i) {
5791 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5792 host + mem_off + (i << msz));
5795 reg_off += 1 << esz;
5796 mem_off += N << msz;
5797 } while (reg_off & 63);
5798 } while (reg_off <= reg_last);
5802 static inline QEMU_ALWAYS_INLINE
5803 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5804 uint32_t desc, const uintptr_t ra,
5805 const int esz, const int msz, const int N,
5806 sve_ldst1_host_fn *host_fn,
5807 sve_ldst1_tlb_fn *tlb_fn)
5809 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5810 int bit55 = extract64(addr, 55, 1);
5812 /* Remove mtedesc from the normal sve descriptor. */
5813 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5815 /* Perform gross MTE suppression early. */
5816 if (!tbi_check(desc, bit55) ||
5817 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5818 mtedesc = 0;
5821 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5824 #define DO_LD1_1(NAME, ESZ) \
5825 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5826 target_ulong addr, uint32_t desc) \
5828 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5829 sve_##NAME##_host, sve_##NAME##_tlb); \
5831 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5832 target_ulong addr, uint32_t desc) \
5834 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5835 sve_##NAME##_host, sve_##NAME##_tlb); \
5838 #define DO_LD1_2(NAME, ESZ, MSZ) \
5839 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5840 target_ulong addr, uint32_t desc) \
5842 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5843 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5845 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5846 target_ulong addr, uint32_t desc) \
5848 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5849 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5851 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5852 target_ulong addr, uint32_t desc) \
5854 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5855 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5857 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5858 target_ulong addr, uint32_t desc) \
5860 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5861 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5864 DO_LD1_1(ld1bb, MO_8)
5865 DO_LD1_1(ld1bhu, MO_16)
5866 DO_LD1_1(ld1bhs, MO_16)
5867 DO_LD1_1(ld1bsu, MO_32)
5868 DO_LD1_1(ld1bss, MO_32)
5869 DO_LD1_1(ld1bdu, MO_64)
5870 DO_LD1_1(ld1bds, MO_64)
5872 DO_LD1_2(ld1hh, MO_16, MO_16)
5873 DO_LD1_2(ld1hsu, MO_32, MO_16)
5874 DO_LD1_2(ld1hss, MO_32, MO_16)
5875 DO_LD1_2(ld1hdu, MO_64, MO_16)
5876 DO_LD1_2(ld1hds, MO_64, MO_16)
5878 DO_LD1_2(ld1ss, MO_32, MO_32)
5879 DO_LD1_2(ld1sdu, MO_64, MO_32)
5880 DO_LD1_2(ld1sds, MO_64, MO_32)
5882 DO_LD1_2(ld1dd, MO_64, MO_64)
5884 #undef DO_LD1_1
5885 #undef DO_LD1_2
5887 #define DO_LDN_1(N) \
5888 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5889 target_ulong addr, uint32_t desc) \
5891 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5892 sve_ld1bb_host, sve_ld1bb_tlb); \
5894 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5895 target_ulong addr, uint32_t desc) \
5897 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5898 sve_ld1bb_host, sve_ld1bb_tlb); \
5901 #define DO_LDN_2(N, SUFF, ESZ) \
5902 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5903 target_ulong addr, uint32_t desc) \
5905 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5906 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5908 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5909 target_ulong addr, uint32_t desc) \
5911 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5912 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5914 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5915 target_ulong addr, uint32_t desc) \
5917 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5918 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5920 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5921 target_ulong addr, uint32_t desc) \
5923 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5924 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5927 DO_LDN_1(2)
5928 DO_LDN_1(3)
5929 DO_LDN_1(4)
5931 DO_LDN_2(2, hh, MO_16)
5932 DO_LDN_2(3, hh, MO_16)
5933 DO_LDN_2(4, hh, MO_16)
5935 DO_LDN_2(2, ss, MO_32)
5936 DO_LDN_2(3, ss, MO_32)
5937 DO_LDN_2(4, ss, MO_32)
5939 DO_LDN_2(2, dd, MO_64)
5940 DO_LDN_2(3, dd, MO_64)
5941 DO_LDN_2(4, dd, MO_64)
5943 #undef DO_LDN_1
5944 #undef DO_LDN_2
5947 * Load contiguous data, first-fault and no-fault.
5949 * For user-only, one could argue that we should hold the mmap_lock during
5950 * the operation so that there is no race between page_check_range and the
5951 * load operation. However, unmapping pages out from under a running thread
5952 * is extraordinarily unlikely. This theoretical race condition also affects
5953 * linux-user/ in its get_user/put_user macros.
5955 * TODO: Construct some helpers, written in assembly, that interact with
5956 * host_signal_handler to produce memory ops which can properly report errors
5957 * without racing.
5960 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5961 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5962 * option, which leaves subsequent data unchanged.
5964 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5966 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5968 if (i & 63) {
5969 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5970 i = ROUND_UP(i, 64);
5972 for (; i < oprsz; i += 64) {
5973 ffr[i / 64] = 0;
5978 * Common helper for all contiguous no-fault and first-fault loads.
5980 static inline QEMU_ALWAYS_INLINE
5981 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5982 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5983 const int esz, const int msz, const SVEContFault fault,
5984 sve_ldst1_host_fn *host_fn,
5985 sve_ldst1_tlb_fn *tlb_fn)
5987 const unsigned rd = simd_data(desc);
5988 void *vd = &env->vfp.zregs[rd];
5989 const intptr_t reg_max = simd_oprsz(desc);
5990 intptr_t reg_off, mem_off, reg_last;
5991 SVEContLdSt info;
5992 int flags;
5993 void *host;
5995 /* Find the active elements. */
5996 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5997 /* The entire predicate was false; no load occurs. */
5998 memset(vd, 0, reg_max);
5999 return;
6001 reg_off = info.reg_off_first[0];
6003 /* Probe the page(s). */
6004 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6005 /* Fault on first element. */
6006 tcg_debug_assert(fault == FAULT_NO);
6007 memset(vd, 0, reg_max);
6008 goto do_fault;
6011 mem_off = info.mem_off_first[0];
6012 flags = info.page[0].flags;
6015 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6016 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6018 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
6019 mtedesc = 0;
6022 if (fault == FAULT_FIRST) {
6023 /* Trapping mte check for the first-fault element. */
6024 if (mtedesc) {
6025 mte_check(env, mtedesc, addr + mem_off, retaddr);
6029 * Special handling of the first active element,
6030 * if it crosses a page boundary or is MMIO.
6032 bool is_split = mem_off == info.mem_off_split;
6033 if (unlikely(flags != 0) || unlikely(is_split)) {
6035 * Use the slow path for cross-page handling.
6036 * Might trap for MMIO or watchpoints.
6038 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6040 /* After any fault, zero the other elements. */
6041 swap_memzero(vd, reg_off);
6042 reg_off += 1 << esz;
6043 mem_off += 1 << msz;
6044 swap_memzero(vd + reg_off, reg_max - reg_off);
6046 if (is_split) {
6047 goto second_page;
6049 } else {
6050 memset(vd, 0, reg_max);
6052 } else {
6053 memset(vd, 0, reg_max);
6054 if (unlikely(mem_off == info.mem_off_split)) {
6055 /* The first active element crosses a page boundary. */
6056 flags |= info.page[1].flags;
6057 if (unlikely(flags & TLB_MMIO)) {
6058 /* Some page is MMIO, see below. */
6059 goto do_fault;
6061 if (unlikely(flags & TLB_WATCHPOINT) &&
6062 (cpu_watchpoint_address_matches
6063 (env_cpu(env), addr + mem_off, 1 << msz)
6064 & BP_MEM_READ)) {
6065 /* Watchpoint hit, see below. */
6066 goto do_fault;
6068 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6069 goto do_fault;
6072 * Use the slow path for cross-page handling.
6073 * This is RAM, without a watchpoint, and will not trap.
6075 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6076 goto second_page;
6081 * From this point on, all memory operations are MemSingleNF.
6083 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6084 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6086 * Unfortuately we do not have access to the memory attributes from the
6087 * PTE to tell Device memory from Normal memory. So we make a mostly
6088 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6089 * This gives the right answer for the common cases of "Normal memory,
6090 * backed by host RAM" and "Device memory, backed by MMIO".
6091 * The architecture allows us to suppress an NF load and return
6092 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6093 * case of "Normal memory, backed by MMIO" is permitted. The case we
6094 * get wrong is "Device memory, backed by host RAM", for which we
6095 * should return (UNKNOWN, FAULT) for but do not.
6097 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6098 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6099 * architectural breakpoints the same.
6101 if (unlikely(flags & TLB_MMIO)) {
6102 goto do_fault;
6105 reg_last = info.reg_off_last[0];
6106 host = info.page[0].host;
6108 do {
6109 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6110 do {
6111 if ((pg >> (reg_off & 63)) & 1) {
6112 if (unlikely(flags & TLB_WATCHPOINT) &&
6113 (cpu_watchpoint_address_matches
6114 (env_cpu(env), addr + mem_off, 1 << msz)
6115 & BP_MEM_READ)) {
6116 goto do_fault;
6118 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6119 goto do_fault;
6121 host_fn(vd, reg_off, host + mem_off);
6123 reg_off += 1 << esz;
6124 mem_off += 1 << msz;
6125 } while (reg_off <= reg_last && (reg_off & 63));
6126 } while (reg_off <= reg_last);
6129 * MemSingleNF is allowed to fail for any reason. We have special
6130 * code above to handle the first element crossing a page boundary.
6131 * As an implementation choice, decline to handle a cross-page element
6132 * in any other position.
6134 reg_off = info.reg_off_split;
6135 if (reg_off >= 0) {
6136 goto do_fault;
6139 second_page:
6140 reg_off = info.reg_off_first[1];
6141 if (likely(reg_off < 0)) {
6142 /* No active elements on the second page. All done. */
6143 return;
6147 * MemSingleNF is allowed to fail for any reason. As an implementation
6148 * choice, decline to handle elements on the second page. This should
6149 * be low frequency as the guest walks through memory -- the next
6150 * iteration of the guest's loop should be aligned on the page boundary,
6151 * and then all following iterations will stay aligned.
6154 do_fault:
6155 record_fault(env, reg_off, reg_max);
6158 static inline QEMU_ALWAYS_INLINE
6159 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6160 uint32_t desc, const uintptr_t retaddr,
6161 const int esz, const int msz, const SVEContFault fault,
6162 sve_ldst1_host_fn *host_fn,
6163 sve_ldst1_tlb_fn *tlb_fn)
6165 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6166 int bit55 = extract64(addr, 55, 1);
6168 /* Remove mtedesc from the normal sve descriptor. */
6169 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6171 /* Perform gross MTE suppression early. */
6172 if (!tbi_check(desc, bit55) ||
6173 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6174 mtedesc = 0;
6177 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6178 esz, msz, fault, host_fn, tlb_fn);
6181 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6182 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6183 target_ulong addr, uint32_t desc) \
6185 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6186 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6188 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6189 target_ulong addr, uint32_t desc) \
6191 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6192 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6194 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6195 target_ulong addr, uint32_t desc) \
6197 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6198 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6200 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6201 target_ulong addr, uint32_t desc) \
6203 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6204 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6207 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6208 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6209 target_ulong addr, uint32_t desc) \
6211 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6212 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6214 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6215 target_ulong addr, uint32_t desc) \
6217 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6218 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6220 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6221 target_ulong addr, uint32_t desc) \
6223 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6224 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6226 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6227 target_ulong addr, uint32_t desc) \
6229 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6230 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6232 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6233 target_ulong addr, uint32_t desc) \
6235 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6236 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6238 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6239 target_ulong addr, uint32_t desc) \
6241 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6242 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6244 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6245 target_ulong addr, uint32_t desc) \
6247 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6248 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6250 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6251 target_ulong addr, uint32_t desc) \
6253 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6254 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6257 DO_LDFF1_LDNF1_1(bb, MO_8)
6258 DO_LDFF1_LDNF1_1(bhu, MO_16)
6259 DO_LDFF1_LDNF1_1(bhs, MO_16)
6260 DO_LDFF1_LDNF1_1(bsu, MO_32)
6261 DO_LDFF1_LDNF1_1(bss, MO_32)
6262 DO_LDFF1_LDNF1_1(bdu, MO_64)
6263 DO_LDFF1_LDNF1_1(bds, MO_64)
6265 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6266 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6267 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6268 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6269 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6271 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6272 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6273 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6275 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6277 #undef DO_LDFF1_LDNF1_1
6278 #undef DO_LDFF1_LDNF1_2
6281 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6284 static inline QEMU_ALWAYS_INLINE
6285 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6286 uint32_t desc, const uintptr_t retaddr,
6287 const int esz, const int msz, const int N, uint32_t mtedesc,
6288 sve_ldst1_host_fn *host_fn,
6289 sve_ldst1_tlb_fn *tlb_fn)
6291 const unsigned rd = simd_data(desc);
6292 const intptr_t reg_max = simd_oprsz(desc);
6293 intptr_t reg_off, reg_last, mem_off;
6294 SVEContLdSt info;
6295 void *host;
6296 int i, flags;
6298 /* Find the active elements. */
6299 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6300 /* The entire predicate was false; no store occurs. */
6301 return;
6304 /* Probe the page(s). Exit with exception for any invalid page. */
6305 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6307 /* Handle watchpoints for all active elements. */
6308 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6309 BP_MEM_WRITE, retaddr);
6312 * Handle mte checks for all active elements.
6313 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6315 if (mtedesc) {
6316 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6317 mtedesc, retaddr);
6320 flags = info.page[0].flags | info.page[1].flags;
6321 if (unlikely(flags != 0)) {
6322 #ifdef CONFIG_USER_ONLY
6323 g_assert_not_reached();
6324 #else
6326 * At least one page includes MMIO.
6327 * Any bus operation can fail with cpu_transaction_failed,
6328 * which for ARM will raise SyncExternal. We cannot avoid
6329 * this fault and will leave with the store incomplete.
6331 mem_off = info.mem_off_first[0];
6332 reg_off = info.reg_off_first[0];
6333 reg_last = info.reg_off_last[1];
6334 if (reg_last < 0) {
6335 reg_last = info.reg_off_split;
6336 if (reg_last < 0) {
6337 reg_last = info.reg_off_last[0];
6341 do {
6342 uint64_t pg = vg[reg_off >> 6];
6343 do {
6344 if ((pg >> (reg_off & 63)) & 1) {
6345 for (i = 0; i < N; ++i) {
6346 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6347 addr + mem_off + (i << msz), retaddr);
6350 reg_off += 1 << esz;
6351 mem_off += N << msz;
6352 } while (reg_off & 63);
6353 } while (reg_off <= reg_last);
6354 return;
6355 #endif
6358 mem_off = info.mem_off_first[0];
6359 reg_off = info.reg_off_first[0];
6360 reg_last = info.reg_off_last[0];
6361 host = info.page[0].host;
6363 while (reg_off <= reg_last) {
6364 uint64_t pg = vg[reg_off >> 6];
6365 do {
6366 if ((pg >> (reg_off & 63)) & 1) {
6367 for (i = 0; i < N; ++i) {
6368 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6369 host + mem_off + (i << msz));
6372 reg_off += 1 << esz;
6373 mem_off += N << msz;
6374 } while (reg_off <= reg_last && (reg_off & 63));
6378 * Use the slow path to manage the cross-page misalignment.
6379 * But we know this is RAM and cannot trap.
6381 mem_off = info.mem_off_split;
6382 if (unlikely(mem_off >= 0)) {
6383 reg_off = info.reg_off_split;
6384 for (i = 0; i < N; ++i) {
6385 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6386 addr + mem_off + (i << msz), retaddr);
6390 mem_off = info.mem_off_first[1];
6391 if (unlikely(mem_off >= 0)) {
6392 reg_off = info.reg_off_first[1];
6393 reg_last = info.reg_off_last[1];
6394 host = info.page[1].host;
6396 do {
6397 uint64_t pg = vg[reg_off >> 6];
6398 do {
6399 if ((pg >> (reg_off & 63)) & 1) {
6400 for (i = 0; i < N; ++i) {
6401 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6402 host + mem_off + (i << msz));
6405 reg_off += 1 << esz;
6406 mem_off += N << msz;
6407 } while (reg_off & 63);
6408 } while (reg_off <= reg_last);
6412 static inline QEMU_ALWAYS_INLINE
6413 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6414 uint32_t desc, const uintptr_t ra,
6415 const int esz, const int msz, const int N,
6416 sve_ldst1_host_fn *host_fn,
6417 sve_ldst1_tlb_fn *tlb_fn)
6419 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6420 int bit55 = extract64(addr, 55, 1);
6422 /* Remove mtedesc from the normal sve descriptor. */
6423 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6425 /* Perform gross MTE suppression early. */
6426 if (!tbi_check(desc, bit55) ||
6427 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6428 mtedesc = 0;
6431 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6434 #define DO_STN_1(N, NAME, ESZ) \
6435 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6436 target_ulong addr, uint32_t desc) \
6438 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6439 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6441 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6442 target_ulong addr, uint32_t desc) \
6444 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6445 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6448 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6449 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6450 target_ulong addr, uint32_t desc) \
6452 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6453 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6455 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6456 target_ulong addr, uint32_t desc) \
6458 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6459 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6461 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6462 target_ulong addr, uint32_t desc) \
6464 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6465 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6467 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6468 target_ulong addr, uint32_t desc) \
6470 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6471 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6474 DO_STN_1(1, bb, MO_8)
6475 DO_STN_1(1, bh, MO_16)
6476 DO_STN_1(1, bs, MO_32)
6477 DO_STN_1(1, bd, MO_64)
6478 DO_STN_1(2, bb, MO_8)
6479 DO_STN_1(3, bb, MO_8)
6480 DO_STN_1(4, bb, MO_8)
6482 DO_STN_2(1, hh, MO_16, MO_16)
6483 DO_STN_2(1, hs, MO_32, MO_16)
6484 DO_STN_2(1, hd, MO_64, MO_16)
6485 DO_STN_2(2, hh, MO_16, MO_16)
6486 DO_STN_2(3, hh, MO_16, MO_16)
6487 DO_STN_2(4, hh, MO_16, MO_16)
6489 DO_STN_2(1, ss, MO_32, MO_32)
6490 DO_STN_2(1, sd, MO_64, MO_32)
6491 DO_STN_2(2, ss, MO_32, MO_32)
6492 DO_STN_2(3, ss, MO_32, MO_32)
6493 DO_STN_2(4, ss, MO_32, MO_32)
6495 DO_STN_2(1, dd, MO_64, MO_64)
6496 DO_STN_2(2, dd, MO_64, MO_64)
6497 DO_STN_2(3, dd, MO_64, MO_64)
6498 DO_STN_2(4, dd, MO_64, MO_64)
6500 #undef DO_STN_1
6501 #undef DO_STN_2
6504 * Loads with a vector index.
6508 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6510 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6512 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6514 return *(uint32_t *)(reg + H1_4(reg_ofs));
6517 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6519 return *(int32_t *)(reg + H1_4(reg_ofs));
6522 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6524 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6527 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6529 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6532 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6534 return *(uint64_t *)(reg + reg_ofs);
6537 static inline QEMU_ALWAYS_INLINE
6538 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6539 target_ulong base, uint32_t desc, uintptr_t retaddr,
6540 uint32_t mtedesc, int esize, int msize,
6541 zreg_off_fn *off_fn,
6542 sve_ldst1_host_fn *host_fn,
6543 sve_ldst1_tlb_fn *tlb_fn)
6545 const int mmu_idx = cpu_mmu_index(env, false);
6546 const intptr_t reg_max = simd_oprsz(desc);
6547 const int scale = simd_data(desc);
6548 ARMVectorReg scratch;
6549 intptr_t reg_off;
6550 SVEHostPage info, info2;
6552 memset(&scratch, 0, reg_max);
6553 reg_off = 0;
6554 do {
6555 uint64_t pg = vg[reg_off >> 6];
6556 do {
6557 if (likely(pg & 1)) {
6558 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6559 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6561 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6562 mmu_idx, retaddr);
6564 if (likely(in_page >= msize)) {
6565 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6566 cpu_check_watchpoint(env_cpu(env), addr, msize,
6567 info.attrs, BP_MEM_READ, retaddr);
6569 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6570 mte_check(env, mtedesc, addr, retaddr);
6572 if (unlikely(info.flags & TLB_MMIO)) {
6573 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6574 } else {
6575 host_fn(&scratch, reg_off, info.host);
6577 } else {
6578 /* Element crosses the page boundary. */
6579 sve_probe_page(&info2, false, env, addr + in_page, 0,
6580 MMU_DATA_LOAD, mmu_idx, retaddr);
6581 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6582 cpu_check_watchpoint(env_cpu(env), addr,
6583 msize, info.attrs,
6584 BP_MEM_READ, retaddr);
6586 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6587 mte_check(env, mtedesc, addr, retaddr);
6589 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6592 reg_off += esize;
6593 pg >>= esize;
6594 } while (reg_off & 63);
6595 } while (reg_off < reg_max);
6597 /* Wait until all exceptions have been raised to write back. */
6598 memcpy(vd, &scratch, reg_max);
6601 static inline QEMU_ALWAYS_INLINE
6602 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6603 target_ulong base, uint32_t desc, uintptr_t retaddr,
6604 int esize, int msize, zreg_off_fn *off_fn,
6605 sve_ldst1_host_fn *host_fn,
6606 sve_ldst1_tlb_fn *tlb_fn)
6608 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6609 /* Remove mtedesc from the normal sve descriptor. */
6610 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6613 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6614 * offset base entirely over the address space hole to change the
6615 * pointer tag, or change the bit55 selector. So we could here
6616 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6618 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6619 esize, msize, off_fn, host_fn, tlb_fn);
6622 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6623 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6624 void *vm, target_ulong base, uint32_t desc) \
6626 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6627 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6629 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6630 void *vm, target_ulong base, uint32_t desc) \
6632 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6633 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6636 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6637 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6638 void *vm, target_ulong base, uint32_t desc) \
6640 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6641 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6643 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6644 void *vm, target_ulong base, uint32_t desc) \
6646 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6647 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6650 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6651 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6652 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6653 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6654 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6656 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6657 DO_LD1_ZPZ_S(bss, zss, MO_8)
6658 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6659 DO_LD1_ZPZ_D(bds, zss, MO_8)
6660 DO_LD1_ZPZ_D(bds, zd, MO_8)
6662 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6663 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6664 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6665 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6666 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6668 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6669 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6670 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6671 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6672 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6674 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6675 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6676 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6677 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6678 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6680 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6681 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6682 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6683 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6684 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6686 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6687 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6688 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6689 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6690 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6692 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6693 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6694 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6695 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6696 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6698 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6699 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6700 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6702 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6703 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6704 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6706 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6707 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6708 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6710 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6711 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6712 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6714 #undef DO_LD1_ZPZ_S
6715 #undef DO_LD1_ZPZ_D
6717 /* First fault loads with a vector index. */
6720 * Common helpers for all gather first-faulting loads.
6723 static inline QEMU_ALWAYS_INLINE
6724 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6725 target_ulong base, uint32_t desc, uintptr_t retaddr,
6726 uint32_t mtedesc, const int esz, const int msz,
6727 zreg_off_fn *off_fn,
6728 sve_ldst1_host_fn *host_fn,
6729 sve_ldst1_tlb_fn *tlb_fn)
6731 const int mmu_idx = cpu_mmu_index(env, false);
6732 const intptr_t reg_max = simd_oprsz(desc);
6733 const int scale = simd_data(desc);
6734 const int esize = 1 << esz;
6735 const int msize = 1 << msz;
6736 intptr_t reg_off;
6737 SVEHostPage info;
6738 target_ulong addr, in_page;
6740 /* Skip to the first true predicate. */
6741 reg_off = find_next_active(vg, 0, reg_max, esz);
6742 if (unlikely(reg_off >= reg_max)) {
6743 /* The entire predicate was false; no load occurs. */
6744 memset(vd, 0, reg_max);
6745 return;
6749 * Probe the first element, allowing faults.
6751 addr = base + (off_fn(vm, reg_off) << scale);
6752 if (mtedesc) {
6753 mte_check(env, mtedesc, addr, retaddr);
6755 tlb_fn(env, vd, reg_off, addr, retaddr);
6757 /* After any fault, zero the other elements. */
6758 swap_memzero(vd, reg_off);
6759 reg_off += esize;
6760 swap_memzero(vd + reg_off, reg_max - reg_off);
6763 * Probe the remaining elements, not allowing faults.
6765 while (reg_off < reg_max) {
6766 uint64_t pg = vg[reg_off >> 6];
6767 do {
6768 if (likely((pg >> (reg_off & 63)) & 1)) {
6769 addr = base + (off_fn(vm, reg_off) << scale);
6770 in_page = -(addr | TARGET_PAGE_MASK);
6772 if (unlikely(in_page < msize)) {
6773 /* Stop if the element crosses a page boundary. */
6774 goto fault;
6777 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6778 mmu_idx, retaddr);
6779 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6780 goto fault;
6782 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6783 (cpu_watchpoint_address_matches
6784 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6785 goto fault;
6787 if (mtedesc &&
6788 arm_tlb_mte_tagged(&info.attrs) &&
6789 !mte_probe(env, mtedesc, addr)) {
6790 goto fault;
6793 host_fn(vd, reg_off, info.host);
6795 reg_off += esize;
6796 } while (reg_off & 63);
6798 return;
6800 fault:
6801 record_fault(env, reg_off, reg_max);
6804 static inline QEMU_ALWAYS_INLINE
6805 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6806 target_ulong base, uint32_t desc, uintptr_t retaddr,
6807 const int esz, const int msz,
6808 zreg_off_fn *off_fn,
6809 sve_ldst1_host_fn *host_fn,
6810 sve_ldst1_tlb_fn *tlb_fn)
6812 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6813 /* Remove mtedesc from the normal sve descriptor. */
6814 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6817 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6818 * offset base entirely over the address space hole to change the
6819 * pointer tag, or change the bit55 selector. So we could here
6820 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6822 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6823 esz, msz, off_fn, host_fn, tlb_fn);
6826 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6827 void HELPER(sve_ldff##MEM##_##OFS) \
6828 (CPUARMState *env, void *vd, void *vg, \
6829 void *vm, target_ulong base, uint32_t desc) \
6831 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6832 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6834 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6835 (CPUARMState *env, void *vd, void *vg, \
6836 void *vm, target_ulong base, uint32_t desc) \
6838 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6839 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6842 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6843 void HELPER(sve_ldff##MEM##_##OFS) \
6844 (CPUARMState *env, void *vd, void *vg, \
6845 void *vm, target_ulong base, uint32_t desc) \
6847 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6848 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6850 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6851 (CPUARMState *env, void *vd, void *vg, \
6852 void *vm, target_ulong base, uint32_t desc) \
6854 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6855 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6858 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6859 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6860 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6861 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6862 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6864 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6865 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6866 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6867 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6868 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6870 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6871 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6872 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6873 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6874 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6876 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6877 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6878 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6879 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6880 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6882 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6883 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6884 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6885 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6886 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6888 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6889 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6890 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6891 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6892 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6894 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6895 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6896 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6897 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6898 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6900 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6901 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6902 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6903 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6904 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6906 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6907 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6908 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6910 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6911 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6912 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6914 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6915 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6916 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6918 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6919 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6920 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6922 /* Stores with a vector index. */
6924 static inline QEMU_ALWAYS_INLINE
6925 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6926 target_ulong base, uint32_t desc, uintptr_t retaddr,
6927 uint32_t mtedesc, int esize, int msize,
6928 zreg_off_fn *off_fn,
6929 sve_ldst1_host_fn *host_fn,
6930 sve_ldst1_tlb_fn *tlb_fn)
6932 const int mmu_idx = cpu_mmu_index(env, false);
6933 const intptr_t reg_max = simd_oprsz(desc);
6934 const int scale = simd_data(desc);
6935 void *host[ARM_MAX_VQ * 4];
6936 intptr_t reg_off, i;
6937 SVEHostPage info, info2;
6940 * Probe all of the elements for host addresses and flags.
6942 i = reg_off = 0;
6943 do {
6944 uint64_t pg = vg[reg_off >> 6];
6945 do {
6946 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6947 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6949 host[i] = NULL;
6950 if (likely((pg >> (reg_off & 63)) & 1)) {
6951 if (likely(in_page >= msize)) {
6952 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6953 mmu_idx, retaddr);
6954 if (!(info.flags & TLB_MMIO)) {
6955 host[i] = info.host;
6957 } else {
6959 * Element crosses the page boundary.
6960 * Probe both pages, but do not record the host address,
6961 * so that we use the slow path.
6963 sve_probe_page(&info, false, env, addr, 0,
6964 MMU_DATA_STORE, mmu_idx, retaddr);
6965 sve_probe_page(&info2, false, env, addr + in_page, 0,
6966 MMU_DATA_STORE, mmu_idx, retaddr);
6967 info.flags |= info2.flags;
6970 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6971 cpu_check_watchpoint(env_cpu(env), addr, msize,
6972 info.attrs, BP_MEM_WRITE, retaddr);
6975 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6976 mte_check(env, mtedesc, addr, retaddr);
6979 i += 1;
6980 reg_off += esize;
6981 } while (reg_off & 63);
6982 } while (reg_off < reg_max);
6985 * Now that we have recognized all exceptions except SyncExternal
6986 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6988 * Note for the common case of an element in RAM, not crossing a page
6989 * boundary, we have stored the host address in host[]. This doubles
6990 * as a first-level check against the predicate, since only enabled
6991 * elements have non-null host addresses.
6993 i = reg_off = 0;
6994 do {
6995 void *h = host[i];
6996 if (likely(h != NULL)) {
6997 host_fn(vd, reg_off, h);
6998 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6999 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7000 tlb_fn(env, vd, reg_off, addr, retaddr);
7002 i += 1;
7003 reg_off += esize;
7004 } while (reg_off < reg_max);
7007 static inline QEMU_ALWAYS_INLINE
7008 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7009 target_ulong base, uint32_t desc, uintptr_t retaddr,
7010 int esize, int msize, zreg_off_fn *off_fn,
7011 sve_ldst1_host_fn *host_fn,
7012 sve_ldst1_tlb_fn *tlb_fn)
7014 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7015 /* Remove mtedesc from the normal sve descriptor. */
7016 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7019 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7020 * offset base entirely over the address space hole to change the
7021 * pointer tag, or change the bit55 selector. So we could here
7022 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7024 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7025 esize, msize, off_fn, host_fn, tlb_fn);
7028 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7029 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7030 void *vm, target_ulong base, uint32_t desc) \
7032 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7033 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7035 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7036 void *vm, target_ulong base, uint32_t desc) \
7038 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7039 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7042 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7043 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7044 void *vm, target_ulong base, uint32_t desc) \
7046 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7047 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7049 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7050 void *vm, target_ulong base, uint32_t desc) \
7052 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7053 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7056 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7057 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7058 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7059 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7060 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7062 DO_ST1_ZPZ_S(bs, zss, MO_8)
7063 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7064 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7065 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7066 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7068 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7069 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7070 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7071 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7072 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7073 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7074 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7076 DO_ST1_ZPZ_D(bd, zss, MO_8)
7077 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7078 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7079 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7080 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7081 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7082 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7084 DO_ST1_ZPZ_D(bd, zd, MO_8)
7085 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7086 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7087 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7088 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7089 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7090 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7092 #undef DO_ST1_ZPZ_S
7093 #undef DO_ST1_ZPZ_D
7095 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7097 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7098 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7100 for (i = 0; i < opr_sz; ++i) {
7101 d[i] = n[i] ^ m[i] ^ k[i];
7105 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7107 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7108 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7110 for (i = 0; i < opr_sz; ++i) {
7111 d[i] = n[i] ^ (m[i] & ~k[i]);
7115 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7117 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7118 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7120 for (i = 0; i < opr_sz; ++i) {
7121 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7125 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7127 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7128 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7130 for (i = 0; i < opr_sz; ++i) {
7131 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7135 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7137 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7138 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7140 for (i = 0; i < opr_sz; ++i) {
7141 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7146 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7147 * See hasless(v,1) from
7148 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7150 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7152 int bits = 8 << esz;
7153 uint64_t ones = dup_const(esz, 1);
7154 uint64_t signs = ones << (bits - 1);
7155 uint64_t cmp0, cmp1;
7157 cmp1 = dup_const(esz, n);
7158 cmp0 = cmp1 ^ m0;
7159 cmp1 = cmp1 ^ m1;
7160 cmp0 = (cmp0 - ones) & ~cmp0;
7161 cmp1 = (cmp1 - ones) & ~cmp1;
7162 return (cmp0 | cmp1) & signs;
7165 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7166 uint32_t desc, int esz, bool nmatch)
7168 uint16_t esz_mask = pred_esz_masks[esz];
7169 intptr_t opr_sz = simd_oprsz(desc);
7170 uint32_t flags = PREDTEST_INIT;
7171 intptr_t i, j, k;
7173 for (i = 0; i < opr_sz; i += 16) {
7174 uint64_t m0 = *(uint64_t *)(vm + i);
7175 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7176 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7177 uint16_t out = 0;
7179 for (j = 0; j < 16; j += 8) {
7180 uint64_t n = *(uint64_t *)(vn + i + j);
7182 for (k = 0; k < 8; k += 1 << esz) {
7183 if (pg & (1 << (j + k))) {
7184 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7185 out |= (o ^ nmatch) << (j + k);
7189 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7190 flags = iter_predtest_fwd(out, pg, flags);
7192 return flags;
7195 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7196 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7198 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7201 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7202 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7204 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7205 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7207 #undef DO_PPZZ_MATCH
7209 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7210 uint32_t desc)
7212 ARMVectorReg scratch;
7213 intptr_t i, j;
7214 intptr_t opr_sz = simd_oprsz(desc);
7215 uint32_t *d = vd, *n = vn, *m = vm;
7216 uint8_t *pg = vg;
7218 if (d == n) {
7219 n = memcpy(&scratch, n, opr_sz);
7220 if (d == m) {
7221 m = n;
7223 } else if (d == m) {
7224 m = memcpy(&scratch, m, opr_sz);
7227 for (i = 0; i < opr_sz; i += 4) {
7228 uint64_t count = 0;
7229 uint8_t pred;
7231 pred = pg[H1(i >> 3)] >> (i & 7);
7232 if (pred & 1) {
7233 uint32_t nn = n[H4(i >> 2)];
7235 for (j = 0; j <= i; j += 4) {
7236 pred = pg[H1(j >> 3)] >> (j & 7);
7237 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7238 ++count;
7242 d[H4(i >> 2)] = count;
7246 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7247 uint32_t desc)
7249 ARMVectorReg scratch;
7250 intptr_t i, j;
7251 intptr_t opr_sz = simd_oprsz(desc);
7252 uint64_t *d = vd, *n = vn, *m = vm;
7253 uint8_t *pg = vg;
7255 if (d == n) {
7256 n = memcpy(&scratch, n, opr_sz);
7257 if (d == m) {
7258 m = n;
7260 } else if (d == m) {
7261 m = memcpy(&scratch, m, opr_sz);
7264 for (i = 0; i < opr_sz / 8; ++i) {
7265 uint64_t count = 0;
7266 if (pg[H1(i)] & 1) {
7267 uint64_t nn = n[i];
7268 for (j = 0; j <= i; ++j) {
7269 if ((pg[H1(j)] & 1) && nn == m[j]) {
7270 ++count;
7274 d[i] = count;
7279 * Returns the number of bytes in m0 and m1 that match n.
7280 * Unlike do_match2 we don't just need true/false, we need an exact count.
7281 * This requires two extra logical operations.
7283 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7285 const uint64_t mask = dup_const(MO_8, 0x7f);
7286 uint64_t cmp0, cmp1;
7288 cmp1 = dup_const(MO_8, n);
7289 cmp0 = cmp1 ^ m0;
7290 cmp1 = cmp1 ^ m1;
7293 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7294 * 2: carry in to msb if byte != 0 (+ mask)
7295 * 3: set msb if cmp has msb set (| cmp)
7296 * 4: set ~msb to ignore them (| mask)
7297 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7298 * 5: invert, resulting in 0x80 if and only if byte == 0.
7300 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7301 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7304 * Combine the two compares in a way that the bits do
7305 * not overlap, and so preserves the count of set bits.
7306 * If the host has an efficient instruction for ctpop,
7307 * then ctpop(x) + ctpop(y) has the same number of
7308 * operations as ctpop(x | (y >> 1)). If the host does
7309 * not have an efficient ctpop, then we only want to
7310 * use it once.
7312 return ctpop64(cmp0 | (cmp1 >> 1));
7315 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7317 intptr_t i, j;
7318 intptr_t opr_sz = simd_oprsz(desc);
7320 for (i = 0; i < opr_sz; i += 16) {
7321 uint64_t n0 = *(uint64_t *)(vn + i);
7322 uint64_t m0 = *(uint64_t *)(vm + i);
7323 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7324 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7325 uint64_t out0 = 0;
7326 uint64_t out1 = 0;
7328 for (j = 0; j < 64; j += 8) {
7329 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7330 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7331 out0 |= cnt0 << j;
7332 out1 |= cnt1 << j;
7335 *(uint64_t *)(vd + i) = out0;
7336 *(uint64_t *)(vd + i + 8) = out1;
7340 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7342 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7343 int shr = simd_data(desc);
7344 int shl = 8 - shr;
7345 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7346 uint64_t *d = vd, *n = vn, *m = vm;
7348 for (i = 0; i < opr_sz; ++i) {
7349 uint64_t t = n[i] ^ m[i];
7350 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7354 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7356 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7357 int shr = simd_data(desc);
7358 int shl = 16 - shr;
7359 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7360 uint64_t *d = vd, *n = vn, *m = vm;
7362 for (i = 0; i < opr_sz; ++i) {
7363 uint64_t t = n[i] ^ m[i];
7364 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7368 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7370 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7371 int shr = simd_data(desc);
7372 uint32_t *d = vd, *n = vn, *m = vm;
7374 for (i = 0; i < opr_sz; ++i) {
7375 d[i] = ror32(n[i] ^ m[i], shr);
7379 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7380 void *status, uint32_t desc)
7382 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7384 for (s = 0; s < opr_sz; ++s) {
7385 float32 *n = vn + s * sizeof(float32) * 4;
7386 float32 *m = vm + s * sizeof(float32) * 4;
7387 float32 *a = va + s * sizeof(float32) * 4;
7388 float32 *d = vd + s * sizeof(float32) * 4;
7389 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7390 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7391 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7392 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7393 float32 p0, p1;
7395 /* i = 0, j = 0 */
7396 p0 = float32_mul(n00, m00, status);
7397 p1 = float32_mul(n01, m01, status);
7398 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7400 /* i = 0, j = 1 */
7401 p0 = float32_mul(n00, m10, status);
7402 p1 = float32_mul(n01, m11, status);
7403 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7405 /* i = 1, j = 0 */
7406 p0 = float32_mul(n10, m00, status);
7407 p1 = float32_mul(n11, m01, status);
7408 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7410 /* i = 1, j = 1 */
7411 p0 = float32_mul(n10, m10, status);
7412 p1 = float32_mul(n11, m11, status);
7413 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7417 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7418 void *status, uint32_t desc)
7420 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7422 for (s = 0; s < opr_sz; ++s) {
7423 float64 *n = vn + s * sizeof(float64) * 4;
7424 float64 *m = vm + s * sizeof(float64) * 4;
7425 float64 *a = va + s * sizeof(float64) * 4;
7426 float64 *d = vd + s * sizeof(float64) * 4;
7427 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7428 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7429 float64 p0, p1;
7431 /* i = 0, j = 0 */
7432 p0 = float64_mul(n00, m00, status);
7433 p1 = float64_mul(n01, m01, status);
7434 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7436 /* i = 0, j = 1 */
7437 p0 = float64_mul(n00, m10, status);
7438 p1 = float64_mul(n01, m11, status);
7439 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7441 /* i = 1, j = 0 */
7442 p0 = float64_mul(n10, m00, status);
7443 p1 = float64_mul(n11, m01, status);
7444 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7446 /* i = 1, j = 1 */
7447 p0 = float64_mul(n10, m10, status);
7448 p1 = float64_mul(n11, m11, status);
7449 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7453 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7454 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7456 intptr_t i = simd_oprsz(desc); \
7457 uint64_t *g = vg; \
7458 do { \
7459 uint64_t pg = g[(i - 1) >> 6]; \
7460 do { \
7461 i -= sizeof(TYPEW); \
7462 if (likely((pg >> (i & 63)) & 1)) { \
7463 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7464 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7466 } while (i & 63); \
7467 } while (i != 0); \
7470 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7471 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7472 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7474 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7475 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7477 intptr_t i = simd_oprsz(desc); \
7478 uint64_t *g = vg; \
7479 do { \
7480 uint64_t pg = g[(i - 1) >> 6]; \
7481 do { \
7482 i -= sizeof(TYPEW); \
7483 if (likely((pg >> (i & 63)) & 1)) { \
7484 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7485 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7487 } while (i & 63); \
7488 } while (i != 0); \
7491 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7492 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7494 #undef DO_FCVTLT
7495 #undef DO_FCVTNT