hw/arm/virt: Make accels in GIC finalize logic explicit
[qemu.git] / target / arm / sve_helper.c
blob521fc9b9697670694d26633a825f84e06a8552f9
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
36 * within CPUARMState.
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
43 * moving forward.
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 if (likely(g)) {
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
55 /* Accumulate Z from each D & G. */
56 flags |= ((d & g) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 return flags;
64 /* This is an iterative function, called for each Pd and Pg word
65 * moving backward.
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 if (likely(g)) {
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
72 if (!(flags & 4)) {
73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags |= (d & pow2floor(g)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags |= ((d & g) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 return flags;
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
103 return flags;
106 /* Similarly for single word elements. */
107 static inline uint64_t expand_pred_s(uint8_t byte)
109 static const uint64_t word[] = {
110 [0x01] = 0x00000000ffffffffull,
111 [0x10] = 0xffffffff00000000ull,
112 [0x11] = 0xffffffffffffffffull,
114 return word[byte & 0x11];
117 #define LOGICAL_PPPP(NAME, FUNC) \
118 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
120 uintptr_t opr_sz = simd_oprsz(desc); \
121 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
122 uintptr_t i; \
123 for (i = 0; i < opr_sz / 8; ++i) { \
124 d[i] = FUNC(n[i], m[i], g[i]); \
128 #define DO_AND(N, M, G) (((N) & (M)) & (G))
129 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
130 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
131 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
132 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
133 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
134 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
135 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
137 LOGICAL_PPPP(sve_and_pppp, DO_AND)
138 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
139 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
140 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
141 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
142 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
143 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
144 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
146 #undef DO_AND
147 #undef DO_BIC
148 #undef DO_EOR
149 #undef DO_ORR
150 #undef DO_ORN
151 #undef DO_NOR
152 #undef DO_NAND
153 #undef DO_SEL
154 #undef LOGICAL_PPPP
156 /* Fully general three-operand expander, controlled by a predicate.
157 * This is complicated by the host-endian storage of the register file.
159 /* ??? I don't expect the compiler could ever vectorize this itself.
160 * With some tables we can convert bit masks to byte masks, and with
161 * extra care wrt byte/word ordering we could use gcc generic vectors
162 * and do 16 bytes at a time.
164 #define DO_ZPZZ(NAME, TYPE, H, OP) \
165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
167 intptr_t i, opr_sz = simd_oprsz(desc); \
168 for (i = 0; i < opr_sz; ) { \
169 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
170 do { \
171 if (pg & 1) { \
172 TYPE nn = *(TYPE *)(vn + H(i)); \
173 TYPE mm = *(TYPE *)(vm + H(i)); \
174 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
176 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
177 } while (i & 15); \
181 /* Similarly, specialized for 64-bit operands. */
182 #define DO_ZPZZ_D(NAME, TYPE, OP) \
183 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
185 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
186 TYPE *d = vd, *n = vn, *m = vm; \
187 uint8_t *pg = vg; \
188 for (i = 0; i < opr_sz; i += 1) { \
189 if (pg[H1(i)] & 1) { \
190 TYPE nn = n[i], mm = m[i]; \
191 d[i] = OP(nn, mm); \
196 #define DO_AND(N, M) (N & M)
197 #define DO_EOR(N, M) (N ^ M)
198 #define DO_ORR(N, M) (N | M)
199 #define DO_BIC(N, M) (N & ~M)
200 #define DO_ADD(N, M) (N + M)
201 #define DO_SUB(N, M) (N - M)
202 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
203 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
204 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
205 #define DO_MUL(N, M) (N * M)
209 * We must avoid the C undefined behaviour cases: division by
210 * zero and signed division of INT_MIN by -1. Both of these
211 * have architecturally defined required results for Arm.
212 * We special case all signed divisions by -1 to avoid having
213 * to deduce the minimum integer for the type involved.
215 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
216 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
218 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
219 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
220 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
221 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
223 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
224 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
225 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
226 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
228 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
229 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
230 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
231 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
233 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
234 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
235 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
236 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
238 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
239 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
240 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
241 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
243 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
244 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
245 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
246 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
248 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
249 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
250 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
251 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
253 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
254 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
255 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
256 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
258 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
259 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
260 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
261 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
263 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
264 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
265 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
266 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
268 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
269 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
270 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
271 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
273 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
274 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
275 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
276 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
278 /* Because the computation type is at least twice as large as required,
279 these work for both signed and unsigned source types. */
280 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
282 return (n * m) >> 8;
285 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
287 return (n * m) >> 16;
290 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
292 return (n * m) >> 32;
295 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
297 uint64_t lo, hi;
298 muls64(&lo, &hi, n, m);
299 return hi;
302 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
304 uint64_t lo, hi;
305 mulu64(&lo, &hi, n, m);
306 return hi;
309 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
310 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
311 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
312 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
314 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
315 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
316 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
317 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
319 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
320 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
321 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
322 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
324 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
325 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
327 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
328 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
330 /* Note that all bits of the shift are significant
331 and not modulo the element size. */
332 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
333 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
334 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
336 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
337 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
338 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
340 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
341 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
342 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
344 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
345 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
346 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
348 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
349 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
350 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
352 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
354 int8_t n1 = n, n2 = n >> 8;
355 return m + n1 + n2;
358 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
360 int16_t n1 = n, n2 = n >> 16;
361 return m + n1 + n2;
364 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
366 int32_t n1 = n, n2 = n >> 32;
367 return m + n1 + n2;
370 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
371 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
372 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
374 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
376 uint8_t n1 = n, n2 = n >> 8;
377 return m + n1 + n2;
380 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
382 uint16_t n1 = n, n2 = n >> 16;
383 return m + n1 + n2;
386 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
388 uint32_t n1 = n, n2 = n >> 32;
389 return m + n1 + n2;
392 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
393 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
394 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
396 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
397 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
398 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
399 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
401 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
402 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
403 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
404 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
406 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
407 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
408 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
409 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
411 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
412 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
413 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
414 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
417 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
418 * We pass in a pointer to a dummy saturation field to trigger
419 * the saturating arithmetic but discard the information about
420 * whether it has occurred.
422 #define do_sqshl_b(n, m) \
423 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
424 #define do_sqshl_h(n, m) \
425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
426 #define do_sqshl_s(n, m) \
427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
428 #define do_sqshl_d(n, m) \
429 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
431 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
432 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
433 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
434 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
436 #define do_uqshl_b(n, m) \
437 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
438 #define do_uqshl_h(n, m) \
439 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
440 #define do_uqshl_s(n, m) \
441 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
442 #define do_uqshl_d(n, m) \
443 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
445 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
446 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
447 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
448 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
450 #define do_sqrshl_b(n, m) \
451 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
452 #define do_sqrshl_h(n, m) \
453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
454 #define do_sqrshl_s(n, m) \
455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
456 #define do_sqrshl_d(n, m) \
457 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
459 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
460 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
461 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
462 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
464 #undef do_sqrshl_d
466 #define do_uqrshl_b(n, m) \
467 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
468 #define do_uqrshl_h(n, m) \
469 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
470 #define do_uqrshl_s(n, m) \
471 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
472 #define do_uqrshl_d(n, m) \
473 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
475 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
476 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
477 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
478 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
480 #undef do_uqrshl_d
482 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
483 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
485 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
486 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
487 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
488 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
490 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
491 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
492 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
493 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
495 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
496 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
498 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
499 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
500 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
501 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
503 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
504 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
506 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
508 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
509 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
511 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
512 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
513 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
514 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
516 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
517 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
519 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
521 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
523 return val >= max ? max : val <= min ? min : val;
526 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
527 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
528 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
530 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
532 int64_t r = n + m;
533 if (((r ^ n) & ~(n ^ m)) < 0) {
534 /* Signed overflow. */
535 return r < 0 ? INT64_MAX : INT64_MIN;
537 return r;
540 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
541 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
542 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
543 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
545 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
546 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
547 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
549 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
551 uint64_t r = n + m;
552 return r < n ? UINT64_MAX : r;
555 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
556 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
557 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
558 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
560 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
561 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
562 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
564 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
566 int64_t r = n - m;
567 if (((r ^ n) & (n ^ m)) < 0) {
568 /* Signed overflow. */
569 return r < 0 ? INT64_MAX : INT64_MIN;
571 return r;
574 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
575 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
576 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
577 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
579 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
580 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
581 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
583 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
585 return n > m ? n - m : 0;
588 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
589 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
590 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
591 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
593 #define DO_SUQADD_B(n, m) \
594 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
595 #define DO_SUQADD_H(n, m) \
596 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
597 #define DO_SUQADD_S(n, m) \
598 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
600 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
602 uint64_t r = n + m;
604 if (n < 0) {
605 /* Note that m - abs(n) cannot underflow. */
606 if (r > INT64_MAX) {
607 /* Result is either very large positive or negative. */
608 if (m > -n) {
609 /* m > abs(n), so r is a very large positive. */
610 return INT64_MAX;
612 /* Result is negative. */
614 } else {
615 /* Both inputs are positive: check for overflow. */
616 if (r < m || r > INT64_MAX) {
617 return INT64_MAX;
620 return r;
623 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
624 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
625 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
626 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
628 #define DO_USQADD_B(n, m) \
629 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
630 #define DO_USQADD_H(n, m) \
631 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
632 #define DO_USQADD_S(n, m) \
633 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
635 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
637 uint64_t r = n + m;
639 if (m < 0) {
640 return n < -m ? 0 : r;
642 return r < n ? UINT64_MAX : r;
645 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
646 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
647 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
648 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
650 #undef DO_ZPZZ
651 #undef DO_ZPZZ_D
654 * Three operand expander, operating on element pairs.
655 * If the slot I is even, the elements from from VN {I, I+1}.
656 * If the slot I is odd, the elements from from VM {I-1, I}.
657 * Load all of the input elements in each pair before overwriting output.
659 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
660 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
662 intptr_t i, opr_sz = simd_oprsz(desc); \
663 for (i = 0; i < opr_sz; ) { \
664 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
665 do { \
666 TYPE n0 = *(TYPE *)(vn + H(i)); \
667 TYPE m0 = *(TYPE *)(vm + H(i)); \
668 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
669 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
670 if (pg & 1) { \
671 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
673 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
674 if (pg & 1) { \
675 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
678 } while (i & 15); \
682 /* Similarly, specialized for 64-bit operands. */
683 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
687 TYPE *d = vd, *n = vn, *m = vm; \
688 uint8_t *pg = vg; \
689 for (i = 0; i < opr_sz; i += 2) { \
690 TYPE n0 = n[i], n1 = n[i + 1]; \
691 TYPE m0 = m[i], m1 = m[i + 1]; \
692 if (pg[H1(i)] & 1) { \
693 d[i] = OP(n0, n1); \
695 if (pg[H1(i + 1)] & 1) { \
696 d[i + 1] = OP(m0, m1); \
701 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
702 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
704 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
706 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
709 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
711 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
714 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
716 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
719 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
721 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
724 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
726 #undef DO_ZPZZ_PAIR
727 #undef DO_ZPZZ_PAIR_D
729 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
730 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
731 void *status, uint32_t desc) \
733 intptr_t i, opr_sz = simd_oprsz(desc); \
734 for (i = 0; i < opr_sz; ) { \
735 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
736 do { \
737 TYPE n0 = *(TYPE *)(vn + H(i)); \
738 TYPE m0 = *(TYPE *)(vm + H(i)); \
739 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
740 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
741 if (pg & 1) { \
742 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
744 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
745 if (pg & 1) { \
746 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
749 } while (i & 15); \
753 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
757 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
761 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
765 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
769 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
773 #undef DO_ZPZZ_PAIR_FP
775 /* Three-operand expander, controlled by a predicate, in which the
776 * third operand is "wide". That is, for D = N op M, the same 64-bit
777 * value of M is used with all of the narrower values of N.
779 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
780 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
782 intptr_t i, opr_sz = simd_oprsz(desc); \
783 for (i = 0; i < opr_sz; ) { \
784 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
785 TYPEW mm = *(TYPEW *)(vm + i); \
786 do { \
787 if (pg & 1) { \
788 TYPE nn = *(TYPE *)(vn + H(i)); \
789 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
791 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
792 } while (i & 7); \
796 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
797 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
798 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
800 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
801 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
802 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
804 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
808 #undef DO_ZPZW
810 /* Fully general two-operand expander, controlled by a predicate.
812 #define DO_ZPZ(NAME, TYPE, H, OP) \
813 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
815 intptr_t i, opr_sz = simd_oprsz(desc); \
816 for (i = 0; i < opr_sz; ) { \
817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
818 do { \
819 if (pg & 1) { \
820 TYPE nn = *(TYPE *)(vn + H(i)); \
821 *(TYPE *)(vd + H(i)) = OP(nn); \
823 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
824 } while (i & 15); \
828 /* Similarly, specialized for 64-bit operands. */
829 #define DO_ZPZ_D(NAME, TYPE, OP) \
830 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
832 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
833 TYPE *d = vd, *n = vn; \
834 uint8_t *pg = vg; \
835 for (i = 0; i < opr_sz; i += 1) { \
836 if (pg[H1(i)] & 1) { \
837 TYPE nn = n[i]; \
838 d[i] = OP(nn); \
843 #define DO_CLS_B(N) (clrsb32(N) - 24)
844 #define DO_CLS_H(N) (clrsb32(N) - 16)
846 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
847 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
848 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
849 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
851 #define DO_CLZ_B(N) (clz32(N) - 24)
852 #define DO_CLZ_H(N) (clz32(N) - 16)
854 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
855 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
856 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
857 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
859 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
860 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
861 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
862 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
864 #define DO_CNOT(N) (N == 0)
866 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
867 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
868 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
869 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
871 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
873 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
874 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
875 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
877 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
879 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
880 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
881 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
883 #define DO_NOT(N) (~N)
885 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
886 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
887 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
888 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
890 #define DO_SXTB(N) ((int8_t)N)
891 #define DO_SXTH(N) ((int16_t)N)
892 #define DO_SXTS(N) ((int32_t)N)
893 #define DO_UXTB(N) ((uint8_t)N)
894 #define DO_UXTH(N) ((uint16_t)N)
895 #define DO_UXTS(N) ((uint32_t)N)
897 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
898 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
899 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
900 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
901 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
902 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
904 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
905 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
906 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
907 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
908 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
909 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
911 #define DO_ABS(N) (N < 0 ? -N : N)
913 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
914 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
915 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
916 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
918 #define DO_NEG(N) (-N)
920 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
921 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
922 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
923 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
925 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
926 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
927 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
929 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
930 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
932 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
934 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
936 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
937 uint64_t *d = vd, *n = vn;
938 uint8_t *pg = vg;
940 for (i = 0; i < opr_sz; i += 2) {
941 if (pg[H1(i)] & 1) {
942 uint64_t n0 = n[i + 0];
943 uint64_t n1 = n[i + 1];
944 d[i + 0] = n1;
945 d[i + 1] = n0;
950 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
951 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
952 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
953 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
955 #define DO_SQABS(X) \
956 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
957 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
959 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
960 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
961 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
962 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
964 #define DO_SQNEG(X) \
965 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
966 x_ == min_ ? -min_ - 1 : -x_; })
968 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
969 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
970 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
971 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
973 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
974 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
976 /* Three-operand expander, unpredicated, in which the third operand is "wide".
978 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
979 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
981 intptr_t i, opr_sz = simd_oprsz(desc); \
982 for (i = 0; i < opr_sz; ) { \
983 TYPEW mm = *(TYPEW *)(vm + i); \
984 do { \
985 TYPE nn = *(TYPE *)(vn + H(i)); \
986 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
987 i += sizeof(TYPE); \
988 } while (i & 7); \
992 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
993 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
994 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
996 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
997 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
998 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1000 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1001 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1002 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1004 #undef DO_ZZW
1006 #undef DO_CLS_B
1007 #undef DO_CLS_H
1008 #undef DO_CLZ_B
1009 #undef DO_CLZ_H
1010 #undef DO_CNOT
1011 #undef DO_FABS
1012 #undef DO_FNEG
1013 #undef DO_ABS
1014 #undef DO_NEG
1015 #undef DO_ZPZ
1016 #undef DO_ZPZ_D
1019 * Three-operand expander, unpredicated, in which the two inputs are
1020 * selected from the top or bottom half of the wide column.
1022 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1023 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1025 intptr_t i, opr_sz = simd_oprsz(desc); \
1026 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1027 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1028 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1029 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1030 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1031 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1035 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1036 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1037 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1039 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1040 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1041 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1043 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1044 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1045 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1047 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1048 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1049 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1051 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1052 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1053 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1055 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1056 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1057 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1059 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1060 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1061 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1063 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1064 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1065 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1067 /* Note that the multiply cannot overflow, but the doubling can. */
1068 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1070 int16_t val = n * m;
1071 return DO_SQADD_H(val, val);
1074 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1076 int32_t val = n * m;
1077 return DO_SQADD_S(val, val);
1080 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1082 int64_t val = n * m;
1083 return do_sqadd_d(val, val);
1086 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1087 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1090 #undef DO_ZZZ_TB
1092 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1093 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1095 intptr_t i, opr_sz = simd_oprsz(desc); \
1096 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1097 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1098 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1099 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1100 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1104 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1105 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1106 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1108 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1109 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1110 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1112 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1113 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1114 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1116 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1117 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1118 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1120 #undef DO_ZZZ_WTB
1122 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1123 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1125 intptr_t i, opr_sz = simd_oprsz(desc); \
1126 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1127 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1128 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1129 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1130 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1131 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1135 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1136 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1137 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1140 #undef DO_ZZZ_NTB
1142 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1143 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1145 intptr_t i, opr_sz = simd_oprsz(desc); \
1146 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1147 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1148 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1149 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1150 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1151 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1155 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1156 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1157 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1159 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1160 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1161 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1163 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1164 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1167 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1168 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1171 #define DO_NMUL(N, M) -(N * M)
1173 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1177 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1181 #undef DO_ZZZW_ACC
1183 #define DO_XTNB(NAME, TYPE, OP) \
1184 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1186 intptr_t i, opr_sz = simd_oprsz(desc); \
1187 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1188 TYPE nn = *(TYPE *)(vn + i); \
1189 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1190 *(TYPE *)(vd + i) = nn; \
1194 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1195 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1197 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1198 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1199 TYPE nn = *(TYPE *)(vn + i); \
1200 *(TYPEN *)(vd + i + odd) = OP(nn); \
1204 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1205 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1206 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1208 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1209 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1210 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1212 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1213 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1214 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1216 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1217 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1218 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1220 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1221 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1222 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1224 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1225 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1226 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1228 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1229 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1230 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1232 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1233 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1234 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1236 #undef DO_XTNB
1237 #undef DO_XTNT
1239 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1241 intptr_t i, opr_sz = simd_oprsz(desc);
1242 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1243 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1244 uint32_t *a = va, *n = vn;
1245 uint64_t *d = vd, *m = vm;
1247 for (i = 0; i < opr_sz / 8; ++i) {
1248 uint32_t e1 = a[2 * i + H4(0)];
1249 uint32_t e2 = n[2 * i + sel] ^ inv;
1250 uint64_t c = extract64(m[i], 32, 1);
1251 /* Compute and store the entire 33-bit result at once. */
1252 d[i] = c + e1 + e2;
1256 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1258 intptr_t i, opr_sz = simd_oprsz(desc);
1259 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1260 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1261 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1263 for (i = 0; i < opr_sz / 8; i += 2) {
1264 Int128 e1 = int128_make64(a[i]);
1265 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1266 Int128 c = int128_make64(m[i + 1] & 1);
1267 Int128 r = int128_add(int128_add(e1, e2), c);
1268 d[i + 0] = int128_getlo(r);
1269 d[i + 1] = int128_gethi(r);
1273 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1274 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1276 intptr_t i, opr_sz = simd_oprsz(desc); \
1277 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1278 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1279 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1280 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1281 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1282 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1283 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1287 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1288 do_sqdmull_h, DO_SQADD_H)
1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1290 do_sqdmull_s, DO_SQADD_S)
1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1292 do_sqdmull_d, do_sqadd_d)
1294 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1295 do_sqdmull_h, DO_SQSUB_H)
1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1297 do_sqdmull_s, DO_SQSUB_S)
1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1299 do_sqdmull_d, do_sqsub_d)
1301 #undef DO_SQDMLAL
1303 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1304 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1306 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1307 int rot = simd_data(desc); \
1308 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1309 bool sub_r = rot == 1 || rot == 2; \
1310 bool sub_i = rot >= 2; \
1311 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1312 for (i = 0; i < opr_sz; i += 2) { \
1313 TYPE elt1_a = n[H(i + sel_a)]; \
1314 TYPE elt2_a = m[H(i + sel_a)]; \
1315 TYPE elt2_b = m[H(i + sel_b)]; \
1316 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1317 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1321 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1323 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1324 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1328 #define DO_SQRDMLAH_B(N, M, A, S) \
1329 do_sqrdmlah_b(N, M, A, S, true)
1330 #define DO_SQRDMLAH_H(N, M, A, S) \
1331 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1332 #define DO_SQRDMLAH_S(N, M, A, S) \
1333 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1334 #define DO_SQRDMLAH_D(N, M, A, S) \
1335 do_sqrdmlah_d(N, M, A, S, true)
1337 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1342 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1343 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1345 intptr_t i, j, oprsz = simd_oprsz(desc); \
1346 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1347 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1348 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1349 bool sub_r = rot == 1 || rot == 2; \
1350 bool sub_i = rot >= 2; \
1351 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1352 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1353 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1354 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1355 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1356 TYPE elt1_a = n[H(i + j + sel_a)]; \
1357 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1358 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1363 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1366 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1369 #undef DO_CMLA
1370 #undef DO_CMLA_FUNC
1371 #undef DO_CMLA_IDX_FUNC
1372 #undef DO_SQRDMLAH_B
1373 #undef DO_SQRDMLAH_H
1374 #undef DO_SQRDMLAH_S
1375 #undef DO_SQRDMLAH_D
1377 /* Note N and M are 4 elements bundled into one unit. */
1378 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1379 int sel_a, int sel_b, int sub_i)
1381 for (int i = 0; i <= 1; i++) {
1382 int32_t elt1_r = (int8_t)(n >> (16 * i));
1383 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1384 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1385 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1387 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1389 return a;
1392 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1393 int sel_a, int sel_b, int sub_i)
1395 for (int i = 0; i <= 1; i++) {
1396 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1397 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1398 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1399 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1401 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1403 return a;
1406 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1407 void *va, uint32_t desc)
1409 int opr_sz = simd_oprsz(desc);
1410 int rot = simd_data(desc);
1411 int sel_a = rot & 1;
1412 int sel_b = sel_a ^ 1;
1413 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1414 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1416 for (int e = 0; e < opr_sz / 4; e++) {
1417 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1421 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1422 void *va, uint32_t desc)
1424 int opr_sz = simd_oprsz(desc);
1425 int rot = simd_data(desc);
1426 int sel_a = rot & 1;
1427 int sel_b = sel_a ^ 1;
1428 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1429 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1431 for (int e = 0; e < opr_sz / 8; e++) {
1432 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1436 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1437 void *va, uint32_t desc)
1439 int opr_sz = simd_oprsz(desc);
1440 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1441 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1442 int sel_a = rot & 1;
1443 int sel_b = sel_a ^ 1;
1444 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1445 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1447 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1448 uint32_t seg_m = m[seg + idx];
1449 for (int e = 0; e < 4; e++) {
1450 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1451 sel_a, sel_b, sub_i);
1456 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1457 void *va, uint32_t desc)
1459 int seg, opr_sz = simd_oprsz(desc);
1460 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1461 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1462 int sel_a = rot & 1;
1463 int sel_b = sel_a ^ 1;
1464 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1465 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1467 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1468 uint64_t seg_m = m[seg + idx];
1469 for (int e = 0; e < 2; e++) {
1470 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1471 sel_a, sel_b, sub_i);
1476 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1477 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1479 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1480 intptr_t i, j, idx = simd_data(desc); \
1481 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1482 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1483 TYPE mm = m[i]; \
1484 for (j = 0; j < segment; j++) { \
1485 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1490 #define DO_SQRDMLAH_H(N, M, A) \
1491 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1492 #define DO_SQRDMLAH_S(N, M, A) \
1493 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1494 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1496 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1497 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1498 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1500 #define DO_SQRDMLSH_H(N, M, A) \
1501 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1502 #define DO_SQRDMLSH_S(N, M, A) \
1503 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1504 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1506 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1507 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1510 #undef DO_ZZXZ
1512 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1513 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1515 intptr_t i, j, oprsz = simd_oprsz(desc); \
1516 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1517 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1518 for (i = 0; i < oprsz; i += 16) { \
1519 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1520 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1521 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1522 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1523 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1528 #define DO_MLA(N, M, A) (A + N * M)
1530 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1531 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1532 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1533 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1535 #define DO_MLS(N, M, A) (A - N * M)
1537 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1538 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1539 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1540 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1542 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1543 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1545 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1546 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1548 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1549 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1551 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1552 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1554 #undef DO_MLA
1555 #undef DO_MLS
1556 #undef DO_ZZXW
1558 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1559 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1561 intptr_t i, j, oprsz = simd_oprsz(desc); \
1562 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1563 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1564 for (i = 0; i < oprsz; i += 16) { \
1565 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1566 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1567 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1568 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1573 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1574 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1576 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1577 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1579 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1580 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1582 #undef DO_ZZX
1584 #define DO_BITPERM(NAME, TYPE, OP) \
1585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1587 intptr_t i, opr_sz = simd_oprsz(desc); \
1588 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1589 TYPE nn = *(TYPE *)(vn + i); \
1590 TYPE mm = *(TYPE *)(vm + i); \
1591 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1595 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1597 uint64_t res = 0;
1598 int db, rb = 0;
1600 for (db = 0; db < n; ++db) {
1601 if ((mask >> db) & 1) {
1602 res |= ((data >> db) & 1) << rb;
1603 ++rb;
1606 return res;
1609 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1610 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1611 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1612 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1614 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1616 uint64_t res = 0;
1617 int rb, db = 0;
1619 for (rb = 0; rb < n; ++rb) {
1620 if ((mask >> rb) & 1) {
1621 res |= ((data >> db) & 1) << rb;
1622 ++db;
1625 return res;
1628 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1629 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1630 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1633 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1635 uint64_t resm = 0, resu = 0;
1636 int db, rbm = 0, rbu = 0;
1638 for (db = 0; db < n; ++db) {
1639 uint64_t val = (data >> db) & 1;
1640 if ((mask >> db) & 1) {
1641 resm |= val << rbm++;
1642 } else {
1643 resu |= val << rbu++;
1647 return resm | (resu << rbm);
1650 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1651 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1652 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1655 #undef DO_BITPERM
1657 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1660 intptr_t i, opr_sz = simd_oprsz(desc); \
1661 int sub_r = simd_data(desc); \
1662 if (sub_r) { \
1663 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1664 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1665 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1666 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1667 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1668 acc_r = ADD_OP(acc_r, el2_i); \
1669 acc_i = SUB_OP(acc_i, el2_r); \
1670 *(TYPE *)(vd + H(i)) = acc_r; \
1671 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1673 } else { \
1674 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1675 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1676 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1677 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1678 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1679 acc_r = SUB_OP(acc_r, el2_i); \
1680 acc_i = ADD_OP(acc_i, el2_r); \
1681 *(TYPE *)(vd + H(i)) = acc_r; \
1682 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1687 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1688 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1689 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1692 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1693 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1694 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1695 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1697 #undef DO_CADD
1699 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1700 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1702 intptr_t i, opr_sz = simd_oprsz(desc); \
1703 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1704 int shift = simd_data(desc) >> 1; \
1705 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1706 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1707 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1711 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1712 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1713 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1715 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1716 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1717 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1719 #undef DO_ZZI_SHLL
1721 /* Two-operand reduction expander, controlled by a predicate.
1722 * The difference between TYPERED and TYPERET has to do with
1723 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1724 * but TYPERET must be unsigned so that e.g. a 32-bit value
1725 * is not sign-extended to the ABI uint64_t return type.
1727 /* ??? If we were to vectorize this by hand the reduction ordering
1728 * would change. For integer operands, this is perfectly fine.
1730 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1731 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1733 intptr_t i, opr_sz = simd_oprsz(desc); \
1734 TYPERED ret = INIT; \
1735 for (i = 0; i < opr_sz; ) { \
1736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1737 do { \
1738 if (pg & 1) { \
1739 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1740 ret = OP(ret, nn); \
1742 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1743 } while (i & 15); \
1745 return (TYPERET)ret; \
1748 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1749 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1751 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1752 TYPEE *n = vn; \
1753 uint8_t *pg = vg; \
1754 TYPER ret = INIT; \
1755 for (i = 0; i < opr_sz; i += 1) { \
1756 if (pg[H1(i)] & 1) { \
1757 TYPEE nn = n[i]; \
1758 ret = OP(ret, nn); \
1761 return ret; \
1764 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1765 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1766 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1767 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1769 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1770 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1771 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1772 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1774 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1775 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1776 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1777 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1779 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1780 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1781 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1783 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1784 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1785 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1786 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1788 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1789 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1790 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1791 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1793 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1794 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1795 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1796 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1798 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1799 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1800 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1801 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1803 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1804 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1805 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1806 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1808 #undef DO_VPZ
1809 #undef DO_VPZ_D
1811 /* Two vector operand, one scalar operand, unpredicated. */
1812 #define DO_ZZI(NAME, TYPE, OP) \
1813 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1815 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1816 TYPE s = s64, *d = vd, *n = vn; \
1817 for (i = 0; i < opr_sz; ++i) { \
1818 d[i] = OP(n[i], s); \
1822 #define DO_SUBR(X, Y) (Y - X)
1824 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1825 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1826 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1827 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1829 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1830 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1831 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1834 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1835 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1836 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1837 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1839 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1840 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1841 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1844 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1845 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1846 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1847 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1849 #undef DO_ZZI
1851 #undef DO_AND
1852 #undef DO_ORR
1853 #undef DO_EOR
1854 #undef DO_BIC
1855 #undef DO_ADD
1856 #undef DO_SUB
1857 #undef DO_MAX
1858 #undef DO_MIN
1859 #undef DO_ABD
1860 #undef DO_MUL
1861 #undef DO_DIV
1862 #undef DO_ASR
1863 #undef DO_LSR
1864 #undef DO_LSL
1865 #undef DO_SUBR
1867 /* Similar to the ARM LastActiveElement pseudocode function, except the
1868 result is multiplied by the element size. This includes the not found
1869 indication; e.g. not found for esz=3 is -8. */
1870 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1872 uint64_t mask = pred_esz_masks[esz];
1873 intptr_t i = words;
1875 do {
1876 uint64_t this_g = g[--i] & mask;
1877 if (this_g) {
1878 return i * 64 + (63 - clz64(this_g));
1880 } while (i > 0);
1881 return (intptr_t)-1 << esz;
1884 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1886 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1887 uint32_t flags = PREDTEST_INIT;
1888 uint64_t *d = vd, *g = vg;
1889 intptr_t i = 0;
1891 do {
1892 uint64_t this_d = d[i];
1893 uint64_t this_g = g[i];
1895 if (this_g) {
1896 if (!(flags & 4)) {
1897 /* Set in D the first bit of G. */
1898 this_d |= this_g & -this_g;
1899 d[i] = this_d;
1901 flags = iter_predtest_fwd(this_d, this_g, flags);
1903 } while (++i < words);
1905 return flags;
1908 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1910 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1911 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1912 uint32_t flags = PREDTEST_INIT;
1913 uint64_t *d = vd, *g = vg, esz_mask;
1914 intptr_t i, next;
1916 next = last_active_element(vd, words, esz) + (1 << esz);
1917 esz_mask = pred_esz_masks[esz];
1919 /* Similar to the pseudocode for pnext, but scaled by ESZ
1920 so that we find the correct bit. */
1921 if (next < words * 64) {
1922 uint64_t mask = -1;
1924 if (next & 63) {
1925 mask = ~((1ull << (next & 63)) - 1);
1926 next &= -64;
1928 do {
1929 uint64_t this_g = g[next / 64] & esz_mask & mask;
1930 if (this_g != 0) {
1931 next = (next & -64) + ctz64(this_g);
1932 break;
1934 next += 64;
1935 mask = -1;
1936 } while (next < words * 64);
1939 i = 0;
1940 do {
1941 uint64_t this_d = 0;
1942 if (i == next / 64) {
1943 this_d = 1ull << (next & 63);
1945 d[i] = this_d;
1946 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1947 } while (++i < words);
1949 return flags;
1953 * Copy Zn into Zd, and store zero into inactive elements.
1954 * If inv, store zeros into the active elements.
1956 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1958 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1959 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1960 uint64_t *d = vd, *n = vn;
1961 uint8_t *pg = vg;
1963 for (i = 0; i < opr_sz; i += 1) {
1964 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1968 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1970 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1971 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1972 uint64_t *d = vd, *n = vn;
1973 uint8_t *pg = vg;
1975 for (i = 0; i < opr_sz; i += 1) {
1976 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1980 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1982 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1983 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1984 uint64_t *d = vd, *n = vn;
1985 uint8_t *pg = vg;
1987 for (i = 0; i < opr_sz; i += 1) {
1988 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1992 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1994 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1995 uint64_t *d = vd, *n = vn;
1996 uint8_t *pg = vg;
1997 uint8_t inv = simd_data(desc);
1999 for (i = 0; i < opr_sz; i += 1) {
2000 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2004 /* Three-operand expander, immediate operand, controlled by a predicate.
2006 #define DO_ZPZI(NAME, TYPE, H, OP) \
2007 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2009 intptr_t i, opr_sz = simd_oprsz(desc); \
2010 TYPE imm = simd_data(desc); \
2011 for (i = 0; i < opr_sz; ) { \
2012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2013 do { \
2014 if (pg & 1) { \
2015 TYPE nn = *(TYPE *)(vn + H(i)); \
2016 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2018 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2019 } while (i & 15); \
2023 /* Similarly, specialized for 64-bit operands. */
2024 #define DO_ZPZI_D(NAME, TYPE, OP) \
2025 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2027 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2028 TYPE *d = vd, *n = vn; \
2029 TYPE imm = simd_data(desc); \
2030 uint8_t *pg = vg; \
2031 for (i = 0; i < opr_sz; i += 1) { \
2032 if (pg[H1(i)] & 1) { \
2033 TYPE nn = n[i]; \
2034 d[i] = OP(nn, imm); \
2039 #define DO_SHR(N, M) (N >> M)
2040 #define DO_SHL(N, M) (N << M)
2042 /* Arithmetic shift right for division. This rounds negative numbers
2043 toward zero as per signed division. Therefore before shifting,
2044 when N is negative, add 2**M-1. */
2045 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2047 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2049 if (likely(sh < 64)) {
2050 return (x >> sh) + ((x >> (sh - 1)) & 1);
2051 } else if (sh == 64) {
2052 return x >> 63;
2053 } else {
2054 return 0;
2058 static inline int64_t do_srshr(int64_t x, unsigned sh)
2060 if (likely(sh < 64)) {
2061 return (x >> sh) + ((x >> (sh - 1)) & 1);
2062 } else {
2063 /* Rounding the sign bit always produces 0. */
2064 return 0;
2068 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2069 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2070 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2071 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2073 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2074 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2075 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2076 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2078 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2079 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2080 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2081 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2083 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2084 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2085 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2086 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2088 /* SVE2 bitwise shift by immediate */
2089 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2090 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2091 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2092 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2094 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2095 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2096 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2097 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2099 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2100 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2101 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2102 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2104 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2105 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2106 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2107 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2109 #define do_suqrshl_b(n, m) \
2110 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2111 #define do_suqrshl_h(n, m) \
2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2113 #define do_suqrshl_s(n, m) \
2114 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2115 #define do_suqrshl_d(n, m) \
2116 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2118 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2119 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2120 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2121 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2123 #undef DO_ASRD
2124 #undef DO_ZPZI
2125 #undef DO_ZPZI_D
2127 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2128 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2130 intptr_t i, opr_sz = simd_oprsz(desc); \
2131 int shift = simd_data(desc); \
2132 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2133 TYPEW nn = *(TYPEW *)(vn + i); \
2134 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2138 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2139 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2141 intptr_t i, opr_sz = simd_oprsz(desc); \
2142 int shift = simd_data(desc); \
2143 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2144 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2145 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2149 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2150 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2151 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2153 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2154 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2155 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2157 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2158 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2159 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2161 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2162 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2163 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2165 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2166 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2167 #define DO_SQSHRUN_D(x, sh) \
2168 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2170 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2171 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2172 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2174 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2175 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2176 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2178 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2179 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2180 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2182 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2183 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2184 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2186 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2187 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2188 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2190 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2191 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2192 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2194 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2195 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2196 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2198 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2199 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2200 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2202 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2203 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2204 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2206 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2207 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2208 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2210 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2211 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2212 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2214 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2215 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2216 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2218 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2219 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2220 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2222 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2223 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2224 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2226 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2227 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2228 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2230 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2231 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2232 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2234 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2235 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2236 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2238 #undef DO_SHRNB
2239 #undef DO_SHRNT
2241 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2242 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2244 intptr_t i, opr_sz = simd_oprsz(desc); \
2245 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2246 TYPEW nn = *(TYPEW *)(vn + i); \
2247 TYPEW mm = *(TYPEW *)(vm + i); \
2248 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2252 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2253 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2255 intptr_t i, opr_sz = simd_oprsz(desc); \
2256 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2257 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2258 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2259 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2263 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2264 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2265 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2266 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2268 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2269 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2270 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2272 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2273 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2274 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2276 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2277 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2278 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2280 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2281 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2282 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2284 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2285 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2286 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2288 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2289 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2290 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2292 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2293 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2294 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2296 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2297 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2298 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2300 #undef DO_RSUBHN
2301 #undef DO_SUBHN
2302 #undef DO_RADDHN
2303 #undef DO_ADDHN
2305 #undef DO_BINOPNB
2307 /* Fully general four-operand expander, controlled by a predicate.
2309 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2310 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2311 void *vg, uint32_t desc) \
2313 intptr_t i, opr_sz = simd_oprsz(desc); \
2314 for (i = 0; i < opr_sz; ) { \
2315 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2316 do { \
2317 if (pg & 1) { \
2318 TYPE nn = *(TYPE *)(vn + H(i)); \
2319 TYPE mm = *(TYPE *)(vm + H(i)); \
2320 TYPE aa = *(TYPE *)(va + H(i)); \
2321 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2323 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2324 } while (i & 15); \
2328 /* Similarly, specialized for 64-bit operands. */
2329 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2330 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2331 void *vg, uint32_t desc) \
2333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2334 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2335 uint8_t *pg = vg; \
2336 for (i = 0; i < opr_sz; i += 1) { \
2337 if (pg[H1(i)] & 1) { \
2338 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2339 d[i] = OP(aa, nn, mm); \
2344 #define DO_MLA(A, N, M) (A + N * M)
2345 #define DO_MLS(A, N, M) (A - N * M)
2347 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2348 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2350 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2351 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2353 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2354 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2356 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2357 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2359 #undef DO_MLA
2360 #undef DO_MLS
2361 #undef DO_ZPZZZ
2362 #undef DO_ZPZZZ_D
2364 void HELPER(sve_index_b)(void *vd, uint32_t start,
2365 uint32_t incr, uint32_t desc)
2367 intptr_t i, opr_sz = simd_oprsz(desc);
2368 uint8_t *d = vd;
2369 for (i = 0; i < opr_sz; i += 1) {
2370 d[H1(i)] = start + i * incr;
2374 void HELPER(sve_index_h)(void *vd, uint32_t start,
2375 uint32_t incr, uint32_t desc)
2377 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2378 uint16_t *d = vd;
2379 for (i = 0; i < opr_sz; i += 1) {
2380 d[H2(i)] = start + i * incr;
2384 void HELPER(sve_index_s)(void *vd, uint32_t start,
2385 uint32_t incr, uint32_t desc)
2387 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2388 uint32_t *d = vd;
2389 for (i = 0; i < opr_sz; i += 1) {
2390 d[H4(i)] = start + i * incr;
2394 void HELPER(sve_index_d)(void *vd, uint64_t start,
2395 uint64_t incr, uint32_t desc)
2397 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2398 uint64_t *d = vd;
2399 for (i = 0; i < opr_sz; i += 1) {
2400 d[i] = start + i * incr;
2404 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2406 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2407 uint32_t sh = simd_data(desc);
2408 uint32_t *d = vd, *n = vn, *m = vm;
2409 for (i = 0; i < opr_sz; i += 1) {
2410 d[i] = n[i] + (m[i] << sh);
2414 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2416 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2417 uint64_t sh = simd_data(desc);
2418 uint64_t *d = vd, *n = vn, *m = vm;
2419 for (i = 0; i < opr_sz; i += 1) {
2420 d[i] = n[i] + (m[i] << sh);
2424 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2426 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2427 uint64_t sh = simd_data(desc);
2428 uint64_t *d = vd, *n = vn, *m = vm;
2429 for (i = 0; i < opr_sz; i += 1) {
2430 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2434 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2436 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2437 uint64_t sh = simd_data(desc);
2438 uint64_t *d = vd, *n = vn, *m = vm;
2439 for (i = 0; i < opr_sz; i += 1) {
2440 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2444 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2446 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2447 static const uint16_t coeff[] = {
2448 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2449 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2450 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2451 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2453 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2454 uint16_t *d = vd, *n = vn;
2456 for (i = 0; i < opr_sz; i++) {
2457 uint16_t nn = n[i];
2458 intptr_t idx = extract32(nn, 0, 5);
2459 uint16_t exp = extract32(nn, 5, 5);
2460 d[i] = coeff[idx] | (exp << 10);
2464 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2466 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2467 static const uint32_t coeff[] = {
2468 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2469 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2470 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2471 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2472 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2473 0x1ef532, 0x20b051, 0x227043, 0x243516,
2474 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2475 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2476 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2477 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2478 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2479 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2480 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2481 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2482 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2483 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2485 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2486 uint32_t *d = vd, *n = vn;
2488 for (i = 0; i < opr_sz; i++) {
2489 uint32_t nn = n[i];
2490 intptr_t idx = extract32(nn, 0, 6);
2491 uint32_t exp = extract32(nn, 6, 8);
2492 d[i] = coeff[idx] | (exp << 23);
2496 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2498 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2499 static const uint64_t coeff[] = {
2500 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2501 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2502 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2503 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2504 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2505 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2506 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2507 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2508 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2509 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2510 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2511 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2512 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2513 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2514 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2515 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2516 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2517 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2518 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2519 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2520 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2521 0xFA7C1819E90D8ull,
2523 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2524 uint64_t *d = vd, *n = vn;
2526 for (i = 0; i < opr_sz; i++) {
2527 uint64_t nn = n[i];
2528 intptr_t idx = extract32(nn, 0, 6);
2529 uint64_t exp = extract32(nn, 6, 11);
2530 d[i] = coeff[idx] | (exp << 52);
2534 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2536 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2537 uint16_t *d = vd, *n = vn, *m = vm;
2538 for (i = 0; i < opr_sz; i += 1) {
2539 uint16_t nn = n[i];
2540 uint16_t mm = m[i];
2541 if (mm & 1) {
2542 nn = float16_one;
2544 d[i] = nn ^ (mm & 2) << 14;
2548 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2550 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2551 uint32_t *d = vd, *n = vn, *m = vm;
2552 for (i = 0; i < opr_sz; i += 1) {
2553 uint32_t nn = n[i];
2554 uint32_t mm = m[i];
2555 if (mm & 1) {
2556 nn = float32_one;
2558 d[i] = nn ^ (mm & 2) << 30;
2562 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2565 uint64_t *d = vd, *n = vn, *m = vm;
2566 for (i = 0; i < opr_sz; i += 1) {
2567 uint64_t nn = n[i];
2568 uint64_t mm = m[i];
2569 if (mm & 1) {
2570 nn = float64_one;
2572 d[i] = nn ^ (mm & 2) << 62;
2577 * Signed saturating addition with scalar operand.
2580 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2582 intptr_t i, oprsz = simd_oprsz(desc);
2584 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2585 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2589 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2591 intptr_t i, oprsz = simd_oprsz(desc);
2593 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2594 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2598 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2600 intptr_t i, oprsz = simd_oprsz(desc);
2602 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2603 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2607 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2609 intptr_t i, oprsz = simd_oprsz(desc);
2611 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2612 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2617 * Unsigned saturating addition with scalar operand.
2620 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2622 intptr_t i, oprsz = simd_oprsz(desc);
2624 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2625 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2629 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2631 intptr_t i, oprsz = simd_oprsz(desc);
2633 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2634 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2638 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2640 intptr_t i, oprsz = simd_oprsz(desc);
2642 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2643 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2647 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2649 intptr_t i, oprsz = simd_oprsz(desc);
2651 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2652 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2656 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2658 intptr_t i, oprsz = simd_oprsz(desc);
2660 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2661 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2665 /* Two operand predicated copy immediate with merge. All valid immediates
2666 * can fit within 17 signed bits in the simd_data field.
2668 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2669 uint64_t mm, uint32_t desc)
2671 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2672 uint64_t *d = vd, *n = vn;
2673 uint8_t *pg = vg;
2675 mm = dup_const(MO_8, mm);
2676 for (i = 0; i < opr_sz; i += 1) {
2677 uint64_t nn = n[i];
2678 uint64_t pp = expand_pred_b(pg[H1(i)]);
2679 d[i] = (mm & pp) | (nn & ~pp);
2683 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2684 uint64_t mm, uint32_t desc)
2686 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2687 uint64_t *d = vd, *n = vn;
2688 uint8_t *pg = vg;
2690 mm = dup_const(MO_16, mm);
2691 for (i = 0; i < opr_sz; i += 1) {
2692 uint64_t nn = n[i];
2693 uint64_t pp = expand_pred_h(pg[H1(i)]);
2694 d[i] = (mm & pp) | (nn & ~pp);
2698 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2699 uint64_t mm, uint32_t desc)
2701 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2702 uint64_t *d = vd, *n = vn;
2703 uint8_t *pg = vg;
2705 mm = dup_const(MO_32, mm);
2706 for (i = 0; i < opr_sz; i += 1) {
2707 uint64_t nn = n[i];
2708 uint64_t pp = expand_pred_s(pg[H1(i)]);
2709 d[i] = (mm & pp) | (nn & ~pp);
2713 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2714 uint64_t mm, uint32_t desc)
2716 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2717 uint64_t *d = vd, *n = vn;
2718 uint8_t *pg = vg;
2720 for (i = 0; i < opr_sz; i += 1) {
2721 uint64_t nn = n[i];
2722 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2726 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2728 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2729 uint64_t *d = vd;
2730 uint8_t *pg = vg;
2732 val = dup_const(MO_8, val);
2733 for (i = 0; i < opr_sz; i += 1) {
2734 d[i] = val & expand_pred_b(pg[H1(i)]);
2738 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2740 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2741 uint64_t *d = vd;
2742 uint8_t *pg = vg;
2744 val = dup_const(MO_16, val);
2745 for (i = 0; i < opr_sz; i += 1) {
2746 d[i] = val & expand_pred_h(pg[H1(i)]);
2750 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2752 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2753 uint64_t *d = vd;
2754 uint8_t *pg = vg;
2756 val = dup_const(MO_32, val);
2757 for (i = 0; i < opr_sz; i += 1) {
2758 d[i] = val & expand_pred_s(pg[H1(i)]);
2762 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2765 uint64_t *d = vd;
2766 uint8_t *pg = vg;
2768 for (i = 0; i < opr_sz; i += 1) {
2769 d[i] = (pg[H1(i)] & 1 ? val : 0);
2773 /* Big-endian hosts need to frob the byte indices. If the copy
2774 * happens to be 8-byte aligned, then no frobbing necessary.
2776 static void swap_memmove(void *vd, void *vs, size_t n)
2778 uintptr_t d = (uintptr_t)vd;
2779 uintptr_t s = (uintptr_t)vs;
2780 uintptr_t o = (d | s | n) & 7;
2781 size_t i;
2783 #if !HOST_BIG_ENDIAN
2784 o = 0;
2785 #endif
2786 switch (o) {
2787 case 0:
2788 memmove(vd, vs, n);
2789 break;
2791 case 4:
2792 if (d < s || d >= s + n) {
2793 for (i = 0; i < n; i += 4) {
2794 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2796 } else {
2797 for (i = n; i > 0; ) {
2798 i -= 4;
2799 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2802 break;
2804 case 2:
2805 case 6:
2806 if (d < s || d >= s + n) {
2807 for (i = 0; i < n; i += 2) {
2808 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2810 } else {
2811 for (i = n; i > 0; ) {
2812 i -= 2;
2813 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2816 break;
2818 default:
2819 if (d < s || d >= s + n) {
2820 for (i = 0; i < n; i++) {
2821 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2823 } else {
2824 for (i = n; i > 0; ) {
2825 i -= 1;
2826 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2829 break;
2833 /* Similarly for memset of 0. */
2834 static void swap_memzero(void *vd, size_t n)
2836 uintptr_t d = (uintptr_t)vd;
2837 uintptr_t o = (d | n) & 7;
2838 size_t i;
2840 /* Usually, the first bit of a predicate is set, so N is 0. */
2841 if (likely(n == 0)) {
2842 return;
2845 #if !HOST_BIG_ENDIAN
2846 o = 0;
2847 #endif
2848 switch (o) {
2849 case 0:
2850 memset(vd, 0, n);
2851 break;
2853 case 4:
2854 for (i = 0; i < n; i += 4) {
2855 *(uint32_t *)H1_4(d + i) = 0;
2857 break;
2859 case 2:
2860 case 6:
2861 for (i = 0; i < n; i += 2) {
2862 *(uint16_t *)H1_2(d + i) = 0;
2864 break;
2866 default:
2867 for (i = 0; i < n; i++) {
2868 *(uint8_t *)H1(d + i) = 0;
2870 break;
2874 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2876 intptr_t opr_sz = simd_oprsz(desc);
2877 size_t n_ofs = simd_data(desc);
2878 size_t n_siz = opr_sz - n_ofs;
2880 if (vd != vm) {
2881 swap_memmove(vd, vn + n_ofs, n_siz);
2882 swap_memmove(vd + n_siz, vm, n_ofs);
2883 } else if (vd != vn) {
2884 swap_memmove(vd + n_siz, vd, n_ofs);
2885 swap_memmove(vd, vn + n_ofs, n_siz);
2886 } else {
2887 /* vd == vn == vm. Need temp space. */
2888 ARMVectorReg tmp;
2889 swap_memmove(&tmp, vm, n_ofs);
2890 swap_memmove(vd, vd + n_ofs, n_siz);
2891 memcpy(vd + n_siz, &tmp, n_ofs);
2895 #define DO_INSR(NAME, TYPE, H) \
2896 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2898 intptr_t opr_sz = simd_oprsz(desc); \
2899 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2900 *(TYPE *)(vd + H(0)) = val; \
2903 DO_INSR(sve_insr_b, uint8_t, H1)
2904 DO_INSR(sve_insr_h, uint16_t, H1_2)
2905 DO_INSR(sve_insr_s, uint32_t, H1_4)
2906 DO_INSR(sve_insr_d, uint64_t, H1_8)
2908 #undef DO_INSR
2910 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2912 intptr_t i, j, opr_sz = simd_oprsz(desc);
2913 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2914 uint64_t f = *(uint64_t *)(vn + i);
2915 uint64_t b = *(uint64_t *)(vn + j);
2916 *(uint64_t *)(vd + i) = bswap64(b);
2917 *(uint64_t *)(vd + j) = bswap64(f);
2921 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2923 intptr_t i, j, opr_sz = simd_oprsz(desc);
2924 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2925 uint64_t f = *(uint64_t *)(vn + i);
2926 uint64_t b = *(uint64_t *)(vn + j);
2927 *(uint64_t *)(vd + i) = hswap64(b);
2928 *(uint64_t *)(vd + j) = hswap64(f);
2932 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2934 intptr_t i, j, opr_sz = simd_oprsz(desc);
2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2936 uint64_t f = *(uint64_t *)(vn + i);
2937 uint64_t b = *(uint64_t *)(vn + j);
2938 *(uint64_t *)(vd + i) = rol64(b, 32);
2939 *(uint64_t *)(vd + j) = rol64(f, 32);
2943 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2945 intptr_t i, j, opr_sz = simd_oprsz(desc);
2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947 uint64_t f = *(uint64_t *)(vn + i);
2948 uint64_t b = *(uint64_t *)(vn + j);
2949 *(uint64_t *)(vd + i) = b;
2950 *(uint64_t *)(vd + j) = f;
2954 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2956 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2957 bool is_tbx, tb_impl_fn *fn)
2959 ARMVectorReg scratch;
2960 uintptr_t oprsz = simd_oprsz(desc);
2962 if (unlikely(vd == vn)) {
2963 vn = memcpy(&scratch, vn, oprsz);
2966 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2969 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2970 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2972 ARMVectorReg scratch;
2973 uintptr_t oprsz = simd_oprsz(desc);
2975 if (unlikely(vd == vn0)) {
2976 vn0 = memcpy(&scratch, vn0, oprsz);
2977 if (vd == vn1) {
2978 vn1 = vn0;
2980 } else if (unlikely(vd == vn1)) {
2981 vn1 = memcpy(&scratch, vn1, oprsz);
2984 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2987 #define DO_TB(SUFF, TYPE, H) \
2988 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2989 void *vm, uintptr_t oprsz, bool is_tbx) \
2991 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2992 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2993 for (i = 0; i < nelem; ++i) { \
2994 TYPE index = indexes[H1(i)], val = 0; \
2995 if (index < nelem) { \
2996 val = tbl0[H(index)]; \
2997 } else { \
2998 index -= nelem; \
2999 if (tbl1 && index < nelem) { \
3000 val = tbl1[H(index)]; \
3001 } else if (is_tbx) { \
3002 continue; \
3005 d[H(i)] = val; \
3008 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3010 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3012 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3013 void *vm, uint32_t desc) \
3015 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3017 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3019 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3022 DO_TB(b, uint8_t, H1)
3023 DO_TB(h, uint16_t, H2)
3024 DO_TB(s, uint32_t, H4)
3025 DO_TB(d, uint64_t, H8)
3027 #undef DO_TB
3029 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3030 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3032 intptr_t i, opr_sz = simd_oprsz(desc); \
3033 TYPED *d = vd; \
3034 TYPES *n = vn; \
3035 ARMVectorReg tmp; \
3036 if (unlikely(vn - vd < opr_sz)) { \
3037 n = memcpy(&tmp, n, opr_sz / 2); \
3039 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3040 d[HD(i)] = n[HS(i)]; \
3044 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3045 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3046 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3048 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3049 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3050 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3052 #undef DO_UNPK
3054 /* Mask of bits included in the even numbered predicates of width esz.
3055 * We also use this for expand_bits/compress_bits, and so extend the
3056 * same pattern out to 16-bit units.
3058 static const uint64_t even_bit_esz_masks[5] = {
3059 0x5555555555555555ull,
3060 0x3333333333333333ull,
3061 0x0f0f0f0f0f0f0f0full,
3062 0x00ff00ff00ff00ffull,
3063 0x0000ffff0000ffffull,
3066 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3067 * For N==0, this corresponds to the operation that in qemu/bitops.h
3068 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3069 * section 7-2 Shuffling Bits.
3071 static uint64_t expand_bits(uint64_t x, int n)
3073 int i;
3075 x &= 0xffffffffu;
3076 for (i = 4; i >= n; i--) {
3077 int sh = 1 << i;
3078 x = ((x << sh) | x) & even_bit_esz_masks[i];
3080 return x;
3083 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3084 * For N==0, this corresponds to the operation that in qemu/bitops.h
3085 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3086 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3088 static uint64_t compress_bits(uint64_t x, int n)
3090 int i;
3092 for (i = n; i <= 4; i++) {
3093 int sh = 1 << i;
3094 x &= even_bit_esz_masks[i];
3095 x = (x >> sh) | x;
3097 return x & 0xffffffffu;
3100 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3102 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3103 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3104 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3105 int esize = 1 << esz;
3106 uint64_t *d = vd;
3107 intptr_t i;
3109 if (oprsz <= 8) {
3110 uint64_t nn = *(uint64_t *)vn;
3111 uint64_t mm = *(uint64_t *)vm;
3112 int half = 4 * oprsz;
3114 nn = extract64(nn, high * half, half);
3115 mm = extract64(mm, high * half, half);
3116 nn = expand_bits(nn, esz);
3117 mm = expand_bits(mm, esz);
3118 d[0] = nn | (mm << esize);
3119 } else {
3120 ARMPredicateReg tmp;
3122 /* We produce output faster than we consume input.
3123 Therefore we must be mindful of possible overlap. */
3124 if (vd == vn) {
3125 vn = memcpy(&tmp, vn, oprsz);
3126 if (vd == vm) {
3127 vm = vn;
3129 } else if (vd == vm) {
3130 vm = memcpy(&tmp, vm, oprsz);
3132 if (high) {
3133 high = oprsz >> 1;
3136 if ((oprsz & 7) == 0) {
3137 uint32_t *n = vn, *m = vm;
3138 high >>= 2;
3140 for (i = 0; i < oprsz / 8; i++) {
3141 uint64_t nn = n[H4(high + i)];
3142 uint64_t mm = m[H4(high + i)];
3144 nn = expand_bits(nn, esz);
3145 mm = expand_bits(mm, esz);
3146 d[i] = nn | (mm << esize);
3148 } else {
3149 uint8_t *n = vn, *m = vm;
3150 uint16_t *d16 = vd;
3152 for (i = 0; i < oprsz / 2; i++) {
3153 uint16_t nn = n[H1(high + i)];
3154 uint16_t mm = m[H1(high + i)];
3156 nn = expand_bits(nn, esz);
3157 mm = expand_bits(mm, esz);
3158 d16[H2(i)] = nn | (mm << esize);
3164 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3166 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3167 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3168 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3169 uint64_t *d = vd, *n = vn, *m = vm;
3170 uint64_t l, h;
3171 intptr_t i;
3173 if (oprsz <= 8) {
3174 l = compress_bits(n[0] >> odd, esz);
3175 h = compress_bits(m[0] >> odd, esz);
3176 d[0] = l | (h << (4 * oprsz));
3177 } else {
3178 ARMPredicateReg tmp_m;
3179 intptr_t oprsz_16 = oprsz / 16;
3181 if ((vm - vd) < (uintptr_t)oprsz) {
3182 m = memcpy(&tmp_m, vm, oprsz);
3185 for (i = 0; i < oprsz_16; i++) {
3186 l = n[2 * i + 0];
3187 h = n[2 * i + 1];
3188 l = compress_bits(l >> odd, esz);
3189 h = compress_bits(h >> odd, esz);
3190 d[i] = l | (h << 32);
3194 * For VL which is not a multiple of 512, the results from M do not
3195 * align nicely with the uint64_t for D. Put the aligned results
3196 * from M into TMP_M and then copy it into place afterward.
3198 if (oprsz & 15) {
3199 int final_shift = (oprsz & 15) * 2;
3201 l = n[2 * i + 0];
3202 h = n[2 * i + 1];
3203 l = compress_bits(l >> odd, esz);
3204 h = compress_bits(h >> odd, esz);
3205 d[i] = l | (h << final_shift);
3207 for (i = 0; i < oprsz_16; i++) {
3208 l = m[2 * i + 0];
3209 h = m[2 * i + 1];
3210 l = compress_bits(l >> odd, esz);
3211 h = compress_bits(h >> odd, esz);
3212 tmp_m.p[i] = l | (h << 32);
3214 l = m[2 * i + 0];
3215 h = m[2 * i + 1];
3216 l = compress_bits(l >> odd, esz);
3217 h = compress_bits(h >> odd, esz);
3218 tmp_m.p[i] = l | (h << final_shift);
3220 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3221 } else {
3222 for (i = 0; i < oprsz_16; i++) {
3223 l = m[2 * i + 0];
3224 h = m[2 * i + 1];
3225 l = compress_bits(l >> odd, esz);
3226 h = compress_bits(h >> odd, esz);
3227 d[oprsz_16 + i] = l | (h << 32);
3233 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3236 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3237 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3238 uint64_t *d = vd, *n = vn, *m = vm;
3239 uint64_t mask;
3240 int shr, shl;
3241 intptr_t i;
3243 shl = 1 << esz;
3244 shr = 0;
3245 mask = even_bit_esz_masks[esz];
3246 if (odd) {
3247 mask <<= shl;
3248 shr = shl;
3249 shl = 0;
3252 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3253 uint64_t nn = (n[i] & mask) >> shr;
3254 uint64_t mm = (m[i] & mask) << shl;
3255 d[i] = nn + mm;
3259 /* Reverse units of 2**N bits. */
3260 static uint64_t reverse_bits_64(uint64_t x, int n)
3262 int i, sh;
3264 x = bswap64(x);
3265 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3266 uint64_t mask = even_bit_esz_masks[i];
3267 x = ((x & mask) << sh) | ((x >> sh) & mask);
3269 return x;
3272 static uint8_t reverse_bits_8(uint8_t x, int n)
3274 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3275 int i, sh;
3277 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3278 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3280 return x;
3283 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3285 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3286 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3287 intptr_t i, oprsz_2 = oprsz / 2;
3289 if (oprsz <= 8) {
3290 uint64_t l = *(uint64_t *)vn;
3291 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3292 *(uint64_t *)vd = l;
3293 } else if ((oprsz & 15) == 0) {
3294 for (i = 0; i < oprsz_2; i += 8) {
3295 intptr_t ih = oprsz - 8 - i;
3296 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3297 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3298 *(uint64_t *)(vd + i) = h;
3299 *(uint64_t *)(vd + ih) = l;
3301 } else {
3302 for (i = 0; i < oprsz_2; i += 1) {
3303 intptr_t il = H1(i);
3304 intptr_t ih = H1(oprsz - 1 - i);
3305 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3306 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3307 *(uint8_t *)(vd + il) = h;
3308 *(uint8_t *)(vd + ih) = l;
3313 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3315 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3316 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3317 uint64_t *d = vd;
3318 intptr_t i;
3320 if (oprsz <= 8) {
3321 uint64_t nn = *(uint64_t *)vn;
3322 int half = 4 * oprsz;
3324 nn = extract64(nn, high * half, half);
3325 nn = expand_bits(nn, 0);
3326 d[0] = nn;
3327 } else {
3328 ARMPredicateReg tmp_n;
3330 /* We produce output faster than we consume input.
3331 Therefore we must be mindful of possible overlap. */
3332 if ((vn - vd) < (uintptr_t)oprsz) {
3333 vn = memcpy(&tmp_n, vn, oprsz);
3335 if (high) {
3336 high = oprsz >> 1;
3339 if ((oprsz & 7) == 0) {
3340 uint32_t *n = vn;
3341 high >>= 2;
3343 for (i = 0; i < oprsz / 8; i++) {
3344 uint64_t nn = n[H4(high + i)];
3345 d[i] = expand_bits(nn, 0);
3347 } else {
3348 uint16_t *d16 = vd;
3349 uint8_t *n = vn;
3351 for (i = 0; i < oprsz / 2; i++) {
3352 uint16_t nn = n[H1(high + i)];
3353 d16[H2(i)] = expand_bits(nn, 0);
3359 #define DO_ZIP(NAME, TYPE, H) \
3360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3362 intptr_t oprsz = simd_oprsz(desc); \
3363 intptr_t odd_ofs = simd_data(desc); \
3364 intptr_t i, oprsz_2 = oprsz / 2; \
3365 ARMVectorReg tmp_n, tmp_m; \
3366 /* We produce output faster than we consume input. \
3367 Therefore we must be mindful of possible overlap. */ \
3368 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3369 vn = memcpy(&tmp_n, vn, oprsz); \
3371 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3372 vm = memcpy(&tmp_m, vm, oprsz); \
3374 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3375 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3376 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3377 *(TYPE *)(vm + odd_ofs + H(i)); \
3379 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3380 memset(vd + oprsz - 16, 0, 16); \
3384 DO_ZIP(sve_zip_b, uint8_t, H1)
3385 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3386 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3387 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3388 DO_ZIP(sve2_zip_q, Int128, )
3390 #define DO_UZP(NAME, TYPE, H) \
3391 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3393 intptr_t oprsz = simd_oprsz(desc); \
3394 intptr_t odd_ofs = simd_data(desc); \
3395 intptr_t i, p; \
3396 ARMVectorReg tmp_m; \
3397 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3398 vm = memcpy(&tmp_m, vm, oprsz); \
3400 i = 0, p = odd_ofs; \
3401 do { \
3402 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3403 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3404 } while (p < oprsz); \
3405 p -= oprsz; \
3406 do { \
3407 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3408 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3409 } while (p < oprsz); \
3410 tcg_debug_assert(i == oprsz); \
3413 DO_UZP(sve_uzp_b, uint8_t, H1)
3414 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3415 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3416 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3417 DO_UZP(sve2_uzp_q, Int128, )
3419 #define DO_TRN(NAME, TYPE, H) \
3420 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3422 intptr_t oprsz = simd_oprsz(desc); \
3423 intptr_t odd_ofs = simd_data(desc); \
3424 intptr_t i; \
3425 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3426 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3427 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3428 *(TYPE *)(vd + H(i + 0)) = ae; \
3429 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3431 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3432 memset(vd + oprsz - 16, 0, 16); \
3436 DO_TRN(sve_trn_b, uint8_t, H1)
3437 DO_TRN(sve_trn_h, uint16_t, H1_2)
3438 DO_TRN(sve_trn_s, uint32_t, H1_4)
3439 DO_TRN(sve_trn_d, uint64_t, H1_8)
3440 DO_TRN(sve2_trn_q, Int128, )
3442 #undef DO_ZIP
3443 #undef DO_UZP
3444 #undef DO_TRN
3446 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3448 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3449 uint32_t *d = vd, *n = vn;
3450 uint8_t *pg = vg;
3452 for (i = j = 0; i < opr_sz; i++) {
3453 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3454 d[H4(j)] = n[H4(i)];
3455 j++;
3458 for (; j < opr_sz; j++) {
3459 d[H4(j)] = 0;
3463 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3465 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3466 uint64_t *d = vd, *n = vn;
3467 uint8_t *pg = vg;
3469 for (i = j = 0; i < opr_sz; i++) {
3470 if (pg[H1(i)] & 1) {
3471 d[j] = n[i];
3472 j++;
3475 for (; j < opr_sz; j++) {
3476 d[j] = 0;
3480 /* Similar to the ARM LastActiveElement pseudocode function, except the
3481 * result is multiplied by the element size. This includes the not found
3482 * indication; e.g. not found for esz=3 is -8.
3484 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3486 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3487 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3489 return last_active_element(vg, words, esz);
3492 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3494 intptr_t opr_sz = simd_oprsz(desc) / 8;
3495 int esz = simd_data(desc);
3496 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3497 intptr_t i, first_i, last_i;
3498 ARMVectorReg tmp;
3500 first_i = last_i = 0;
3501 first_g = last_g = 0;
3503 /* Find the extent of the active elements within VG. */
3504 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3505 pg = *(uint64_t *)(vg + i) & mask;
3506 if (pg) {
3507 if (last_g == 0) {
3508 last_g = pg;
3509 last_i = i;
3511 first_g = pg;
3512 first_i = i;
3516 len = 0;
3517 if (first_g != 0) {
3518 first_i = first_i * 8 + ctz64(first_g);
3519 last_i = last_i * 8 + 63 - clz64(last_g);
3520 len = last_i - first_i + (1 << esz);
3521 if (vd == vm) {
3522 vm = memcpy(&tmp, vm, opr_sz * 8);
3524 swap_memmove(vd, vn + first_i, len);
3526 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3529 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3530 void *vg, uint32_t desc)
3532 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3533 uint64_t *d = vd, *n = vn, *m = vm;
3534 uint8_t *pg = vg;
3536 for (i = 0; i < opr_sz; i += 1) {
3537 uint64_t nn = n[i], mm = m[i];
3538 uint64_t pp = expand_pred_b(pg[H1(i)]);
3539 d[i] = (nn & pp) | (mm & ~pp);
3543 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3544 void *vg, uint32_t desc)
3546 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3547 uint64_t *d = vd, *n = vn, *m = vm;
3548 uint8_t *pg = vg;
3550 for (i = 0; i < opr_sz; i += 1) {
3551 uint64_t nn = n[i], mm = m[i];
3552 uint64_t pp = expand_pred_h(pg[H1(i)]);
3553 d[i] = (nn & pp) | (mm & ~pp);
3557 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3558 void *vg, uint32_t desc)
3560 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3561 uint64_t *d = vd, *n = vn, *m = vm;
3562 uint8_t *pg = vg;
3564 for (i = 0; i < opr_sz; i += 1) {
3565 uint64_t nn = n[i], mm = m[i];
3566 uint64_t pp = expand_pred_s(pg[H1(i)]);
3567 d[i] = (nn & pp) | (mm & ~pp);
3571 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3572 void *vg, uint32_t desc)
3574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3575 uint64_t *d = vd, *n = vn, *m = vm;
3576 uint8_t *pg = vg;
3578 for (i = 0; i < opr_sz; i += 1) {
3579 uint64_t nn = n[i], mm = m[i];
3580 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3584 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3585 void *vg, uint32_t desc)
3587 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3588 Int128 *d = vd, *n = vn, *m = vm;
3589 uint16_t *pg = vg;
3591 for (i = 0; i < opr_sz; i += 1) {
3592 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3596 /* Two operand comparison controlled by a predicate.
3597 * ??? It is very tempting to want to be able to expand this inline
3598 * with x86 instructions, e.g.
3600 * vcmpeqw zm, zn, %ymm0
3601 * vpmovmskb %ymm0, %eax
3602 * and $0x5555, %eax
3603 * and pg, %eax
3605 * or even aarch64, e.g.
3607 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3608 * cmeq v0.8h, zn, zm
3609 * and v0.8h, v0.8h, mask
3610 * addv h0, v0.8h
3611 * and v0.8b, pg
3613 * However, coming up with an abstraction that allows vector inputs and
3614 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3615 * scalar outputs, is tricky.
3617 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3618 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3620 intptr_t opr_sz = simd_oprsz(desc); \
3621 uint32_t flags = PREDTEST_INIT; \
3622 intptr_t i = opr_sz; \
3623 do { \
3624 uint64_t out = 0, pg; \
3625 do { \
3626 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3627 TYPE nn = *(TYPE *)(vn + H(i)); \
3628 TYPE mm = *(TYPE *)(vm + H(i)); \
3629 out |= nn OP mm; \
3630 } while (i & 63); \
3631 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3632 out &= pg; \
3633 *(uint64_t *)(vd + (i >> 3)) = out; \
3634 flags = iter_predtest_bwd(out, pg, flags); \
3635 } while (i > 0); \
3636 return flags; \
3639 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3640 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3641 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3643 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3645 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3648 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3649 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3650 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3651 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3653 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3654 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3655 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3656 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3658 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3659 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3660 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3661 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3663 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3664 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3665 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3666 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3668 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3669 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3670 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3671 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3673 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3674 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3675 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3676 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3678 #undef DO_CMP_PPZZ_B
3679 #undef DO_CMP_PPZZ_H
3680 #undef DO_CMP_PPZZ_S
3681 #undef DO_CMP_PPZZ_D
3682 #undef DO_CMP_PPZZ
3684 /* Similar, but the second source is "wide". */
3685 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3686 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3688 intptr_t opr_sz = simd_oprsz(desc); \
3689 uint32_t flags = PREDTEST_INIT; \
3690 intptr_t i = opr_sz; \
3691 do { \
3692 uint64_t out = 0, pg; \
3693 do { \
3694 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3695 do { \
3696 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3697 TYPE nn = *(TYPE *)(vn + H(i)); \
3698 out |= nn OP mm; \
3699 } while (i & 7); \
3700 } while (i & 63); \
3701 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3702 out &= pg; \
3703 *(uint64_t *)(vd + (i >> 3)) = out; \
3704 flags = iter_predtest_bwd(out, pg, flags); \
3705 } while (i > 0); \
3706 return flags; \
3709 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3710 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3711 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3713 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3716 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3717 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3718 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3720 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3721 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3722 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3724 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3725 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3726 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3728 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3729 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3730 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3732 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3733 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3734 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3736 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3737 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3738 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3740 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3741 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3742 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3744 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3745 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3746 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3748 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3749 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3750 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3752 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3753 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3754 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3756 #undef DO_CMP_PPZW_B
3757 #undef DO_CMP_PPZW_H
3758 #undef DO_CMP_PPZW_S
3759 #undef DO_CMP_PPZW
3761 /* Similar, but the second source is immediate. */
3762 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3763 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3765 intptr_t opr_sz = simd_oprsz(desc); \
3766 uint32_t flags = PREDTEST_INIT; \
3767 TYPE mm = simd_data(desc); \
3768 intptr_t i = opr_sz; \
3769 do { \
3770 uint64_t out = 0, pg; \
3771 do { \
3772 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3773 TYPE nn = *(TYPE *)(vn + H(i)); \
3774 out |= nn OP mm; \
3775 } while (i & 63); \
3776 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3777 out &= pg; \
3778 *(uint64_t *)(vd + (i >> 3)) = out; \
3779 flags = iter_predtest_bwd(out, pg, flags); \
3780 } while (i > 0); \
3781 return flags; \
3784 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3785 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3786 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3787 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3788 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3790 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3793 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3794 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3795 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3796 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3798 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3799 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3800 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3801 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3803 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3804 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3805 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3806 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3808 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3809 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3810 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3811 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3813 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3814 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3815 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3816 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3818 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3819 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3820 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3821 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3823 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3824 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3825 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3826 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3828 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3829 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3830 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3831 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3833 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3834 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3835 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3836 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3838 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3839 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3840 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3841 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3843 #undef DO_CMP_PPZI_B
3844 #undef DO_CMP_PPZI_H
3845 #undef DO_CMP_PPZI_S
3846 #undef DO_CMP_PPZI_D
3847 #undef DO_CMP_PPZI
3849 /* Similar to the ARM LastActive pseudocode function. */
3850 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3852 intptr_t i;
3854 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3855 uint64_t pg = *(uint64_t *)(vg + i);
3856 if (pg) {
3857 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3860 return 0;
3863 /* Compute a mask into RETB that is true for all G, up to and including
3864 * (if after) or excluding (if !after) the first G & N.
3865 * Return true if BRK found.
3867 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3868 bool brk, bool after)
3870 uint64_t b;
3872 if (brk) {
3873 b = 0;
3874 } else if ((g & n) == 0) {
3875 /* For all G, no N are set; break not found. */
3876 b = g;
3877 } else {
3878 /* Break somewhere in N. Locate it. */
3879 b = g & n; /* guard true, pred true */
3880 b = b & -b; /* first such */
3881 if (after) {
3882 b = b | (b - 1); /* break after same */
3883 } else {
3884 b = b - 1; /* break before same */
3886 brk = true;
3889 *retb = b;
3890 return brk;
3893 /* Compute a zeroing BRK. */
3894 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3895 intptr_t oprsz, bool after)
3897 bool brk = false;
3898 intptr_t i;
3900 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3901 uint64_t this_b, this_g = g[i];
3903 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3904 d[i] = this_b & this_g;
3908 /* Likewise, but also compute flags. */
3909 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3910 intptr_t oprsz, bool after)
3912 uint32_t flags = PREDTEST_INIT;
3913 bool brk = false;
3914 intptr_t i;
3916 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3917 uint64_t this_b, this_d, this_g = g[i];
3919 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3920 d[i] = this_d = this_b & this_g;
3921 flags = iter_predtest_fwd(this_d, this_g, flags);
3923 return flags;
3926 /* Compute a merging BRK. */
3927 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3928 intptr_t oprsz, bool after)
3930 bool brk = false;
3931 intptr_t i;
3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3934 uint64_t this_b, this_g = g[i];
3936 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3937 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3941 /* Likewise, but also compute flags. */
3942 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3943 intptr_t oprsz, bool after)
3945 uint32_t flags = PREDTEST_INIT;
3946 bool brk = false;
3947 intptr_t i;
3949 for (i = 0; i < oprsz / 8; ++i) {
3950 uint64_t this_b, this_d = d[i], this_g = g[i];
3952 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3953 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3954 flags = iter_predtest_fwd(this_d, this_g, flags);
3956 return flags;
3959 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3961 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3962 * The compiler should turn this into 4 64-bit integer stores.
3964 memset(d, 0, sizeof(ARMPredicateReg));
3965 return PREDTEST_INIT;
3968 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3969 uint32_t pred_desc)
3971 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3972 if (last_active_pred(vn, vg, oprsz)) {
3973 compute_brk_z(vd, vm, vg, oprsz, true);
3974 } else {
3975 do_zero(vd, oprsz);
3979 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3980 uint32_t pred_desc)
3982 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3983 if (last_active_pred(vn, vg, oprsz)) {
3984 return compute_brks_z(vd, vm, vg, oprsz, true);
3985 } else {
3986 return do_zero(vd, oprsz);
3990 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3991 uint32_t pred_desc)
3993 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3994 if (last_active_pred(vn, vg, oprsz)) {
3995 compute_brk_z(vd, vm, vg, oprsz, false);
3996 } else {
3997 do_zero(vd, oprsz);
4001 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4002 uint32_t pred_desc)
4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005 if (last_active_pred(vn, vg, oprsz)) {
4006 return compute_brks_z(vd, vm, vg, oprsz, false);
4007 } else {
4008 return do_zero(vd, oprsz);
4012 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4014 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4015 compute_brk_z(vd, vn, vg, oprsz, true);
4018 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4020 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4021 return compute_brks_z(vd, vn, vg, oprsz, true);
4024 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4027 compute_brk_z(vd, vn, vg, oprsz, false);
4030 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4033 return compute_brks_z(vd, vn, vg, oprsz, false);
4036 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4038 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4039 compute_brk_m(vd, vn, vg, oprsz, true);
4042 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4044 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4045 return compute_brks_m(vd, vn, vg, oprsz, true);
4048 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4051 compute_brk_m(vd, vn, vg, oprsz, false);
4054 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4057 return compute_brks_m(vd, vn, vg, oprsz, false);
4060 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4063 if (!last_active_pred(vn, vg, oprsz)) {
4064 do_zero(vd, oprsz);
4068 /* As if PredTest(Ones(PL), D, esz). */
4069 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4070 uint64_t esz_mask)
4072 uint32_t flags = PREDTEST_INIT;
4073 intptr_t i;
4075 for (i = 0; i < oprsz / 8; i++) {
4076 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4078 if (oprsz & 7) {
4079 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4080 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4082 return flags;
4085 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4087 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4088 if (last_active_pred(vn, vg, oprsz)) {
4089 return predtest_ones(vd, oprsz, -1);
4090 } else {
4091 return do_zero(vd, oprsz);
4095 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4097 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4098 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4099 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4100 intptr_t i;
4102 for (i = 0; i < words; ++i) {
4103 uint64_t t = n[i] & g[i] & mask;
4104 sum += ctpop64(t);
4106 return sum;
4109 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4111 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4112 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4113 uint64_t esz_mask = pred_esz_masks[esz];
4114 ARMPredicateReg *d = vd;
4115 uint32_t flags;
4116 intptr_t i;
4118 /* Begin with a zero predicate register. */
4119 flags = do_zero(d, oprsz);
4120 if (count == 0) {
4121 return flags;
4124 /* Set all of the requested bits. */
4125 for (i = 0; i < count / 64; ++i) {
4126 d->p[i] = esz_mask;
4128 if (count & 63) {
4129 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4132 return predtest_ones(d, oprsz, esz_mask);
4135 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4138 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4139 uint64_t esz_mask = pred_esz_masks[esz];
4140 ARMPredicateReg *d = vd;
4141 intptr_t i, invcount, oprbits;
4142 uint64_t bits;
4144 if (count == 0) {
4145 return do_zero(d, oprsz);
4148 oprbits = oprsz * 8;
4149 tcg_debug_assert(count <= oprbits);
4151 bits = esz_mask;
4152 if (oprbits & 63) {
4153 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4156 invcount = oprbits - count;
4157 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4158 d->p[i] = bits;
4159 bits = esz_mask;
4162 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4164 while (--i >= 0) {
4165 d->p[i] = 0;
4168 return predtest_ones(d, oprsz, esz_mask);
4171 /* Recursive reduction on a function;
4172 * C.f. the ARM ARM function ReducePredicated.
4174 * While it would be possible to write this without the DATA temporary,
4175 * it is much simpler to process the predicate register this way.
4176 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4177 * little to gain with a more complex non-recursive form.
4179 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4180 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4182 if (n == 1) { \
4183 return *data; \
4184 } else { \
4185 uintptr_t half = n / 2; \
4186 TYPE lo = NAME##_reduce(data, status, half); \
4187 TYPE hi = NAME##_reduce(data + half, status, half); \
4188 return TYPE##_##FUNC(lo, hi, status); \
4191 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4193 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4194 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4195 for (i = 0; i < oprsz; ) { \
4196 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4197 do { \
4198 TYPE nn = *(TYPE *)(vn + H(i)); \
4199 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4200 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4201 } while (i & 15); \
4203 for (; i < maxsz; i += sizeof(TYPE)) { \
4204 *(TYPE *)((void *)data + i) = IDENT; \
4206 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4209 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4210 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4211 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4213 /* Identity is floatN_default_nan, without the function call. */
4214 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4215 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4216 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4218 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4219 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4220 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4222 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4223 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4224 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4226 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4227 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4228 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4230 #undef DO_REDUCE
4232 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4233 void *status, uint32_t desc)
4235 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4236 float16 result = nn;
4238 do {
4239 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4240 do {
4241 if (pg & 1) {
4242 float16 mm = *(float16 *)(vm + H1_2(i));
4243 result = float16_add(result, mm, status);
4245 i += sizeof(float16), pg >>= sizeof(float16);
4246 } while (i & 15);
4247 } while (i < opr_sz);
4249 return result;
4252 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4253 void *status, uint32_t desc)
4255 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4256 float32 result = nn;
4258 do {
4259 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4260 do {
4261 if (pg & 1) {
4262 float32 mm = *(float32 *)(vm + H1_2(i));
4263 result = float32_add(result, mm, status);
4265 i += sizeof(float32), pg >>= sizeof(float32);
4266 } while (i & 15);
4267 } while (i < opr_sz);
4269 return result;
4272 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4273 void *status, uint32_t desc)
4275 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4276 uint64_t *m = vm;
4277 uint8_t *pg = vg;
4279 for (i = 0; i < opr_sz; i++) {
4280 if (pg[H1(i)] & 1) {
4281 nn = float64_add(nn, m[i], status);
4285 return nn;
4288 /* Fully general three-operand expander, controlled by a predicate,
4289 * With the extra float_status parameter.
4291 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4292 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4293 void *status, uint32_t desc) \
4295 intptr_t i = simd_oprsz(desc); \
4296 uint64_t *g = vg; \
4297 do { \
4298 uint64_t pg = g[(i - 1) >> 6]; \
4299 do { \
4300 i -= sizeof(TYPE); \
4301 if (likely((pg >> (i & 63)) & 1)) { \
4302 TYPE nn = *(TYPE *)(vn + H(i)); \
4303 TYPE mm = *(TYPE *)(vm + H(i)); \
4304 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4306 } while (i & 63); \
4307 } while (i != 0); \
4310 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4311 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4312 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4314 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4315 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4316 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4318 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4319 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4320 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4322 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4323 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4324 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4326 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4327 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4328 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4330 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4331 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4332 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4334 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4335 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4336 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4338 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4339 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4340 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4342 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4344 return float16_abs(float16_sub(a, b, s));
4347 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4349 return float32_abs(float32_sub(a, b, s));
4352 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4354 return float64_abs(float64_sub(a, b, s));
4357 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4358 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4359 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4361 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4363 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4364 return float64_scalbn(a, b_int, s);
4367 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4368 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4369 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4371 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4372 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4373 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4375 #undef DO_ZPZZ_FP
4377 /* Three-operand expander, with one scalar operand, controlled by
4378 * a predicate, with the extra float_status parameter.
4380 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4381 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4382 void *status, uint32_t desc) \
4384 intptr_t i = simd_oprsz(desc); \
4385 uint64_t *g = vg; \
4386 TYPE mm = scalar; \
4387 do { \
4388 uint64_t pg = g[(i - 1) >> 6]; \
4389 do { \
4390 i -= sizeof(TYPE); \
4391 if (likely((pg >> (i & 63)) & 1)) { \
4392 TYPE nn = *(TYPE *)(vn + H(i)); \
4393 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4395 } while (i & 63); \
4396 } while (i != 0); \
4399 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4400 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4401 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4403 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4404 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4405 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4407 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4408 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4409 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4411 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4413 return float16_sub(b, a, s);
4416 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4418 return float32_sub(b, a, s);
4421 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4423 return float64_sub(b, a, s);
4426 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4427 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4428 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4430 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4431 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4432 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4434 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4435 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4436 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4438 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4439 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4440 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4442 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4443 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4444 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4446 /* Fully general two-operand expander, controlled by a predicate,
4447 * With the extra float_status parameter.
4449 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4450 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4452 intptr_t i = simd_oprsz(desc); \
4453 uint64_t *g = vg; \
4454 do { \
4455 uint64_t pg = g[(i - 1) >> 6]; \
4456 do { \
4457 i -= sizeof(TYPE); \
4458 if (likely((pg >> (i & 63)) & 1)) { \
4459 TYPE nn = *(TYPE *)(vn + H(i)); \
4460 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4462 } while (i & 63); \
4463 } while (i != 0); \
4466 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4467 * FZ16. When converting from fp16, this affects flushing input denormals;
4468 * when converting to fp16, this affects flushing output denormals.
4470 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4472 bool save = get_flush_inputs_to_zero(fpst);
4473 float32 ret;
4475 set_flush_inputs_to_zero(false, fpst);
4476 ret = float16_to_float32(f, true, fpst);
4477 set_flush_inputs_to_zero(save, fpst);
4478 return ret;
4481 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4483 bool save = get_flush_inputs_to_zero(fpst);
4484 float64 ret;
4486 set_flush_inputs_to_zero(false, fpst);
4487 ret = float16_to_float64(f, true, fpst);
4488 set_flush_inputs_to_zero(save, fpst);
4489 return ret;
4492 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4494 bool save = get_flush_to_zero(fpst);
4495 float16 ret;
4497 set_flush_to_zero(false, fpst);
4498 ret = float32_to_float16(f, true, fpst);
4499 set_flush_to_zero(save, fpst);
4500 return ret;
4503 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4505 bool save = get_flush_to_zero(fpst);
4506 float16 ret;
4508 set_flush_to_zero(false, fpst);
4509 ret = float64_to_float16(f, true, fpst);
4510 set_flush_to_zero(save, fpst);
4511 return ret;
4514 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4516 if (float16_is_any_nan(f)) {
4517 float_raise(float_flag_invalid, s);
4518 return 0;
4520 return float16_to_int16_round_to_zero(f, s);
4523 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4525 if (float16_is_any_nan(f)) {
4526 float_raise(float_flag_invalid, s);
4527 return 0;
4529 return float16_to_int64_round_to_zero(f, s);
4532 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4534 if (float32_is_any_nan(f)) {
4535 float_raise(float_flag_invalid, s);
4536 return 0;
4538 return float32_to_int64_round_to_zero(f, s);
4541 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4543 if (float64_is_any_nan(f)) {
4544 float_raise(float_flag_invalid, s);
4545 return 0;
4547 return float64_to_int64_round_to_zero(f, s);
4550 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4552 if (float16_is_any_nan(f)) {
4553 float_raise(float_flag_invalid, s);
4554 return 0;
4556 return float16_to_uint16_round_to_zero(f, s);
4559 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4561 if (float16_is_any_nan(f)) {
4562 float_raise(float_flag_invalid, s);
4563 return 0;
4565 return float16_to_uint64_round_to_zero(f, s);
4568 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4570 if (float32_is_any_nan(f)) {
4571 float_raise(float_flag_invalid, s);
4572 return 0;
4574 return float32_to_uint64_round_to_zero(f, s);
4577 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4579 if (float64_is_any_nan(f)) {
4580 float_raise(float_flag_invalid, s);
4581 return 0;
4583 return float64_to_uint64_round_to_zero(f, s);
4586 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4587 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4588 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4589 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4590 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4591 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4592 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4594 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4595 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4596 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4597 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4598 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4599 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4600 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4602 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4603 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4604 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4605 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4606 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4608 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4610 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4611 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4612 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4614 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4615 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4616 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4618 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4619 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4620 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4622 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4623 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4624 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4626 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4627 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4628 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4629 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4630 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4631 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4632 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4634 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4635 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4636 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4637 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4638 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4639 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4640 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4642 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4644 /* Extract frac to the top of the uint32_t. */
4645 uint32_t frac = (uint32_t)a << (16 + 6);
4646 int16_t exp = extract32(a, 10, 5);
4648 if (unlikely(exp == 0)) {
4649 if (frac != 0) {
4650 if (!get_flush_inputs_to_zero(s)) {
4651 /* denormal: bias - fractional_zeros */
4652 return -15 - clz32(frac);
4654 /* flush to zero */
4655 float_raise(float_flag_input_denormal, s);
4657 } else if (unlikely(exp == 0x1f)) {
4658 if (frac == 0) {
4659 return INT16_MAX; /* infinity */
4661 } else {
4662 /* normal: exp - bias */
4663 return exp - 15;
4665 /* nan or zero */
4666 float_raise(float_flag_invalid, s);
4667 return INT16_MIN;
4670 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4672 /* Extract frac to the top of the uint32_t. */
4673 uint32_t frac = a << 9;
4674 int32_t exp = extract32(a, 23, 8);
4676 if (unlikely(exp == 0)) {
4677 if (frac != 0) {
4678 if (!get_flush_inputs_to_zero(s)) {
4679 /* denormal: bias - fractional_zeros */
4680 return -127 - clz32(frac);
4682 /* flush to zero */
4683 float_raise(float_flag_input_denormal, s);
4685 } else if (unlikely(exp == 0xff)) {
4686 if (frac == 0) {
4687 return INT32_MAX; /* infinity */
4689 } else {
4690 /* normal: exp - bias */
4691 return exp - 127;
4693 /* nan or zero */
4694 float_raise(float_flag_invalid, s);
4695 return INT32_MIN;
4698 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4700 /* Extract frac to the top of the uint64_t. */
4701 uint64_t frac = a << 12;
4702 int64_t exp = extract64(a, 52, 11);
4704 if (unlikely(exp == 0)) {
4705 if (frac != 0) {
4706 if (!get_flush_inputs_to_zero(s)) {
4707 /* denormal: bias - fractional_zeros */
4708 return -1023 - clz64(frac);
4710 /* flush to zero */
4711 float_raise(float_flag_input_denormal, s);
4713 } else if (unlikely(exp == 0x7ff)) {
4714 if (frac == 0) {
4715 return INT64_MAX; /* infinity */
4717 } else {
4718 /* normal: exp - bias */
4719 return exp - 1023;
4721 /* nan or zero */
4722 float_raise(float_flag_invalid, s);
4723 return INT64_MIN;
4726 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4727 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4728 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4730 #undef DO_ZPZ_FP
4732 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4733 float_status *status, uint32_t desc,
4734 uint16_t neg1, uint16_t neg3)
4736 intptr_t i = simd_oprsz(desc);
4737 uint64_t *g = vg;
4739 do {
4740 uint64_t pg = g[(i - 1) >> 6];
4741 do {
4742 i -= 2;
4743 if (likely((pg >> (i & 63)) & 1)) {
4744 float16 e1, e2, e3, r;
4746 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4747 e2 = *(uint16_t *)(vm + H1_2(i));
4748 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4749 r = float16_muladd(e1, e2, e3, 0, status);
4750 *(uint16_t *)(vd + H1_2(i)) = r;
4752 } while (i & 63);
4753 } while (i != 0);
4756 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4757 void *vg, void *status, uint32_t desc)
4759 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4762 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4763 void *vg, void *status, uint32_t desc)
4765 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4768 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4769 void *vg, void *status, uint32_t desc)
4771 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4774 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4775 void *vg, void *status, uint32_t desc)
4777 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4780 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4781 float_status *status, uint32_t desc,
4782 uint32_t neg1, uint32_t neg3)
4784 intptr_t i = simd_oprsz(desc);
4785 uint64_t *g = vg;
4787 do {
4788 uint64_t pg = g[(i - 1) >> 6];
4789 do {
4790 i -= 4;
4791 if (likely((pg >> (i & 63)) & 1)) {
4792 float32 e1, e2, e3, r;
4794 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4795 e2 = *(uint32_t *)(vm + H1_4(i));
4796 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4797 r = float32_muladd(e1, e2, e3, 0, status);
4798 *(uint32_t *)(vd + H1_4(i)) = r;
4800 } while (i & 63);
4801 } while (i != 0);
4804 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4805 void *vg, void *status, uint32_t desc)
4807 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4810 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4811 void *vg, void *status, uint32_t desc)
4813 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4816 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4817 void *vg, void *status, uint32_t desc)
4819 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4822 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4823 void *vg, void *status, uint32_t desc)
4825 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4828 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4829 float_status *status, uint32_t desc,
4830 uint64_t neg1, uint64_t neg3)
4832 intptr_t i = simd_oprsz(desc);
4833 uint64_t *g = vg;
4835 do {
4836 uint64_t pg = g[(i - 1) >> 6];
4837 do {
4838 i -= 8;
4839 if (likely((pg >> (i & 63)) & 1)) {
4840 float64 e1, e2, e3, r;
4842 e1 = *(uint64_t *)(vn + i) ^ neg1;
4843 e2 = *(uint64_t *)(vm + i);
4844 e3 = *(uint64_t *)(va + i) ^ neg3;
4845 r = float64_muladd(e1, e2, e3, 0, status);
4846 *(uint64_t *)(vd + i) = r;
4848 } while (i & 63);
4849 } while (i != 0);
4852 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4853 void *vg, void *status, uint32_t desc)
4855 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4858 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4859 void *vg, void *status, uint32_t desc)
4861 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4864 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4865 void *vg, void *status, uint32_t desc)
4867 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4870 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4871 void *vg, void *status, uint32_t desc)
4873 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4876 /* Two operand floating-point comparison controlled by a predicate.
4877 * Unlike the integer version, we are not allowed to optimistically
4878 * compare operands, since the comparison may have side effects wrt
4879 * the FPSR.
4881 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4882 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4883 void *status, uint32_t desc) \
4885 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4886 uint64_t *d = vd, *g = vg; \
4887 do { \
4888 uint64_t out = 0, pg = g[j]; \
4889 do { \
4890 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4891 if (likely((pg >> (i & 63)) & 1)) { \
4892 TYPE nn = *(TYPE *)(vn + H(i)); \
4893 TYPE mm = *(TYPE *)(vm + H(i)); \
4894 out |= OP(TYPE, nn, mm, status); \
4896 } while (i & 63); \
4897 d[j--] = out; \
4898 } while (i > 0); \
4901 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4902 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4903 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4904 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4905 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4906 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4908 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4909 DO_FPCMP_PPZZ_H(NAME, OP) \
4910 DO_FPCMP_PPZZ_S(NAME, OP) \
4911 DO_FPCMP_PPZZ_D(NAME, OP)
4913 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4914 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4915 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4916 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4917 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4918 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4919 #define DO_FCMUO(TYPE, X, Y, ST) \
4920 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4921 #define DO_FACGE(TYPE, X, Y, ST) \
4922 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4923 #define DO_FACGT(TYPE, X, Y, ST) \
4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4926 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4927 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4928 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4931 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4932 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4934 #undef DO_FPCMP_PPZZ_ALL
4935 #undef DO_FPCMP_PPZZ_D
4936 #undef DO_FPCMP_PPZZ_S
4937 #undef DO_FPCMP_PPZZ_H
4938 #undef DO_FPCMP_PPZZ
4940 /* One operand floating-point comparison against zero, controlled
4941 * by a predicate.
4943 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4944 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4945 void *status, uint32_t desc) \
4947 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4948 uint64_t *d = vd, *g = vg; \
4949 do { \
4950 uint64_t out = 0, pg = g[j]; \
4951 do { \
4952 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4953 if ((pg >> (i & 63)) & 1) { \
4954 TYPE nn = *(TYPE *)(vn + H(i)); \
4955 out |= OP(TYPE, nn, 0, status); \
4957 } while (i & 63); \
4958 d[j--] = out; \
4959 } while (i > 0); \
4962 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4963 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4964 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4965 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4966 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4967 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4969 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4970 DO_FPCMP_PPZ0_H(NAME, OP) \
4971 DO_FPCMP_PPZ0_S(NAME, OP) \
4972 DO_FPCMP_PPZ0_D(NAME, OP)
4974 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4975 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4981 /* FP Trig Multiply-Add. */
4983 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4985 static const float16 coeff[16] = {
4986 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4987 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4990 intptr_t x = simd_data(desc);
4991 float16 *d = vd, *n = vn, *m = vm;
4992 for (i = 0; i < opr_sz; i++) {
4993 float16 mm = m[i];
4994 intptr_t xx = x;
4995 if (float16_is_neg(mm)) {
4996 mm = float16_abs(mm);
4997 xx += 8;
4999 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5003 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5005 static const float32 coeff[16] = {
5006 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5007 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5008 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5009 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5011 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5012 intptr_t x = simd_data(desc);
5013 float32 *d = vd, *n = vn, *m = vm;
5014 for (i = 0; i < opr_sz; i++) {
5015 float32 mm = m[i];
5016 intptr_t xx = x;
5017 if (float32_is_neg(mm)) {
5018 mm = float32_abs(mm);
5019 xx += 8;
5021 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5025 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5027 static const float64 coeff[16] = {
5028 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5029 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5030 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5031 0x3de5d8408868552full, 0x0000000000000000ull,
5032 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5033 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5034 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5035 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5037 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5038 intptr_t x = simd_data(desc);
5039 float64 *d = vd, *n = vn, *m = vm;
5040 for (i = 0; i < opr_sz; i++) {
5041 float64 mm = m[i];
5042 intptr_t xx = x;
5043 if (float64_is_neg(mm)) {
5044 mm = float64_abs(mm);
5045 xx += 8;
5047 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5052 * FP Complex Add
5055 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5056 void *vs, uint32_t desc)
5058 intptr_t j, i = simd_oprsz(desc);
5059 uint64_t *g = vg;
5060 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5061 float16 neg_real = float16_chs(neg_imag);
5063 do {
5064 uint64_t pg = g[(i - 1) >> 6];
5065 do {
5066 float16 e0, e1, e2, e3;
5068 /* I holds the real index; J holds the imag index. */
5069 j = i - sizeof(float16);
5070 i -= 2 * sizeof(float16);
5072 e0 = *(float16 *)(vn + H1_2(i));
5073 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5074 e2 = *(float16 *)(vn + H1_2(j));
5075 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5077 if (likely((pg >> (i & 63)) & 1)) {
5078 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5080 if (likely((pg >> (j & 63)) & 1)) {
5081 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5083 } while (i & 63);
5084 } while (i != 0);
5087 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5088 void *vs, uint32_t desc)
5090 intptr_t j, i = simd_oprsz(desc);
5091 uint64_t *g = vg;
5092 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5093 float32 neg_real = float32_chs(neg_imag);
5095 do {
5096 uint64_t pg = g[(i - 1) >> 6];
5097 do {
5098 float32 e0, e1, e2, e3;
5100 /* I holds the real index; J holds the imag index. */
5101 j = i - sizeof(float32);
5102 i -= 2 * sizeof(float32);
5104 e0 = *(float32 *)(vn + H1_2(i));
5105 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5106 e2 = *(float32 *)(vn + H1_2(j));
5107 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5109 if (likely((pg >> (i & 63)) & 1)) {
5110 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5112 if (likely((pg >> (j & 63)) & 1)) {
5113 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5115 } while (i & 63);
5116 } while (i != 0);
5119 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5120 void *vs, uint32_t desc)
5122 intptr_t j, i = simd_oprsz(desc);
5123 uint64_t *g = vg;
5124 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5125 float64 neg_real = float64_chs(neg_imag);
5127 do {
5128 uint64_t pg = g[(i - 1) >> 6];
5129 do {
5130 float64 e0, e1, e2, e3;
5132 /* I holds the real index; J holds the imag index. */
5133 j = i - sizeof(float64);
5134 i -= 2 * sizeof(float64);
5136 e0 = *(float64 *)(vn + H1_2(i));
5137 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5138 e2 = *(float64 *)(vn + H1_2(j));
5139 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5141 if (likely((pg >> (i & 63)) & 1)) {
5142 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5144 if (likely((pg >> (j & 63)) & 1)) {
5145 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5147 } while (i & 63);
5148 } while (i != 0);
5152 * FP Complex Multiply
5155 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5156 void *vg, void *status, uint32_t desc)
5158 intptr_t j, i = simd_oprsz(desc);
5159 unsigned rot = simd_data(desc);
5160 bool flip = rot & 1;
5161 float16 neg_imag, neg_real;
5162 uint64_t *g = vg;
5164 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5165 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5167 do {
5168 uint64_t pg = g[(i - 1) >> 6];
5169 do {
5170 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5172 /* I holds the real index; J holds the imag index. */
5173 j = i - sizeof(float16);
5174 i -= 2 * sizeof(float16);
5176 nr = *(float16 *)(vn + H1_2(i));
5177 ni = *(float16 *)(vn + H1_2(j));
5178 mr = *(float16 *)(vm + H1_2(i));
5179 mi = *(float16 *)(vm + H1_2(j));
5181 e2 = (flip ? ni : nr);
5182 e1 = (flip ? mi : mr) ^ neg_real;
5183 e4 = e2;
5184 e3 = (flip ? mr : mi) ^ neg_imag;
5186 if (likely((pg >> (i & 63)) & 1)) {
5187 d = *(float16 *)(va + H1_2(i));
5188 d = float16_muladd(e2, e1, d, 0, status);
5189 *(float16 *)(vd + H1_2(i)) = d;
5191 if (likely((pg >> (j & 63)) & 1)) {
5192 d = *(float16 *)(va + H1_2(j));
5193 d = float16_muladd(e4, e3, d, 0, status);
5194 *(float16 *)(vd + H1_2(j)) = d;
5196 } while (i & 63);
5197 } while (i != 0);
5200 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5201 void *vg, void *status, uint32_t desc)
5203 intptr_t j, i = simd_oprsz(desc);
5204 unsigned rot = simd_data(desc);
5205 bool flip = rot & 1;
5206 float32 neg_imag, neg_real;
5207 uint64_t *g = vg;
5209 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5210 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5212 do {
5213 uint64_t pg = g[(i - 1) >> 6];
5214 do {
5215 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5217 /* I holds the real index; J holds the imag index. */
5218 j = i - sizeof(float32);
5219 i -= 2 * sizeof(float32);
5221 nr = *(float32 *)(vn + H1_2(i));
5222 ni = *(float32 *)(vn + H1_2(j));
5223 mr = *(float32 *)(vm + H1_2(i));
5224 mi = *(float32 *)(vm + H1_2(j));
5226 e2 = (flip ? ni : nr);
5227 e1 = (flip ? mi : mr) ^ neg_real;
5228 e4 = e2;
5229 e3 = (flip ? mr : mi) ^ neg_imag;
5231 if (likely((pg >> (i & 63)) & 1)) {
5232 d = *(float32 *)(va + H1_2(i));
5233 d = float32_muladd(e2, e1, d, 0, status);
5234 *(float32 *)(vd + H1_2(i)) = d;
5236 if (likely((pg >> (j & 63)) & 1)) {
5237 d = *(float32 *)(va + H1_2(j));
5238 d = float32_muladd(e4, e3, d, 0, status);
5239 *(float32 *)(vd + H1_2(j)) = d;
5241 } while (i & 63);
5242 } while (i != 0);
5245 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5246 void *vg, void *status, uint32_t desc)
5248 intptr_t j, i = simd_oprsz(desc);
5249 unsigned rot = simd_data(desc);
5250 bool flip = rot & 1;
5251 float64 neg_imag, neg_real;
5252 uint64_t *g = vg;
5254 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5255 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5257 do {
5258 uint64_t pg = g[(i - 1) >> 6];
5259 do {
5260 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5262 /* I holds the real index; J holds the imag index. */
5263 j = i - sizeof(float64);
5264 i -= 2 * sizeof(float64);
5266 nr = *(float64 *)(vn + H1_2(i));
5267 ni = *(float64 *)(vn + H1_2(j));
5268 mr = *(float64 *)(vm + H1_2(i));
5269 mi = *(float64 *)(vm + H1_2(j));
5271 e2 = (flip ? ni : nr);
5272 e1 = (flip ? mi : mr) ^ neg_real;
5273 e4 = e2;
5274 e3 = (flip ? mr : mi) ^ neg_imag;
5276 if (likely((pg >> (i & 63)) & 1)) {
5277 d = *(float64 *)(va + H1_2(i));
5278 d = float64_muladd(e2, e1, d, 0, status);
5279 *(float64 *)(vd + H1_2(i)) = d;
5281 if (likely((pg >> (j & 63)) & 1)) {
5282 d = *(float64 *)(va + H1_2(j));
5283 d = float64_muladd(e4, e3, d, 0, status);
5284 *(float64 *)(vd + H1_2(j)) = d;
5286 } while (i & 63);
5287 } while (i != 0);
5291 * Load contiguous data, protected by a governing predicate.
5295 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5296 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5297 * element >= @reg_off, or @reg_max if there were no active elements at all.
5299 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5300 intptr_t reg_max, int esz)
5302 uint64_t pg_mask = pred_esz_masks[esz];
5303 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5305 /* In normal usage, the first element is active. */
5306 if (likely(pg & 1)) {
5307 return reg_off;
5310 if (pg == 0) {
5311 reg_off &= -64;
5312 do {
5313 reg_off += 64;
5314 if (unlikely(reg_off >= reg_max)) {
5315 /* The entire predicate was false. */
5316 return reg_max;
5318 pg = vg[reg_off >> 6] & pg_mask;
5319 } while (pg == 0);
5321 reg_off += ctz64(pg);
5323 /* We should never see an out of range predicate bit set. */
5324 tcg_debug_assert(reg_off < reg_max);
5325 return reg_off;
5329 * Resolve the guest virtual address to info->host and info->flags.
5330 * If @nofault, return false if the page is invalid, otherwise
5331 * exit via page fault exception.
5334 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5335 target_ulong addr, int mem_off, MMUAccessType access_type,
5336 int mmu_idx, uintptr_t retaddr)
5338 int flags;
5340 addr += mem_off;
5343 * User-only currently always issues with TBI. See the comment
5344 * above useronly_clean_ptr. Usually we clean this top byte away
5345 * during translation, but we can't do that for e.g. vector + imm
5346 * addressing modes.
5348 * We currently always enable TBI for user-only, and do not provide
5349 * a way to turn it off. So clean the pointer unconditionally here,
5350 * rather than look it up here, or pass it down from above.
5352 addr = useronly_clean_ptr(addr);
5354 #ifdef CONFIG_USER_ONLY
5355 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5356 &info->host, retaddr);
5357 #else
5358 CPUTLBEntryFull *full;
5359 flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
5360 &info->host, &full, retaddr);
5361 #endif
5362 info->flags = flags;
5364 if (flags & TLB_INVALID_MASK) {
5365 g_assert(nofault);
5366 return false;
5369 #ifdef CONFIG_USER_ONLY
5370 memset(&info->attrs, 0, sizeof(info->attrs));
5371 /* Require both ANON and MTE; see allocation_tag_mem(). */
5372 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5373 #else
5374 info->attrs = full->attrs;
5375 info->tagged = full->pte_attrs == 0xf0;
5376 #endif
5378 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5379 info->host -= mem_off;
5380 return true;
5384 * Find first active element on each page, and a loose bound for the
5385 * final element on each page. Identify any single element that spans
5386 * the page boundary. Return true if there are any active elements.
5388 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5389 intptr_t reg_max, int esz, int msize)
5391 const int esize = 1 << esz;
5392 const uint64_t pg_mask = pred_esz_masks[esz];
5393 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5394 intptr_t mem_off_last, mem_off_split;
5395 intptr_t page_split, elt_split;
5396 intptr_t i;
5398 /* Set all of the element indices to -1, and the TLB data to 0. */
5399 memset(info, -1, offsetof(SVEContLdSt, page));
5400 memset(info->page, 0, sizeof(info->page));
5402 /* Gross scan over the entire predicate to find bounds. */
5403 i = 0;
5404 do {
5405 uint64_t pg = vg[i] & pg_mask;
5406 if (pg) {
5407 reg_off_last = i * 64 + 63 - clz64(pg);
5408 if (reg_off_first < 0) {
5409 reg_off_first = i * 64 + ctz64(pg);
5412 } while (++i * 64 < reg_max);
5414 if (unlikely(reg_off_first < 0)) {
5415 /* No active elements, no pages touched. */
5416 return false;
5418 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5420 info->reg_off_first[0] = reg_off_first;
5421 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5422 mem_off_last = (reg_off_last >> esz) * msize;
5424 page_split = -(addr | TARGET_PAGE_MASK);
5425 if (likely(mem_off_last + msize <= page_split)) {
5426 /* The entire operation fits within a single page. */
5427 info->reg_off_last[0] = reg_off_last;
5428 return true;
5431 info->page_split = page_split;
5432 elt_split = page_split / msize;
5433 reg_off_split = elt_split << esz;
5434 mem_off_split = elt_split * msize;
5437 * This is the last full element on the first page, but it is not
5438 * necessarily active. If there is no full element, i.e. the first
5439 * active element is the one that's split, this value remains -1.
5440 * It is useful as iteration bounds.
5442 if (elt_split != 0) {
5443 info->reg_off_last[0] = reg_off_split - esize;
5446 /* Determine if an unaligned element spans the pages. */
5447 if (page_split % msize != 0) {
5448 /* It is helpful to know if the split element is active. */
5449 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5450 info->reg_off_split = reg_off_split;
5451 info->mem_off_split = mem_off_split;
5453 if (reg_off_split == reg_off_last) {
5454 /* The page crossing element is last. */
5455 return true;
5458 reg_off_split += esize;
5459 mem_off_split += msize;
5463 * We do want the first active element on the second page, because
5464 * this may affect the address reported in an exception.
5466 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5467 tcg_debug_assert(reg_off_split <= reg_off_last);
5468 info->reg_off_first[1] = reg_off_split;
5469 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5470 info->reg_off_last[1] = reg_off_last;
5471 return true;
5475 * Resolve the guest virtual addresses to info->page[].
5476 * Control the generation of page faults with @fault. Return false if
5477 * there is no work to do, which can only happen with @fault == FAULT_NO.
5479 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5480 CPUARMState *env, target_ulong addr,
5481 MMUAccessType access_type, uintptr_t retaddr)
5483 int mmu_idx = cpu_mmu_index(env, false);
5484 int mem_off = info->mem_off_first[0];
5485 bool nofault = fault == FAULT_NO;
5486 bool have_work = true;
5488 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5489 access_type, mmu_idx, retaddr)) {
5490 /* No work to be done. */
5491 return false;
5494 if (likely(info->page_split < 0)) {
5495 /* The entire operation was on the one page. */
5496 return true;
5500 * If the second page is invalid, then we want the fault address to be
5501 * the first byte on that page which is accessed.
5503 if (info->mem_off_split >= 0) {
5505 * There is an element split across the pages. The fault address
5506 * should be the first byte of the second page.
5508 mem_off = info->page_split;
5510 * If the split element is also the first active element
5511 * of the vector, then: For first-fault we should continue
5512 * to generate faults for the second page. For no-fault,
5513 * we have work only if the second page is valid.
5515 if (info->mem_off_first[0] < info->mem_off_split) {
5516 nofault = FAULT_FIRST;
5517 have_work = false;
5519 } else {
5521 * There is no element split across the pages. The fault address
5522 * should be the first active element on the second page.
5524 mem_off = info->mem_off_first[1];
5526 * There must have been one active element on the first page,
5527 * so we're out of first-fault territory.
5529 nofault = fault != FAULT_ALL;
5532 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5533 access_type, mmu_idx, retaddr);
5534 return have_work;
5537 #ifndef CONFIG_USER_ONLY
5538 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5539 uint64_t *vg, target_ulong addr,
5540 int esize, int msize, int wp_access,
5541 uintptr_t retaddr)
5543 intptr_t mem_off, reg_off, reg_last;
5544 int flags0 = info->page[0].flags;
5545 int flags1 = info->page[1].flags;
5547 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5548 return;
5551 /* Indicate that watchpoints are handled. */
5552 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5553 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5555 if (flags0 & TLB_WATCHPOINT) {
5556 mem_off = info->mem_off_first[0];
5557 reg_off = info->reg_off_first[0];
5558 reg_last = info->reg_off_last[0];
5560 while (reg_off <= reg_last) {
5561 uint64_t pg = vg[reg_off >> 6];
5562 do {
5563 if ((pg >> (reg_off & 63)) & 1) {
5564 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5565 msize, info->page[0].attrs,
5566 wp_access, retaddr);
5568 reg_off += esize;
5569 mem_off += msize;
5570 } while (reg_off <= reg_last && (reg_off & 63));
5574 mem_off = info->mem_off_split;
5575 if (mem_off >= 0) {
5576 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5577 info->page[0].attrs, wp_access, retaddr);
5580 mem_off = info->mem_off_first[1];
5581 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5582 reg_off = info->reg_off_first[1];
5583 reg_last = info->reg_off_last[1];
5585 do {
5586 uint64_t pg = vg[reg_off >> 6];
5587 do {
5588 if ((pg >> (reg_off & 63)) & 1) {
5589 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5590 msize, info->page[1].attrs,
5591 wp_access, retaddr);
5593 reg_off += esize;
5594 mem_off += msize;
5595 } while (reg_off & 63);
5596 } while (reg_off <= reg_last);
5599 #endif
5601 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5602 uint64_t *vg, target_ulong addr, int esize,
5603 int msize, uint32_t mtedesc, uintptr_t ra)
5605 intptr_t mem_off, reg_off, reg_last;
5607 /* Process the page only if MemAttr == Tagged. */
5608 if (info->page[0].tagged) {
5609 mem_off = info->mem_off_first[0];
5610 reg_off = info->reg_off_first[0];
5611 reg_last = info->reg_off_split;
5612 if (reg_last < 0) {
5613 reg_last = info->reg_off_last[0];
5616 do {
5617 uint64_t pg = vg[reg_off >> 6];
5618 do {
5619 if ((pg >> (reg_off & 63)) & 1) {
5620 mte_check(env, mtedesc, addr, ra);
5622 reg_off += esize;
5623 mem_off += msize;
5624 } while (reg_off <= reg_last && (reg_off & 63));
5625 } while (reg_off <= reg_last);
5628 mem_off = info->mem_off_first[1];
5629 if (mem_off >= 0 && info->page[1].tagged) {
5630 reg_off = info->reg_off_first[1];
5631 reg_last = info->reg_off_last[1];
5633 do {
5634 uint64_t pg = vg[reg_off >> 6];
5635 do {
5636 if ((pg >> (reg_off & 63)) & 1) {
5637 mte_check(env, mtedesc, addr, ra);
5639 reg_off += esize;
5640 mem_off += msize;
5641 } while (reg_off & 63);
5642 } while (reg_off <= reg_last);
5647 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5649 static inline QEMU_ALWAYS_INLINE
5650 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5651 uint32_t desc, const uintptr_t retaddr,
5652 const int esz, const int msz, const int N, uint32_t mtedesc,
5653 sve_ldst1_host_fn *host_fn,
5654 sve_ldst1_tlb_fn *tlb_fn)
5656 const unsigned rd = simd_data(desc);
5657 const intptr_t reg_max = simd_oprsz(desc);
5658 intptr_t reg_off, reg_last, mem_off;
5659 SVEContLdSt info;
5660 void *host;
5661 int flags, i;
5663 /* Find the active elements. */
5664 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5665 /* The entire predicate was false; no load occurs. */
5666 for (i = 0; i < N; ++i) {
5667 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5669 return;
5672 /* Probe the page(s). Exit with exception for any invalid page. */
5673 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5675 /* Handle watchpoints for all active elements. */
5676 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5677 BP_MEM_READ, retaddr);
5680 * Handle mte checks for all active elements.
5681 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5683 if (mtedesc) {
5684 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5685 mtedesc, retaddr);
5688 flags = info.page[0].flags | info.page[1].flags;
5689 if (unlikely(flags != 0)) {
5690 #ifdef CONFIG_USER_ONLY
5691 g_assert_not_reached();
5692 #else
5694 * At least one page includes MMIO.
5695 * Any bus operation can fail with cpu_transaction_failed,
5696 * which for ARM will raise SyncExternal. Perform the load
5697 * into scratch memory to preserve register state until the end.
5699 ARMVectorReg scratch[4] = { };
5701 mem_off = info.mem_off_first[0];
5702 reg_off = info.reg_off_first[0];
5703 reg_last = info.reg_off_last[1];
5704 if (reg_last < 0) {
5705 reg_last = info.reg_off_split;
5706 if (reg_last < 0) {
5707 reg_last = info.reg_off_last[0];
5711 do {
5712 uint64_t pg = vg[reg_off >> 6];
5713 do {
5714 if ((pg >> (reg_off & 63)) & 1) {
5715 for (i = 0; i < N; ++i) {
5716 tlb_fn(env, &scratch[i], reg_off,
5717 addr + mem_off + (i << msz), retaddr);
5720 reg_off += 1 << esz;
5721 mem_off += N << msz;
5722 } while (reg_off & 63);
5723 } while (reg_off <= reg_last);
5725 for (i = 0; i < N; ++i) {
5726 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5728 return;
5729 #endif
5732 /* The entire operation is in RAM, on valid pages. */
5734 for (i = 0; i < N; ++i) {
5735 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5738 mem_off = info.mem_off_first[0];
5739 reg_off = info.reg_off_first[0];
5740 reg_last = info.reg_off_last[0];
5741 host = info.page[0].host;
5743 while (reg_off <= reg_last) {
5744 uint64_t pg = vg[reg_off >> 6];
5745 do {
5746 if ((pg >> (reg_off & 63)) & 1) {
5747 for (i = 0; i < N; ++i) {
5748 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5749 host + mem_off + (i << msz));
5752 reg_off += 1 << esz;
5753 mem_off += N << msz;
5754 } while (reg_off <= reg_last && (reg_off & 63));
5758 * Use the slow path to manage the cross-page misalignment.
5759 * But we know this is RAM and cannot trap.
5761 mem_off = info.mem_off_split;
5762 if (unlikely(mem_off >= 0)) {
5763 reg_off = info.reg_off_split;
5764 for (i = 0; i < N; ++i) {
5765 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5766 addr + mem_off + (i << msz), retaddr);
5770 mem_off = info.mem_off_first[1];
5771 if (unlikely(mem_off >= 0)) {
5772 reg_off = info.reg_off_first[1];
5773 reg_last = info.reg_off_last[1];
5774 host = info.page[1].host;
5776 do {
5777 uint64_t pg = vg[reg_off >> 6];
5778 do {
5779 if ((pg >> (reg_off & 63)) & 1) {
5780 for (i = 0; i < N; ++i) {
5781 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5782 host + mem_off + (i << msz));
5785 reg_off += 1 << esz;
5786 mem_off += N << msz;
5787 } while (reg_off & 63);
5788 } while (reg_off <= reg_last);
5792 static inline QEMU_ALWAYS_INLINE
5793 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5794 uint32_t desc, const uintptr_t ra,
5795 const int esz, const int msz, const int N,
5796 sve_ldst1_host_fn *host_fn,
5797 sve_ldst1_tlb_fn *tlb_fn)
5799 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5800 int bit55 = extract64(addr, 55, 1);
5802 /* Remove mtedesc from the normal sve descriptor. */
5803 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5805 /* Perform gross MTE suppression early. */
5806 if (!tbi_check(desc, bit55) ||
5807 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5808 mtedesc = 0;
5811 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5814 #define DO_LD1_1(NAME, ESZ) \
5815 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5816 target_ulong addr, uint32_t desc) \
5818 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5819 sve_##NAME##_host, sve_##NAME##_tlb); \
5821 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5822 target_ulong addr, uint32_t desc) \
5824 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5825 sve_##NAME##_host, sve_##NAME##_tlb); \
5828 #define DO_LD1_2(NAME, ESZ, MSZ) \
5829 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5830 target_ulong addr, uint32_t desc) \
5832 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5833 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5835 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5836 target_ulong addr, uint32_t desc) \
5838 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5839 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5841 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5842 target_ulong addr, uint32_t desc) \
5844 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5845 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5847 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5848 target_ulong addr, uint32_t desc) \
5850 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5851 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5854 DO_LD1_1(ld1bb, MO_8)
5855 DO_LD1_1(ld1bhu, MO_16)
5856 DO_LD1_1(ld1bhs, MO_16)
5857 DO_LD1_1(ld1bsu, MO_32)
5858 DO_LD1_1(ld1bss, MO_32)
5859 DO_LD1_1(ld1bdu, MO_64)
5860 DO_LD1_1(ld1bds, MO_64)
5862 DO_LD1_2(ld1hh, MO_16, MO_16)
5863 DO_LD1_2(ld1hsu, MO_32, MO_16)
5864 DO_LD1_2(ld1hss, MO_32, MO_16)
5865 DO_LD1_2(ld1hdu, MO_64, MO_16)
5866 DO_LD1_2(ld1hds, MO_64, MO_16)
5868 DO_LD1_2(ld1ss, MO_32, MO_32)
5869 DO_LD1_2(ld1sdu, MO_64, MO_32)
5870 DO_LD1_2(ld1sds, MO_64, MO_32)
5872 DO_LD1_2(ld1dd, MO_64, MO_64)
5874 #undef DO_LD1_1
5875 #undef DO_LD1_2
5877 #define DO_LDN_1(N) \
5878 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5879 target_ulong addr, uint32_t desc) \
5881 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5882 sve_ld1bb_host, sve_ld1bb_tlb); \
5884 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5885 target_ulong addr, uint32_t desc) \
5887 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5888 sve_ld1bb_host, sve_ld1bb_tlb); \
5891 #define DO_LDN_2(N, SUFF, ESZ) \
5892 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5893 target_ulong addr, uint32_t desc) \
5895 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5896 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5898 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5899 target_ulong addr, uint32_t desc) \
5901 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5902 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5904 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5905 target_ulong addr, uint32_t desc) \
5907 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5908 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5910 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5911 target_ulong addr, uint32_t desc) \
5913 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5914 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5917 DO_LDN_1(2)
5918 DO_LDN_1(3)
5919 DO_LDN_1(4)
5921 DO_LDN_2(2, hh, MO_16)
5922 DO_LDN_2(3, hh, MO_16)
5923 DO_LDN_2(4, hh, MO_16)
5925 DO_LDN_2(2, ss, MO_32)
5926 DO_LDN_2(3, ss, MO_32)
5927 DO_LDN_2(4, ss, MO_32)
5929 DO_LDN_2(2, dd, MO_64)
5930 DO_LDN_2(3, dd, MO_64)
5931 DO_LDN_2(4, dd, MO_64)
5933 #undef DO_LDN_1
5934 #undef DO_LDN_2
5937 * Load contiguous data, first-fault and no-fault.
5939 * For user-only, one could argue that we should hold the mmap_lock during
5940 * the operation so that there is no race between page_check_range and the
5941 * load operation. However, unmapping pages out from under a running thread
5942 * is extraordinarily unlikely. This theoretical race condition also affects
5943 * linux-user/ in its get_user/put_user macros.
5945 * TODO: Construct some helpers, written in assembly, that interact with
5946 * host_signal_handler to produce memory ops which can properly report errors
5947 * without racing.
5950 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5951 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5952 * option, which leaves subsequent data unchanged.
5954 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5956 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5958 if (i & 63) {
5959 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5960 i = ROUND_UP(i, 64);
5962 for (; i < oprsz; i += 64) {
5963 ffr[i / 64] = 0;
5968 * Common helper for all contiguous no-fault and first-fault loads.
5970 static inline QEMU_ALWAYS_INLINE
5971 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5972 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5973 const int esz, const int msz, const SVEContFault fault,
5974 sve_ldst1_host_fn *host_fn,
5975 sve_ldst1_tlb_fn *tlb_fn)
5977 const unsigned rd = simd_data(desc);
5978 void *vd = &env->vfp.zregs[rd];
5979 const intptr_t reg_max = simd_oprsz(desc);
5980 intptr_t reg_off, mem_off, reg_last;
5981 SVEContLdSt info;
5982 int flags;
5983 void *host;
5985 /* Find the active elements. */
5986 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5987 /* The entire predicate was false; no load occurs. */
5988 memset(vd, 0, reg_max);
5989 return;
5991 reg_off = info.reg_off_first[0];
5993 /* Probe the page(s). */
5994 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5995 /* Fault on first element. */
5996 tcg_debug_assert(fault == FAULT_NO);
5997 memset(vd, 0, reg_max);
5998 goto do_fault;
6001 mem_off = info.mem_off_first[0];
6002 flags = info.page[0].flags;
6005 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6006 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6008 if (!info.page[0].tagged) {
6009 mtedesc = 0;
6012 if (fault == FAULT_FIRST) {
6013 /* Trapping mte check for the first-fault element. */
6014 if (mtedesc) {
6015 mte_check(env, mtedesc, addr + mem_off, retaddr);
6019 * Special handling of the first active element,
6020 * if it crosses a page boundary or is MMIO.
6022 bool is_split = mem_off == info.mem_off_split;
6023 if (unlikely(flags != 0) || unlikely(is_split)) {
6025 * Use the slow path for cross-page handling.
6026 * Might trap for MMIO or watchpoints.
6028 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6030 /* After any fault, zero the other elements. */
6031 swap_memzero(vd, reg_off);
6032 reg_off += 1 << esz;
6033 mem_off += 1 << msz;
6034 swap_memzero(vd + reg_off, reg_max - reg_off);
6036 if (is_split) {
6037 goto second_page;
6039 } else {
6040 memset(vd, 0, reg_max);
6042 } else {
6043 memset(vd, 0, reg_max);
6044 if (unlikely(mem_off == info.mem_off_split)) {
6045 /* The first active element crosses a page boundary. */
6046 flags |= info.page[1].flags;
6047 if (unlikely(flags & TLB_MMIO)) {
6048 /* Some page is MMIO, see below. */
6049 goto do_fault;
6051 if (unlikely(flags & TLB_WATCHPOINT) &&
6052 (cpu_watchpoint_address_matches
6053 (env_cpu(env), addr + mem_off, 1 << msz)
6054 & BP_MEM_READ)) {
6055 /* Watchpoint hit, see below. */
6056 goto do_fault;
6058 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6059 goto do_fault;
6062 * Use the slow path for cross-page handling.
6063 * This is RAM, without a watchpoint, and will not trap.
6065 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6066 goto second_page;
6071 * From this point on, all memory operations are MemSingleNF.
6073 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6074 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6076 * Unfortuately we do not have access to the memory attributes from the
6077 * PTE to tell Device memory from Normal memory. So we make a mostly
6078 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6079 * This gives the right answer for the common cases of "Normal memory,
6080 * backed by host RAM" and "Device memory, backed by MMIO".
6081 * The architecture allows us to suppress an NF load and return
6082 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6083 * case of "Normal memory, backed by MMIO" is permitted. The case we
6084 * get wrong is "Device memory, backed by host RAM", for which we
6085 * should return (UNKNOWN, FAULT) for but do not.
6087 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6088 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6089 * architectural breakpoints the same.
6091 if (unlikely(flags & TLB_MMIO)) {
6092 goto do_fault;
6095 reg_last = info.reg_off_last[0];
6096 host = info.page[0].host;
6098 do {
6099 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6100 do {
6101 if ((pg >> (reg_off & 63)) & 1) {
6102 if (unlikely(flags & TLB_WATCHPOINT) &&
6103 (cpu_watchpoint_address_matches
6104 (env_cpu(env), addr + mem_off, 1 << msz)
6105 & BP_MEM_READ)) {
6106 goto do_fault;
6108 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6109 goto do_fault;
6111 host_fn(vd, reg_off, host + mem_off);
6113 reg_off += 1 << esz;
6114 mem_off += 1 << msz;
6115 } while (reg_off <= reg_last && (reg_off & 63));
6116 } while (reg_off <= reg_last);
6119 * MemSingleNF is allowed to fail for any reason. We have special
6120 * code above to handle the first element crossing a page boundary.
6121 * As an implementation choice, decline to handle a cross-page element
6122 * in any other position.
6124 reg_off = info.reg_off_split;
6125 if (reg_off >= 0) {
6126 goto do_fault;
6129 second_page:
6130 reg_off = info.reg_off_first[1];
6131 if (likely(reg_off < 0)) {
6132 /* No active elements on the second page. All done. */
6133 return;
6137 * MemSingleNF is allowed to fail for any reason. As an implementation
6138 * choice, decline to handle elements on the second page. This should
6139 * be low frequency as the guest walks through memory -- the next
6140 * iteration of the guest's loop should be aligned on the page boundary,
6141 * and then all following iterations will stay aligned.
6144 do_fault:
6145 record_fault(env, reg_off, reg_max);
6148 static inline QEMU_ALWAYS_INLINE
6149 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6150 uint32_t desc, const uintptr_t retaddr,
6151 const int esz, const int msz, const SVEContFault fault,
6152 sve_ldst1_host_fn *host_fn,
6153 sve_ldst1_tlb_fn *tlb_fn)
6155 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6156 int bit55 = extract64(addr, 55, 1);
6158 /* Remove mtedesc from the normal sve descriptor. */
6159 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6161 /* Perform gross MTE suppression early. */
6162 if (!tbi_check(desc, bit55) ||
6163 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6164 mtedesc = 0;
6167 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6168 esz, msz, fault, host_fn, tlb_fn);
6171 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6172 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6173 target_ulong addr, uint32_t desc) \
6175 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6176 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6178 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6179 target_ulong addr, uint32_t desc) \
6181 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6182 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6184 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6185 target_ulong addr, uint32_t desc) \
6187 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6188 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6190 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6191 target_ulong addr, uint32_t desc) \
6193 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6194 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6197 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6198 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6199 target_ulong addr, uint32_t desc) \
6201 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6202 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6204 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6205 target_ulong addr, uint32_t desc) \
6207 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6208 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6210 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6211 target_ulong addr, uint32_t desc) \
6213 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6214 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6216 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6217 target_ulong addr, uint32_t desc) \
6219 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6220 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6222 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6223 target_ulong addr, uint32_t desc) \
6225 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6226 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6228 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6229 target_ulong addr, uint32_t desc) \
6231 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6232 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6234 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6235 target_ulong addr, uint32_t desc) \
6237 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6238 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6240 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6241 target_ulong addr, uint32_t desc) \
6243 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6244 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6247 DO_LDFF1_LDNF1_1(bb, MO_8)
6248 DO_LDFF1_LDNF1_1(bhu, MO_16)
6249 DO_LDFF1_LDNF1_1(bhs, MO_16)
6250 DO_LDFF1_LDNF1_1(bsu, MO_32)
6251 DO_LDFF1_LDNF1_1(bss, MO_32)
6252 DO_LDFF1_LDNF1_1(bdu, MO_64)
6253 DO_LDFF1_LDNF1_1(bds, MO_64)
6255 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6256 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6257 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6258 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6259 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6261 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6262 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6263 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6265 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6267 #undef DO_LDFF1_LDNF1_1
6268 #undef DO_LDFF1_LDNF1_2
6271 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6274 static inline QEMU_ALWAYS_INLINE
6275 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6276 uint32_t desc, const uintptr_t retaddr,
6277 const int esz, const int msz, const int N, uint32_t mtedesc,
6278 sve_ldst1_host_fn *host_fn,
6279 sve_ldst1_tlb_fn *tlb_fn)
6281 const unsigned rd = simd_data(desc);
6282 const intptr_t reg_max = simd_oprsz(desc);
6283 intptr_t reg_off, reg_last, mem_off;
6284 SVEContLdSt info;
6285 void *host;
6286 int i, flags;
6288 /* Find the active elements. */
6289 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6290 /* The entire predicate was false; no store occurs. */
6291 return;
6294 /* Probe the page(s). Exit with exception for any invalid page. */
6295 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6297 /* Handle watchpoints for all active elements. */
6298 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6299 BP_MEM_WRITE, retaddr);
6302 * Handle mte checks for all active elements.
6303 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6305 if (mtedesc) {
6306 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6307 mtedesc, retaddr);
6310 flags = info.page[0].flags | info.page[1].flags;
6311 if (unlikely(flags != 0)) {
6312 #ifdef CONFIG_USER_ONLY
6313 g_assert_not_reached();
6314 #else
6316 * At least one page includes MMIO.
6317 * Any bus operation can fail with cpu_transaction_failed,
6318 * which for ARM will raise SyncExternal. We cannot avoid
6319 * this fault and will leave with the store incomplete.
6321 mem_off = info.mem_off_first[0];
6322 reg_off = info.reg_off_first[0];
6323 reg_last = info.reg_off_last[1];
6324 if (reg_last < 0) {
6325 reg_last = info.reg_off_split;
6326 if (reg_last < 0) {
6327 reg_last = info.reg_off_last[0];
6331 do {
6332 uint64_t pg = vg[reg_off >> 6];
6333 do {
6334 if ((pg >> (reg_off & 63)) & 1) {
6335 for (i = 0; i < N; ++i) {
6336 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6337 addr + mem_off + (i << msz), retaddr);
6340 reg_off += 1 << esz;
6341 mem_off += N << msz;
6342 } while (reg_off & 63);
6343 } while (reg_off <= reg_last);
6344 return;
6345 #endif
6348 mem_off = info.mem_off_first[0];
6349 reg_off = info.reg_off_first[0];
6350 reg_last = info.reg_off_last[0];
6351 host = info.page[0].host;
6353 while (reg_off <= reg_last) {
6354 uint64_t pg = vg[reg_off >> 6];
6355 do {
6356 if ((pg >> (reg_off & 63)) & 1) {
6357 for (i = 0; i < N; ++i) {
6358 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6359 host + mem_off + (i << msz));
6362 reg_off += 1 << esz;
6363 mem_off += N << msz;
6364 } while (reg_off <= reg_last && (reg_off & 63));
6368 * Use the slow path to manage the cross-page misalignment.
6369 * But we know this is RAM and cannot trap.
6371 mem_off = info.mem_off_split;
6372 if (unlikely(mem_off >= 0)) {
6373 reg_off = info.reg_off_split;
6374 for (i = 0; i < N; ++i) {
6375 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6376 addr + mem_off + (i << msz), retaddr);
6380 mem_off = info.mem_off_first[1];
6381 if (unlikely(mem_off >= 0)) {
6382 reg_off = info.reg_off_first[1];
6383 reg_last = info.reg_off_last[1];
6384 host = info.page[1].host;
6386 do {
6387 uint64_t pg = vg[reg_off >> 6];
6388 do {
6389 if ((pg >> (reg_off & 63)) & 1) {
6390 for (i = 0; i < N; ++i) {
6391 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6392 host + mem_off + (i << msz));
6395 reg_off += 1 << esz;
6396 mem_off += N << msz;
6397 } while (reg_off & 63);
6398 } while (reg_off <= reg_last);
6402 static inline QEMU_ALWAYS_INLINE
6403 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6404 uint32_t desc, const uintptr_t ra,
6405 const int esz, const int msz, const int N,
6406 sve_ldst1_host_fn *host_fn,
6407 sve_ldst1_tlb_fn *tlb_fn)
6409 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6410 int bit55 = extract64(addr, 55, 1);
6412 /* Remove mtedesc from the normal sve descriptor. */
6413 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6415 /* Perform gross MTE suppression early. */
6416 if (!tbi_check(desc, bit55) ||
6417 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6418 mtedesc = 0;
6421 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6424 #define DO_STN_1(N, NAME, ESZ) \
6425 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6426 target_ulong addr, uint32_t desc) \
6428 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6429 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6431 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6432 target_ulong addr, uint32_t desc) \
6434 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6435 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6438 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6439 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6440 target_ulong addr, uint32_t desc) \
6442 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6443 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6445 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6446 target_ulong addr, uint32_t desc) \
6448 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6449 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6451 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6452 target_ulong addr, uint32_t desc) \
6454 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6455 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6457 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6458 target_ulong addr, uint32_t desc) \
6460 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6461 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6464 DO_STN_1(1, bb, MO_8)
6465 DO_STN_1(1, bh, MO_16)
6466 DO_STN_1(1, bs, MO_32)
6467 DO_STN_1(1, bd, MO_64)
6468 DO_STN_1(2, bb, MO_8)
6469 DO_STN_1(3, bb, MO_8)
6470 DO_STN_1(4, bb, MO_8)
6472 DO_STN_2(1, hh, MO_16, MO_16)
6473 DO_STN_2(1, hs, MO_32, MO_16)
6474 DO_STN_2(1, hd, MO_64, MO_16)
6475 DO_STN_2(2, hh, MO_16, MO_16)
6476 DO_STN_2(3, hh, MO_16, MO_16)
6477 DO_STN_2(4, hh, MO_16, MO_16)
6479 DO_STN_2(1, ss, MO_32, MO_32)
6480 DO_STN_2(1, sd, MO_64, MO_32)
6481 DO_STN_2(2, ss, MO_32, MO_32)
6482 DO_STN_2(3, ss, MO_32, MO_32)
6483 DO_STN_2(4, ss, MO_32, MO_32)
6485 DO_STN_2(1, dd, MO_64, MO_64)
6486 DO_STN_2(2, dd, MO_64, MO_64)
6487 DO_STN_2(3, dd, MO_64, MO_64)
6488 DO_STN_2(4, dd, MO_64, MO_64)
6490 #undef DO_STN_1
6491 #undef DO_STN_2
6494 * Loads with a vector index.
6498 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6500 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6502 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6504 return *(uint32_t *)(reg + H1_4(reg_ofs));
6507 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6509 return *(int32_t *)(reg + H1_4(reg_ofs));
6512 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6514 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6517 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6519 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6522 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6524 return *(uint64_t *)(reg + reg_ofs);
6527 static inline QEMU_ALWAYS_INLINE
6528 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6529 target_ulong base, uint32_t desc, uintptr_t retaddr,
6530 uint32_t mtedesc, int esize, int msize,
6531 zreg_off_fn *off_fn,
6532 sve_ldst1_host_fn *host_fn,
6533 sve_ldst1_tlb_fn *tlb_fn)
6535 const int mmu_idx = cpu_mmu_index(env, false);
6536 const intptr_t reg_max = simd_oprsz(desc);
6537 const int scale = simd_data(desc);
6538 ARMVectorReg scratch;
6539 intptr_t reg_off;
6540 SVEHostPage info, info2;
6542 memset(&scratch, 0, reg_max);
6543 reg_off = 0;
6544 do {
6545 uint64_t pg = vg[reg_off >> 6];
6546 do {
6547 if (likely(pg & 1)) {
6548 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6549 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6551 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6552 mmu_idx, retaddr);
6554 if (likely(in_page >= msize)) {
6555 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6556 cpu_check_watchpoint(env_cpu(env), addr, msize,
6557 info.attrs, BP_MEM_READ, retaddr);
6559 if (mtedesc && info.tagged) {
6560 mte_check(env, mtedesc, addr, retaddr);
6562 if (unlikely(info.flags & TLB_MMIO)) {
6563 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6564 } else {
6565 host_fn(&scratch, reg_off, info.host);
6567 } else {
6568 /* Element crosses the page boundary. */
6569 sve_probe_page(&info2, false, env, addr + in_page, 0,
6570 MMU_DATA_LOAD, mmu_idx, retaddr);
6571 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6572 cpu_check_watchpoint(env_cpu(env), addr,
6573 msize, info.attrs,
6574 BP_MEM_READ, retaddr);
6576 if (mtedesc && info.tagged) {
6577 mte_check(env, mtedesc, addr, retaddr);
6579 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6582 reg_off += esize;
6583 pg >>= esize;
6584 } while (reg_off & 63);
6585 } while (reg_off < reg_max);
6587 /* Wait until all exceptions have been raised to write back. */
6588 memcpy(vd, &scratch, reg_max);
6591 static inline QEMU_ALWAYS_INLINE
6592 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6593 target_ulong base, uint32_t desc, uintptr_t retaddr,
6594 int esize, int msize, zreg_off_fn *off_fn,
6595 sve_ldst1_host_fn *host_fn,
6596 sve_ldst1_tlb_fn *tlb_fn)
6598 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6599 /* Remove mtedesc from the normal sve descriptor. */
6600 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6603 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6604 * offset base entirely over the address space hole to change the
6605 * pointer tag, or change the bit55 selector. So we could here
6606 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6608 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6609 esize, msize, off_fn, host_fn, tlb_fn);
6612 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6613 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6614 void *vm, target_ulong base, uint32_t desc) \
6616 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6617 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6619 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6620 void *vm, target_ulong base, uint32_t desc) \
6622 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6623 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6626 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6627 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6628 void *vm, target_ulong base, uint32_t desc) \
6630 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6631 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6633 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6634 void *vm, target_ulong base, uint32_t desc) \
6636 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6637 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6640 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6641 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6642 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6643 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6644 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6646 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6647 DO_LD1_ZPZ_S(bss, zss, MO_8)
6648 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6649 DO_LD1_ZPZ_D(bds, zss, MO_8)
6650 DO_LD1_ZPZ_D(bds, zd, MO_8)
6652 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6653 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6654 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6655 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6656 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6658 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6659 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6660 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6661 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6662 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6664 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6665 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6666 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6667 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6668 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6670 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6671 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6672 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6673 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6674 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6676 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6677 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6678 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6679 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6680 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6682 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6683 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6684 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6685 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6686 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6688 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6689 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6690 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6692 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6693 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6694 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6696 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6697 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6698 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6700 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6701 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6702 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6704 #undef DO_LD1_ZPZ_S
6705 #undef DO_LD1_ZPZ_D
6707 /* First fault loads with a vector index. */
6710 * Common helpers for all gather first-faulting loads.
6713 static inline QEMU_ALWAYS_INLINE
6714 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6715 target_ulong base, uint32_t desc, uintptr_t retaddr,
6716 uint32_t mtedesc, const int esz, const int msz,
6717 zreg_off_fn *off_fn,
6718 sve_ldst1_host_fn *host_fn,
6719 sve_ldst1_tlb_fn *tlb_fn)
6721 const int mmu_idx = cpu_mmu_index(env, false);
6722 const intptr_t reg_max = simd_oprsz(desc);
6723 const int scale = simd_data(desc);
6724 const int esize = 1 << esz;
6725 const int msize = 1 << msz;
6726 intptr_t reg_off;
6727 SVEHostPage info;
6728 target_ulong addr, in_page;
6730 /* Skip to the first true predicate. */
6731 reg_off = find_next_active(vg, 0, reg_max, esz);
6732 if (unlikely(reg_off >= reg_max)) {
6733 /* The entire predicate was false; no load occurs. */
6734 memset(vd, 0, reg_max);
6735 return;
6739 * Probe the first element, allowing faults.
6741 addr = base + (off_fn(vm, reg_off) << scale);
6742 if (mtedesc) {
6743 mte_check(env, mtedesc, addr, retaddr);
6745 tlb_fn(env, vd, reg_off, addr, retaddr);
6747 /* After any fault, zero the other elements. */
6748 swap_memzero(vd, reg_off);
6749 reg_off += esize;
6750 swap_memzero(vd + reg_off, reg_max - reg_off);
6753 * Probe the remaining elements, not allowing faults.
6755 while (reg_off < reg_max) {
6756 uint64_t pg = vg[reg_off >> 6];
6757 do {
6758 if (likely((pg >> (reg_off & 63)) & 1)) {
6759 addr = base + (off_fn(vm, reg_off) << scale);
6760 in_page = -(addr | TARGET_PAGE_MASK);
6762 if (unlikely(in_page < msize)) {
6763 /* Stop if the element crosses a page boundary. */
6764 goto fault;
6767 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6768 mmu_idx, retaddr);
6769 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6770 goto fault;
6772 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6773 (cpu_watchpoint_address_matches
6774 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6775 goto fault;
6777 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6778 goto fault;
6781 host_fn(vd, reg_off, info.host);
6783 reg_off += esize;
6784 } while (reg_off & 63);
6786 return;
6788 fault:
6789 record_fault(env, reg_off, reg_max);
6792 static inline QEMU_ALWAYS_INLINE
6793 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6794 target_ulong base, uint32_t desc, uintptr_t retaddr,
6795 const int esz, const int msz,
6796 zreg_off_fn *off_fn,
6797 sve_ldst1_host_fn *host_fn,
6798 sve_ldst1_tlb_fn *tlb_fn)
6800 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6801 /* Remove mtedesc from the normal sve descriptor. */
6802 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6805 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6806 * offset base entirely over the address space hole to change the
6807 * pointer tag, or change the bit55 selector. So we could here
6808 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6810 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6811 esz, msz, off_fn, host_fn, tlb_fn);
6814 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6815 void HELPER(sve_ldff##MEM##_##OFS) \
6816 (CPUARMState *env, void *vd, void *vg, \
6817 void *vm, target_ulong base, uint32_t desc) \
6819 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6820 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6822 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6823 (CPUARMState *env, void *vd, void *vg, \
6824 void *vm, target_ulong base, uint32_t desc) \
6826 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6827 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6830 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6831 void HELPER(sve_ldff##MEM##_##OFS) \
6832 (CPUARMState *env, void *vd, void *vg, \
6833 void *vm, target_ulong base, uint32_t desc) \
6835 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6836 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6838 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6839 (CPUARMState *env, void *vd, void *vg, \
6840 void *vm, target_ulong base, uint32_t desc) \
6842 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6843 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6846 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6847 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6848 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6849 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6850 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6852 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6853 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6854 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6855 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6856 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6858 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6859 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6860 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6861 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6862 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6864 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6865 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6866 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6867 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6868 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6870 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6871 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6872 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6873 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6874 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6876 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6877 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6878 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6879 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6880 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6882 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6883 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6884 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6885 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6886 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6888 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6889 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6890 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6891 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6892 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6894 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6895 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6896 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6898 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6899 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6900 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6902 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6903 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6904 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6906 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6907 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6908 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6910 /* Stores with a vector index. */
6912 static inline QEMU_ALWAYS_INLINE
6913 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6914 target_ulong base, uint32_t desc, uintptr_t retaddr,
6915 uint32_t mtedesc, int esize, int msize,
6916 zreg_off_fn *off_fn,
6917 sve_ldst1_host_fn *host_fn,
6918 sve_ldst1_tlb_fn *tlb_fn)
6920 const int mmu_idx = cpu_mmu_index(env, false);
6921 const intptr_t reg_max = simd_oprsz(desc);
6922 const int scale = simd_data(desc);
6923 void *host[ARM_MAX_VQ * 4];
6924 intptr_t reg_off, i;
6925 SVEHostPage info, info2;
6928 * Probe all of the elements for host addresses and flags.
6930 i = reg_off = 0;
6931 do {
6932 uint64_t pg = vg[reg_off >> 6];
6933 do {
6934 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6935 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6937 host[i] = NULL;
6938 if (likely((pg >> (reg_off & 63)) & 1)) {
6939 if (likely(in_page >= msize)) {
6940 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6941 mmu_idx, retaddr);
6942 if (!(info.flags & TLB_MMIO)) {
6943 host[i] = info.host;
6945 } else {
6947 * Element crosses the page boundary.
6948 * Probe both pages, but do not record the host address,
6949 * so that we use the slow path.
6951 sve_probe_page(&info, false, env, addr, 0,
6952 MMU_DATA_STORE, mmu_idx, retaddr);
6953 sve_probe_page(&info2, false, env, addr + in_page, 0,
6954 MMU_DATA_STORE, mmu_idx, retaddr);
6955 info.flags |= info2.flags;
6958 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6959 cpu_check_watchpoint(env_cpu(env), addr, msize,
6960 info.attrs, BP_MEM_WRITE, retaddr);
6963 if (mtedesc && info.tagged) {
6964 mte_check(env, mtedesc, addr, retaddr);
6967 i += 1;
6968 reg_off += esize;
6969 } while (reg_off & 63);
6970 } while (reg_off < reg_max);
6973 * Now that we have recognized all exceptions except SyncExternal
6974 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6976 * Note for the common case of an element in RAM, not crossing a page
6977 * boundary, we have stored the host address in host[]. This doubles
6978 * as a first-level check against the predicate, since only enabled
6979 * elements have non-null host addresses.
6981 i = reg_off = 0;
6982 do {
6983 void *h = host[i];
6984 if (likely(h != NULL)) {
6985 host_fn(vd, reg_off, h);
6986 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6987 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6988 tlb_fn(env, vd, reg_off, addr, retaddr);
6990 i += 1;
6991 reg_off += esize;
6992 } while (reg_off < reg_max);
6995 static inline QEMU_ALWAYS_INLINE
6996 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6997 target_ulong base, uint32_t desc, uintptr_t retaddr,
6998 int esize, int msize, zreg_off_fn *off_fn,
6999 sve_ldst1_host_fn *host_fn,
7000 sve_ldst1_tlb_fn *tlb_fn)
7002 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7003 /* Remove mtedesc from the normal sve descriptor. */
7004 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7007 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7008 * offset base entirely over the address space hole to change the
7009 * pointer tag, or change the bit55 selector. So we could here
7010 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7012 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7013 esize, msize, off_fn, host_fn, tlb_fn);
7016 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7017 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7018 void *vm, target_ulong base, uint32_t desc) \
7020 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7021 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7023 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7024 void *vm, target_ulong base, uint32_t desc) \
7026 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7027 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7030 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7031 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7032 void *vm, target_ulong base, uint32_t desc) \
7034 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7035 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7037 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7038 void *vm, target_ulong base, uint32_t desc) \
7040 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7041 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7044 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7045 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7046 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7047 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7048 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7050 DO_ST1_ZPZ_S(bs, zss, MO_8)
7051 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7052 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7053 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7054 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7056 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7057 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7058 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7059 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7060 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7061 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7062 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7064 DO_ST1_ZPZ_D(bd, zss, MO_8)
7065 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7066 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7067 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7068 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7069 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7070 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7072 DO_ST1_ZPZ_D(bd, zd, MO_8)
7073 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7074 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7075 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7076 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7077 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7078 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7080 #undef DO_ST1_ZPZ_S
7081 #undef DO_ST1_ZPZ_D
7083 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7085 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7086 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7088 for (i = 0; i < opr_sz; ++i) {
7089 d[i] = n[i] ^ m[i] ^ k[i];
7093 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7095 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7096 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7098 for (i = 0; i < opr_sz; ++i) {
7099 d[i] = n[i] ^ (m[i] & ~k[i]);
7103 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7105 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7106 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7108 for (i = 0; i < opr_sz; ++i) {
7109 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7113 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7115 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7116 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7118 for (i = 0; i < opr_sz; ++i) {
7119 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7123 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7125 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7126 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7128 for (i = 0; i < opr_sz; ++i) {
7129 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7134 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7135 * See hasless(v,1) from
7136 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7138 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7140 int bits = 8 << esz;
7141 uint64_t ones = dup_const(esz, 1);
7142 uint64_t signs = ones << (bits - 1);
7143 uint64_t cmp0, cmp1;
7145 cmp1 = dup_const(esz, n);
7146 cmp0 = cmp1 ^ m0;
7147 cmp1 = cmp1 ^ m1;
7148 cmp0 = (cmp0 - ones) & ~cmp0;
7149 cmp1 = (cmp1 - ones) & ~cmp1;
7150 return (cmp0 | cmp1) & signs;
7153 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7154 uint32_t desc, int esz, bool nmatch)
7156 uint16_t esz_mask = pred_esz_masks[esz];
7157 intptr_t opr_sz = simd_oprsz(desc);
7158 uint32_t flags = PREDTEST_INIT;
7159 intptr_t i, j, k;
7161 for (i = 0; i < opr_sz; i += 16) {
7162 uint64_t m0 = *(uint64_t *)(vm + i);
7163 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7164 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7165 uint16_t out = 0;
7167 for (j = 0; j < 16; j += 8) {
7168 uint64_t n = *(uint64_t *)(vn + i + j);
7170 for (k = 0; k < 8; k += 1 << esz) {
7171 if (pg & (1 << (j + k))) {
7172 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7173 out |= (o ^ nmatch) << (j + k);
7177 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7178 flags = iter_predtest_fwd(out, pg, flags);
7180 return flags;
7183 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7184 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7186 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7189 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7190 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7192 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7193 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7195 #undef DO_PPZZ_MATCH
7197 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7198 uint32_t desc)
7200 ARMVectorReg scratch;
7201 intptr_t i, j;
7202 intptr_t opr_sz = simd_oprsz(desc);
7203 uint32_t *d = vd, *n = vn, *m = vm;
7204 uint8_t *pg = vg;
7206 if (d == n) {
7207 n = memcpy(&scratch, n, opr_sz);
7208 if (d == m) {
7209 m = n;
7211 } else if (d == m) {
7212 m = memcpy(&scratch, m, opr_sz);
7215 for (i = 0; i < opr_sz; i += 4) {
7216 uint64_t count = 0;
7217 uint8_t pred;
7219 pred = pg[H1(i >> 3)] >> (i & 7);
7220 if (pred & 1) {
7221 uint32_t nn = n[H4(i >> 2)];
7223 for (j = 0; j <= i; j += 4) {
7224 pred = pg[H1(j >> 3)] >> (j & 7);
7225 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7226 ++count;
7230 d[H4(i >> 2)] = count;
7234 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7235 uint32_t desc)
7237 ARMVectorReg scratch;
7238 intptr_t i, j;
7239 intptr_t opr_sz = simd_oprsz(desc);
7240 uint64_t *d = vd, *n = vn, *m = vm;
7241 uint8_t *pg = vg;
7243 if (d == n) {
7244 n = memcpy(&scratch, n, opr_sz);
7245 if (d == m) {
7246 m = n;
7248 } else if (d == m) {
7249 m = memcpy(&scratch, m, opr_sz);
7252 for (i = 0; i < opr_sz / 8; ++i) {
7253 uint64_t count = 0;
7254 if (pg[H1(i)] & 1) {
7255 uint64_t nn = n[i];
7256 for (j = 0; j <= i; ++j) {
7257 if ((pg[H1(j)] & 1) && nn == m[j]) {
7258 ++count;
7262 d[i] = count;
7267 * Returns the number of bytes in m0 and m1 that match n.
7268 * Unlike do_match2 we don't just need true/false, we need an exact count.
7269 * This requires two extra logical operations.
7271 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7273 const uint64_t mask = dup_const(MO_8, 0x7f);
7274 uint64_t cmp0, cmp1;
7276 cmp1 = dup_const(MO_8, n);
7277 cmp0 = cmp1 ^ m0;
7278 cmp1 = cmp1 ^ m1;
7281 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7282 * 2: carry in to msb if byte != 0 (+ mask)
7283 * 3: set msb if cmp has msb set (| cmp)
7284 * 4: set ~msb to ignore them (| mask)
7285 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7286 * 5: invert, resulting in 0x80 if and only if byte == 0.
7288 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7289 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7292 * Combine the two compares in a way that the bits do
7293 * not overlap, and so preserves the count of set bits.
7294 * If the host has an efficient instruction for ctpop,
7295 * then ctpop(x) + ctpop(y) has the same number of
7296 * operations as ctpop(x | (y >> 1)). If the host does
7297 * not have an efficient ctpop, then we only want to
7298 * use it once.
7300 return ctpop64(cmp0 | (cmp1 >> 1));
7303 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7305 intptr_t i, j;
7306 intptr_t opr_sz = simd_oprsz(desc);
7308 for (i = 0; i < opr_sz; i += 16) {
7309 uint64_t n0 = *(uint64_t *)(vn + i);
7310 uint64_t m0 = *(uint64_t *)(vm + i);
7311 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7312 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7313 uint64_t out0 = 0;
7314 uint64_t out1 = 0;
7316 for (j = 0; j < 64; j += 8) {
7317 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7318 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7319 out0 |= cnt0 << j;
7320 out1 |= cnt1 << j;
7323 *(uint64_t *)(vd + i) = out0;
7324 *(uint64_t *)(vd + i + 8) = out1;
7328 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7330 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7331 int shr = simd_data(desc);
7332 int shl = 8 - shr;
7333 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7334 uint64_t *d = vd, *n = vn, *m = vm;
7336 for (i = 0; i < opr_sz; ++i) {
7337 uint64_t t = n[i] ^ m[i];
7338 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7342 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7344 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7345 int shr = simd_data(desc);
7346 int shl = 16 - shr;
7347 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7348 uint64_t *d = vd, *n = vn, *m = vm;
7350 for (i = 0; i < opr_sz; ++i) {
7351 uint64_t t = n[i] ^ m[i];
7352 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7356 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7358 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7359 int shr = simd_data(desc);
7360 uint32_t *d = vd, *n = vn, *m = vm;
7362 for (i = 0; i < opr_sz; ++i) {
7363 d[i] = ror32(n[i] ^ m[i], shr);
7367 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7368 void *status, uint32_t desc)
7370 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7372 for (s = 0; s < opr_sz; ++s) {
7373 float32 *n = vn + s * sizeof(float32) * 4;
7374 float32 *m = vm + s * sizeof(float32) * 4;
7375 float32 *a = va + s * sizeof(float32) * 4;
7376 float32 *d = vd + s * sizeof(float32) * 4;
7377 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7378 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7379 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7380 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7381 float32 p0, p1;
7383 /* i = 0, j = 0 */
7384 p0 = float32_mul(n00, m00, status);
7385 p1 = float32_mul(n01, m01, status);
7386 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7388 /* i = 0, j = 1 */
7389 p0 = float32_mul(n00, m10, status);
7390 p1 = float32_mul(n01, m11, status);
7391 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7393 /* i = 1, j = 0 */
7394 p0 = float32_mul(n10, m00, status);
7395 p1 = float32_mul(n11, m01, status);
7396 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7398 /* i = 1, j = 1 */
7399 p0 = float32_mul(n10, m10, status);
7400 p1 = float32_mul(n11, m11, status);
7401 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7405 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7406 void *status, uint32_t desc)
7408 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7410 for (s = 0; s < opr_sz; ++s) {
7411 float64 *n = vn + s * sizeof(float64) * 4;
7412 float64 *m = vm + s * sizeof(float64) * 4;
7413 float64 *a = va + s * sizeof(float64) * 4;
7414 float64 *d = vd + s * sizeof(float64) * 4;
7415 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7416 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7417 float64 p0, p1;
7419 /* i = 0, j = 0 */
7420 p0 = float64_mul(n00, m00, status);
7421 p1 = float64_mul(n01, m01, status);
7422 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7424 /* i = 0, j = 1 */
7425 p0 = float64_mul(n00, m10, status);
7426 p1 = float64_mul(n01, m11, status);
7427 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7429 /* i = 1, j = 0 */
7430 p0 = float64_mul(n10, m00, status);
7431 p1 = float64_mul(n11, m01, status);
7432 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7434 /* i = 1, j = 1 */
7435 p0 = float64_mul(n10, m10, status);
7436 p1 = float64_mul(n11, m11, status);
7437 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7441 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7442 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7444 intptr_t i = simd_oprsz(desc); \
7445 uint64_t *g = vg; \
7446 do { \
7447 uint64_t pg = g[(i - 1) >> 6]; \
7448 do { \
7449 i -= sizeof(TYPEW); \
7450 if (likely((pg >> (i & 63)) & 1)) { \
7451 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7452 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7454 } while (i & 63); \
7455 } while (i != 0); \
7458 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7459 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7460 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7462 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7463 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7465 intptr_t i = simd_oprsz(desc); \
7466 uint64_t *g = vg; \
7467 do { \
7468 uint64_t pg = g[(i - 1) >> 6]; \
7469 do { \
7470 i -= sizeof(TYPEW); \
7471 if (likely((pg >> (i & 63)) & 1)) { \
7472 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7473 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7475 } while (i & 63); \
7476 } while (i != 0); \
7479 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7480 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7482 #undef DO_FCVTLT
7483 #undef DO_FCVTNT