target/arm: Move expand_pred_h to vec_internal.h
[qemu.git] / target / arm / sve_helper.c
blob1654c0bbf9eaae022f82a2c924246db2aed3a63a
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
36 * within CPUARMState.
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
43 * moving forward.
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 if (likely(g)) {
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
55 /* Accumulate Z from each D & G. */
56 flags |= ((d & g) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 return flags;
64 /* This is an iterative function, called for each Pd and Pg word
65 * moving backward.
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 if (likely(g)) {
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
72 if (!(flags & 4)) {
73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags |= (d & pow2floor(g)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags |= ((d & g) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 return flags;
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
103 return flags;
106 /* Similarly for single word elements. */
107 static inline uint64_t expand_pred_s(uint8_t byte)
109 static const uint64_t word[] = {
110 [0x01] = 0x00000000ffffffffull,
111 [0x10] = 0xffffffff00000000ull,
112 [0x11] = 0xffffffffffffffffull,
114 return word[byte & 0x11];
117 #define LOGICAL_PPPP(NAME, FUNC) \
118 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
120 uintptr_t opr_sz = simd_oprsz(desc); \
121 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
122 uintptr_t i; \
123 for (i = 0; i < opr_sz / 8; ++i) { \
124 d[i] = FUNC(n[i], m[i], g[i]); \
128 #define DO_AND(N, M, G) (((N) & (M)) & (G))
129 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
130 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
131 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
132 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
133 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
134 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
135 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
137 LOGICAL_PPPP(sve_and_pppp, DO_AND)
138 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
139 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
140 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
141 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
142 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
143 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
144 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
146 #undef DO_AND
147 #undef DO_BIC
148 #undef DO_EOR
149 #undef DO_ORR
150 #undef DO_ORN
151 #undef DO_NOR
152 #undef DO_NAND
153 #undef DO_SEL
154 #undef LOGICAL_PPPP
156 /* Fully general three-operand expander, controlled by a predicate.
157 * This is complicated by the host-endian storage of the register file.
159 /* ??? I don't expect the compiler could ever vectorize this itself.
160 * With some tables we can convert bit masks to byte masks, and with
161 * extra care wrt byte/word ordering we could use gcc generic vectors
162 * and do 16 bytes at a time.
164 #define DO_ZPZZ(NAME, TYPE, H, OP) \
165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
167 intptr_t i, opr_sz = simd_oprsz(desc); \
168 for (i = 0; i < opr_sz; ) { \
169 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
170 do { \
171 if (pg & 1) { \
172 TYPE nn = *(TYPE *)(vn + H(i)); \
173 TYPE mm = *(TYPE *)(vm + H(i)); \
174 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
176 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
177 } while (i & 15); \
181 /* Similarly, specialized for 64-bit operands. */
182 #define DO_ZPZZ_D(NAME, TYPE, OP) \
183 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
185 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
186 TYPE *d = vd, *n = vn, *m = vm; \
187 uint8_t *pg = vg; \
188 for (i = 0; i < opr_sz; i += 1) { \
189 if (pg[H1(i)] & 1) { \
190 TYPE nn = n[i], mm = m[i]; \
191 d[i] = OP(nn, mm); \
196 #define DO_AND(N, M) (N & M)
197 #define DO_EOR(N, M) (N ^ M)
198 #define DO_ORR(N, M) (N | M)
199 #define DO_BIC(N, M) (N & ~M)
200 #define DO_ADD(N, M) (N + M)
201 #define DO_SUB(N, M) (N - M)
202 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
203 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
204 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
205 #define DO_MUL(N, M) (N * M)
209 * We must avoid the C undefined behaviour cases: division by
210 * zero and signed division of INT_MIN by -1. Both of these
211 * have architecturally defined required results for Arm.
212 * We special case all signed divisions by -1 to avoid having
213 * to deduce the minimum integer for the type involved.
215 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
216 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
218 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
219 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
220 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
221 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
223 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
224 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
225 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
226 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
228 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
229 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
230 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
231 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
233 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
234 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
235 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
236 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
238 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
239 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
240 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
241 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
243 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
244 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
245 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
246 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
248 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
249 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
250 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
251 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
253 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
254 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
255 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
256 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
258 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
259 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
260 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
261 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
263 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
264 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
265 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
266 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
268 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
269 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
270 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
271 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
273 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
274 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
275 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
276 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
278 /* Because the computation type is at least twice as large as required,
279 these work for both signed and unsigned source types. */
280 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
282 return (n * m) >> 8;
285 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
287 return (n * m) >> 16;
290 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
292 return (n * m) >> 32;
295 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
297 uint64_t lo, hi;
298 muls64(&lo, &hi, n, m);
299 return hi;
302 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
304 uint64_t lo, hi;
305 mulu64(&lo, &hi, n, m);
306 return hi;
309 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
310 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
311 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
312 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
314 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
315 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
316 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
317 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
319 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
320 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
321 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
322 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
324 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
325 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
327 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
328 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
330 /* Note that all bits of the shift are significant
331 and not modulo the element size. */
332 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
333 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
334 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
336 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
337 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
338 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
340 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
341 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
342 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
344 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
345 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
346 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
348 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
349 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
350 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
352 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
354 int8_t n1 = n, n2 = n >> 8;
355 return m + n1 + n2;
358 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
360 int16_t n1 = n, n2 = n >> 16;
361 return m + n1 + n2;
364 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
366 int32_t n1 = n, n2 = n >> 32;
367 return m + n1 + n2;
370 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
371 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
372 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
374 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
376 uint8_t n1 = n, n2 = n >> 8;
377 return m + n1 + n2;
380 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
382 uint16_t n1 = n, n2 = n >> 16;
383 return m + n1 + n2;
386 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
388 uint32_t n1 = n, n2 = n >> 32;
389 return m + n1 + n2;
392 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
393 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
394 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
396 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
397 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
398 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
399 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
401 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
402 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
403 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
404 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
406 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
407 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
408 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
409 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
411 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
412 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
413 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
414 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
417 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
418 * We pass in a pointer to a dummy saturation field to trigger
419 * the saturating arithmetic but discard the information about
420 * whether it has occurred.
422 #define do_sqshl_b(n, m) \
423 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
424 #define do_sqshl_h(n, m) \
425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
426 #define do_sqshl_s(n, m) \
427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
428 #define do_sqshl_d(n, m) \
429 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
431 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
432 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
433 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
434 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
436 #define do_uqshl_b(n, m) \
437 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
438 #define do_uqshl_h(n, m) \
439 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
440 #define do_uqshl_s(n, m) \
441 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
442 #define do_uqshl_d(n, m) \
443 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
445 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
446 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
447 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
448 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
450 #define do_sqrshl_b(n, m) \
451 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
452 #define do_sqrshl_h(n, m) \
453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
454 #define do_sqrshl_s(n, m) \
455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
456 #define do_sqrshl_d(n, m) \
457 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
459 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
460 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
461 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
462 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
464 #undef do_sqrshl_d
466 #define do_uqrshl_b(n, m) \
467 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
468 #define do_uqrshl_h(n, m) \
469 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
470 #define do_uqrshl_s(n, m) \
471 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
472 #define do_uqrshl_d(n, m) \
473 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
475 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
476 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
477 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
478 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
480 #undef do_uqrshl_d
482 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
483 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
485 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
486 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
487 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
488 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
490 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
491 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
492 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
493 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
495 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
496 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
498 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
499 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
500 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
501 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
503 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
504 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
506 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
508 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
509 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
511 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
512 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
513 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
514 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
516 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
517 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
519 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
521 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
523 return val >= max ? max : val <= min ? min : val;
526 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
527 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
528 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
530 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
532 int64_t r = n + m;
533 if (((r ^ n) & ~(n ^ m)) < 0) {
534 /* Signed overflow. */
535 return r < 0 ? INT64_MAX : INT64_MIN;
537 return r;
540 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
541 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
542 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
543 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
545 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
546 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
547 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
549 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
551 uint64_t r = n + m;
552 return r < n ? UINT64_MAX : r;
555 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
556 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
557 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
558 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
560 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
561 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
562 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
564 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
566 int64_t r = n - m;
567 if (((r ^ n) & (n ^ m)) < 0) {
568 /* Signed overflow. */
569 return r < 0 ? INT64_MAX : INT64_MIN;
571 return r;
574 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
575 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
576 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
577 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
579 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
580 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
581 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
583 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
585 return n > m ? n - m : 0;
588 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
589 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
590 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
591 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
593 #define DO_SUQADD_B(n, m) \
594 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
595 #define DO_SUQADD_H(n, m) \
596 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
597 #define DO_SUQADD_S(n, m) \
598 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
600 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
602 uint64_t r = n + m;
604 if (n < 0) {
605 /* Note that m - abs(n) cannot underflow. */
606 if (r > INT64_MAX) {
607 /* Result is either very large positive or negative. */
608 if (m > -n) {
609 /* m > abs(n), so r is a very large positive. */
610 return INT64_MAX;
612 /* Result is negative. */
614 } else {
615 /* Both inputs are positive: check for overflow. */
616 if (r < m || r > INT64_MAX) {
617 return INT64_MAX;
620 return r;
623 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
624 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
625 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
626 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
628 #define DO_USQADD_B(n, m) \
629 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
630 #define DO_USQADD_H(n, m) \
631 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
632 #define DO_USQADD_S(n, m) \
633 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
635 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
637 uint64_t r = n + m;
639 if (m < 0) {
640 return n < -m ? 0 : r;
642 return r < n ? UINT64_MAX : r;
645 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
646 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
647 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
648 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
650 #undef DO_ZPZZ
651 #undef DO_ZPZZ_D
654 * Three operand expander, operating on element pairs.
655 * If the slot I is even, the elements from from VN {I, I+1}.
656 * If the slot I is odd, the elements from from VM {I-1, I}.
657 * Load all of the input elements in each pair before overwriting output.
659 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
660 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
662 intptr_t i, opr_sz = simd_oprsz(desc); \
663 for (i = 0; i < opr_sz; ) { \
664 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
665 do { \
666 TYPE n0 = *(TYPE *)(vn + H(i)); \
667 TYPE m0 = *(TYPE *)(vm + H(i)); \
668 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
669 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
670 if (pg & 1) { \
671 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
673 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
674 if (pg & 1) { \
675 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
678 } while (i & 15); \
682 /* Similarly, specialized for 64-bit operands. */
683 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
687 TYPE *d = vd, *n = vn, *m = vm; \
688 uint8_t *pg = vg; \
689 for (i = 0; i < opr_sz; i += 2) { \
690 TYPE n0 = n[i], n1 = n[i + 1]; \
691 TYPE m0 = m[i], m1 = m[i + 1]; \
692 if (pg[H1(i)] & 1) { \
693 d[i] = OP(n0, n1); \
695 if (pg[H1(i + 1)] & 1) { \
696 d[i + 1] = OP(m0, m1); \
701 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
702 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
704 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
706 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
709 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
711 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
714 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
716 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
719 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
721 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
724 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
726 #undef DO_ZPZZ_PAIR
727 #undef DO_ZPZZ_PAIR_D
729 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
730 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
731 void *status, uint32_t desc) \
733 intptr_t i, opr_sz = simd_oprsz(desc); \
734 for (i = 0; i < opr_sz; ) { \
735 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
736 do { \
737 TYPE n0 = *(TYPE *)(vn + H(i)); \
738 TYPE m0 = *(TYPE *)(vm + H(i)); \
739 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
740 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
741 if (pg & 1) { \
742 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
744 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
745 if (pg & 1) { \
746 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
749 } while (i & 15); \
753 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
757 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
761 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
765 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
769 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
773 #undef DO_ZPZZ_PAIR_FP
775 /* Three-operand expander, controlled by a predicate, in which the
776 * third operand is "wide". That is, for D = N op M, the same 64-bit
777 * value of M is used with all of the narrower values of N.
779 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
780 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
782 intptr_t i, opr_sz = simd_oprsz(desc); \
783 for (i = 0; i < opr_sz; ) { \
784 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
785 TYPEW mm = *(TYPEW *)(vm + i); \
786 do { \
787 if (pg & 1) { \
788 TYPE nn = *(TYPE *)(vn + H(i)); \
789 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
791 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
792 } while (i & 7); \
796 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
797 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
798 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
800 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
801 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
802 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
804 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
808 #undef DO_ZPZW
810 /* Fully general two-operand expander, controlled by a predicate.
812 #define DO_ZPZ(NAME, TYPE, H, OP) \
813 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
815 intptr_t i, opr_sz = simd_oprsz(desc); \
816 for (i = 0; i < opr_sz; ) { \
817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
818 do { \
819 if (pg & 1) { \
820 TYPE nn = *(TYPE *)(vn + H(i)); \
821 *(TYPE *)(vd + H(i)) = OP(nn); \
823 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
824 } while (i & 15); \
828 /* Similarly, specialized for 64-bit operands. */
829 #define DO_ZPZ_D(NAME, TYPE, OP) \
830 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
832 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
833 TYPE *d = vd, *n = vn; \
834 uint8_t *pg = vg; \
835 for (i = 0; i < opr_sz; i += 1) { \
836 if (pg[H1(i)] & 1) { \
837 TYPE nn = n[i]; \
838 d[i] = OP(nn); \
843 #define DO_CLS_B(N) (clrsb32(N) - 24)
844 #define DO_CLS_H(N) (clrsb32(N) - 16)
846 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
847 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
848 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
849 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
851 #define DO_CLZ_B(N) (clz32(N) - 24)
852 #define DO_CLZ_H(N) (clz32(N) - 16)
854 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
855 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
856 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
857 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
859 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
860 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
861 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
862 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
864 #define DO_CNOT(N) (N == 0)
866 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
867 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
868 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
869 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
871 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
873 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
874 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
875 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
877 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
879 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
880 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
881 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
883 #define DO_NOT(N) (~N)
885 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
886 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
887 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
888 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
890 #define DO_SXTB(N) ((int8_t)N)
891 #define DO_SXTH(N) ((int16_t)N)
892 #define DO_SXTS(N) ((int32_t)N)
893 #define DO_UXTB(N) ((uint8_t)N)
894 #define DO_UXTH(N) ((uint16_t)N)
895 #define DO_UXTS(N) ((uint32_t)N)
897 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
898 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
899 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
900 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
901 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
902 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
904 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
905 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
906 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
907 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
908 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
909 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
911 #define DO_ABS(N) (N < 0 ? -N : N)
913 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
914 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
915 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
916 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
918 #define DO_NEG(N) (-N)
920 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
921 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
922 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
923 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
925 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
926 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
927 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
929 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
930 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
932 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
934 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
935 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
936 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
937 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
939 #define DO_SQABS(X) \
940 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
941 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
943 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
944 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
945 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
946 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
948 #define DO_SQNEG(X) \
949 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
950 x_ == min_ ? -min_ - 1 : -x_; })
952 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
953 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
954 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
955 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
957 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
958 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
960 /* Three-operand expander, unpredicated, in which the third operand is "wide".
962 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
963 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
965 intptr_t i, opr_sz = simd_oprsz(desc); \
966 for (i = 0; i < opr_sz; ) { \
967 TYPEW mm = *(TYPEW *)(vm + i); \
968 do { \
969 TYPE nn = *(TYPE *)(vn + H(i)); \
970 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
971 i += sizeof(TYPE); \
972 } while (i & 7); \
976 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
977 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
978 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
980 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
981 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
982 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
984 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
985 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
986 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
988 #undef DO_ZZW
990 #undef DO_CLS_B
991 #undef DO_CLS_H
992 #undef DO_CLZ_B
993 #undef DO_CLZ_H
994 #undef DO_CNOT
995 #undef DO_FABS
996 #undef DO_FNEG
997 #undef DO_ABS
998 #undef DO_NEG
999 #undef DO_ZPZ
1000 #undef DO_ZPZ_D
1003 * Three-operand expander, unpredicated, in which the two inputs are
1004 * selected from the top or bottom half of the wide column.
1006 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1007 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1009 intptr_t i, opr_sz = simd_oprsz(desc); \
1010 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1011 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1012 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1013 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1014 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1015 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1019 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1020 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1021 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1023 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1024 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1025 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1027 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1028 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1029 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1031 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1032 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1033 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1035 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1036 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1037 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1039 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1040 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1041 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1043 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1044 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1045 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1047 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1048 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1049 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1051 /* Note that the multiply cannot overflow, but the doubling can. */
1052 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1054 int16_t val = n * m;
1055 return DO_SQADD_H(val, val);
1058 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1060 int32_t val = n * m;
1061 return DO_SQADD_S(val, val);
1064 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1066 int64_t val = n * m;
1067 return do_sqadd_d(val, val);
1070 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1071 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1072 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1074 #undef DO_ZZZ_TB
1076 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1077 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1079 intptr_t i, opr_sz = simd_oprsz(desc); \
1080 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1081 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1082 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1083 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1084 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1088 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1089 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1090 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1092 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1093 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1094 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1096 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1097 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1098 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1100 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1101 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1102 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1104 #undef DO_ZZZ_WTB
1106 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1107 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1109 intptr_t i, opr_sz = simd_oprsz(desc); \
1110 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1111 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1112 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1113 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1114 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1115 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1119 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1120 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1121 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1122 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1124 #undef DO_ZZZ_NTB
1126 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1127 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1129 intptr_t i, opr_sz = simd_oprsz(desc); \
1130 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1131 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1132 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1133 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1134 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1135 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1139 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1140 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1141 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1143 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1144 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1145 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1147 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1148 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1149 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1151 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1152 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1153 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1155 #define DO_NMUL(N, M) -(N * M)
1157 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1158 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1159 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1161 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1162 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1163 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1165 #undef DO_ZZZW_ACC
1167 #define DO_XTNB(NAME, TYPE, OP) \
1168 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1170 intptr_t i, opr_sz = simd_oprsz(desc); \
1171 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1172 TYPE nn = *(TYPE *)(vn + i); \
1173 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1174 *(TYPE *)(vd + i) = nn; \
1178 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1179 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1181 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1182 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1183 TYPE nn = *(TYPE *)(vn + i); \
1184 *(TYPEN *)(vd + i + odd) = OP(nn); \
1188 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1189 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1190 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1192 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1193 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1194 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1196 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1197 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1198 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1200 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1201 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1202 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1204 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1205 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1206 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1208 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1209 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1210 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1212 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1213 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1214 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1216 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1217 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1218 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1220 #undef DO_XTNB
1221 #undef DO_XTNT
1223 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1225 intptr_t i, opr_sz = simd_oprsz(desc);
1226 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1227 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1228 uint32_t *a = va, *n = vn;
1229 uint64_t *d = vd, *m = vm;
1231 for (i = 0; i < opr_sz / 8; ++i) {
1232 uint32_t e1 = a[2 * i + H4(0)];
1233 uint32_t e2 = n[2 * i + sel] ^ inv;
1234 uint64_t c = extract64(m[i], 32, 1);
1235 /* Compute and store the entire 33-bit result at once. */
1236 d[i] = c + e1 + e2;
1240 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1242 intptr_t i, opr_sz = simd_oprsz(desc);
1243 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1244 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1245 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1247 for (i = 0; i < opr_sz / 8; i += 2) {
1248 Int128 e1 = int128_make64(a[i]);
1249 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1250 Int128 c = int128_make64(m[i + 1] & 1);
1251 Int128 r = int128_add(int128_add(e1, e2), c);
1252 d[i + 0] = int128_getlo(r);
1253 d[i + 1] = int128_gethi(r);
1257 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1258 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1260 intptr_t i, opr_sz = simd_oprsz(desc); \
1261 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1262 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1263 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1264 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1265 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1266 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1267 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1271 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1272 do_sqdmull_h, DO_SQADD_H)
1273 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1274 do_sqdmull_s, DO_SQADD_S)
1275 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1276 do_sqdmull_d, do_sqadd_d)
1278 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1279 do_sqdmull_h, DO_SQSUB_H)
1280 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1281 do_sqdmull_s, DO_SQSUB_S)
1282 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1283 do_sqdmull_d, do_sqsub_d)
1285 #undef DO_SQDMLAL
1287 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1288 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1290 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1291 int rot = simd_data(desc); \
1292 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1293 bool sub_r = rot == 1 || rot == 2; \
1294 bool sub_i = rot >= 2; \
1295 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1296 for (i = 0; i < opr_sz; i += 2) { \
1297 TYPE elt1_a = n[H(i + sel_a)]; \
1298 TYPE elt2_a = m[H(i + sel_a)]; \
1299 TYPE elt2_b = m[H(i + sel_b)]; \
1300 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1301 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1305 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1307 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1308 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1309 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1310 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1312 #define DO_SQRDMLAH_B(N, M, A, S) \
1313 do_sqrdmlah_b(N, M, A, S, true)
1314 #define DO_SQRDMLAH_H(N, M, A, S) \
1315 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1316 #define DO_SQRDMLAH_S(N, M, A, S) \
1317 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1318 #define DO_SQRDMLAH_D(N, M, A, S) \
1319 do_sqrdmlah_d(N, M, A, S, true)
1321 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1322 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1323 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1324 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1326 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1327 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1329 intptr_t i, j, oprsz = simd_oprsz(desc); \
1330 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1331 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1332 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1333 bool sub_r = rot == 1 || rot == 2; \
1334 bool sub_i = rot >= 2; \
1335 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1336 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1337 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1338 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1339 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1340 TYPE elt1_a = n[H(i + j + sel_a)]; \
1341 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1342 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1347 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1348 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1350 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1351 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1353 #undef DO_CMLA
1354 #undef DO_CMLA_FUNC
1355 #undef DO_CMLA_IDX_FUNC
1356 #undef DO_SQRDMLAH_B
1357 #undef DO_SQRDMLAH_H
1358 #undef DO_SQRDMLAH_S
1359 #undef DO_SQRDMLAH_D
1361 /* Note N and M are 4 elements bundled into one unit. */
1362 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1363 int sel_a, int sel_b, int sub_i)
1365 for (int i = 0; i <= 1; i++) {
1366 int32_t elt1_r = (int8_t)(n >> (16 * i));
1367 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1368 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1369 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1371 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1373 return a;
1376 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1377 int sel_a, int sel_b, int sub_i)
1379 for (int i = 0; i <= 1; i++) {
1380 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1381 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1382 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1383 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1385 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1387 return a;
1390 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1391 void *va, uint32_t desc)
1393 int opr_sz = simd_oprsz(desc);
1394 int rot = simd_data(desc);
1395 int sel_a = rot & 1;
1396 int sel_b = sel_a ^ 1;
1397 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1398 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1400 for (int e = 0; e < opr_sz / 4; e++) {
1401 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1405 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1406 void *va, uint32_t desc)
1408 int opr_sz = simd_oprsz(desc);
1409 int rot = simd_data(desc);
1410 int sel_a = rot & 1;
1411 int sel_b = sel_a ^ 1;
1412 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1413 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1415 for (int e = 0; e < opr_sz / 8; e++) {
1416 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1420 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1421 void *va, uint32_t desc)
1423 int opr_sz = simd_oprsz(desc);
1424 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1425 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1426 int sel_a = rot & 1;
1427 int sel_b = sel_a ^ 1;
1428 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1429 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1431 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1432 uint32_t seg_m = m[seg + idx];
1433 for (int e = 0; e < 4; e++) {
1434 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1435 sel_a, sel_b, sub_i);
1440 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1441 void *va, uint32_t desc)
1443 int seg, opr_sz = simd_oprsz(desc);
1444 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1445 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1446 int sel_a = rot & 1;
1447 int sel_b = sel_a ^ 1;
1448 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1449 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1451 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1452 uint64_t seg_m = m[seg + idx];
1453 for (int e = 0; e < 2; e++) {
1454 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1455 sel_a, sel_b, sub_i);
1460 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1461 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1463 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1464 intptr_t i, j, idx = simd_data(desc); \
1465 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1466 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1467 TYPE mm = m[i]; \
1468 for (j = 0; j < segment; j++) { \
1469 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1474 #define DO_SQRDMLAH_H(N, M, A) \
1475 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1476 #define DO_SQRDMLAH_S(N, M, A) \
1477 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1478 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1480 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1481 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1482 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1484 #define DO_SQRDMLSH_H(N, M, A) \
1485 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1486 #define DO_SQRDMLSH_S(N, M, A) \
1487 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1488 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1490 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1491 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1492 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1494 #undef DO_ZZXZ
1496 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1497 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1499 intptr_t i, j, oprsz = simd_oprsz(desc); \
1500 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1501 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1502 for (i = 0; i < oprsz; i += 16) { \
1503 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1504 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1505 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1506 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1507 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1512 #define DO_MLA(N, M, A) (A + N * M)
1514 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1515 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1516 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1517 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1519 #define DO_MLS(N, M, A) (A - N * M)
1521 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1522 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1523 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1524 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1526 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1527 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1529 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1530 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1532 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1533 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1535 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1536 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1538 #undef DO_MLA
1539 #undef DO_MLS
1540 #undef DO_ZZXW
1542 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1543 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1545 intptr_t i, j, oprsz = simd_oprsz(desc); \
1546 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1547 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1548 for (i = 0; i < oprsz; i += 16) { \
1549 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1550 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1551 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1552 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1557 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1558 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1560 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1561 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1563 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1564 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1566 #undef DO_ZZX
1568 #define DO_BITPERM(NAME, TYPE, OP) \
1569 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1571 intptr_t i, opr_sz = simd_oprsz(desc); \
1572 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1573 TYPE nn = *(TYPE *)(vn + i); \
1574 TYPE mm = *(TYPE *)(vm + i); \
1575 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1579 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1581 uint64_t res = 0;
1582 int db, rb = 0;
1584 for (db = 0; db < n; ++db) {
1585 if ((mask >> db) & 1) {
1586 res |= ((data >> db) & 1) << rb;
1587 ++rb;
1590 return res;
1593 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1594 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1595 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1596 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1598 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1600 uint64_t res = 0;
1601 int rb, db = 0;
1603 for (rb = 0; rb < n; ++rb) {
1604 if ((mask >> rb) & 1) {
1605 res |= ((data >> db) & 1) << rb;
1606 ++db;
1609 return res;
1612 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1613 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1614 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1615 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1617 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1619 uint64_t resm = 0, resu = 0;
1620 int db, rbm = 0, rbu = 0;
1622 for (db = 0; db < n; ++db) {
1623 uint64_t val = (data >> db) & 1;
1624 if ((mask >> db) & 1) {
1625 resm |= val << rbm++;
1626 } else {
1627 resu |= val << rbu++;
1631 return resm | (resu << rbm);
1634 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1635 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1636 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1637 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1639 #undef DO_BITPERM
1641 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1642 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1644 intptr_t i, opr_sz = simd_oprsz(desc); \
1645 int sub_r = simd_data(desc); \
1646 if (sub_r) { \
1647 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1648 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1649 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1650 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1651 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1652 acc_r = ADD_OP(acc_r, el2_i); \
1653 acc_i = SUB_OP(acc_i, el2_r); \
1654 *(TYPE *)(vd + H(i)) = acc_r; \
1655 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1657 } else { \
1658 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1659 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1660 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1661 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1662 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1663 acc_r = SUB_OP(acc_r, el2_i); \
1664 acc_i = ADD_OP(acc_i, el2_r); \
1665 *(TYPE *)(vd + H(i)) = acc_r; \
1666 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1671 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1672 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1673 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1674 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1676 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1677 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1678 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1679 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1681 #undef DO_CADD
1683 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1684 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1686 intptr_t i, opr_sz = simd_oprsz(desc); \
1687 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1688 int shift = simd_data(desc) >> 1; \
1689 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1690 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1691 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1695 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1696 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1697 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1699 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1700 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1701 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1703 #undef DO_ZZI_SHLL
1705 /* Two-operand reduction expander, controlled by a predicate.
1706 * The difference between TYPERED and TYPERET has to do with
1707 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1708 * but TYPERET must be unsigned so that e.g. a 32-bit value
1709 * is not sign-extended to the ABI uint64_t return type.
1711 /* ??? If we were to vectorize this by hand the reduction ordering
1712 * would change. For integer operands, this is perfectly fine.
1714 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1715 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1717 intptr_t i, opr_sz = simd_oprsz(desc); \
1718 TYPERED ret = INIT; \
1719 for (i = 0; i < opr_sz; ) { \
1720 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1721 do { \
1722 if (pg & 1) { \
1723 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1724 ret = OP(ret, nn); \
1726 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1727 } while (i & 15); \
1729 return (TYPERET)ret; \
1732 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1733 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1735 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1736 TYPEE *n = vn; \
1737 uint8_t *pg = vg; \
1738 TYPER ret = INIT; \
1739 for (i = 0; i < opr_sz; i += 1) { \
1740 if (pg[H1(i)] & 1) { \
1741 TYPEE nn = n[i]; \
1742 ret = OP(ret, nn); \
1745 return ret; \
1748 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1749 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1750 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1751 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1753 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1754 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1755 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1756 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1758 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1759 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1760 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1761 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1763 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1764 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1765 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1767 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1768 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1769 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1770 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1772 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1773 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1774 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1775 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1777 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1778 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1779 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1780 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1782 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1783 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1784 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1785 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1787 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1788 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1789 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1790 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1792 #undef DO_VPZ
1793 #undef DO_VPZ_D
1795 /* Two vector operand, one scalar operand, unpredicated. */
1796 #define DO_ZZI(NAME, TYPE, OP) \
1797 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1799 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1800 TYPE s = s64, *d = vd, *n = vn; \
1801 for (i = 0; i < opr_sz; ++i) { \
1802 d[i] = OP(n[i], s); \
1806 #define DO_SUBR(X, Y) (Y - X)
1808 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1809 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1810 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1811 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1813 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1814 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1815 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1816 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1818 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1819 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1820 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1821 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1823 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1824 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1825 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1826 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1828 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1829 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1830 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1831 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1833 #undef DO_ZZI
1835 #undef DO_AND
1836 #undef DO_ORR
1837 #undef DO_EOR
1838 #undef DO_BIC
1839 #undef DO_ADD
1840 #undef DO_SUB
1841 #undef DO_MAX
1842 #undef DO_MIN
1843 #undef DO_ABD
1844 #undef DO_MUL
1845 #undef DO_DIV
1846 #undef DO_ASR
1847 #undef DO_LSR
1848 #undef DO_LSL
1849 #undef DO_SUBR
1851 /* Similar to the ARM LastActiveElement pseudocode function, except the
1852 result is multiplied by the element size. This includes the not found
1853 indication; e.g. not found for esz=3 is -8. */
1854 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1856 uint64_t mask = pred_esz_masks[esz];
1857 intptr_t i = words;
1859 do {
1860 uint64_t this_g = g[--i] & mask;
1861 if (this_g) {
1862 return i * 64 + (63 - clz64(this_g));
1864 } while (i > 0);
1865 return (intptr_t)-1 << esz;
1868 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1870 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1871 uint32_t flags = PREDTEST_INIT;
1872 uint64_t *d = vd, *g = vg;
1873 intptr_t i = 0;
1875 do {
1876 uint64_t this_d = d[i];
1877 uint64_t this_g = g[i];
1879 if (this_g) {
1880 if (!(flags & 4)) {
1881 /* Set in D the first bit of G. */
1882 this_d |= this_g & -this_g;
1883 d[i] = this_d;
1885 flags = iter_predtest_fwd(this_d, this_g, flags);
1887 } while (++i < words);
1889 return flags;
1892 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1894 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1895 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1896 uint32_t flags = PREDTEST_INIT;
1897 uint64_t *d = vd, *g = vg, esz_mask;
1898 intptr_t i, next;
1900 next = last_active_element(vd, words, esz) + (1 << esz);
1901 esz_mask = pred_esz_masks[esz];
1903 /* Similar to the pseudocode for pnext, but scaled by ESZ
1904 so that we find the correct bit. */
1905 if (next < words * 64) {
1906 uint64_t mask = -1;
1908 if (next & 63) {
1909 mask = ~((1ull << (next & 63)) - 1);
1910 next &= -64;
1912 do {
1913 uint64_t this_g = g[next / 64] & esz_mask & mask;
1914 if (this_g != 0) {
1915 next = (next & -64) + ctz64(this_g);
1916 break;
1918 next += 64;
1919 mask = -1;
1920 } while (next < words * 64);
1923 i = 0;
1924 do {
1925 uint64_t this_d = 0;
1926 if (i == next / 64) {
1927 this_d = 1ull << (next & 63);
1929 d[i] = this_d;
1930 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1931 } while (++i < words);
1933 return flags;
1937 * Copy Zn into Zd, and store zero into inactive elements.
1938 * If inv, store zeros into the active elements.
1940 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1942 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1943 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1944 uint64_t *d = vd, *n = vn;
1945 uint8_t *pg = vg;
1947 for (i = 0; i < opr_sz; i += 1) {
1948 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1952 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1954 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1955 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1956 uint64_t *d = vd, *n = vn;
1957 uint8_t *pg = vg;
1959 for (i = 0; i < opr_sz; i += 1) {
1960 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1964 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1966 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1967 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1968 uint64_t *d = vd, *n = vn;
1969 uint8_t *pg = vg;
1971 for (i = 0; i < opr_sz; i += 1) {
1972 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1976 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1979 uint64_t *d = vd, *n = vn;
1980 uint8_t *pg = vg;
1981 uint8_t inv = simd_data(desc);
1983 for (i = 0; i < opr_sz; i += 1) {
1984 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1988 /* Three-operand expander, immediate operand, controlled by a predicate.
1990 #define DO_ZPZI(NAME, TYPE, H, OP) \
1991 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1993 intptr_t i, opr_sz = simd_oprsz(desc); \
1994 TYPE imm = simd_data(desc); \
1995 for (i = 0; i < opr_sz; ) { \
1996 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1997 do { \
1998 if (pg & 1) { \
1999 TYPE nn = *(TYPE *)(vn + H(i)); \
2000 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2002 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2003 } while (i & 15); \
2007 /* Similarly, specialized for 64-bit operands. */
2008 #define DO_ZPZI_D(NAME, TYPE, OP) \
2009 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2011 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2012 TYPE *d = vd, *n = vn; \
2013 TYPE imm = simd_data(desc); \
2014 uint8_t *pg = vg; \
2015 for (i = 0; i < opr_sz; i += 1) { \
2016 if (pg[H1(i)] & 1) { \
2017 TYPE nn = n[i]; \
2018 d[i] = OP(nn, imm); \
2023 #define DO_SHR(N, M) (N >> M)
2024 #define DO_SHL(N, M) (N << M)
2026 /* Arithmetic shift right for division. This rounds negative numbers
2027 toward zero as per signed division. Therefore before shifting,
2028 when N is negative, add 2**M-1. */
2029 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2031 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2033 if (likely(sh < 64)) {
2034 return (x >> sh) + ((x >> (sh - 1)) & 1);
2035 } else if (sh == 64) {
2036 return x >> 63;
2037 } else {
2038 return 0;
2042 static inline int64_t do_srshr(int64_t x, unsigned sh)
2044 if (likely(sh < 64)) {
2045 return (x >> sh) + ((x >> (sh - 1)) & 1);
2046 } else {
2047 /* Rounding the sign bit always produces 0. */
2048 return 0;
2052 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2053 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2054 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2055 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2057 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2058 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2059 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2060 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2062 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2063 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2064 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2065 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2067 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2068 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2069 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2070 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2072 /* SVE2 bitwise shift by immediate */
2073 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2074 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2075 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2076 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2078 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2079 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2080 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2081 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2083 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2084 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2085 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2086 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2088 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2089 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2090 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2091 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2093 #define do_suqrshl_b(n, m) \
2094 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2095 #define do_suqrshl_h(n, m) \
2096 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2097 #define do_suqrshl_s(n, m) \
2098 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2099 #define do_suqrshl_d(n, m) \
2100 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2102 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2103 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2104 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2105 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2107 #undef DO_ASRD
2108 #undef DO_ZPZI
2109 #undef DO_ZPZI_D
2111 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2112 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2114 intptr_t i, opr_sz = simd_oprsz(desc); \
2115 int shift = simd_data(desc); \
2116 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2117 TYPEW nn = *(TYPEW *)(vn + i); \
2118 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2122 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2123 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2125 intptr_t i, opr_sz = simd_oprsz(desc); \
2126 int shift = simd_data(desc); \
2127 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2128 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2129 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2133 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2134 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2135 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2137 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2138 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2139 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2141 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2142 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2143 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2145 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2146 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2147 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2149 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2150 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2151 #define DO_SQSHRUN_D(x, sh) \
2152 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2154 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2155 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2156 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2158 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2159 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2160 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2162 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2163 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2164 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2166 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2167 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2168 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2170 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2171 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2172 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2174 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2175 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2176 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2178 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2179 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2180 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2182 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2183 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2184 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2186 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2187 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2188 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2190 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2191 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2192 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2194 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2195 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2196 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2198 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2199 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2200 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2202 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2203 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2204 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2206 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2207 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2208 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2210 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2211 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2212 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2214 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2215 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2216 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2218 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2219 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2220 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2222 #undef DO_SHRNB
2223 #undef DO_SHRNT
2225 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2226 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2228 intptr_t i, opr_sz = simd_oprsz(desc); \
2229 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2230 TYPEW nn = *(TYPEW *)(vn + i); \
2231 TYPEW mm = *(TYPEW *)(vm + i); \
2232 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2236 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2237 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2239 intptr_t i, opr_sz = simd_oprsz(desc); \
2240 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2241 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2242 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2243 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2247 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2248 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2249 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2250 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2252 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2253 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2254 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2256 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2257 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2258 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2260 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2261 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2262 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2264 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2265 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2266 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2268 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2269 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2270 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2272 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2273 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2274 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2276 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2277 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2278 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2280 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2281 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2282 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2284 #undef DO_RSUBHN
2285 #undef DO_SUBHN
2286 #undef DO_RADDHN
2287 #undef DO_ADDHN
2289 #undef DO_BINOPNB
2291 /* Fully general four-operand expander, controlled by a predicate.
2293 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2294 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2295 void *vg, uint32_t desc) \
2297 intptr_t i, opr_sz = simd_oprsz(desc); \
2298 for (i = 0; i < opr_sz; ) { \
2299 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2300 do { \
2301 if (pg & 1) { \
2302 TYPE nn = *(TYPE *)(vn + H(i)); \
2303 TYPE mm = *(TYPE *)(vm + H(i)); \
2304 TYPE aa = *(TYPE *)(va + H(i)); \
2305 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2307 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2308 } while (i & 15); \
2312 /* Similarly, specialized for 64-bit operands. */
2313 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2314 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2315 void *vg, uint32_t desc) \
2317 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2318 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2319 uint8_t *pg = vg; \
2320 for (i = 0; i < opr_sz; i += 1) { \
2321 if (pg[H1(i)] & 1) { \
2322 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2323 d[i] = OP(aa, nn, mm); \
2328 #define DO_MLA(A, N, M) (A + N * M)
2329 #define DO_MLS(A, N, M) (A - N * M)
2331 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2332 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2334 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2335 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2337 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2338 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2340 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2341 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2343 #undef DO_MLA
2344 #undef DO_MLS
2345 #undef DO_ZPZZZ
2346 #undef DO_ZPZZZ_D
2348 void HELPER(sve_index_b)(void *vd, uint32_t start,
2349 uint32_t incr, uint32_t desc)
2351 intptr_t i, opr_sz = simd_oprsz(desc);
2352 uint8_t *d = vd;
2353 for (i = 0; i < opr_sz; i += 1) {
2354 d[H1(i)] = start + i * incr;
2358 void HELPER(sve_index_h)(void *vd, uint32_t start,
2359 uint32_t incr, uint32_t desc)
2361 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2362 uint16_t *d = vd;
2363 for (i = 0; i < opr_sz; i += 1) {
2364 d[H2(i)] = start + i * incr;
2368 void HELPER(sve_index_s)(void *vd, uint32_t start,
2369 uint32_t incr, uint32_t desc)
2371 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2372 uint32_t *d = vd;
2373 for (i = 0; i < opr_sz; i += 1) {
2374 d[H4(i)] = start + i * incr;
2378 void HELPER(sve_index_d)(void *vd, uint64_t start,
2379 uint64_t incr, uint32_t desc)
2381 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2382 uint64_t *d = vd;
2383 for (i = 0; i < opr_sz; i += 1) {
2384 d[i] = start + i * incr;
2388 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2390 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2391 uint32_t sh = simd_data(desc);
2392 uint32_t *d = vd, *n = vn, *m = vm;
2393 for (i = 0; i < opr_sz; i += 1) {
2394 d[i] = n[i] + (m[i] << sh);
2398 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2400 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2401 uint64_t sh = simd_data(desc);
2402 uint64_t *d = vd, *n = vn, *m = vm;
2403 for (i = 0; i < opr_sz; i += 1) {
2404 d[i] = n[i] + (m[i] << sh);
2408 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2410 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2411 uint64_t sh = simd_data(desc);
2412 uint64_t *d = vd, *n = vn, *m = vm;
2413 for (i = 0; i < opr_sz; i += 1) {
2414 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2418 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2420 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2421 uint64_t sh = simd_data(desc);
2422 uint64_t *d = vd, *n = vn, *m = vm;
2423 for (i = 0; i < opr_sz; i += 1) {
2424 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2428 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2430 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2431 static const uint16_t coeff[] = {
2432 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2433 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2434 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2435 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2437 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2438 uint16_t *d = vd, *n = vn;
2440 for (i = 0; i < opr_sz; i++) {
2441 uint16_t nn = n[i];
2442 intptr_t idx = extract32(nn, 0, 5);
2443 uint16_t exp = extract32(nn, 5, 5);
2444 d[i] = coeff[idx] | (exp << 10);
2448 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2450 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2451 static const uint32_t coeff[] = {
2452 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2453 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2454 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2455 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2456 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2457 0x1ef532, 0x20b051, 0x227043, 0x243516,
2458 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2459 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2460 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2461 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2462 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2463 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2464 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2465 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2466 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2467 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2469 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2470 uint32_t *d = vd, *n = vn;
2472 for (i = 0; i < opr_sz; i++) {
2473 uint32_t nn = n[i];
2474 intptr_t idx = extract32(nn, 0, 6);
2475 uint32_t exp = extract32(nn, 6, 8);
2476 d[i] = coeff[idx] | (exp << 23);
2480 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2482 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2483 static const uint64_t coeff[] = {
2484 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2485 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2486 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2487 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2488 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2489 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2490 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2491 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2492 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2493 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2494 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2495 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2496 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2497 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2498 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2499 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2500 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2501 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2502 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2503 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2504 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2505 0xFA7C1819E90D8ull,
2507 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2508 uint64_t *d = vd, *n = vn;
2510 for (i = 0; i < opr_sz; i++) {
2511 uint64_t nn = n[i];
2512 intptr_t idx = extract32(nn, 0, 6);
2513 uint64_t exp = extract32(nn, 6, 11);
2514 d[i] = coeff[idx] | (exp << 52);
2518 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2520 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2521 uint16_t *d = vd, *n = vn, *m = vm;
2522 for (i = 0; i < opr_sz; i += 1) {
2523 uint16_t nn = n[i];
2524 uint16_t mm = m[i];
2525 if (mm & 1) {
2526 nn = float16_one;
2528 d[i] = nn ^ (mm & 2) << 14;
2532 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2534 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2535 uint32_t *d = vd, *n = vn, *m = vm;
2536 for (i = 0; i < opr_sz; i += 1) {
2537 uint32_t nn = n[i];
2538 uint32_t mm = m[i];
2539 if (mm & 1) {
2540 nn = float32_one;
2542 d[i] = nn ^ (mm & 2) << 30;
2546 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2548 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2549 uint64_t *d = vd, *n = vn, *m = vm;
2550 for (i = 0; i < opr_sz; i += 1) {
2551 uint64_t nn = n[i];
2552 uint64_t mm = m[i];
2553 if (mm & 1) {
2554 nn = float64_one;
2556 d[i] = nn ^ (mm & 2) << 62;
2561 * Signed saturating addition with scalar operand.
2564 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2566 intptr_t i, oprsz = simd_oprsz(desc);
2568 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2569 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2573 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2575 intptr_t i, oprsz = simd_oprsz(desc);
2577 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2578 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2582 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2584 intptr_t i, oprsz = simd_oprsz(desc);
2586 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2587 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2591 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2593 intptr_t i, oprsz = simd_oprsz(desc);
2595 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2596 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2601 * Unsigned saturating addition with scalar operand.
2604 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2606 intptr_t i, oprsz = simd_oprsz(desc);
2608 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2609 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2613 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2615 intptr_t i, oprsz = simd_oprsz(desc);
2617 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2618 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2622 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2624 intptr_t i, oprsz = simd_oprsz(desc);
2626 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2627 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2631 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2633 intptr_t i, oprsz = simd_oprsz(desc);
2635 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2636 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2640 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2642 intptr_t i, oprsz = simd_oprsz(desc);
2644 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2645 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2649 /* Two operand predicated copy immediate with merge. All valid immediates
2650 * can fit within 17 signed bits in the simd_data field.
2652 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2653 uint64_t mm, uint32_t desc)
2655 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2656 uint64_t *d = vd, *n = vn;
2657 uint8_t *pg = vg;
2659 mm = dup_const(MO_8, mm);
2660 for (i = 0; i < opr_sz; i += 1) {
2661 uint64_t nn = n[i];
2662 uint64_t pp = expand_pred_b(pg[H1(i)]);
2663 d[i] = (mm & pp) | (nn & ~pp);
2667 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2668 uint64_t mm, uint32_t desc)
2670 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2671 uint64_t *d = vd, *n = vn;
2672 uint8_t *pg = vg;
2674 mm = dup_const(MO_16, mm);
2675 for (i = 0; i < opr_sz; i += 1) {
2676 uint64_t nn = n[i];
2677 uint64_t pp = expand_pred_h(pg[H1(i)]);
2678 d[i] = (mm & pp) | (nn & ~pp);
2682 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2683 uint64_t mm, uint32_t desc)
2685 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2686 uint64_t *d = vd, *n = vn;
2687 uint8_t *pg = vg;
2689 mm = dup_const(MO_32, mm);
2690 for (i = 0; i < opr_sz; i += 1) {
2691 uint64_t nn = n[i];
2692 uint64_t pp = expand_pred_s(pg[H1(i)]);
2693 d[i] = (mm & pp) | (nn & ~pp);
2697 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2698 uint64_t mm, uint32_t desc)
2700 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2701 uint64_t *d = vd, *n = vn;
2702 uint8_t *pg = vg;
2704 for (i = 0; i < opr_sz; i += 1) {
2705 uint64_t nn = n[i];
2706 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2710 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2712 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2713 uint64_t *d = vd;
2714 uint8_t *pg = vg;
2716 val = dup_const(MO_8, val);
2717 for (i = 0; i < opr_sz; i += 1) {
2718 d[i] = val & expand_pred_b(pg[H1(i)]);
2722 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2724 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2725 uint64_t *d = vd;
2726 uint8_t *pg = vg;
2728 val = dup_const(MO_16, val);
2729 for (i = 0; i < opr_sz; i += 1) {
2730 d[i] = val & expand_pred_h(pg[H1(i)]);
2734 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2736 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2737 uint64_t *d = vd;
2738 uint8_t *pg = vg;
2740 val = dup_const(MO_32, val);
2741 for (i = 0; i < opr_sz; i += 1) {
2742 d[i] = val & expand_pred_s(pg[H1(i)]);
2746 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2748 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2749 uint64_t *d = vd;
2750 uint8_t *pg = vg;
2752 for (i = 0; i < opr_sz; i += 1) {
2753 d[i] = (pg[H1(i)] & 1 ? val : 0);
2757 /* Big-endian hosts need to frob the byte indices. If the copy
2758 * happens to be 8-byte aligned, then no frobbing necessary.
2760 static void swap_memmove(void *vd, void *vs, size_t n)
2762 uintptr_t d = (uintptr_t)vd;
2763 uintptr_t s = (uintptr_t)vs;
2764 uintptr_t o = (d | s | n) & 7;
2765 size_t i;
2767 #if !HOST_BIG_ENDIAN
2768 o = 0;
2769 #endif
2770 switch (o) {
2771 case 0:
2772 memmove(vd, vs, n);
2773 break;
2775 case 4:
2776 if (d < s || d >= s + n) {
2777 for (i = 0; i < n; i += 4) {
2778 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2780 } else {
2781 for (i = n; i > 0; ) {
2782 i -= 4;
2783 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2786 break;
2788 case 2:
2789 case 6:
2790 if (d < s || d >= s + n) {
2791 for (i = 0; i < n; i += 2) {
2792 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2794 } else {
2795 for (i = n; i > 0; ) {
2796 i -= 2;
2797 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2800 break;
2802 default:
2803 if (d < s || d >= s + n) {
2804 for (i = 0; i < n; i++) {
2805 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2807 } else {
2808 for (i = n; i > 0; ) {
2809 i -= 1;
2810 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2813 break;
2817 /* Similarly for memset of 0. */
2818 static void swap_memzero(void *vd, size_t n)
2820 uintptr_t d = (uintptr_t)vd;
2821 uintptr_t o = (d | n) & 7;
2822 size_t i;
2824 /* Usually, the first bit of a predicate is set, so N is 0. */
2825 if (likely(n == 0)) {
2826 return;
2829 #if !HOST_BIG_ENDIAN
2830 o = 0;
2831 #endif
2832 switch (o) {
2833 case 0:
2834 memset(vd, 0, n);
2835 break;
2837 case 4:
2838 for (i = 0; i < n; i += 4) {
2839 *(uint32_t *)H1_4(d + i) = 0;
2841 break;
2843 case 2:
2844 case 6:
2845 for (i = 0; i < n; i += 2) {
2846 *(uint16_t *)H1_2(d + i) = 0;
2848 break;
2850 default:
2851 for (i = 0; i < n; i++) {
2852 *(uint8_t *)H1(d + i) = 0;
2854 break;
2858 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2860 intptr_t opr_sz = simd_oprsz(desc);
2861 size_t n_ofs = simd_data(desc);
2862 size_t n_siz = opr_sz - n_ofs;
2864 if (vd != vm) {
2865 swap_memmove(vd, vn + n_ofs, n_siz);
2866 swap_memmove(vd + n_siz, vm, n_ofs);
2867 } else if (vd != vn) {
2868 swap_memmove(vd + n_siz, vd, n_ofs);
2869 swap_memmove(vd, vn + n_ofs, n_siz);
2870 } else {
2871 /* vd == vn == vm. Need temp space. */
2872 ARMVectorReg tmp;
2873 swap_memmove(&tmp, vm, n_ofs);
2874 swap_memmove(vd, vd + n_ofs, n_siz);
2875 memcpy(vd + n_siz, &tmp, n_ofs);
2879 #define DO_INSR(NAME, TYPE, H) \
2880 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2882 intptr_t opr_sz = simd_oprsz(desc); \
2883 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2884 *(TYPE *)(vd + H(0)) = val; \
2887 DO_INSR(sve_insr_b, uint8_t, H1)
2888 DO_INSR(sve_insr_h, uint16_t, H1_2)
2889 DO_INSR(sve_insr_s, uint32_t, H1_4)
2890 DO_INSR(sve_insr_d, uint64_t, H1_8)
2892 #undef DO_INSR
2894 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2896 intptr_t i, j, opr_sz = simd_oprsz(desc);
2897 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2898 uint64_t f = *(uint64_t *)(vn + i);
2899 uint64_t b = *(uint64_t *)(vn + j);
2900 *(uint64_t *)(vd + i) = bswap64(b);
2901 *(uint64_t *)(vd + j) = bswap64(f);
2905 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2907 intptr_t i, j, opr_sz = simd_oprsz(desc);
2908 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2909 uint64_t f = *(uint64_t *)(vn + i);
2910 uint64_t b = *(uint64_t *)(vn + j);
2911 *(uint64_t *)(vd + i) = hswap64(b);
2912 *(uint64_t *)(vd + j) = hswap64(f);
2916 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2918 intptr_t i, j, opr_sz = simd_oprsz(desc);
2919 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2920 uint64_t f = *(uint64_t *)(vn + i);
2921 uint64_t b = *(uint64_t *)(vn + j);
2922 *(uint64_t *)(vd + i) = rol64(b, 32);
2923 *(uint64_t *)(vd + j) = rol64(f, 32);
2927 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2929 intptr_t i, j, opr_sz = simd_oprsz(desc);
2930 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2931 uint64_t f = *(uint64_t *)(vn + i);
2932 uint64_t b = *(uint64_t *)(vn + j);
2933 *(uint64_t *)(vd + i) = b;
2934 *(uint64_t *)(vd + j) = f;
2938 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2940 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2941 bool is_tbx, tb_impl_fn *fn)
2943 ARMVectorReg scratch;
2944 uintptr_t oprsz = simd_oprsz(desc);
2946 if (unlikely(vd == vn)) {
2947 vn = memcpy(&scratch, vn, oprsz);
2950 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2953 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2954 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2956 ARMVectorReg scratch;
2957 uintptr_t oprsz = simd_oprsz(desc);
2959 if (unlikely(vd == vn0)) {
2960 vn0 = memcpy(&scratch, vn0, oprsz);
2961 if (vd == vn1) {
2962 vn1 = vn0;
2964 } else if (unlikely(vd == vn1)) {
2965 vn1 = memcpy(&scratch, vn1, oprsz);
2968 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2971 #define DO_TB(SUFF, TYPE, H) \
2972 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2973 void *vm, uintptr_t oprsz, bool is_tbx) \
2975 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2976 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2977 for (i = 0; i < nelem; ++i) { \
2978 TYPE index = indexes[H1(i)], val = 0; \
2979 if (index < nelem) { \
2980 val = tbl0[H(index)]; \
2981 } else { \
2982 index -= nelem; \
2983 if (tbl1 && index < nelem) { \
2984 val = tbl1[H(index)]; \
2985 } else if (is_tbx) { \
2986 continue; \
2989 d[H(i)] = val; \
2992 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
2994 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
2996 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
2997 void *vm, uint32_t desc) \
2999 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3001 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3003 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3006 DO_TB(b, uint8_t, H1)
3007 DO_TB(h, uint16_t, H2)
3008 DO_TB(s, uint32_t, H4)
3009 DO_TB(d, uint64_t, H8)
3011 #undef DO_TB
3013 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3014 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3016 intptr_t i, opr_sz = simd_oprsz(desc); \
3017 TYPED *d = vd; \
3018 TYPES *n = vn; \
3019 ARMVectorReg tmp; \
3020 if (unlikely(vn - vd < opr_sz)) { \
3021 n = memcpy(&tmp, n, opr_sz / 2); \
3023 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3024 d[HD(i)] = n[HS(i)]; \
3028 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3029 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3030 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3032 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3033 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3034 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3036 #undef DO_UNPK
3038 /* Mask of bits included in the even numbered predicates of width esz.
3039 * We also use this for expand_bits/compress_bits, and so extend the
3040 * same pattern out to 16-bit units.
3042 static const uint64_t even_bit_esz_masks[5] = {
3043 0x5555555555555555ull,
3044 0x3333333333333333ull,
3045 0x0f0f0f0f0f0f0f0full,
3046 0x00ff00ff00ff00ffull,
3047 0x0000ffff0000ffffull,
3050 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3051 * For N==0, this corresponds to the operation that in qemu/bitops.h
3052 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3053 * section 7-2 Shuffling Bits.
3055 static uint64_t expand_bits(uint64_t x, int n)
3057 int i;
3059 x &= 0xffffffffu;
3060 for (i = 4; i >= n; i--) {
3061 int sh = 1 << i;
3062 x = ((x << sh) | x) & even_bit_esz_masks[i];
3064 return x;
3067 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3068 * For N==0, this corresponds to the operation that in qemu/bitops.h
3069 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3070 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3072 static uint64_t compress_bits(uint64_t x, int n)
3074 int i;
3076 for (i = n; i <= 4; i++) {
3077 int sh = 1 << i;
3078 x &= even_bit_esz_masks[i];
3079 x = (x >> sh) | x;
3081 return x & 0xffffffffu;
3084 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3086 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3087 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3088 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3089 int esize = 1 << esz;
3090 uint64_t *d = vd;
3091 intptr_t i;
3093 if (oprsz <= 8) {
3094 uint64_t nn = *(uint64_t *)vn;
3095 uint64_t mm = *(uint64_t *)vm;
3096 int half = 4 * oprsz;
3098 nn = extract64(nn, high * half, half);
3099 mm = extract64(mm, high * half, half);
3100 nn = expand_bits(nn, esz);
3101 mm = expand_bits(mm, esz);
3102 d[0] = nn | (mm << esize);
3103 } else {
3104 ARMPredicateReg tmp;
3106 /* We produce output faster than we consume input.
3107 Therefore we must be mindful of possible overlap. */
3108 if (vd == vn) {
3109 vn = memcpy(&tmp, vn, oprsz);
3110 if (vd == vm) {
3111 vm = vn;
3113 } else if (vd == vm) {
3114 vm = memcpy(&tmp, vm, oprsz);
3116 if (high) {
3117 high = oprsz >> 1;
3120 if ((oprsz & 7) == 0) {
3121 uint32_t *n = vn, *m = vm;
3122 high >>= 2;
3124 for (i = 0; i < oprsz / 8; i++) {
3125 uint64_t nn = n[H4(high + i)];
3126 uint64_t mm = m[H4(high + i)];
3128 nn = expand_bits(nn, esz);
3129 mm = expand_bits(mm, esz);
3130 d[i] = nn | (mm << esize);
3132 } else {
3133 uint8_t *n = vn, *m = vm;
3134 uint16_t *d16 = vd;
3136 for (i = 0; i < oprsz / 2; i++) {
3137 uint16_t nn = n[H1(high + i)];
3138 uint16_t mm = m[H1(high + i)];
3140 nn = expand_bits(nn, esz);
3141 mm = expand_bits(mm, esz);
3142 d16[H2(i)] = nn | (mm << esize);
3148 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3150 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3151 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3152 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3153 uint64_t *d = vd, *n = vn, *m = vm;
3154 uint64_t l, h;
3155 intptr_t i;
3157 if (oprsz <= 8) {
3158 l = compress_bits(n[0] >> odd, esz);
3159 h = compress_bits(m[0] >> odd, esz);
3160 d[0] = l | (h << (4 * oprsz));
3161 } else {
3162 ARMPredicateReg tmp_m;
3163 intptr_t oprsz_16 = oprsz / 16;
3165 if ((vm - vd) < (uintptr_t)oprsz) {
3166 m = memcpy(&tmp_m, vm, oprsz);
3169 for (i = 0; i < oprsz_16; i++) {
3170 l = n[2 * i + 0];
3171 h = n[2 * i + 1];
3172 l = compress_bits(l >> odd, esz);
3173 h = compress_bits(h >> odd, esz);
3174 d[i] = l | (h << 32);
3178 * For VL which is not a multiple of 512, the results from M do not
3179 * align nicely with the uint64_t for D. Put the aligned results
3180 * from M into TMP_M and then copy it into place afterward.
3182 if (oprsz & 15) {
3183 int final_shift = (oprsz & 15) * 2;
3185 l = n[2 * i + 0];
3186 h = n[2 * i + 1];
3187 l = compress_bits(l >> odd, esz);
3188 h = compress_bits(h >> odd, esz);
3189 d[i] = l | (h << final_shift);
3191 for (i = 0; i < oprsz_16; i++) {
3192 l = m[2 * i + 0];
3193 h = m[2 * i + 1];
3194 l = compress_bits(l >> odd, esz);
3195 h = compress_bits(h >> odd, esz);
3196 tmp_m.p[i] = l | (h << 32);
3198 l = m[2 * i + 0];
3199 h = m[2 * i + 1];
3200 l = compress_bits(l >> odd, esz);
3201 h = compress_bits(h >> odd, esz);
3202 tmp_m.p[i] = l | (h << final_shift);
3204 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3205 } else {
3206 for (i = 0; i < oprsz_16; i++) {
3207 l = m[2 * i + 0];
3208 h = m[2 * i + 1];
3209 l = compress_bits(l >> odd, esz);
3210 h = compress_bits(h >> odd, esz);
3211 d[oprsz_16 + i] = l | (h << 32);
3217 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3219 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3220 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3221 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3222 uint64_t *d = vd, *n = vn, *m = vm;
3223 uint64_t mask;
3224 int shr, shl;
3225 intptr_t i;
3227 shl = 1 << esz;
3228 shr = 0;
3229 mask = even_bit_esz_masks[esz];
3230 if (odd) {
3231 mask <<= shl;
3232 shr = shl;
3233 shl = 0;
3236 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3237 uint64_t nn = (n[i] & mask) >> shr;
3238 uint64_t mm = (m[i] & mask) << shl;
3239 d[i] = nn + mm;
3243 /* Reverse units of 2**N bits. */
3244 static uint64_t reverse_bits_64(uint64_t x, int n)
3246 int i, sh;
3248 x = bswap64(x);
3249 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3250 uint64_t mask = even_bit_esz_masks[i];
3251 x = ((x & mask) << sh) | ((x >> sh) & mask);
3253 return x;
3256 static uint8_t reverse_bits_8(uint8_t x, int n)
3258 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3259 int i, sh;
3261 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3262 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3264 return x;
3267 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3269 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3270 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3271 intptr_t i, oprsz_2 = oprsz / 2;
3273 if (oprsz <= 8) {
3274 uint64_t l = *(uint64_t *)vn;
3275 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3276 *(uint64_t *)vd = l;
3277 } else if ((oprsz & 15) == 0) {
3278 for (i = 0; i < oprsz_2; i += 8) {
3279 intptr_t ih = oprsz - 8 - i;
3280 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3281 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3282 *(uint64_t *)(vd + i) = h;
3283 *(uint64_t *)(vd + ih) = l;
3285 } else {
3286 for (i = 0; i < oprsz_2; i += 1) {
3287 intptr_t il = H1(i);
3288 intptr_t ih = H1(oprsz - 1 - i);
3289 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3290 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3291 *(uint8_t *)(vd + il) = h;
3292 *(uint8_t *)(vd + ih) = l;
3297 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3299 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3300 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3301 uint64_t *d = vd;
3302 intptr_t i;
3304 if (oprsz <= 8) {
3305 uint64_t nn = *(uint64_t *)vn;
3306 int half = 4 * oprsz;
3308 nn = extract64(nn, high * half, half);
3309 nn = expand_bits(nn, 0);
3310 d[0] = nn;
3311 } else {
3312 ARMPredicateReg tmp_n;
3314 /* We produce output faster than we consume input.
3315 Therefore we must be mindful of possible overlap. */
3316 if ((vn - vd) < (uintptr_t)oprsz) {
3317 vn = memcpy(&tmp_n, vn, oprsz);
3319 if (high) {
3320 high = oprsz >> 1;
3323 if ((oprsz & 7) == 0) {
3324 uint32_t *n = vn;
3325 high >>= 2;
3327 for (i = 0; i < oprsz / 8; i++) {
3328 uint64_t nn = n[H4(high + i)];
3329 d[i] = expand_bits(nn, 0);
3331 } else {
3332 uint16_t *d16 = vd;
3333 uint8_t *n = vn;
3335 for (i = 0; i < oprsz / 2; i++) {
3336 uint16_t nn = n[H1(high + i)];
3337 d16[H2(i)] = expand_bits(nn, 0);
3343 #define DO_ZIP(NAME, TYPE, H) \
3344 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3346 intptr_t oprsz = simd_oprsz(desc); \
3347 intptr_t odd_ofs = simd_data(desc); \
3348 intptr_t i, oprsz_2 = oprsz / 2; \
3349 ARMVectorReg tmp_n, tmp_m; \
3350 /* We produce output faster than we consume input. \
3351 Therefore we must be mindful of possible overlap. */ \
3352 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3353 vn = memcpy(&tmp_n, vn, oprsz_2); \
3355 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3356 vm = memcpy(&tmp_m, vm, oprsz_2); \
3358 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3359 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3360 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3361 *(TYPE *)(vm + odd_ofs + H(i)); \
3363 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3364 memset(vd + oprsz - 16, 0, 16); \
3368 DO_ZIP(sve_zip_b, uint8_t, H1)
3369 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3370 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3371 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3372 DO_ZIP(sve2_zip_q, Int128, )
3374 #define DO_UZP(NAME, TYPE, H) \
3375 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3377 intptr_t oprsz = simd_oprsz(desc); \
3378 intptr_t odd_ofs = simd_data(desc); \
3379 intptr_t i, p; \
3380 ARMVectorReg tmp_m; \
3381 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3382 vm = memcpy(&tmp_m, vm, oprsz); \
3384 i = 0, p = odd_ofs; \
3385 do { \
3386 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3387 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3388 } while (p < oprsz); \
3389 p -= oprsz; \
3390 do { \
3391 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3392 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3393 } while (p < oprsz); \
3394 tcg_debug_assert(i == oprsz); \
3397 DO_UZP(sve_uzp_b, uint8_t, H1)
3398 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3399 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3400 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3401 DO_UZP(sve2_uzp_q, Int128, )
3403 #define DO_TRN(NAME, TYPE, H) \
3404 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3406 intptr_t oprsz = simd_oprsz(desc); \
3407 intptr_t odd_ofs = simd_data(desc); \
3408 intptr_t i; \
3409 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3410 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3411 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3412 *(TYPE *)(vd + H(i + 0)) = ae; \
3413 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3415 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3416 memset(vd + oprsz - 16, 0, 16); \
3420 DO_TRN(sve_trn_b, uint8_t, H1)
3421 DO_TRN(sve_trn_h, uint16_t, H1_2)
3422 DO_TRN(sve_trn_s, uint32_t, H1_4)
3423 DO_TRN(sve_trn_d, uint64_t, H1_8)
3424 DO_TRN(sve2_trn_q, Int128, )
3426 #undef DO_ZIP
3427 #undef DO_UZP
3428 #undef DO_TRN
3430 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3432 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3433 uint32_t *d = vd, *n = vn;
3434 uint8_t *pg = vg;
3436 for (i = j = 0; i < opr_sz; i++) {
3437 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3438 d[H4(j)] = n[H4(i)];
3439 j++;
3442 for (; j < opr_sz; j++) {
3443 d[H4(j)] = 0;
3447 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3449 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3450 uint64_t *d = vd, *n = vn;
3451 uint8_t *pg = vg;
3453 for (i = j = 0; i < opr_sz; i++) {
3454 if (pg[H1(i)] & 1) {
3455 d[j] = n[i];
3456 j++;
3459 for (; j < opr_sz; j++) {
3460 d[j] = 0;
3464 /* Similar to the ARM LastActiveElement pseudocode function, except the
3465 * result is multiplied by the element size. This includes the not found
3466 * indication; e.g. not found for esz=3 is -8.
3468 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3470 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3471 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3473 return last_active_element(vg, words, esz);
3476 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3478 intptr_t opr_sz = simd_oprsz(desc) / 8;
3479 int esz = simd_data(desc);
3480 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3481 intptr_t i, first_i, last_i;
3482 ARMVectorReg tmp;
3484 first_i = last_i = 0;
3485 first_g = last_g = 0;
3487 /* Find the extent of the active elements within VG. */
3488 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3489 pg = *(uint64_t *)(vg + i) & mask;
3490 if (pg) {
3491 if (last_g == 0) {
3492 last_g = pg;
3493 last_i = i;
3495 first_g = pg;
3496 first_i = i;
3500 len = 0;
3501 if (first_g != 0) {
3502 first_i = first_i * 8 + ctz64(first_g);
3503 last_i = last_i * 8 + 63 - clz64(last_g);
3504 len = last_i - first_i + (1 << esz);
3505 if (vd == vm) {
3506 vm = memcpy(&tmp, vm, opr_sz * 8);
3508 swap_memmove(vd, vn + first_i, len);
3510 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3513 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3514 void *vg, uint32_t desc)
3516 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3517 uint64_t *d = vd, *n = vn, *m = vm;
3518 uint8_t *pg = vg;
3520 for (i = 0; i < opr_sz; i += 1) {
3521 uint64_t nn = n[i], mm = m[i];
3522 uint64_t pp = expand_pred_b(pg[H1(i)]);
3523 d[i] = (nn & pp) | (mm & ~pp);
3527 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3528 void *vg, uint32_t desc)
3530 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3531 uint64_t *d = vd, *n = vn, *m = vm;
3532 uint8_t *pg = vg;
3534 for (i = 0; i < opr_sz; i += 1) {
3535 uint64_t nn = n[i], mm = m[i];
3536 uint64_t pp = expand_pred_h(pg[H1(i)]);
3537 d[i] = (nn & pp) | (mm & ~pp);
3541 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3542 void *vg, uint32_t desc)
3544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3545 uint64_t *d = vd, *n = vn, *m = vm;
3546 uint8_t *pg = vg;
3548 for (i = 0; i < opr_sz; i += 1) {
3549 uint64_t nn = n[i], mm = m[i];
3550 uint64_t pp = expand_pred_s(pg[H1(i)]);
3551 d[i] = (nn & pp) | (mm & ~pp);
3555 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3556 void *vg, uint32_t desc)
3558 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3559 uint64_t *d = vd, *n = vn, *m = vm;
3560 uint8_t *pg = vg;
3562 for (i = 0; i < opr_sz; i += 1) {
3563 uint64_t nn = n[i], mm = m[i];
3564 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3568 /* Two operand comparison controlled by a predicate.
3569 * ??? It is very tempting to want to be able to expand this inline
3570 * with x86 instructions, e.g.
3572 * vcmpeqw zm, zn, %ymm0
3573 * vpmovmskb %ymm0, %eax
3574 * and $0x5555, %eax
3575 * and pg, %eax
3577 * or even aarch64, e.g.
3579 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3580 * cmeq v0.8h, zn, zm
3581 * and v0.8h, v0.8h, mask
3582 * addv h0, v0.8h
3583 * and v0.8b, pg
3585 * However, coming up with an abstraction that allows vector inputs and
3586 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3587 * scalar outputs, is tricky.
3589 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3590 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3592 intptr_t opr_sz = simd_oprsz(desc); \
3593 uint32_t flags = PREDTEST_INIT; \
3594 intptr_t i = opr_sz; \
3595 do { \
3596 uint64_t out = 0, pg; \
3597 do { \
3598 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3599 TYPE nn = *(TYPE *)(vn + H(i)); \
3600 TYPE mm = *(TYPE *)(vm + H(i)); \
3601 out |= nn OP mm; \
3602 } while (i & 63); \
3603 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3604 out &= pg; \
3605 *(uint64_t *)(vd + (i >> 3)) = out; \
3606 flags = iter_predtest_bwd(out, pg, flags); \
3607 } while (i > 0); \
3608 return flags; \
3611 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3612 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3613 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3614 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3615 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3616 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3617 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3618 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3620 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3621 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3622 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3623 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3625 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3626 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3627 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3628 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3630 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3631 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3632 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3633 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3635 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3636 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3637 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3638 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3640 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3641 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3642 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3643 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3645 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3646 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3647 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3648 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3650 #undef DO_CMP_PPZZ_B
3651 #undef DO_CMP_PPZZ_H
3652 #undef DO_CMP_PPZZ_S
3653 #undef DO_CMP_PPZZ_D
3654 #undef DO_CMP_PPZZ
3656 /* Similar, but the second source is "wide". */
3657 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3658 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3660 intptr_t opr_sz = simd_oprsz(desc); \
3661 uint32_t flags = PREDTEST_INIT; \
3662 intptr_t i = opr_sz; \
3663 do { \
3664 uint64_t out = 0, pg; \
3665 do { \
3666 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3667 do { \
3668 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3669 TYPE nn = *(TYPE *)(vn + H(i)); \
3670 out |= nn OP mm; \
3671 } while (i & 7); \
3672 } while (i & 63); \
3673 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3674 out &= pg; \
3675 *(uint64_t *)(vd + (i >> 3)) = out; \
3676 flags = iter_predtest_bwd(out, pg, flags); \
3677 } while (i > 0); \
3678 return flags; \
3681 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3682 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3683 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3684 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3685 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3686 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3688 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3689 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3690 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3692 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3693 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3694 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3696 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3697 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3698 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3700 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3701 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3702 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3704 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3705 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3706 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3708 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3709 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3710 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3712 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3713 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3714 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3716 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3717 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3718 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3720 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3721 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3722 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3724 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3725 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3726 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3728 #undef DO_CMP_PPZW_B
3729 #undef DO_CMP_PPZW_H
3730 #undef DO_CMP_PPZW_S
3731 #undef DO_CMP_PPZW
3733 /* Similar, but the second source is immediate. */
3734 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3735 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3737 intptr_t opr_sz = simd_oprsz(desc); \
3738 uint32_t flags = PREDTEST_INIT; \
3739 TYPE mm = simd_data(desc); \
3740 intptr_t i = opr_sz; \
3741 do { \
3742 uint64_t out = 0, pg; \
3743 do { \
3744 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3745 TYPE nn = *(TYPE *)(vn + H(i)); \
3746 out |= nn OP mm; \
3747 } while (i & 63); \
3748 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3749 out &= pg; \
3750 *(uint64_t *)(vd + (i >> 3)) = out; \
3751 flags = iter_predtest_bwd(out, pg, flags); \
3752 } while (i > 0); \
3753 return flags; \
3756 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3757 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3758 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3759 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3760 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3761 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3762 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3763 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3765 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3766 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3767 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3768 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3770 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3771 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3772 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3773 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3775 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3776 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3777 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3778 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3780 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3781 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3782 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3783 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3785 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3786 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3787 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3788 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3790 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3791 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3792 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3793 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3795 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3796 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3797 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3798 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3800 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3801 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3802 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3803 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3805 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3806 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3807 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3808 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3810 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3811 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3812 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3813 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3815 #undef DO_CMP_PPZI_B
3816 #undef DO_CMP_PPZI_H
3817 #undef DO_CMP_PPZI_S
3818 #undef DO_CMP_PPZI_D
3819 #undef DO_CMP_PPZI
3821 /* Similar to the ARM LastActive pseudocode function. */
3822 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3824 intptr_t i;
3826 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3827 uint64_t pg = *(uint64_t *)(vg + i);
3828 if (pg) {
3829 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3832 return 0;
3835 /* Compute a mask into RETB that is true for all G, up to and including
3836 * (if after) or excluding (if !after) the first G & N.
3837 * Return true if BRK found.
3839 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3840 bool brk, bool after)
3842 uint64_t b;
3844 if (brk) {
3845 b = 0;
3846 } else if ((g & n) == 0) {
3847 /* For all G, no N are set; break not found. */
3848 b = g;
3849 } else {
3850 /* Break somewhere in N. Locate it. */
3851 b = g & n; /* guard true, pred true */
3852 b = b & -b; /* first such */
3853 if (after) {
3854 b = b | (b - 1); /* break after same */
3855 } else {
3856 b = b - 1; /* break before same */
3858 brk = true;
3861 *retb = b;
3862 return brk;
3865 /* Compute a zeroing BRK. */
3866 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3867 intptr_t oprsz, bool after)
3869 bool brk = false;
3870 intptr_t i;
3872 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3873 uint64_t this_b, this_g = g[i];
3875 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3876 d[i] = this_b & this_g;
3880 /* Likewise, but also compute flags. */
3881 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3882 intptr_t oprsz, bool after)
3884 uint32_t flags = PREDTEST_INIT;
3885 bool brk = false;
3886 intptr_t i;
3888 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3889 uint64_t this_b, this_d, this_g = g[i];
3891 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3892 d[i] = this_d = this_b & this_g;
3893 flags = iter_predtest_fwd(this_d, this_g, flags);
3895 return flags;
3898 /* Compute a merging BRK. */
3899 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3900 intptr_t oprsz, bool after)
3902 bool brk = false;
3903 intptr_t i;
3905 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3906 uint64_t this_b, this_g = g[i];
3908 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3909 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3913 /* Likewise, but also compute flags. */
3914 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3915 intptr_t oprsz, bool after)
3917 uint32_t flags = PREDTEST_INIT;
3918 bool brk = false;
3919 intptr_t i;
3921 for (i = 0; i < oprsz / 8; ++i) {
3922 uint64_t this_b, this_d = d[i], this_g = g[i];
3924 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3925 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3926 flags = iter_predtest_fwd(this_d, this_g, flags);
3928 return flags;
3931 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3933 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3934 * The compiler should turn this into 4 64-bit integer stores.
3936 memset(d, 0, sizeof(ARMPredicateReg));
3937 return PREDTEST_INIT;
3940 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3941 uint32_t pred_desc)
3943 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3944 if (last_active_pred(vn, vg, oprsz)) {
3945 compute_brk_z(vd, vm, vg, oprsz, true);
3946 } else {
3947 do_zero(vd, oprsz);
3951 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3952 uint32_t pred_desc)
3954 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3955 if (last_active_pred(vn, vg, oprsz)) {
3956 return compute_brks_z(vd, vm, vg, oprsz, true);
3957 } else {
3958 return do_zero(vd, oprsz);
3962 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3963 uint32_t pred_desc)
3965 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3966 if (last_active_pred(vn, vg, oprsz)) {
3967 compute_brk_z(vd, vm, vg, oprsz, false);
3968 } else {
3969 do_zero(vd, oprsz);
3973 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3974 uint32_t pred_desc)
3976 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3977 if (last_active_pred(vn, vg, oprsz)) {
3978 return compute_brks_z(vd, vm, vg, oprsz, false);
3979 } else {
3980 return do_zero(vd, oprsz);
3984 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3986 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3987 compute_brk_z(vd, vn, vg, oprsz, true);
3990 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3992 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3993 return compute_brks_z(vd, vn, vg, oprsz, true);
3996 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3998 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3999 compute_brk_z(vd, vn, vg, oprsz, false);
4002 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005 return compute_brks_z(vd, vn, vg, oprsz, false);
4008 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4010 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4011 compute_brk_m(vd, vn, vg, oprsz, true);
4014 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4016 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4017 return compute_brks_m(vd, vn, vg, oprsz, true);
4020 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4022 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023 compute_brk_m(vd, vn, vg, oprsz, false);
4026 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029 return compute_brks_m(vd, vn, vg, oprsz, false);
4032 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4034 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035 if (!last_active_pred(vn, vg, oprsz)) {
4036 do_zero(vd, oprsz);
4040 /* As if PredTest(Ones(PL), D, esz). */
4041 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4042 uint64_t esz_mask)
4044 uint32_t flags = PREDTEST_INIT;
4045 intptr_t i;
4047 for (i = 0; i < oprsz / 8; i++) {
4048 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4050 if (oprsz & 7) {
4051 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4052 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4054 return flags;
4057 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4059 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4060 if (last_active_pred(vn, vg, oprsz)) {
4061 return predtest_ones(vd, oprsz, -1);
4062 } else {
4063 return do_zero(vd, oprsz);
4067 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4069 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4070 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4071 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4072 intptr_t i;
4074 for (i = 0; i < words; ++i) {
4075 uint64_t t = n[i] & g[i] & mask;
4076 sum += ctpop64(t);
4078 return sum;
4081 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4083 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4084 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4085 uint64_t esz_mask = pred_esz_masks[esz];
4086 ARMPredicateReg *d = vd;
4087 uint32_t flags;
4088 intptr_t i;
4090 /* Begin with a zero predicate register. */
4091 flags = do_zero(d, oprsz);
4092 if (count == 0) {
4093 return flags;
4096 /* Set all of the requested bits. */
4097 for (i = 0; i < count / 64; ++i) {
4098 d->p[i] = esz_mask;
4100 if (count & 63) {
4101 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4104 return predtest_ones(d, oprsz, esz_mask);
4107 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4109 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4110 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4111 uint64_t esz_mask = pred_esz_masks[esz];
4112 ARMPredicateReg *d = vd;
4113 intptr_t i, invcount, oprbits;
4114 uint64_t bits;
4116 if (count == 0) {
4117 return do_zero(d, oprsz);
4120 oprbits = oprsz * 8;
4121 tcg_debug_assert(count <= oprbits);
4123 bits = esz_mask;
4124 if (oprbits & 63) {
4125 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4128 invcount = oprbits - count;
4129 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4130 d->p[i] = bits;
4131 bits = esz_mask;
4134 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4136 while (--i >= 0) {
4137 d->p[i] = 0;
4140 return predtest_ones(d, oprsz, esz_mask);
4143 /* Recursive reduction on a function;
4144 * C.f. the ARM ARM function ReducePredicated.
4146 * While it would be possible to write this without the DATA temporary,
4147 * it is much simpler to process the predicate register this way.
4148 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4149 * little to gain with a more complex non-recursive form.
4151 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4152 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4154 if (n == 1) { \
4155 return *data; \
4156 } else { \
4157 uintptr_t half = n / 2; \
4158 TYPE lo = NAME##_reduce(data, status, half); \
4159 TYPE hi = NAME##_reduce(data + half, status, half); \
4160 return TYPE##_##FUNC(lo, hi, status); \
4163 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4165 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4166 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4167 for (i = 0; i < oprsz; ) { \
4168 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4169 do { \
4170 TYPE nn = *(TYPE *)(vn + H(i)); \
4171 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4172 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4173 } while (i & 15); \
4175 for (; i < maxsz; i += sizeof(TYPE)) { \
4176 *(TYPE *)((void *)data + i) = IDENT; \
4178 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4181 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4182 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4183 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4185 /* Identity is floatN_default_nan, without the function call. */
4186 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4187 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4188 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4190 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4191 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4192 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4194 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4195 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4196 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4198 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4199 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4200 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4202 #undef DO_REDUCE
4204 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4205 void *status, uint32_t desc)
4207 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4208 float16 result = nn;
4210 do {
4211 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4212 do {
4213 if (pg & 1) {
4214 float16 mm = *(float16 *)(vm + H1_2(i));
4215 result = float16_add(result, mm, status);
4217 i += sizeof(float16), pg >>= sizeof(float16);
4218 } while (i & 15);
4219 } while (i < opr_sz);
4221 return result;
4224 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4225 void *status, uint32_t desc)
4227 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4228 float32 result = nn;
4230 do {
4231 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4232 do {
4233 if (pg & 1) {
4234 float32 mm = *(float32 *)(vm + H1_2(i));
4235 result = float32_add(result, mm, status);
4237 i += sizeof(float32), pg >>= sizeof(float32);
4238 } while (i & 15);
4239 } while (i < opr_sz);
4241 return result;
4244 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4245 void *status, uint32_t desc)
4247 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4248 uint64_t *m = vm;
4249 uint8_t *pg = vg;
4251 for (i = 0; i < opr_sz; i++) {
4252 if (pg[H1(i)] & 1) {
4253 nn = float64_add(nn, m[i], status);
4257 return nn;
4260 /* Fully general three-operand expander, controlled by a predicate,
4261 * With the extra float_status parameter.
4263 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4264 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4265 void *status, uint32_t desc) \
4267 intptr_t i = simd_oprsz(desc); \
4268 uint64_t *g = vg; \
4269 do { \
4270 uint64_t pg = g[(i - 1) >> 6]; \
4271 do { \
4272 i -= sizeof(TYPE); \
4273 if (likely((pg >> (i & 63)) & 1)) { \
4274 TYPE nn = *(TYPE *)(vn + H(i)); \
4275 TYPE mm = *(TYPE *)(vm + H(i)); \
4276 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4278 } while (i & 63); \
4279 } while (i != 0); \
4282 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4283 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4284 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4286 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4287 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4288 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4290 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4291 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4292 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4294 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4295 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4296 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4298 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4299 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4300 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4302 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4303 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4304 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4306 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4307 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4308 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4310 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4311 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4312 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4314 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4316 return float16_abs(float16_sub(a, b, s));
4319 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4321 return float32_abs(float32_sub(a, b, s));
4324 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4326 return float64_abs(float64_sub(a, b, s));
4329 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4330 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4331 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4333 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4335 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4336 return float64_scalbn(a, b_int, s);
4339 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4340 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4341 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4343 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4344 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4345 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4347 #undef DO_ZPZZ_FP
4349 /* Three-operand expander, with one scalar operand, controlled by
4350 * a predicate, with the extra float_status parameter.
4352 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4353 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4354 void *status, uint32_t desc) \
4356 intptr_t i = simd_oprsz(desc); \
4357 uint64_t *g = vg; \
4358 TYPE mm = scalar; \
4359 do { \
4360 uint64_t pg = g[(i - 1) >> 6]; \
4361 do { \
4362 i -= sizeof(TYPE); \
4363 if (likely((pg >> (i & 63)) & 1)) { \
4364 TYPE nn = *(TYPE *)(vn + H(i)); \
4365 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4367 } while (i & 63); \
4368 } while (i != 0); \
4371 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4372 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4373 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4375 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4376 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4377 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4379 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4380 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4381 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4383 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4385 return float16_sub(b, a, s);
4388 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4390 return float32_sub(b, a, s);
4393 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4395 return float64_sub(b, a, s);
4398 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4399 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4400 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4402 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4403 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4404 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4406 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4407 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4408 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4410 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4411 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4412 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4414 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4415 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4416 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4418 /* Fully general two-operand expander, controlled by a predicate,
4419 * With the extra float_status parameter.
4421 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4422 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4424 intptr_t i = simd_oprsz(desc); \
4425 uint64_t *g = vg; \
4426 do { \
4427 uint64_t pg = g[(i - 1) >> 6]; \
4428 do { \
4429 i -= sizeof(TYPE); \
4430 if (likely((pg >> (i & 63)) & 1)) { \
4431 TYPE nn = *(TYPE *)(vn + H(i)); \
4432 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4434 } while (i & 63); \
4435 } while (i != 0); \
4438 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4439 * FZ16. When converting from fp16, this affects flushing input denormals;
4440 * when converting to fp16, this affects flushing output denormals.
4442 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4444 bool save = get_flush_inputs_to_zero(fpst);
4445 float32 ret;
4447 set_flush_inputs_to_zero(false, fpst);
4448 ret = float16_to_float32(f, true, fpst);
4449 set_flush_inputs_to_zero(save, fpst);
4450 return ret;
4453 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4455 bool save = get_flush_inputs_to_zero(fpst);
4456 float64 ret;
4458 set_flush_inputs_to_zero(false, fpst);
4459 ret = float16_to_float64(f, true, fpst);
4460 set_flush_inputs_to_zero(save, fpst);
4461 return ret;
4464 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4466 bool save = get_flush_to_zero(fpst);
4467 float16 ret;
4469 set_flush_to_zero(false, fpst);
4470 ret = float32_to_float16(f, true, fpst);
4471 set_flush_to_zero(save, fpst);
4472 return ret;
4475 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4477 bool save = get_flush_to_zero(fpst);
4478 float16 ret;
4480 set_flush_to_zero(false, fpst);
4481 ret = float64_to_float16(f, true, fpst);
4482 set_flush_to_zero(save, fpst);
4483 return ret;
4486 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4488 if (float16_is_any_nan(f)) {
4489 float_raise(float_flag_invalid, s);
4490 return 0;
4492 return float16_to_int16_round_to_zero(f, s);
4495 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4497 if (float16_is_any_nan(f)) {
4498 float_raise(float_flag_invalid, s);
4499 return 0;
4501 return float16_to_int64_round_to_zero(f, s);
4504 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4506 if (float32_is_any_nan(f)) {
4507 float_raise(float_flag_invalid, s);
4508 return 0;
4510 return float32_to_int64_round_to_zero(f, s);
4513 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4515 if (float64_is_any_nan(f)) {
4516 float_raise(float_flag_invalid, s);
4517 return 0;
4519 return float64_to_int64_round_to_zero(f, s);
4522 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4524 if (float16_is_any_nan(f)) {
4525 float_raise(float_flag_invalid, s);
4526 return 0;
4528 return float16_to_uint16_round_to_zero(f, s);
4531 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4533 if (float16_is_any_nan(f)) {
4534 float_raise(float_flag_invalid, s);
4535 return 0;
4537 return float16_to_uint64_round_to_zero(f, s);
4540 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4542 if (float32_is_any_nan(f)) {
4543 float_raise(float_flag_invalid, s);
4544 return 0;
4546 return float32_to_uint64_round_to_zero(f, s);
4549 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4551 if (float64_is_any_nan(f)) {
4552 float_raise(float_flag_invalid, s);
4553 return 0;
4555 return float64_to_uint64_round_to_zero(f, s);
4558 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4559 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4560 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4561 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4562 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4563 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4564 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4566 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4567 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4568 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4569 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4570 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4571 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4572 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4574 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4575 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4576 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4577 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4578 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4579 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4580 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4582 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4583 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4584 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4586 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4587 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4588 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4590 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4591 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4592 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4594 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4595 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4596 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4598 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4599 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4600 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4601 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4602 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4603 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4604 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4606 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4607 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4608 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4609 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4610 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4611 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4612 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4614 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4616 /* Extract frac to the top of the uint32_t. */
4617 uint32_t frac = (uint32_t)a << (16 + 6);
4618 int16_t exp = extract32(a, 10, 5);
4620 if (unlikely(exp == 0)) {
4621 if (frac != 0) {
4622 if (!get_flush_inputs_to_zero(s)) {
4623 /* denormal: bias - fractional_zeros */
4624 return -15 - clz32(frac);
4626 /* flush to zero */
4627 float_raise(float_flag_input_denormal, s);
4629 } else if (unlikely(exp == 0x1f)) {
4630 if (frac == 0) {
4631 return INT16_MAX; /* infinity */
4633 } else {
4634 /* normal: exp - bias */
4635 return exp - 15;
4637 /* nan or zero */
4638 float_raise(float_flag_invalid, s);
4639 return INT16_MIN;
4642 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4644 /* Extract frac to the top of the uint32_t. */
4645 uint32_t frac = a << 9;
4646 int32_t exp = extract32(a, 23, 8);
4648 if (unlikely(exp == 0)) {
4649 if (frac != 0) {
4650 if (!get_flush_inputs_to_zero(s)) {
4651 /* denormal: bias - fractional_zeros */
4652 return -127 - clz32(frac);
4654 /* flush to zero */
4655 float_raise(float_flag_input_denormal, s);
4657 } else if (unlikely(exp == 0xff)) {
4658 if (frac == 0) {
4659 return INT32_MAX; /* infinity */
4661 } else {
4662 /* normal: exp - bias */
4663 return exp - 127;
4665 /* nan or zero */
4666 float_raise(float_flag_invalid, s);
4667 return INT32_MIN;
4670 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4672 /* Extract frac to the top of the uint64_t. */
4673 uint64_t frac = a << 12;
4674 int64_t exp = extract64(a, 52, 11);
4676 if (unlikely(exp == 0)) {
4677 if (frac != 0) {
4678 if (!get_flush_inputs_to_zero(s)) {
4679 /* denormal: bias - fractional_zeros */
4680 return -1023 - clz64(frac);
4682 /* flush to zero */
4683 float_raise(float_flag_input_denormal, s);
4685 } else if (unlikely(exp == 0x7ff)) {
4686 if (frac == 0) {
4687 return INT64_MAX; /* infinity */
4689 } else {
4690 /* normal: exp - bias */
4691 return exp - 1023;
4693 /* nan or zero */
4694 float_raise(float_flag_invalid, s);
4695 return INT64_MIN;
4698 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4699 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4700 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4702 #undef DO_ZPZ_FP
4704 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4705 float_status *status, uint32_t desc,
4706 uint16_t neg1, uint16_t neg3)
4708 intptr_t i = simd_oprsz(desc);
4709 uint64_t *g = vg;
4711 do {
4712 uint64_t pg = g[(i - 1) >> 6];
4713 do {
4714 i -= 2;
4715 if (likely((pg >> (i & 63)) & 1)) {
4716 float16 e1, e2, e3, r;
4718 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4719 e2 = *(uint16_t *)(vm + H1_2(i));
4720 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4721 r = float16_muladd(e1, e2, e3, 0, status);
4722 *(uint16_t *)(vd + H1_2(i)) = r;
4724 } while (i & 63);
4725 } while (i != 0);
4728 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4729 void *vg, void *status, uint32_t desc)
4731 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4734 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4735 void *vg, void *status, uint32_t desc)
4737 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4740 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4741 void *vg, void *status, uint32_t desc)
4743 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4746 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4747 void *vg, void *status, uint32_t desc)
4749 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4752 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4753 float_status *status, uint32_t desc,
4754 uint32_t neg1, uint32_t neg3)
4756 intptr_t i = simd_oprsz(desc);
4757 uint64_t *g = vg;
4759 do {
4760 uint64_t pg = g[(i - 1) >> 6];
4761 do {
4762 i -= 4;
4763 if (likely((pg >> (i & 63)) & 1)) {
4764 float32 e1, e2, e3, r;
4766 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4767 e2 = *(uint32_t *)(vm + H1_4(i));
4768 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4769 r = float32_muladd(e1, e2, e3, 0, status);
4770 *(uint32_t *)(vd + H1_4(i)) = r;
4772 } while (i & 63);
4773 } while (i != 0);
4776 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4777 void *vg, void *status, uint32_t desc)
4779 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4782 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4783 void *vg, void *status, uint32_t desc)
4785 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4788 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4789 void *vg, void *status, uint32_t desc)
4791 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4794 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4795 void *vg, void *status, uint32_t desc)
4797 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4800 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4801 float_status *status, uint32_t desc,
4802 uint64_t neg1, uint64_t neg3)
4804 intptr_t i = simd_oprsz(desc);
4805 uint64_t *g = vg;
4807 do {
4808 uint64_t pg = g[(i - 1) >> 6];
4809 do {
4810 i -= 8;
4811 if (likely((pg >> (i & 63)) & 1)) {
4812 float64 e1, e2, e3, r;
4814 e1 = *(uint64_t *)(vn + i) ^ neg1;
4815 e2 = *(uint64_t *)(vm + i);
4816 e3 = *(uint64_t *)(va + i) ^ neg3;
4817 r = float64_muladd(e1, e2, e3, 0, status);
4818 *(uint64_t *)(vd + i) = r;
4820 } while (i & 63);
4821 } while (i != 0);
4824 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4825 void *vg, void *status, uint32_t desc)
4827 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4830 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4831 void *vg, void *status, uint32_t desc)
4833 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4836 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4837 void *vg, void *status, uint32_t desc)
4839 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4842 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4843 void *vg, void *status, uint32_t desc)
4845 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4848 /* Two operand floating-point comparison controlled by a predicate.
4849 * Unlike the integer version, we are not allowed to optimistically
4850 * compare operands, since the comparison may have side effects wrt
4851 * the FPSR.
4853 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4854 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4855 void *status, uint32_t desc) \
4857 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4858 uint64_t *d = vd, *g = vg; \
4859 do { \
4860 uint64_t out = 0, pg = g[j]; \
4861 do { \
4862 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4863 if (likely((pg >> (i & 63)) & 1)) { \
4864 TYPE nn = *(TYPE *)(vn + H(i)); \
4865 TYPE mm = *(TYPE *)(vm + H(i)); \
4866 out |= OP(TYPE, nn, mm, status); \
4868 } while (i & 63); \
4869 d[j--] = out; \
4870 } while (i > 0); \
4873 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4874 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4875 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4876 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4877 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4878 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4880 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4881 DO_FPCMP_PPZZ_H(NAME, OP) \
4882 DO_FPCMP_PPZZ_S(NAME, OP) \
4883 DO_FPCMP_PPZZ_D(NAME, OP)
4885 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4886 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4887 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4888 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4889 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4890 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4891 #define DO_FCMUO(TYPE, X, Y, ST) \
4892 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4893 #define DO_FACGE(TYPE, X, Y, ST) \
4894 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4895 #define DO_FACGT(TYPE, X, Y, ST) \
4896 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4898 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4899 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4900 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4901 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4902 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4903 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4904 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4906 #undef DO_FPCMP_PPZZ_ALL
4907 #undef DO_FPCMP_PPZZ_D
4908 #undef DO_FPCMP_PPZZ_S
4909 #undef DO_FPCMP_PPZZ_H
4910 #undef DO_FPCMP_PPZZ
4912 /* One operand floating-point comparison against zero, controlled
4913 * by a predicate.
4915 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4916 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4917 void *status, uint32_t desc) \
4919 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4920 uint64_t *d = vd, *g = vg; \
4921 do { \
4922 uint64_t out = 0, pg = g[j]; \
4923 do { \
4924 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4925 if ((pg >> (i & 63)) & 1) { \
4926 TYPE nn = *(TYPE *)(vn + H(i)); \
4927 out |= OP(TYPE, nn, 0, status); \
4929 } while (i & 63); \
4930 d[j--] = out; \
4931 } while (i > 0); \
4934 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4935 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4936 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4937 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4938 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4939 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4941 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4942 DO_FPCMP_PPZ0_H(NAME, OP) \
4943 DO_FPCMP_PPZ0_S(NAME, OP) \
4944 DO_FPCMP_PPZ0_D(NAME, OP)
4946 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4947 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4948 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4949 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4950 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4951 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4953 /* FP Trig Multiply-Add. */
4955 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4957 static const float16 coeff[16] = {
4958 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4959 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4961 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4962 intptr_t x = simd_data(desc);
4963 float16 *d = vd, *n = vn, *m = vm;
4964 for (i = 0; i < opr_sz; i++) {
4965 float16 mm = m[i];
4966 intptr_t xx = x;
4967 if (float16_is_neg(mm)) {
4968 mm = float16_abs(mm);
4969 xx += 8;
4971 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4975 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4977 static const float32 coeff[16] = {
4978 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4979 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4980 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4981 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4983 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4984 intptr_t x = simd_data(desc);
4985 float32 *d = vd, *n = vn, *m = vm;
4986 for (i = 0; i < opr_sz; i++) {
4987 float32 mm = m[i];
4988 intptr_t xx = x;
4989 if (float32_is_neg(mm)) {
4990 mm = float32_abs(mm);
4991 xx += 8;
4993 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4997 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4999 static const float64 coeff[16] = {
5000 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5001 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5002 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5003 0x3de5d8408868552full, 0x0000000000000000ull,
5004 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5005 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5006 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5007 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5009 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5010 intptr_t x = simd_data(desc);
5011 float64 *d = vd, *n = vn, *m = vm;
5012 for (i = 0; i < opr_sz; i++) {
5013 float64 mm = m[i];
5014 intptr_t xx = x;
5015 if (float64_is_neg(mm)) {
5016 mm = float64_abs(mm);
5017 xx += 8;
5019 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5024 * FP Complex Add
5027 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5028 void *vs, uint32_t desc)
5030 intptr_t j, i = simd_oprsz(desc);
5031 uint64_t *g = vg;
5032 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5033 float16 neg_real = float16_chs(neg_imag);
5035 do {
5036 uint64_t pg = g[(i - 1) >> 6];
5037 do {
5038 float16 e0, e1, e2, e3;
5040 /* I holds the real index; J holds the imag index. */
5041 j = i - sizeof(float16);
5042 i -= 2 * sizeof(float16);
5044 e0 = *(float16 *)(vn + H1_2(i));
5045 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5046 e2 = *(float16 *)(vn + H1_2(j));
5047 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5049 if (likely((pg >> (i & 63)) & 1)) {
5050 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5052 if (likely((pg >> (j & 63)) & 1)) {
5053 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5055 } while (i & 63);
5056 } while (i != 0);
5059 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5060 void *vs, uint32_t desc)
5062 intptr_t j, i = simd_oprsz(desc);
5063 uint64_t *g = vg;
5064 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5065 float32 neg_real = float32_chs(neg_imag);
5067 do {
5068 uint64_t pg = g[(i - 1) >> 6];
5069 do {
5070 float32 e0, e1, e2, e3;
5072 /* I holds the real index; J holds the imag index. */
5073 j = i - sizeof(float32);
5074 i -= 2 * sizeof(float32);
5076 e0 = *(float32 *)(vn + H1_2(i));
5077 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5078 e2 = *(float32 *)(vn + H1_2(j));
5079 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5081 if (likely((pg >> (i & 63)) & 1)) {
5082 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5084 if (likely((pg >> (j & 63)) & 1)) {
5085 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5087 } while (i & 63);
5088 } while (i != 0);
5091 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5092 void *vs, uint32_t desc)
5094 intptr_t j, i = simd_oprsz(desc);
5095 uint64_t *g = vg;
5096 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5097 float64 neg_real = float64_chs(neg_imag);
5099 do {
5100 uint64_t pg = g[(i - 1) >> 6];
5101 do {
5102 float64 e0, e1, e2, e3;
5104 /* I holds the real index; J holds the imag index. */
5105 j = i - sizeof(float64);
5106 i -= 2 * sizeof(float64);
5108 e0 = *(float64 *)(vn + H1_2(i));
5109 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5110 e2 = *(float64 *)(vn + H1_2(j));
5111 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5113 if (likely((pg >> (i & 63)) & 1)) {
5114 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5116 if (likely((pg >> (j & 63)) & 1)) {
5117 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5119 } while (i & 63);
5120 } while (i != 0);
5124 * FP Complex Multiply
5127 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5128 void *vg, void *status, uint32_t desc)
5130 intptr_t j, i = simd_oprsz(desc);
5131 unsigned rot = simd_data(desc);
5132 bool flip = rot & 1;
5133 float16 neg_imag, neg_real;
5134 uint64_t *g = vg;
5136 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5137 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5139 do {
5140 uint64_t pg = g[(i - 1) >> 6];
5141 do {
5142 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5144 /* I holds the real index; J holds the imag index. */
5145 j = i - sizeof(float16);
5146 i -= 2 * sizeof(float16);
5148 nr = *(float16 *)(vn + H1_2(i));
5149 ni = *(float16 *)(vn + H1_2(j));
5150 mr = *(float16 *)(vm + H1_2(i));
5151 mi = *(float16 *)(vm + H1_2(j));
5153 e2 = (flip ? ni : nr);
5154 e1 = (flip ? mi : mr) ^ neg_real;
5155 e4 = e2;
5156 e3 = (flip ? mr : mi) ^ neg_imag;
5158 if (likely((pg >> (i & 63)) & 1)) {
5159 d = *(float16 *)(va + H1_2(i));
5160 d = float16_muladd(e2, e1, d, 0, status);
5161 *(float16 *)(vd + H1_2(i)) = d;
5163 if (likely((pg >> (j & 63)) & 1)) {
5164 d = *(float16 *)(va + H1_2(j));
5165 d = float16_muladd(e4, e3, d, 0, status);
5166 *(float16 *)(vd + H1_2(j)) = d;
5168 } while (i & 63);
5169 } while (i != 0);
5172 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5173 void *vg, void *status, uint32_t desc)
5175 intptr_t j, i = simd_oprsz(desc);
5176 unsigned rot = simd_data(desc);
5177 bool flip = rot & 1;
5178 float32 neg_imag, neg_real;
5179 uint64_t *g = vg;
5181 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5182 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5184 do {
5185 uint64_t pg = g[(i - 1) >> 6];
5186 do {
5187 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5189 /* I holds the real index; J holds the imag index. */
5190 j = i - sizeof(float32);
5191 i -= 2 * sizeof(float32);
5193 nr = *(float32 *)(vn + H1_2(i));
5194 ni = *(float32 *)(vn + H1_2(j));
5195 mr = *(float32 *)(vm + H1_2(i));
5196 mi = *(float32 *)(vm + H1_2(j));
5198 e2 = (flip ? ni : nr);
5199 e1 = (flip ? mi : mr) ^ neg_real;
5200 e4 = e2;
5201 e3 = (flip ? mr : mi) ^ neg_imag;
5203 if (likely((pg >> (i & 63)) & 1)) {
5204 d = *(float32 *)(va + H1_2(i));
5205 d = float32_muladd(e2, e1, d, 0, status);
5206 *(float32 *)(vd + H1_2(i)) = d;
5208 if (likely((pg >> (j & 63)) & 1)) {
5209 d = *(float32 *)(va + H1_2(j));
5210 d = float32_muladd(e4, e3, d, 0, status);
5211 *(float32 *)(vd + H1_2(j)) = d;
5213 } while (i & 63);
5214 } while (i != 0);
5217 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5218 void *vg, void *status, uint32_t desc)
5220 intptr_t j, i = simd_oprsz(desc);
5221 unsigned rot = simd_data(desc);
5222 bool flip = rot & 1;
5223 float64 neg_imag, neg_real;
5224 uint64_t *g = vg;
5226 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5227 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5229 do {
5230 uint64_t pg = g[(i - 1) >> 6];
5231 do {
5232 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5234 /* I holds the real index; J holds the imag index. */
5235 j = i - sizeof(float64);
5236 i -= 2 * sizeof(float64);
5238 nr = *(float64 *)(vn + H1_2(i));
5239 ni = *(float64 *)(vn + H1_2(j));
5240 mr = *(float64 *)(vm + H1_2(i));
5241 mi = *(float64 *)(vm + H1_2(j));
5243 e2 = (flip ? ni : nr);
5244 e1 = (flip ? mi : mr) ^ neg_real;
5245 e4 = e2;
5246 e3 = (flip ? mr : mi) ^ neg_imag;
5248 if (likely((pg >> (i & 63)) & 1)) {
5249 d = *(float64 *)(va + H1_2(i));
5250 d = float64_muladd(e2, e1, d, 0, status);
5251 *(float64 *)(vd + H1_2(i)) = d;
5253 if (likely((pg >> (j & 63)) & 1)) {
5254 d = *(float64 *)(va + H1_2(j));
5255 d = float64_muladd(e4, e3, d, 0, status);
5256 *(float64 *)(vd + H1_2(j)) = d;
5258 } while (i & 63);
5259 } while (i != 0);
5263 * Load contiguous data, protected by a governing predicate.
5267 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5268 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5269 * element >= @reg_off, or @reg_max if there were no active elements at all.
5271 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5272 intptr_t reg_max, int esz)
5274 uint64_t pg_mask = pred_esz_masks[esz];
5275 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5277 /* In normal usage, the first element is active. */
5278 if (likely(pg & 1)) {
5279 return reg_off;
5282 if (pg == 0) {
5283 reg_off &= -64;
5284 do {
5285 reg_off += 64;
5286 if (unlikely(reg_off >= reg_max)) {
5287 /* The entire predicate was false. */
5288 return reg_max;
5290 pg = vg[reg_off >> 6] & pg_mask;
5291 } while (pg == 0);
5293 reg_off += ctz64(pg);
5295 /* We should never see an out of range predicate bit set. */
5296 tcg_debug_assert(reg_off < reg_max);
5297 return reg_off;
5301 * Resolve the guest virtual address to info->host and info->flags.
5302 * If @nofault, return false if the page is invalid, otherwise
5303 * exit via page fault exception.
5306 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5307 target_ulong addr, int mem_off, MMUAccessType access_type,
5308 int mmu_idx, uintptr_t retaddr)
5310 int flags;
5312 addr += mem_off;
5315 * User-only currently always issues with TBI. See the comment
5316 * above useronly_clean_ptr. Usually we clean this top byte away
5317 * during translation, but we can't do that for e.g. vector + imm
5318 * addressing modes.
5320 * We currently always enable TBI for user-only, and do not provide
5321 * a way to turn it off. So clean the pointer unconditionally here,
5322 * rather than look it up here, or pass it down from above.
5324 addr = useronly_clean_ptr(addr);
5326 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5327 &info->host, retaddr);
5328 info->flags = flags;
5330 if (flags & TLB_INVALID_MASK) {
5331 g_assert(nofault);
5332 return false;
5335 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5336 info->host -= mem_off;
5338 #ifdef CONFIG_USER_ONLY
5339 memset(&info->attrs, 0, sizeof(info->attrs));
5340 #else
5342 * Find the iotlbentry for addr and return the transaction attributes.
5343 * This *must* be present in the TLB because we just found the mapping.
5346 uintptr_t index = tlb_index(env, mmu_idx, addr);
5348 # ifdef CONFIG_DEBUG_TCG
5349 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5350 target_ulong comparator = (access_type == MMU_DATA_LOAD
5351 ? entry->addr_read
5352 : tlb_addr_write(entry));
5353 g_assert(tlb_hit(comparator, addr));
5354 # endif
5356 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5357 info->attrs = iotlbentry->attrs;
5359 #endif
5361 return true;
5365 * Find first active element on each page, and a loose bound for the
5366 * final element on each page. Identify any single element that spans
5367 * the page boundary. Return true if there are any active elements.
5369 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5370 intptr_t reg_max, int esz, int msize)
5372 const int esize = 1 << esz;
5373 const uint64_t pg_mask = pred_esz_masks[esz];
5374 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5375 intptr_t mem_off_last, mem_off_split;
5376 intptr_t page_split, elt_split;
5377 intptr_t i;
5379 /* Set all of the element indices to -1, and the TLB data to 0. */
5380 memset(info, -1, offsetof(SVEContLdSt, page));
5381 memset(info->page, 0, sizeof(info->page));
5383 /* Gross scan over the entire predicate to find bounds. */
5384 i = 0;
5385 do {
5386 uint64_t pg = vg[i] & pg_mask;
5387 if (pg) {
5388 reg_off_last = i * 64 + 63 - clz64(pg);
5389 if (reg_off_first < 0) {
5390 reg_off_first = i * 64 + ctz64(pg);
5393 } while (++i * 64 < reg_max);
5395 if (unlikely(reg_off_first < 0)) {
5396 /* No active elements, no pages touched. */
5397 return false;
5399 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5401 info->reg_off_first[0] = reg_off_first;
5402 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5403 mem_off_last = (reg_off_last >> esz) * msize;
5405 page_split = -(addr | TARGET_PAGE_MASK);
5406 if (likely(mem_off_last + msize <= page_split)) {
5407 /* The entire operation fits within a single page. */
5408 info->reg_off_last[0] = reg_off_last;
5409 return true;
5412 info->page_split = page_split;
5413 elt_split = page_split / msize;
5414 reg_off_split = elt_split << esz;
5415 mem_off_split = elt_split * msize;
5418 * This is the last full element on the first page, but it is not
5419 * necessarily active. If there is no full element, i.e. the first
5420 * active element is the one that's split, this value remains -1.
5421 * It is useful as iteration bounds.
5423 if (elt_split != 0) {
5424 info->reg_off_last[0] = reg_off_split - esize;
5427 /* Determine if an unaligned element spans the pages. */
5428 if (page_split % msize != 0) {
5429 /* It is helpful to know if the split element is active. */
5430 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5431 info->reg_off_split = reg_off_split;
5432 info->mem_off_split = mem_off_split;
5434 if (reg_off_split == reg_off_last) {
5435 /* The page crossing element is last. */
5436 return true;
5439 reg_off_split += esize;
5440 mem_off_split += msize;
5444 * We do want the first active element on the second page, because
5445 * this may affect the address reported in an exception.
5447 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5448 tcg_debug_assert(reg_off_split <= reg_off_last);
5449 info->reg_off_first[1] = reg_off_split;
5450 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5451 info->reg_off_last[1] = reg_off_last;
5452 return true;
5456 * Resolve the guest virtual addresses to info->page[].
5457 * Control the generation of page faults with @fault. Return false if
5458 * there is no work to do, which can only happen with @fault == FAULT_NO.
5460 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5461 CPUARMState *env, target_ulong addr,
5462 MMUAccessType access_type, uintptr_t retaddr)
5464 int mmu_idx = cpu_mmu_index(env, false);
5465 int mem_off = info->mem_off_first[0];
5466 bool nofault = fault == FAULT_NO;
5467 bool have_work = true;
5469 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5470 access_type, mmu_idx, retaddr)) {
5471 /* No work to be done. */
5472 return false;
5475 if (likely(info->page_split < 0)) {
5476 /* The entire operation was on the one page. */
5477 return true;
5481 * If the second page is invalid, then we want the fault address to be
5482 * the first byte on that page which is accessed.
5484 if (info->mem_off_split >= 0) {
5486 * There is an element split across the pages. The fault address
5487 * should be the first byte of the second page.
5489 mem_off = info->page_split;
5491 * If the split element is also the first active element
5492 * of the vector, then: For first-fault we should continue
5493 * to generate faults for the second page. For no-fault,
5494 * we have work only if the second page is valid.
5496 if (info->mem_off_first[0] < info->mem_off_split) {
5497 nofault = FAULT_FIRST;
5498 have_work = false;
5500 } else {
5502 * There is no element split across the pages. The fault address
5503 * should be the first active element on the second page.
5505 mem_off = info->mem_off_first[1];
5507 * There must have been one active element on the first page,
5508 * so we're out of first-fault territory.
5510 nofault = fault != FAULT_ALL;
5513 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5514 access_type, mmu_idx, retaddr);
5515 return have_work;
5518 #ifndef CONFIG_USER_ONLY
5519 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5520 uint64_t *vg, target_ulong addr,
5521 int esize, int msize, int wp_access,
5522 uintptr_t retaddr)
5524 intptr_t mem_off, reg_off, reg_last;
5525 int flags0 = info->page[0].flags;
5526 int flags1 = info->page[1].flags;
5528 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5529 return;
5532 /* Indicate that watchpoints are handled. */
5533 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5534 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5536 if (flags0 & TLB_WATCHPOINT) {
5537 mem_off = info->mem_off_first[0];
5538 reg_off = info->reg_off_first[0];
5539 reg_last = info->reg_off_last[0];
5541 while (reg_off <= reg_last) {
5542 uint64_t pg = vg[reg_off >> 6];
5543 do {
5544 if ((pg >> (reg_off & 63)) & 1) {
5545 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5546 msize, info->page[0].attrs,
5547 wp_access, retaddr);
5549 reg_off += esize;
5550 mem_off += msize;
5551 } while (reg_off <= reg_last && (reg_off & 63));
5555 mem_off = info->mem_off_split;
5556 if (mem_off >= 0) {
5557 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5558 info->page[0].attrs, wp_access, retaddr);
5561 mem_off = info->mem_off_first[1];
5562 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5563 reg_off = info->reg_off_first[1];
5564 reg_last = info->reg_off_last[1];
5566 do {
5567 uint64_t pg = vg[reg_off >> 6];
5568 do {
5569 if ((pg >> (reg_off & 63)) & 1) {
5570 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5571 msize, info->page[1].attrs,
5572 wp_access, retaddr);
5574 reg_off += esize;
5575 mem_off += msize;
5576 } while (reg_off & 63);
5577 } while (reg_off <= reg_last);
5580 #endif
5582 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5583 uint64_t *vg, target_ulong addr, int esize,
5584 int msize, uint32_t mtedesc, uintptr_t ra)
5586 intptr_t mem_off, reg_off, reg_last;
5588 /* Process the page only if MemAttr == Tagged. */
5589 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5590 mem_off = info->mem_off_first[0];
5591 reg_off = info->reg_off_first[0];
5592 reg_last = info->reg_off_split;
5593 if (reg_last < 0) {
5594 reg_last = info->reg_off_last[0];
5597 do {
5598 uint64_t pg = vg[reg_off >> 6];
5599 do {
5600 if ((pg >> (reg_off & 63)) & 1) {
5601 mte_check(env, mtedesc, addr, ra);
5603 reg_off += esize;
5604 mem_off += msize;
5605 } while (reg_off <= reg_last && (reg_off & 63));
5606 } while (reg_off <= reg_last);
5609 mem_off = info->mem_off_first[1];
5610 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5611 reg_off = info->reg_off_first[1];
5612 reg_last = info->reg_off_last[1];
5614 do {
5615 uint64_t pg = vg[reg_off >> 6];
5616 do {
5617 if ((pg >> (reg_off & 63)) & 1) {
5618 mte_check(env, mtedesc, addr, ra);
5620 reg_off += esize;
5621 mem_off += msize;
5622 } while (reg_off & 63);
5623 } while (reg_off <= reg_last);
5628 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5630 static inline QEMU_ALWAYS_INLINE
5631 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5632 uint32_t desc, const uintptr_t retaddr,
5633 const int esz, const int msz, const int N, uint32_t mtedesc,
5634 sve_ldst1_host_fn *host_fn,
5635 sve_ldst1_tlb_fn *tlb_fn)
5637 const unsigned rd = simd_data(desc);
5638 const intptr_t reg_max = simd_oprsz(desc);
5639 intptr_t reg_off, reg_last, mem_off;
5640 SVEContLdSt info;
5641 void *host;
5642 int flags, i;
5644 /* Find the active elements. */
5645 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5646 /* The entire predicate was false; no load occurs. */
5647 for (i = 0; i < N; ++i) {
5648 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5650 return;
5653 /* Probe the page(s). Exit with exception for any invalid page. */
5654 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5656 /* Handle watchpoints for all active elements. */
5657 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5658 BP_MEM_READ, retaddr);
5661 * Handle mte checks for all active elements.
5662 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5664 if (mtedesc) {
5665 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5666 mtedesc, retaddr);
5669 flags = info.page[0].flags | info.page[1].flags;
5670 if (unlikely(flags != 0)) {
5671 #ifdef CONFIG_USER_ONLY
5672 g_assert_not_reached();
5673 #else
5675 * At least one page includes MMIO.
5676 * Any bus operation can fail with cpu_transaction_failed,
5677 * which for ARM will raise SyncExternal. Perform the load
5678 * into scratch memory to preserve register state until the end.
5680 ARMVectorReg scratch[4] = { };
5682 mem_off = info.mem_off_first[0];
5683 reg_off = info.reg_off_first[0];
5684 reg_last = info.reg_off_last[1];
5685 if (reg_last < 0) {
5686 reg_last = info.reg_off_split;
5687 if (reg_last < 0) {
5688 reg_last = info.reg_off_last[0];
5692 do {
5693 uint64_t pg = vg[reg_off >> 6];
5694 do {
5695 if ((pg >> (reg_off & 63)) & 1) {
5696 for (i = 0; i < N; ++i) {
5697 tlb_fn(env, &scratch[i], reg_off,
5698 addr + mem_off + (i << msz), retaddr);
5701 reg_off += 1 << esz;
5702 mem_off += N << msz;
5703 } while (reg_off & 63);
5704 } while (reg_off <= reg_last);
5706 for (i = 0; i < N; ++i) {
5707 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5709 return;
5710 #endif
5713 /* The entire operation is in RAM, on valid pages. */
5715 for (i = 0; i < N; ++i) {
5716 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5719 mem_off = info.mem_off_first[0];
5720 reg_off = info.reg_off_first[0];
5721 reg_last = info.reg_off_last[0];
5722 host = info.page[0].host;
5724 while (reg_off <= reg_last) {
5725 uint64_t pg = vg[reg_off >> 6];
5726 do {
5727 if ((pg >> (reg_off & 63)) & 1) {
5728 for (i = 0; i < N; ++i) {
5729 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5730 host + mem_off + (i << msz));
5733 reg_off += 1 << esz;
5734 mem_off += N << msz;
5735 } while (reg_off <= reg_last && (reg_off & 63));
5739 * Use the slow path to manage the cross-page misalignment.
5740 * But we know this is RAM and cannot trap.
5742 mem_off = info.mem_off_split;
5743 if (unlikely(mem_off >= 0)) {
5744 reg_off = info.reg_off_split;
5745 for (i = 0; i < N; ++i) {
5746 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5747 addr + mem_off + (i << msz), retaddr);
5751 mem_off = info.mem_off_first[1];
5752 if (unlikely(mem_off >= 0)) {
5753 reg_off = info.reg_off_first[1];
5754 reg_last = info.reg_off_last[1];
5755 host = info.page[1].host;
5757 do {
5758 uint64_t pg = vg[reg_off >> 6];
5759 do {
5760 if ((pg >> (reg_off & 63)) & 1) {
5761 for (i = 0; i < N; ++i) {
5762 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5763 host + mem_off + (i << msz));
5766 reg_off += 1 << esz;
5767 mem_off += N << msz;
5768 } while (reg_off & 63);
5769 } while (reg_off <= reg_last);
5773 static inline QEMU_ALWAYS_INLINE
5774 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5775 uint32_t desc, const uintptr_t ra,
5776 const int esz, const int msz, const int N,
5777 sve_ldst1_host_fn *host_fn,
5778 sve_ldst1_tlb_fn *tlb_fn)
5780 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5781 int bit55 = extract64(addr, 55, 1);
5783 /* Remove mtedesc from the normal sve descriptor. */
5784 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5786 /* Perform gross MTE suppression early. */
5787 if (!tbi_check(desc, bit55) ||
5788 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5789 mtedesc = 0;
5792 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5795 #define DO_LD1_1(NAME, ESZ) \
5796 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5797 target_ulong addr, uint32_t desc) \
5799 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5800 sve_##NAME##_host, sve_##NAME##_tlb); \
5802 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5803 target_ulong addr, uint32_t desc) \
5805 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5806 sve_##NAME##_host, sve_##NAME##_tlb); \
5809 #define DO_LD1_2(NAME, ESZ, MSZ) \
5810 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5811 target_ulong addr, uint32_t desc) \
5813 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5814 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5816 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5817 target_ulong addr, uint32_t desc) \
5819 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5820 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5822 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5823 target_ulong addr, uint32_t desc) \
5825 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5826 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5828 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5829 target_ulong addr, uint32_t desc) \
5831 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5832 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5835 DO_LD1_1(ld1bb, MO_8)
5836 DO_LD1_1(ld1bhu, MO_16)
5837 DO_LD1_1(ld1bhs, MO_16)
5838 DO_LD1_1(ld1bsu, MO_32)
5839 DO_LD1_1(ld1bss, MO_32)
5840 DO_LD1_1(ld1bdu, MO_64)
5841 DO_LD1_1(ld1bds, MO_64)
5843 DO_LD1_2(ld1hh, MO_16, MO_16)
5844 DO_LD1_2(ld1hsu, MO_32, MO_16)
5845 DO_LD1_2(ld1hss, MO_32, MO_16)
5846 DO_LD1_2(ld1hdu, MO_64, MO_16)
5847 DO_LD1_2(ld1hds, MO_64, MO_16)
5849 DO_LD1_2(ld1ss, MO_32, MO_32)
5850 DO_LD1_2(ld1sdu, MO_64, MO_32)
5851 DO_LD1_2(ld1sds, MO_64, MO_32)
5853 DO_LD1_2(ld1dd, MO_64, MO_64)
5855 #undef DO_LD1_1
5856 #undef DO_LD1_2
5858 #define DO_LDN_1(N) \
5859 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5860 target_ulong addr, uint32_t desc) \
5862 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5863 sve_ld1bb_host, sve_ld1bb_tlb); \
5865 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5866 target_ulong addr, uint32_t desc) \
5868 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5869 sve_ld1bb_host, sve_ld1bb_tlb); \
5872 #define DO_LDN_2(N, SUFF, ESZ) \
5873 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5874 target_ulong addr, uint32_t desc) \
5876 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5877 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5879 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5880 target_ulong addr, uint32_t desc) \
5882 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5883 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5885 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5886 target_ulong addr, uint32_t desc) \
5888 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5889 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5891 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5892 target_ulong addr, uint32_t desc) \
5894 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5895 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5898 DO_LDN_1(2)
5899 DO_LDN_1(3)
5900 DO_LDN_1(4)
5902 DO_LDN_2(2, hh, MO_16)
5903 DO_LDN_2(3, hh, MO_16)
5904 DO_LDN_2(4, hh, MO_16)
5906 DO_LDN_2(2, ss, MO_32)
5907 DO_LDN_2(3, ss, MO_32)
5908 DO_LDN_2(4, ss, MO_32)
5910 DO_LDN_2(2, dd, MO_64)
5911 DO_LDN_2(3, dd, MO_64)
5912 DO_LDN_2(4, dd, MO_64)
5914 #undef DO_LDN_1
5915 #undef DO_LDN_2
5918 * Load contiguous data, first-fault and no-fault.
5920 * For user-only, one could argue that we should hold the mmap_lock during
5921 * the operation so that there is no race between page_check_range and the
5922 * load operation. However, unmapping pages out from under a running thread
5923 * is extraordinarily unlikely. This theoretical race condition also affects
5924 * linux-user/ in its get_user/put_user macros.
5926 * TODO: Construct some helpers, written in assembly, that interact with
5927 * host_signal_handler to produce memory ops which can properly report errors
5928 * without racing.
5931 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5932 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5933 * option, which leaves subsequent data unchanged.
5935 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5937 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5939 if (i & 63) {
5940 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5941 i = ROUND_UP(i, 64);
5943 for (; i < oprsz; i += 64) {
5944 ffr[i / 64] = 0;
5949 * Common helper for all contiguous no-fault and first-fault loads.
5951 static inline QEMU_ALWAYS_INLINE
5952 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5953 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5954 const int esz, const int msz, const SVEContFault fault,
5955 sve_ldst1_host_fn *host_fn,
5956 sve_ldst1_tlb_fn *tlb_fn)
5958 const unsigned rd = simd_data(desc);
5959 void *vd = &env->vfp.zregs[rd];
5960 const intptr_t reg_max = simd_oprsz(desc);
5961 intptr_t reg_off, mem_off, reg_last;
5962 SVEContLdSt info;
5963 int flags;
5964 void *host;
5966 /* Find the active elements. */
5967 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5968 /* The entire predicate was false; no load occurs. */
5969 memset(vd, 0, reg_max);
5970 return;
5972 reg_off = info.reg_off_first[0];
5974 /* Probe the page(s). */
5975 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5976 /* Fault on first element. */
5977 tcg_debug_assert(fault == FAULT_NO);
5978 memset(vd, 0, reg_max);
5979 goto do_fault;
5982 mem_off = info.mem_off_first[0];
5983 flags = info.page[0].flags;
5986 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5987 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5989 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5990 mtedesc = 0;
5993 if (fault == FAULT_FIRST) {
5994 /* Trapping mte check for the first-fault element. */
5995 if (mtedesc) {
5996 mte_check(env, mtedesc, addr + mem_off, retaddr);
6000 * Special handling of the first active element,
6001 * if it crosses a page boundary or is MMIO.
6003 bool is_split = mem_off == info.mem_off_split;
6004 if (unlikely(flags != 0) || unlikely(is_split)) {
6006 * Use the slow path for cross-page handling.
6007 * Might trap for MMIO or watchpoints.
6009 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6011 /* After any fault, zero the other elements. */
6012 swap_memzero(vd, reg_off);
6013 reg_off += 1 << esz;
6014 mem_off += 1 << msz;
6015 swap_memzero(vd + reg_off, reg_max - reg_off);
6017 if (is_split) {
6018 goto second_page;
6020 } else {
6021 memset(vd, 0, reg_max);
6023 } else {
6024 memset(vd, 0, reg_max);
6025 if (unlikely(mem_off == info.mem_off_split)) {
6026 /* The first active element crosses a page boundary. */
6027 flags |= info.page[1].flags;
6028 if (unlikely(flags & TLB_MMIO)) {
6029 /* Some page is MMIO, see below. */
6030 goto do_fault;
6032 if (unlikely(flags & TLB_WATCHPOINT) &&
6033 (cpu_watchpoint_address_matches
6034 (env_cpu(env), addr + mem_off, 1 << msz)
6035 & BP_MEM_READ)) {
6036 /* Watchpoint hit, see below. */
6037 goto do_fault;
6039 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6040 goto do_fault;
6043 * Use the slow path for cross-page handling.
6044 * This is RAM, without a watchpoint, and will not trap.
6046 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6047 goto second_page;
6052 * From this point on, all memory operations are MemSingleNF.
6054 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6055 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6057 * Unfortuately we do not have access to the memory attributes from the
6058 * PTE to tell Device memory from Normal memory. So we make a mostly
6059 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6060 * This gives the right answer for the common cases of "Normal memory,
6061 * backed by host RAM" and "Device memory, backed by MMIO".
6062 * The architecture allows us to suppress an NF load and return
6063 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6064 * case of "Normal memory, backed by MMIO" is permitted. The case we
6065 * get wrong is "Device memory, backed by host RAM", for which we
6066 * should return (UNKNOWN, FAULT) for but do not.
6068 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6069 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6070 * architectural breakpoints the same.
6072 if (unlikely(flags & TLB_MMIO)) {
6073 goto do_fault;
6076 reg_last = info.reg_off_last[0];
6077 host = info.page[0].host;
6079 do {
6080 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6081 do {
6082 if ((pg >> (reg_off & 63)) & 1) {
6083 if (unlikely(flags & TLB_WATCHPOINT) &&
6084 (cpu_watchpoint_address_matches
6085 (env_cpu(env), addr + mem_off, 1 << msz)
6086 & BP_MEM_READ)) {
6087 goto do_fault;
6089 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6090 goto do_fault;
6092 host_fn(vd, reg_off, host + mem_off);
6094 reg_off += 1 << esz;
6095 mem_off += 1 << msz;
6096 } while (reg_off <= reg_last && (reg_off & 63));
6097 } while (reg_off <= reg_last);
6100 * MemSingleNF is allowed to fail for any reason. We have special
6101 * code above to handle the first element crossing a page boundary.
6102 * As an implementation choice, decline to handle a cross-page element
6103 * in any other position.
6105 reg_off = info.reg_off_split;
6106 if (reg_off >= 0) {
6107 goto do_fault;
6110 second_page:
6111 reg_off = info.reg_off_first[1];
6112 if (likely(reg_off < 0)) {
6113 /* No active elements on the second page. All done. */
6114 return;
6118 * MemSingleNF is allowed to fail for any reason. As an implementation
6119 * choice, decline to handle elements on the second page. This should
6120 * be low frequency as the guest walks through memory -- the next
6121 * iteration of the guest's loop should be aligned on the page boundary,
6122 * and then all following iterations will stay aligned.
6125 do_fault:
6126 record_fault(env, reg_off, reg_max);
6129 static inline QEMU_ALWAYS_INLINE
6130 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6131 uint32_t desc, const uintptr_t retaddr,
6132 const int esz, const int msz, const SVEContFault fault,
6133 sve_ldst1_host_fn *host_fn,
6134 sve_ldst1_tlb_fn *tlb_fn)
6136 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6137 int bit55 = extract64(addr, 55, 1);
6139 /* Remove mtedesc from the normal sve descriptor. */
6140 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6142 /* Perform gross MTE suppression early. */
6143 if (!tbi_check(desc, bit55) ||
6144 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6145 mtedesc = 0;
6148 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6149 esz, msz, fault, host_fn, tlb_fn);
6152 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6153 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6154 target_ulong addr, uint32_t desc) \
6156 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6157 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6159 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6160 target_ulong addr, uint32_t desc) \
6162 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6163 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6165 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6166 target_ulong addr, uint32_t desc) \
6168 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6169 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6171 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6172 target_ulong addr, uint32_t desc) \
6174 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6175 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6178 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6179 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6180 target_ulong addr, uint32_t desc) \
6182 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6183 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6185 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6186 target_ulong addr, uint32_t desc) \
6188 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6189 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6191 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6192 target_ulong addr, uint32_t desc) \
6194 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6195 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6197 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6198 target_ulong addr, uint32_t desc) \
6200 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6201 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6203 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6204 target_ulong addr, uint32_t desc) \
6206 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6207 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6209 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6210 target_ulong addr, uint32_t desc) \
6212 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6213 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6215 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6216 target_ulong addr, uint32_t desc) \
6218 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6219 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6221 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6222 target_ulong addr, uint32_t desc) \
6224 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6225 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6228 DO_LDFF1_LDNF1_1(bb, MO_8)
6229 DO_LDFF1_LDNF1_1(bhu, MO_16)
6230 DO_LDFF1_LDNF1_1(bhs, MO_16)
6231 DO_LDFF1_LDNF1_1(bsu, MO_32)
6232 DO_LDFF1_LDNF1_1(bss, MO_32)
6233 DO_LDFF1_LDNF1_1(bdu, MO_64)
6234 DO_LDFF1_LDNF1_1(bds, MO_64)
6236 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6237 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6238 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6239 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6240 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6242 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6243 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6244 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6246 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6248 #undef DO_LDFF1_LDNF1_1
6249 #undef DO_LDFF1_LDNF1_2
6252 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6255 static inline QEMU_ALWAYS_INLINE
6256 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6257 uint32_t desc, const uintptr_t retaddr,
6258 const int esz, const int msz, const int N, uint32_t mtedesc,
6259 sve_ldst1_host_fn *host_fn,
6260 sve_ldst1_tlb_fn *tlb_fn)
6262 const unsigned rd = simd_data(desc);
6263 const intptr_t reg_max = simd_oprsz(desc);
6264 intptr_t reg_off, reg_last, mem_off;
6265 SVEContLdSt info;
6266 void *host;
6267 int i, flags;
6269 /* Find the active elements. */
6270 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6271 /* The entire predicate was false; no store occurs. */
6272 return;
6275 /* Probe the page(s). Exit with exception for any invalid page. */
6276 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6278 /* Handle watchpoints for all active elements. */
6279 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6280 BP_MEM_WRITE, retaddr);
6283 * Handle mte checks for all active elements.
6284 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6286 if (mtedesc) {
6287 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6288 mtedesc, retaddr);
6291 flags = info.page[0].flags | info.page[1].flags;
6292 if (unlikely(flags != 0)) {
6293 #ifdef CONFIG_USER_ONLY
6294 g_assert_not_reached();
6295 #else
6297 * At least one page includes MMIO.
6298 * Any bus operation can fail with cpu_transaction_failed,
6299 * which for ARM will raise SyncExternal. We cannot avoid
6300 * this fault and will leave with the store incomplete.
6302 mem_off = info.mem_off_first[0];
6303 reg_off = info.reg_off_first[0];
6304 reg_last = info.reg_off_last[1];
6305 if (reg_last < 0) {
6306 reg_last = info.reg_off_split;
6307 if (reg_last < 0) {
6308 reg_last = info.reg_off_last[0];
6312 do {
6313 uint64_t pg = vg[reg_off >> 6];
6314 do {
6315 if ((pg >> (reg_off & 63)) & 1) {
6316 for (i = 0; i < N; ++i) {
6317 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6318 addr + mem_off + (i << msz), retaddr);
6321 reg_off += 1 << esz;
6322 mem_off += N << msz;
6323 } while (reg_off & 63);
6324 } while (reg_off <= reg_last);
6325 return;
6326 #endif
6329 mem_off = info.mem_off_first[0];
6330 reg_off = info.reg_off_first[0];
6331 reg_last = info.reg_off_last[0];
6332 host = info.page[0].host;
6334 while (reg_off <= reg_last) {
6335 uint64_t pg = vg[reg_off >> 6];
6336 do {
6337 if ((pg >> (reg_off & 63)) & 1) {
6338 for (i = 0; i < N; ++i) {
6339 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6340 host + mem_off + (i << msz));
6343 reg_off += 1 << esz;
6344 mem_off += N << msz;
6345 } while (reg_off <= reg_last && (reg_off & 63));
6349 * Use the slow path to manage the cross-page misalignment.
6350 * But we know this is RAM and cannot trap.
6352 mem_off = info.mem_off_split;
6353 if (unlikely(mem_off >= 0)) {
6354 reg_off = info.reg_off_split;
6355 for (i = 0; i < N; ++i) {
6356 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6357 addr + mem_off + (i << msz), retaddr);
6361 mem_off = info.mem_off_first[1];
6362 if (unlikely(mem_off >= 0)) {
6363 reg_off = info.reg_off_first[1];
6364 reg_last = info.reg_off_last[1];
6365 host = info.page[1].host;
6367 do {
6368 uint64_t pg = vg[reg_off >> 6];
6369 do {
6370 if ((pg >> (reg_off & 63)) & 1) {
6371 for (i = 0; i < N; ++i) {
6372 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6373 host + mem_off + (i << msz));
6376 reg_off += 1 << esz;
6377 mem_off += N << msz;
6378 } while (reg_off & 63);
6379 } while (reg_off <= reg_last);
6383 static inline QEMU_ALWAYS_INLINE
6384 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6385 uint32_t desc, const uintptr_t ra,
6386 const int esz, const int msz, const int N,
6387 sve_ldst1_host_fn *host_fn,
6388 sve_ldst1_tlb_fn *tlb_fn)
6390 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6391 int bit55 = extract64(addr, 55, 1);
6393 /* Remove mtedesc from the normal sve descriptor. */
6394 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6396 /* Perform gross MTE suppression early. */
6397 if (!tbi_check(desc, bit55) ||
6398 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6399 mtedesc = 0;
6402 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6405 #define DO_STN_1(N, NAME, ESZ) \
6406 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6407 target_ulong addr, uint32_t desc) \
6409 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6410 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6412 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6413 target_ulong addr, uint32_t desc) \
6415 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6416 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6419 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6420 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6421 target_ulong addr, uint32_t desc) \
6423 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6424 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6426 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6427 target_ulong addr, uint32_t desc) \
6429 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6430 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6432 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6433 target_ulong addr, uint32_t desc) \
6435 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6436 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6438 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6439 target_ulong addr, uint32_t desc) \
6441 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6442 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6445 DO_STN_1(1, bb, MO_8)
6446 DO_STN_1(1, bh, MO_16)
6447 DO_STN_1(1, bs, MO_32)
6448 DO_STN_1(1, bd, MO_64)
6449 DO_STN_1(2, bb, MO_8)
6450 DO_STN_1(3, bb, MO_8)
6451 DO_STN_1(4, bb, MO_8)
6453 DO_STN_2(1, hh, MO_16, MO_16)
6454 DO_STN_2(1, hs, MO_32, MO_16)
6455 DO_STN_2(1, hd, MO_64, MO_16)
6456 DO_STN_2(2, hh, MO_16, MO_16)
6457 DO_STN_2(3, hh, MO_16, MO_16)
6458 DO_STN_2(4, hh, MO_16, MO_16)
6460 DO_STN_2(1, ss, MO_32, MO_32)
6461 DO_STN_2(1, sd, MO_64, MO_32)
6462 DO_STN_2(2, ss, MO_32, MO_32)
6463 DO_STN_2(3, ss, MO_32, MO_32)
6464 DO_STN_2(4, ss, MO_32, MO_32)
6466 DO_STN_2(1, dd, MO_64, MO_64)
6467 DO_STN_2(2, dd, MO_64, MO_64)
6468 DO_STN_2(3, dd, MO_64, MO_64)
6469 DO_STN_2(4, dd, MO_64, MO_64)
6471 #undef DO_STN_1
6472 #undef DO_STN_2
6475 * Loads with a vector index.
6479 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6481 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6483 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6485 return *(uint32_t *)(reg + H1_4(reg_ofs));
6488 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6490 return *(int32_t *)(reg + H1_4(reg_ofs));
6493 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6495 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6498 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6500 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6503 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6505 return *(uint64_t *)(reg + reg_ofs);
6508 static inline QEMU_ALWAYS_INLINE
6509 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6510 target_ulong base, uint32_t desc, uintptr_t retaddr,
6511 uint32_t mtedesc, int esize, int msize,
6512 zreg_off_fn *off_fn,
6513 sve_ldst1_host_fn *host_fn,
6514 sve_ldst1_tlb_fn *tlb_fn)
6516 const int mmu_idx = cpu_mmu_index(env, false);
6517 const intptr_t reg_max = simd_oprsz(desc);
6518 const int scale = simd_data(desc);
6519 ARMVectorReg scratch;
6520 intptr_t reg_off;
6521 SVEHostPage info, info2;
6523 memset(&scratch, 0, reg_max);
6524 reg_off = 0;
6525 do {
6526 uint64_t pg = vg[reg_off >> 6];
6527 do {
6528 if (likely(pg & 1)) {
6529 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6530 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6532 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6533 mmu_idx, retaddr);
6535 if (likely(in_page >= msize)) {
6536 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6537 cpu_check_watchpoint(env_cpu(env), addr, msize,
6538 info.attrs, BP_MEM_READ, retaddr);
6540 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6541 mte_check(env, mtedesc, addr, retaddr);
6543 if (unlikely(info.flags & TLB_MMIO)) {
6544 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6545 } else {
6546 host_fn(&scratch, reg_off, info.host);
6548 } else {
6549 /* Element crosses the page boundary. */
6550 sve_probe_page(&info2, false, env, addr + in_page, 0,
6551 MMU_DATA_LOAD, mmu_idx, retaddr);
6552 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6553 cpu_check_watchpoint(env_cpu(env), addr,
6554 msize, info.attrs,
6555 BP_MEM_READ, retaddr);
6557 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6558 mte_check(env, mtedesc, addr, retaddr);
6560 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6563 reg_off += esize;
6564 pg >>= esize;
6565 } while (reg_off & 63);
6566 } while (reg_off < reg_max);
6568 /* Wait until all exceptions have been raised to write back. */
6569 memcpy(vd, &scratch, reg_max);
6572 static inline QEMU_ALWAYS_INLINE
6573 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6574 target_ulong base, uint32_t desc, uintptr_t retaddr,
6575 int esize, int msize, zreg_off_fn *off_fn,
6576 sve_ldst1_host_fn *host_fn,
6577 sve_ldst1_tlb_fn *tlb_fn)
6579 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6580 /* Remove mtedesc from the normal sve descriptor. */
6581 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6584 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6585 * offset base entirely over the address space hole to change the
6586 * pointer tag, or change the bit55 selector. So we could here
6587 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6589 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6590 esize, msize, off_fn, host_fn, tlb_fn);
6593 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6594 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6595 void *vm, target_ulong base, uint32_t desc) \
6597 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6598 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6600 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6601 void *vm, target_ulong base, uint32_t desc) \
6603 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6604 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6607 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6608 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6609 void *vm, target_ulong base, uint32_t desc) \
6611 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6612 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6614 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6615 void *vm, target_ulong base, uint32_t desc) \
6617 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6618 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6621 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6622 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6623 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6624 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6625 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6627 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6628 DO_LD1_ZPZ_S(bss, zss, MO_8)
6629 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6630 DO_LD1_ZPZ_D(bds, zss, MO_8)
6631 DO_LD1_ZPZ_D(bds, zd, MO_8)
6633 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6634 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6635 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6636 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6637 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6639 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6640 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6641 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6642 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6643 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6645 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6646 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6647 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6648 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6649 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6651 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6652 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6653 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6654 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6655 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6657 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6658 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6659 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6660 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6661 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6663 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6664 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6665 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6666 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6667 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6669 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6670 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6671 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6673 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6674 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6675 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6677 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6678 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6679 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6681 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6682 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6683 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6685 #undef DO_LD1_ZPZ_S
6686 #undef DO_LD1_ZPZ_D
6688 /* First fault loads with a vector index. */
6691 * Common helpers for all gather first-faulting loads.
6694 static inline QEMU_ALWAYS_INLINE
6695 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6696 target_ulong base, uint32_t desc, uintptr_t retaddr,
6697 uint32_t mtedesc, const int esz, const int msz,
6698 zreg_off_fn *off_fn,
6699 sve_ldst1_host_fn *host_fn,
6700 sve_ldst1_tlb_fn *tlb_fn)
6702 const int mmu_idx = cpu_mmu_index(env, false);
6703 const intptr_t reg_max = simd_oprsz(desc);
6704 const int scale = simd_data(desc);
6705 const int esize = 1 << esz;
6706 const int msize = 1 << msz;
6707 intptr_t reg_off;
6708 SVEHostPage info;
6709 target_ulong addr, in_page;
6711 /* Skip to the first true predicate. */
6712 reg_off = find_next_active(vg, 0, reg_max, esz);
6713 if (unlikely(reg_off >= reg_max)) {
6714 /* The entire predicate was false; no load occurs. */
6715 memset(vd, 0, reg_max);
6716 return;
6720 * Probe the first element, allowing faults.
6722 addr = base + (off_fn(vm, reg_off) << scale);
6723 if (mtedesc) {
6724 mte_check(env, mtedesc, addr, retaddr);
6726 tlb_fn(env, vd, reg_off, addr, retaddr);
6728 /* After any fault, zero the other elements. */
6729 swap_memzero(vd, reg_off);
6730 reg_off += esize;
6731 swap_memzero(vd + reg_off, reg_max - reg_off);
6734 * Probe the remaining elements, not allowing faults.
6736 while (reg_off < reg_max) {
6737 uint64_t pg = vg[reg_off >> 6];
6738 do {
6739 if (likely((pg >> (reg_off & 63)) & 1)) {
6740 addr = base + (off_fn(vm, reg_off) << scale);
6741 in_page = -(addr | TARGET_PAGE_MASK);
6743 if (unlikely(in_page < msize)) {
6744 /* Stop if the element crosses a page boundary. */
6745 goto fault;
6748 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6749 mmu_idx, retaddr);
6750 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6751 goto fault;
6753 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6754 (cpu_watchpoint_address_matches
6755 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6756 goto fault;
6758 if (mtedesc &&
6759 arm_tlb_mte_tagged(&info.attrs) &&
6760 !mte_probe(env, mtedesc, addr)) {
6761 goto fault;
6764 host_fn(vd, reg_off, info.host);
6766 reg_off += esize;
6767 } while (reg_off & 63);
6769 return;
6771 fault:
6772 record_fault(env, reg_off, reg_max);
6775 static inline QEMU_ALWAYS_INLINE
6776 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6777 target_ulong base, uint32_t desc, uintptr_t retaddr,
6778 const int esz, const int msz,
6779 zreg_off_fn *off_fn,
6780 sve_ldst1_host_fn *host_fn,
6781 sve_ldst1_tlb_fn *tlb_fn)
6783 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6784 /* Remove mtedesc from the normal sve descriptor. */
6785 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6788 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6789 * offset base entirely over the address space hole to change the
6790 * pointer tag, or change the bit55 selector. So we could here
6791 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6793 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6794 esz, msz, off_fn, host_fn, tlb_fn);
6797 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6798 void HELPER(sve_ldff##MEM##_##OFS) \
6799 (CPUARMState *env, void *vd, void *vg, \
6800 void *vm, target_ulong base, uint32_t desc) \
6802 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6803 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6805 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6806 (CPUARMState *env, void *vd, void *vg, \
6807 void *vm, target_ulong base, uint32_t desc) \
6809 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6810 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6813 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6814 void HELPER(sve_ldff##MEM##_##OFS) \
6815 (CPUARMState *env, void *vd, void *vg, \
6816 void *vm, target_ulong base, uint32_t desc) \
6818 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6819 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6821 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6822 (CPUARMState *env, void *vd, void *vg, \
6823 void *vm, target_ulong base, uint32_t desc) \
6825 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6826 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6829 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6830 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6831 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6832 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6833 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6835 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6836 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6837 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6838 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6839 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6841 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6842 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6843 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6844 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6845 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6847 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6848 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6849 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6850 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6851 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6853 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6854 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6855 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6856 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6857 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6859 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6860 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6861 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6862 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6863 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6865 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6866 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6867 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6868 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6869 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6871 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6872 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6873 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6874 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6875 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6877 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6878 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6879 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6881 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6882 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6883 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6885 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6886 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6887 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6889 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6890 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6891 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6893 /* Stores with a vector index. */
6895 static inline QEMU_ALWAYS_INLINE
6896 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6897 target_ulong base, uint32_t desc, uintptr_t retaddr,
6898 uint32_t mtedesc, int esize, int msize,
6899 zreg_off_fn *off_fn,
6900 sve_ldst1_host_fn *host_fn,
6901 sve_ldst1_tlb_fn *tlb_fn)
6903 const int mmu_idx = cpu_mmu_index(env, false);
6904 const intptr_t reg_max = simd_oprsz(desc);
6905 const int scale = simd_data(desc);
6906 void *host[ARM_MAX_VQ * 4];
6907 intptr_t reg_off, i;
6908 SVEHostPage info, info2;
6911 * Probe all of the elements for host addresses and flags.
6913 i = reg_off = 0;
6914 do {
6915 uint64_t pg = vg[reg_off >> 6];
6916 do {
6917 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6918 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6920 host[i] = NULL;
6921 if (likely((pg >> (reg_off & 63)) & 1)) {
6922 if (likely(in_page >= msize)) {
6923 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6924 mmu_idx, retaddr);
6925 if (!(info.flags & TLB_MMIO)) {
6926 host[i] = info.host;
6928 } else {
6930 * Element crosses the page boundary.
6931 * Probe both pages, but do not record the host address,
6932 * so that we use the slow path.
6934 sve_probe_page(&info, false, env, addr, 0,
6935 MMU_DATA_STORE, mmu_idx, retaddr);
6936 sve_probe_page(&info2, false, env, addr + in_page, 0,
6937 MMU_DATA_STORE, mmu_idx, retaddr);
6938 info.flags |= info2.flags;
6941 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6942 cpu_check_watchpoint(env_cpu(env), addr, msize,
6943 info.attrs, BP_MEM_WRITE, retaddr);
6946 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6947 mte_check(env, mtedesc, addr, retaddr);
6950 i += 1;
6951 reg_off += esize;
6952 } while (reg_off & 63);
6953 } while (reg_off < reg_max);
6956 * Now that we have recognized all exceptions except SyncExternal
6957 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6959 * Note for the common case of an element in RAM, not crossing a page
6960 * boundary, we have stored the host address in host[]. This doubles
6961 * as a first-level check against the predicate, since only enabled
6962 * elements have non-null host addresses.
6964 i = reg_off = 0;
6965 do {
6966 void *h = host[i];
6967 if (likely(h != NULL)) {
6968 host_fn(vd, reg_off, h);
6969 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6970 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6971 tlb_fn(env, vd, reg_off, addr, retaddr);
6973 i += 1;
6974 reg_off += esize;
6975 } while (reg_off < reg_max);
6978 static inline QEMU_ALWAYS_INLINE
6979 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6980 target_ulong base, uint32_t desc, uintptr_t retaddr,
6981 int esize, int msize, zreg_off_fn *off_fn,
6982 sve_ldst1_host_fn *host_fn,
6983 sve_ldst1_tlb_fn *tlb_fn)
6985 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6986 /* Remove mtedesc from the normal sve descriptor. */
6987 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6990 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6991 * offset base entirely over the address space hole to change the
6992 * pointer tag, or change the bit55 selector. So we could here
6993 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6995 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6996 esize, msize, off_fn, host_fn, tlb_fn);
6999 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7000 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7001 void *vm, target_ulong base, uint32_t desc) \
7003 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7004 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7006 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7007 void *vm, target_ulong base, uint32_t desc) \
7009 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7010 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7013 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7014 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7015 void *vm, target_ulong base, uint32_t desc) \
7017 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7018 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7020 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7021 void *vm, target_ulong base, uint32_t desc) \
7023 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7024 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7027 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7028 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7029 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7030 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7031 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7033 DO_ST1_ZPZ_S(bs, zss, MO_8)
7034 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7035 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7036 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7037 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7039 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7040 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7041 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7042 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7043 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7044 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7045 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7047 DO_ST1_ZPZ_D(bd, zss, MO_8)
7048 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7049 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7050 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7051 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7052 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7053 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7055 DO_ST1_ZPZ_D(bd, zd, MO_8)
7056 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7057 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7058 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7059 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7060 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7061 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7063 #undef DO_ST1_ZPZ_S
7064 #undef DO_ST1_ZPZ_D
7066 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7068 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7069 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7071 for (i = 0; i < opr_sz; ++i) {
7072 d[i] = n[i] ^ m[i] ^ k[i];
7076 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7078 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7079 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7081 for (i = 0; i < opr_sz; ++i) {
7082 d[i] = n[i] ^ (m[i] & ~k[i]);
7086 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7088 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7089 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7091 for (i = 0; i < opr_sz; ++i) {
7092 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7096 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7098 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7099 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7101 for (i = 0; i < opr_sz; ++i) {
7102 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7106 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7108 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7109 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7111 for (i = 0; i < opr_sz; ++i) {
7112 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7117 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7118 * See hasless(v,1) from
7119 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7121 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7123 int bits = 8 << esz;
7124 uint64_t ones = dup_const(esz, 1);
7125 uint64_t signs = ones << (bits - 1);
7126 uint64_t cmp0, cmp1;
7128 cmp1 = dup_const(esz, n);
7129 cmp0 = cmp1 ^ m0;
7130 cmp1 = cmp1 ^ m1;
7131 cmp0 = (cmp0 - ones) & ~cmp0;
7132 cmp1 = (cmp1 - ones) & ~cmp1;
7133 return (cmp0 | cmp1) & signs;
7136 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7137 uint32_t desc, int esz, bool nmatch)
7139 uint16_t esz_mask = pred_esz_masks[esz];
7140 intptr_t opr_sz = simd_oprsz(desc);
7141 uint32_t flags = PREDTEST_INIT;
7142 intptr_t i, j, k;
7144 for (i = 0; i < opr_sz; i += 16) {
7145 uint64_t m0 = *(uint64_t *)(vm + i);
7146 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7147 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7148 uint16_t out = 0;
7150 for (j = 0; j < 16; j += 8) {
7151 uint64_t n = *(uint64_t *)(vn + i + j);
7153 for (k = 0; k < 8; k += 1 << esz) {
7154 if (pg & (1 << (j + k))) {
7155 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7156 out |= (o ^ nmatch) << (j + k);
7160 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7161 flags = iter_predtest_fwd(out, pg, flags);
7163 return flags;
7166 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7167 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7169 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7172 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7173 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7175 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7176 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7178 #undef DO_PPZZ_MATCH
7180 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7181 uint32_t desc)
7183 ARMVectorReg scratch;
7184 intptr_t i, j;
7185 intptr_t opr_sz = simd_oprsz(desc);
7186 uint32_t *d = vd, *n = vn, *m = vm;
7187 uint8_t *pg = vg;
7189 if (d == n) {
7190 n = memcpy(&scratch, n, opr_sz);
7191 if (d == m) {
7192 m = n;
7194 } else if (d == m) {
7195 m = memcpy(&scratch, m, opr_sz);
7198 for (i = 0; i < opr_sz; i += 4) {
7199 uint64_t count = 0;
7200 uint8_t pred;
7202 pred = pg[H1(i >> 3)] >> (i & 7);
7203 if (pred & 1) {
7204 uint32_t nn = n[H4(i >> 2)];
7206 for (j = 0; j <= i; j += 4) {
7207 pred = pg[H1(j >> 3)] >> (j & 7);
7208 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7209 ++count;
7213 d[H4(i >> 2)] = count;
7217 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7218 uint32_t desc)
7220 ARMVectorReg scratch;
7221 intptr_t i, j;
7222 intptr_t opr_sz = simd_oprsz(desc);
7223 uint64_t *d = vd, *n = vn, *m = vm;
7224 uint8_t *pg = vg;
7226 if (d == n) {
7227 n = memcpy(&scratch, n, opr_sz);
7228 if (d == m) {
7229 m = n;
7231 } else if (d == m) {
7232 m = memcpy(&scratch, m, opr_sz);
7235 for (i = 0; i < opr_sz / 8; ++i) {
7236 uint64_t count = 0;
7237 if (pg[H1(i)] & 1) {
7238 uint64_t nn = n[i];
7239 for (j = 0; j <= i; ++j) {
7240 if ((pg[H1(j)] & 1) && nn == m[j]) {
7241 ++count;
7245 d[i] = count;
7250 * Returns the number of bytes in m0 and m1 that match n.
7251 * Unlike do_match2 we don't just need true/false, we need an exact count.
7252 * This requires two extra logical operations.
7254 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7256 const uint64_t mask = dup_const(MO_8, 0x7f);
7257 uint64_t cmp0, cmp1;
7259 cmp1 = dup_const(MO_8, n);
7260 cmp0 = cmp1 ^ m0;
7261 cmp1 = cmp1 ^ m1;
7264 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7265 * 2: carry in to msb if byte != 0 (+ mask)
7266 * 3: set msb if cmp has msb set (| cmp)
7267 * 4: set ~msb to ignore them (| mask)
7268 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7269 * 5: invert, resulting in 0x80 if and only if byte == 0.
7271 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7272 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7275 * Combine the two compares in a way that the bits do
7276 * not overlap, and so preserves the count of set bits.
7277 * If the host has an efficient instruction for ctpop,
7278 * then ctpop(x) + ctpop(y) has the same number of
7279 * operations as ctpop(x | (y >> 1)). If the host does
7280 * not have an efficient ctpop, then we only want to
7281 * use it once.
7283 return ctpop64(cmp0 | (cmp1 >> 1));
7286 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7288 intptr_t i, j;
7289 intptr_t opr_sz = simd_oprsz(desc);
7291 for (i = 0; i < opr_sz; i += 16) {
7292 uint64_t n0 = *(uint64_t *)(vn + i);
7293 uint64_t m0 = *(uint64_t *)(vm + i);
7294 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7295 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7296 uint64_t out0 = 0;
7297 uint64_t out1 = 0;
7299 for (j = 0; j < 64; j += 8) {
7300 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7301 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7302 out0 |= cnt0 << j;
7303 out1 |= cnt1 << j;
7306 *(uint64_t *)(vd + i) = out0;
7307 *(uint64_t *)(vd + i + 8) = out1;
7311 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7313 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7314 int shr = simd_data(desc);
7315 int shl = 8 - shr;
7316 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7317 uint64_t *d = vd, *n = vn, *m = vm;
7319 for (i = 0; i < opr_sz; ++i) {
7320 uint64_t t = n[i] ^ m[i];
7321 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7325 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7327 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7328 int shr = simd_data(desc);
7329 int shl = 16 - shr;
7330 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7331 uint64_t *d = vd, *n = vn, *m = vm;
7333 for (i = 0; i < opr_sz; ++i) {
7334 uint64_t t = n[i] ^ m[i];
7335 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7339 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7341 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7342 int shr = simd_data(desc);
7343 uint32_t *d = vd, *n = vn, *m = vm;
7345 for (i = 0; i < opr_sz; ++i) {
7346 d[i] = ror32(n[i] ^ m[i], shr);
7350 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7351 void *status, uint32_t desc)
7353 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7355 for (s = 0; s < opr_sz; ++s) {
7356 float32 *n = vn + s * sizeof(float32) * 4;
7357 float32 *m = vm + s * sizeof(float32) * 4;
7358 float32 *a = va + s * sizeof(float32) * 4;
7359 float32 *d = vd + s * sizeof(float32) * 4;
7360 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7361 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7362 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7363 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7364 float32 p0, p1;
7366 /* i = 0, j = 0 */
7367 p0 = float32_mul(n00, m00, status);
7368 p1 = float32_mul(n01, m01, status);
7369 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7371 /* i = 0, j = 1 */
7372 p0 = float32_mul(n00, m10, status);
7373 p1 = float32_mul(n01, m11, status);
7374 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7376 /* i = 1, j = 0 */
7377 p0 = float32_mul(n10, m00, status);
7378 p1 = float32_mul(n11, m01, status);
7379 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7381 /* i = 1, j = 1 */
7382 p0 = float32_mul(n10, m10, status);
7383 p1 = float32_mul(n11, m11, status);
7384 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7388 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7389 void *status, uint32_t desc)
7391 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7393 for (s = 0; s < opr_sz; ++s) {
7394 float64 *n = vn + s * sizeof(float64) * 4;
7395 float64 *m = vm + s * sizeof(float64) * 4;
7396 float64 *a = va + s * sizeof(float64) * 4;
7397 float64 *d = vd + s * sizeof(float64) * 4;
7398 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7399 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7400 float64 p0, p1;
7402 /* i = 0, j = 0 */
7403 p0 = float64_mul(n00, m00, status);
7404 p1 = float64_mul(n01, m01, status);
7405 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7407 /* i = 0, j = 1 */
7408 p0 = float64_mul(n00, m10, status);
7409 p1 = float64_mul(n01, m11, status);
7410 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7412 /* i = 1, j = 0 */
7413 p0 = float64_mul(n10, m00, status);
7414 p1 = float64_mul(n11, m01, status);
7415 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7417 /* i = 1, j = 1 */
7418 p0 = float64_mul(n10, m10, status);
7419 p1 = float64_mul(n11, m11, status);
7420 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7424 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7425 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7427 intptr_t i = simd_oprsz(desc); \
7428 uint64_t *g = vg; \
7429 do { \
7430 uint64_t pg = g[(i - 1) >> 6]; \
7431 do { \
7432 i -= sizeof(TYPEW); \
7433 if (likely((pg >> (i & 63)) & 1)) { \
7434 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7435 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7437 } while (i & 63); \
7438 } while (i != 0); \
7441 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7442 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7443 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7445 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7446 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7448 intptr_t i = simd_oprsz(desc); \
7449 uint64_t *g = vg; \
7450 do { \
7451 uint64_t pg = g[(i - 1) >> 6]; \
7452 do { \
7453 i -= sizeof(TYPEW); \
7454 if (likely((pg >> (i & 63)) & 1)) { \
7455 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7456 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7458 } while (i & 63); \
7459 } while (i != 0); \
7462 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7463 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7465 #undef DO_FCVTLT
7466 #undef DO_FCVTNT