4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
45 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
51 flags
|= ((d
& (g
& -g
)) != 0) << 31;
55 /* Accumulate Z from each D & G. */
56 flags
|= ((d
& g
) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
64 /* This is an iterative function, called for each Pd and Pg word
67 static uint32_t iter_predtest_bwd(uint64_t d
, uint64_t g
, uint32_t flags
)
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
73 flags
+= 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags
|= (d
& pow2floor(g
)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags
|= ((d
& g
) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags
= deposit32(flags
, 31, 1, (d
& (g
& -g
)) != 0);
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
89 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
95 uint32_t flags
= PREDTEST_INIT
;
96 uint64_t *d
= vd
, *g
= vg
;
100 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
101 } while (++i
< words
);
107 * Expand active predicate bits to bytes, for byte elements.
108 * (The data table itself is in vec_helper.c as MVE also needs it.)
110 static inline uint64_t expand_pred_b(uint8_t byte
)
112 return expand_pred_b_data
[byte
];
115 /* Similarly for half-word elements.
116 * for (i = 0; i < 256; ++i) {
117 * unsigned long m = 0;
121 * for (j = 0; j < 8; j += 2) {
122 * if ((i >> j) & 1) {
123 * m |= 0xfffful << (j << 3);
126 * printf("[0x%x] = 0x%016lx,\n", i, m);
129 static inline uint64_t expand_pred_h(uint8_t byte
)
131 static const uint64_t word
[] = {
132 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
133 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
134 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
135 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
136 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
137 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
138 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
139 [0x55] = 0xffffffffffffffff,
141 return word
[byte
& 0x55];
144 /* Similarly for single word elements. */
145 static inline uint64_t expand_pred_s(uint8_t byte
)
147 static const uint64_t word
[] = {
148 [0x01] = 0x00000000ffffffffull
,
149 [0x10] = 0xffffffff00000000ull
,
150 [0x11] = 0xffffffffffffffffull
,
152 return word
[byte
& 0x11];
155 /* Swap 16-bit words within a 32-bit word. */
156 static inline uint32_t hswap32(uint32_t h
)
161 /* Swap 16-bit words within a 64-bit word. */
162 static inline uint64_t hswap64(uint64_t h
)
164 uint64_t m
= 0x0000ffff0000ffffull
;
166 return ((h
& m
) << 16) | ((h
>> 16) & m
);
169 /* Swap 32-bit words within a 64-bit word. */
170 static inline uint64_t wswap64(uint64_t h
)
175 #define LOGICAL_PPPP(NAME, FUNC) \
176 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
178 uintptr_t opr_sz = simd_oprsz(desc); \
179 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
181 for (i = 0; i < opr_sz / 8; ++i) { \
182 d[i] = FUNC(n[i], m[i], g[i]); \
186 #define DO_AND(N, M, G) (((N) & (M)) & (G))
187 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
188 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
189 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
190 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
191 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
192 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
193 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
195 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
196 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
197 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
198 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
199 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
200 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
201 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
202 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
214 /* Fully general three-operand expander, controlled by a predicate.
215 * This is complicated by the host-endian storage of the register file.
217 /* ??? I don't expect the compiler could ever vectorize this itself.
218 * With some tables we can convert bit masks to byte masks, and with
219 * extra care wrt byte/word ordering we could use gcc generic vectors
220 * and do 16 bytes at a time.
222 #define DO_ZPZZ(NAME, TYPE, H, OP) \
223 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
225 intptr_t i, opr_sz = simd_oprsz(desc); \
226 for (i = 0; i < opr_sz; ) { \
227 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
230 TYPE nn = *(TYPE *)(vn + H(i)); \
231 TYPE mm = *(TYPE *)(vm + H(i)); \
232 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
234 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
239 /* Similarly, specialized for 64-bit operands. */
240 #define DO_ZPZZ_D(NAME, TYPE, OP) \
241 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
243 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
244 TYPE *d = vd, *n = vn, *m = vm; \
246 for (i = 0; i < opr_sz; i += 1) { \
247 if (pg[H1(i)] & 1) { \
248 TYPE nn = n[i], mm = m[i]; \
254 #define DO_AND(N, M) (N & M)
255 #define DO_EOR(N, M) (N ^ M)
256 #define DO_ORR(N, M) (N | M)
257 #define DO_BIC(N, M) (N & ~M)
258 #define DO_ADD(N, M) (N + M)
259 #define DO_SUB(N, M) (N - M)
260 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
261 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
262 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
263 #define DO_MUL(N, M) (N * M)
267 * We must avoid the C undefined behaviour cases: division by
268 * zero and signed division of INT_MIN by -1. Both of these
269 * have architecturally defined required results for Arm.
270 * We special case all signed divisions by -1 to avoid having
271 * to deduce the minimum integer for the type involved.
273 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
274 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
276 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
277 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
278 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
279 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
281 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
282 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
283 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
284 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
286 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
287 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
288 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
289 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
291 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
292 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
293 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
294 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
296 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
297 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
298 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
299 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
301 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
302 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
303 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
304 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
306 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
307 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
308 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
309 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
311 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
312 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
313 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
314 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
316 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
317 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
318 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
319 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
321 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
322 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
323 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
324 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
326 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
327 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
328 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
329 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
331 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
332 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
333 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
334 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
336 /* Because the computation type is at least twice as large as required,
337 these work for both signed and unsigned source types. */
338 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
343 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
345 return (n
* m
) >> 16;
348 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
350 return (n
* m
) >> 32;
353 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
356 muls64(&lo
, &hi
, n
, m
);
360 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
363 mulu64(&lo
, &hi
, n
, m
);
367 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
368 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
369 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
370 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
372 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
373 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
374 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
375 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
377 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
378 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
379 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
380 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
382 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_SDIV
)
383 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_SDIV
)
385 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_UDIV
)
386 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_UDIV
)
388 /* Note that all bits of the shift are significant
389 and not modulo the element size. */
390 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
391 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
392 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
394 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
395 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
396 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
398 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
399 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
400 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
402 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
403 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
404 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
406 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
407 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
408 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
410 static inline uint16_t do_sadalp_h(int16_t n
, int16_t m
)
412 int8_t n1
= n
, n2
= n
>> 8;
416 static inline uint32_t do_sadalp_s(int32_t n
, int32_t m
)
418 int16_t n1
= n
, n2
= n
>> 16;
422 static inline uint64_t do_sadalp_d(int64_t n
, int64_t m
)
424 int32_t n1
= n
, n2
= n
>> 32;
428 DO_ZPZZ(sve2_sadalp_zpzz_h
, int16_t, H1_2
, do_sadalp_h
)
429 DO_ZPZZ(sve2_sadalp_zpzz_s
, int32_t, H1_4
, do_sadalp_s
)
430 DO_ZPZZ_D(sve2_sadalp_zpzz_d
, int64_t, do_sadalp_d
)
432 static inline uint16_t do_uadalp_h(uint16_t n
, uint16_t m
)
434 uint8_t n1
= n
, n2
= n
>> 8;
438 static inline uint32_t do_uadalp_s(uint32_t n
, uint32_t m
)
440 uint16_t n1
= n
, n2
= n
>> 16;
444 static inline uint64_t do_uadalp_d(uint64_t n
, uint64_t m
)
446 uint32_t n1
= n
, n2
= n
>> 32;
450 DO_ZPZZ(sve2_uadalp_zpzz_h
, uint16_t, H1_2
, do_uadalp_h
)
451 DO_ZPZZ(sve2_uadalp_zpzz_s
, uint32_t, H1_4
, do_uadalp_s
)
452 DO_ZPZZ_D(sve2_uadalp_zpzz_d
, uint64_t, do_uadalp_d
)
454 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
455 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
456 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
457 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
459 DO_ZPZZ(sve2_srshl_zpzz_b
, int8_t, H1
, do_srshl_b
)
460 DO_ZPZZ(sve2_srshl_zpzz_h
, int16_t, H1_2
, do_srshl_h
)
461 DO_ZPZZ(sve2_srshl_zpzz_s
, int32_t, H1_4
, do_srshl_s
)
462 DO_ZPZZ_D(sve2_srshl_zpzz_d
, int64_t, do_srshl_d
)
464 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
465 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
466 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
467 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
469 DO_ZPZZ(sve2_urshl_zpzz_b
, uint8_t, H1
, do_urshl_b
)
470 DO_ZPZZ(sve2_urshl_zpzz_h
, uint16_t, H1_2
, do_urshl_h
)
471 DO_ZPZZ(sve2_urshl_zpzz_s
, uint32_t, H1_4
, do_urshl_s
)
472 DO_ZPZZ_D(sve2_urshl_zpzz_d
, uint64_t, do_urshl_d
)
475 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
476 * We pass in a pointer to a dummy saturation field to trigger
477 * the saturating arithmetic but discard the information about
478 * whether it has occurred.
480 #define do_sqshl_b(n, m) \
481 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
482 #define do_sqshl_h(n, m) \
483 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
484 #define do_sqshl_s(n, m) \
485 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
486 #define do_sqshl_d(n, m) \
487 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
489 DO_ZPZZ(sve2_sqshl_zpzz_b
, int8_t, H1_2
, do_sqshl_b
)
490 DO_ZPZZ(sve2_sqshl_zpzz_h
, int16_t, H1_2
, do_sqshl_h
)
491 DO_ZPZZ(sve2_sqshl_zpzz_s
, int32_t, H1_4
, do_sqshl_s
)
492 DO_ZPZZ_D(sve2_sqshl_zpzz_d
, int64_t, do_sqshl_d
)
494 #define do_uqshl_b(n, m) \
495 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
496 #define do_uqshl_h(n, m) \
497 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
498 #define do_uqshl_s(n, m) \
499 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
500 #define do_uqshl_d(n, m) \
501 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
503 DO_ZPZZ(sve2_uqshl_zpzz_b
, uint8_t, H1_2
, do_uqshl_b
)
504 DO_ZPZZ(sve2_uqshl_zpzz_h
, uint16_t, H1_2
, do_uqshl_h
)
505 DO_ZPZZ(sve2_uqshl_zpzz_s
, uint32_t, H1_4
, do_uqshl_s
)
506 DO_ZPZZ_D(sve2_uqshl_zpzz_d
, uint64_t, do_uqshl_d
)
508 #define do_sqrshl_b(n, m) \
509 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
510 #define do_sqrshl_h(n, m) \
511 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
512 #define do_sqrshl_s(n, m) \
513 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
514 #define do_sqrshl_d(n, m) \
515 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
517 DO_ZPZZ(sve2_sqrshl_zpzz_b
, int8_t, H1_2
, do_sqrshl_b
)
518 DO_ZPZZ(sve2_sqrshl_zpzz_h
, int16_t, H1_2
, do_sqrshl_h
)
519 DO_ZPZZ(sve2_sqrshl_zpzz_s
, int32_t, H1_4
, do_sqrshl_s
)
520 DO_ZPZZ_D(sve2_sqrshl_zpzz_d
, int64_t, do_sqrshl_d
)
524 #define do_uqrshl_b(n, m) \
525 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
526 #define do_uqrshl_h(n, m) \
527 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
528 #define do_uqrshl_s(n, m) \
529 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
530 #define do_uqrshl_d(n, m) \
531 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
533 DO_ZPZZ(sve2_uqrshl_zpzz_b
, uint8_t, H1_2
, do_uqrshl_b
)
534 DO_ZPZZ(sve2_uqrshl_zpzz_h
, uint16_t, H1_2
, do_uqrshl_h
)
535 DO_ZPZZ(sve2_uqrshl_zpzz_s
, uint32_t, H1_4
, do_uqrshl_s
)
536 DO_ZPZZ_D(sve2_uqrshl_zpzz_d
, uint64_t, do_uqrshl_d
)
540 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
541 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
543 DO_ZPZZ(sve2_shadd_zpzz_b
, int8_t, H1
, DO_HADD_BHS
)
544 DO_ZPZZ(sve2_shadd_zpzz_h
, int16_t, H1_2
, DO_HADD_BHS
)
545 DO_ZPZZ(sve2_shadd_zpzz_s
, int32_t, H1_4
, DO_HADD_BHS
)
546 DO_ZPZZ_D(sve2_shadd_zpzz_d
, int64_t, DO_HADD_D
)
548 DO_ZPZZ(sve2_uhadd_zpzz_b
, uint8_t, H1
, DO_HADD_BHS
)
549 DO_ZPZZ(sve2_uhadd_zpzz_h
, uint16_t, H1_2
, DO_HADD_BHS
)
550 DO_ZPZZ(sve2_uhadd_zpzz_s
, uint32_t, H1_4
, DO_HADD_BHS
)
551 DO_ZPZZ_D(sve2_uhadd_zpzz_d
, uint64_t, DO_HADD_D
)
553 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
554 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
556 DO_ZPZZ(sve2_srhadd_zpzz_b
, int8_t, H1
, DO_RHADD_BHS
)
557 DO_ZPZZ(sve2_srhadd_zpzz_h
, int16_t, H1_2
, DO_RHADD_BHS
)
558 DO_ZPZZ(sve2_srhadd_zpzz_s
, int32_t, H1_4
, DO_RHADD_BHS
)
559 DO_ZPZZ_D(sve2_srhadd_zpzz_d
, int64_t, DO_RHADD_D
)
561 DO_ZPZZ(sve2_urhadd_zpzz_b
, uint8_t, H1
, DO_RHADD_BHS
)
562 DO_ZPZZ(sve2_urhadd_zpzz_h
, uint16_t, H1_2
, DO_RHADD_BHS
)
563 DO_ZPZZ(sve2_urhadd_zpzz_s
, uint32_t, H1_4
, DO_RHADD_BHS
)
564 DO_ZPZZ_D(sve2_urhadd_zpzz_d
, uint64_t, DO_RHADD_D
)
566 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
567 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
569 DO_ZPZZ(sve2_shsub_zpzz_b
, int8_t, H1
, DO_HSUB_BHS
)
570 DO_ZPZZ(sve2_shsub_zpzz_h
, int16_t, H1_2
, DO_HSUB_BHS
)
571 DO_ZPZZ(sve2_shsub_zpzz_s
, int32_t, H1_4
, DO_HSUB_BHS
)
572 DO_ZPZZ_D(sve2_shsub_zpzz_d
, int64_t, DO_HSUB_D
)
574 DO_ZPZZ(sve2_uhsub_zpzz_b
, uint8_t, H1
, DO_HSUB_BHS
)
575 DO_ZPZZ(sve2_uhsub_zpzz_h
, uint16_t, H1_2
, DO_HSUB_BHS
)
576 DO_ZPZZ(sve2_uhsub_zpzz_s
, uint32_t, H1_4
, DO_HSUB_BHS
)
577 DO_ZPZZ_D(sve2_uhsub_zpzz_d
, uint64_t, DO_HSUB_D
)
579 static inline int32_t do_sat_bhs(int64_t val
, int64_t min
, int64_t max
)
581 return val
>= max
? max
: val
<= min
? min
: val
;
584 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
585 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
586 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
588 static inline int64_t do_sqadd_d(int64_t n
, int64_t m
)
591 if (((r
^ n
) & ~(n
^ m
)) < 0) {
592 /* Signed overflow. */
593 return r
< 0 ? INT64_MAX
: INT64_MIN
;
598 DO_ZPZZ(sve2_sqadd_zpzz_b
, int8_t, H1
, DO_SQADD_B
)
599 DO_ZPZZ(sve2_sqadd_zpzz_h
, int16_t, H1_2
, DO_SQADD_H
)
600 DO_ZPZZ(sve2_sqadd_zpzz_s
, int32_t, H1_4
, DO_SQADD_S
)
601 DO_ZPZZ_D(sve2_sqadd_zpzz_d
, int64_t, do_sqadd_d
)
603 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
604 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
605 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
607 static inline uint64_t do_uqadd_d(uint64_t n
, uint64_t m
)
610 return r
< n
? UINT64_MAX
: r
;
613 DO_ZPZZ(sve2_uqadd_zpzz_b
, uint8_t, H1
, DO_UQADD_B
)
614 DO_ZPZZ(sve2_uqadd_zpzz_h
, uint16_t, H1_2
, DO_UQADD_H
)
615 DO_ZPZZ(sve2_uqadd_zpzz_s
, uint32_t, H1_4
, DO_UQADD_S
)
616 DO_ZPZZ_D(sve2_uqadd_zpzz_d
, uint64_t, do_uqadd_d
)
618 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
619 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
620 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
622 static inline int64_t do_sqsub_d(int64_t n
, int64_t m
)
625 if (((r
^ n
) & (n
^ m
)) < 0) {
626 /* Signed overflow. */
627 return r
< 0 ? INT64_MAX
: INT64_MIN
;
632 DO_ZPZZ(sve2_sqsub_zpzz_b
, int8_t, H1
, DO_SQSUB_B
)
633 DO_ZPZZ(sve2_sqsub_zpzz_h
, int16_t, H1_2
, DO_SQSUB_H
)
634 DO_ZPZZ(sve2_sqsub_zpzz_s
, int32_t, H1_4
, DO_SQSUB_S
)
635 DO_ZPZZ_D(sve2_sqsub_zpzz_d
, int64_t, do_sqsub_d
)
637 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
638 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
639 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
641 static inline uint64_t do_uqsub_d(uint64_t n
, uint64_t m
)
643 return n
> m
? n
- m
: 0;
646 DO_ZPZZ(sve2_uqsub_zpzz_b
, uint8_t, H1
, DO_UQSUB_B
)
647 DO_ZPZZ(sve2_uqsub_zpzz_h
, uint16_t, H1_2
, DO_UQSUB_H
)
648 DO_ZPZZ(sve2_uqsub_zpzz_s
, uint32_t, H1_4
, DO_UQSUB_S
)
649 DO_ZPZZ_D(sve2_uqsub_zpzz_d
, uint64_t, do_uqsub_d
)
651 #define DO_SUQADD_B(n, m) \
652 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
653 #define DO_SUQADD_H(n, m) \
654 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
655 #define DO_SUQADD_S(n, m) \
656 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
658 static inline int64_t do_suqadd_d(int64_t n
, uint64_t m
)
663 /* Note that m - abs(n) cannot underflow. */
665 /* Result is either very large positive or negative. */
667 /* m > abs(n), so r is a very large positive. */
670 /* Result is negative. */
673 /* Both inputs are positive: check for overflow. */
674 if (r
< m
|| r
> INT64_MAX
) {
681 DO_ZPZZ(sve2_suqadd_zpzz_b
, uint8_t, H1
, DO_SUQADD_B
)
682 DO_ZPZZ(sve2_suqadd_zpzz_h
, uint16_t, H1_2
, DO_SUQADD_H
)
683 DO_ZPZZ(sve2_suqadd_zpzz_s
, uint32_t, H1_4
, DO_SUQADD_S
)
684 DO_ZPZZ_D(sve2_suqadd_zpzz_d
, uint64_t, do_suqadd_d
)
686 #define DO_USQADD_B(n, m) \
687 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
688 #define DO_USQADD_H(n, m) \
689 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
690 #define DO_USQADD_S(n, m) \
691 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
693 static inline uint64_t do_usqadd_d(uint64_t n
, int64_t m
)
698 return n
< -m
? 0 : r
;
700 return r
< n
? UINT64_MAX
: r
;
703 DO_ZPZZ(sve2_usqadd_zpzz_b
, uint8_t, H1
, DO_USQADD_B
)
704 DO_ZPZZ(sve2_usqadd_zpzz_h
, uint16_t, H1_2
, DO_USQADD_H
)
705 DO_ZPZZ(sve2_usqadd_zpzz_s
, uint32_t, H1_4
, DO_USQADD_S
)
706 DO_ZPZZ_D(sve2_usqadd_zpzz_d
, uint64_t, do_usqadd_d
)
712 * Three operand expander, operating on element pairs.
713 * If the slot I is even, the elements from from VN {I, I+1}.
714 * If the slot I is odd, the elements from from VM {I-1, I}.
715 * Load all of the input elements in each pair before overwriting output.
717 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
718 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
720 intptr_t i, opr_sz = simd_oprsz(desc); \
721 for (i = 0; i < opr_sz; ) { \
722 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
724 TYPE n0 = *(TYPE *)(vn + H(i)); \
725 TYPE m0 = *(TYPE *)(vm + H(i)); \
726 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
727 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
729 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
731 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
733 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
735 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
740 /* Similarly, specialized for 64-bit operands. */
741 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
742 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
744 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
745 TYPE *d = vd, *n = vn, *m = vm; \
747 for (i = 0; i < opr_sz; i += 2) { \
748 TYPE n0 = n[i], n1 = n[i + 1]; \
749 TYPE m0 = m[i], m1 = m[i + 1]; \
750 if (pg[H1(i)] & 1) { \
753 if (pg[H1(i + 1)] & 1) { \
754 d[i + 1] = OP(m0, m1); \
759 DO_ZPZZ_PAIR(sve2_addp_zpzz_b
, uint8_t, H1
, DO_ADD
)
760 DO_ZPZZ_PAIR(sve2_addp_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
761 DO_ZPZZ_PAIR(sve2_addp_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
762 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d
, uint64_t, DO_ADD
)
764 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b
, uint8_t, H1
, DO_MAX
)
765 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
766 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
767 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d
, uint64_t, DO_MAX
)
769 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b
, uint8_t, H1
, DO_MIN
)
770 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
771 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
772 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d
, uint64_t, DO_MIN
)
774 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b
, int8_t, H1
, DO_MAX
)
775 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h
, int16_t, H1_2
, DO_MAX
)
776 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s
, int32_t, H1_4
, DO_MAX
)
777 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d
, int64_t, DO_MAX
)
779 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b
, int8_t, H1
, DO_MIN
)
780 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h
, int16_t, H1_2
, DO_MIN
)
781 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s
, int32_t, H1_4
, DO_MIN
)
782 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d
, int64_t, DO_MIN
)
785 #undef DO_ZPZZ_PAIR_D
787 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
788 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
789 void *status, uint32_t desc) \
791 intptr_t i, opr_sz = simd_oprsz(desc); \
792 for (i = 0; i < opr_sz; ) { \
793 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
795 TYPE n0 = *(TYPE *)(vn + H(i)); \
796 TYPE m0 = *(TYPE *)(vm + H(i)); \
797 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
798 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
800 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
802 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
804 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
806 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
811 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h
, float16
, H1_2
, float16_add
)
812 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s
, float32
, H1_4
, float32_add
)
813 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d
, float64
, H1_8
, float64_add
)
815 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h
, float16
, H1_2
, float16_maxnum
)
816 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s
, float32
, H1_4
, float32_maxnum
)
817 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d
, float64
, H1_8
, float64_maxnum
)
819 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h
, float16
, H1_2
, float16_minnum
)
820 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s
, float32
, H1_4
, float32_minnum
)
821 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d
, float64
, H1_8
, float64_minnum
)
823 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h
, float16
, H1_2
, float16_max
)
824 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s
, float32
, H1_4
, float32_max
)
825 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d
, float64
, H1_8
, float64_max
)
827 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h
, float16
, H1_2
, float16_min
)
828 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s
, float32
, H1_4
, float32_min
)
829 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d
, float64
, H1_8
, float64_min
)
831 #undef DO_ZPZZ_PAIR_FP
833 /* Three-operand expander, controlled by a predicate, in which the
834 * third operand is "wide". That is, for D = N op M, the same 64-bit
835 * value of M is used with all of the narrower values of N.
837 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
838 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
840 intptr_t i, opr_sz = simd_oprsz(desc); \
841 for (i = 0; i < opr_sz; ) { \
842 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
843 TYPEW mm = *(TYPEW *)(vm + i); \
846 TYPE nn = *(TYPE *)(vn + H(i)); \
847 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
849 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
854 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
855 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
856 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
858 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
859 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
860 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
862 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
863 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
864 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
868 /* Fully general two-operand expander, controlled by a predicate.
870 #define DO_ZPZ(NAME, TYPE, H, OP) \
871 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
873 intptr_t i, opr_sz = simd_oprsz(desc); \
874 for (i = 0; i < opr_sz; ) { \
875 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
878 TYPE nn = *(TYPE *)(vn + H(i)); \
879 *(TYPE *)(vd + H(i)) = OP(nn); \
881 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
886 /* Similarly, specialized for 64-bit operands. */
887 #define DO_ZPZ_D(NAME, TYPE, OP) \
888 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
890 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
891 TYPE *d = vd, *n = vn; \
893 for (i = 0; i < opr_sz; i += 1) { \
894 if (pg[H1(i)] & 1) { \
901 #define DO_CLS_B(N) (clrsb32(N) - 24)
902 #define DO_CLS_H(N) (clrsb32(N) - 16)
904 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
905 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
906 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
907 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
909 #define DO_CLZ_B(N) (clz32(N) - 24)
910 #define DO_CLZ_H(N) (clz32(N) - 16)
912 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
913 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
914 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
915 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
917 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
918 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
919 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
920 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
922 #define DO_CNOT(N) (N == 0)
924 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
925 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
926 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
927 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
929 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
931 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
932 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
933 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
935 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
937 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
938 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
939 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
941 #define DO_NOT(N) (~N)
943 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
944 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
945 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
946 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
948 #define DO_SXTB(N) ((int8_t)N)
949 #define DO_SXTH(N) ((int16_t)N)
950 #define DO_SXTS(N) ((int32_t)N)
951 #define DO_UXTB(N) ((uint8_t)N)
952 #define DO_UXTH(N) ((uint16_t)N)
953 #define DO_UXTS(N) ((uint32_t)N)
955 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
956 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
957 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
958 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
959 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
960 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
962 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
963 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
964 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
965 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
966 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
967 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
969 #define DO_ABS(N) (N < 0 ? -N : N)
971 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
972 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
973 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
974 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
976 #define DO_NEG(N) (-N)
978 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
979 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
980 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
981 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
983 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
984 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
985 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
987 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
988 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
990 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
992 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
993 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
994 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
995 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
997 #define DO_SQABS(X) \
998 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
999 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1001 DO_ZPZ(sve2_sqabs_b
, int8_t, H1
, DO_SQABS
)
1002 DO_ZPZ(sve2_sqabs_h
, int16_t, H1_2
, DO_SQABS
)
1003 DO_ZPZ(sve2_sqabs_s
, int32_t, H1_4
, DO_SQABS
)
1004 DO_ZPZ_D(sve2_sqabs_d
, int64_t, DO_SQABS
)
1006 #define DO_SQNEG(X) \
1007 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1008 x_ == min_ ? -min_ - 1 : -x_; })
1010 DO_ZPZ(sve2_sqneg_b
, uint8_t, H1
, DO_SQNEG
)
1011 DO_ZPZ(sve2_sqneg_h
, uint16_t, H1_2
, DO_SQNEG
)
1012 DO_ZPZ(sve2_sqneg_s
, uint32_t, H1_4
, DO_SQNEG
)
1013 DO_ZPZ_D(sve2_sqneg_d
, uint64_t, DO_SQNEG
)
1015 DO_ZPZ(sve2_urecpe_s
, uint32_t, H1_4
, helper_recpe_u32
)
1016 DO_ZPZ(sve2_ursqrte_s
, uint32_t, H1_4
, helper_rsqrte_u32
)
1018 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1020 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1021 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1023 intptr_t i, opr_sz = simd_oprsz(desc); \
1024 for (i = 0; i < opr_sz; ) { \
1025 TYPEW mm = *(TYPEW *)(vm + i); \
1027 TYPE nn = *(TYPE *)(vn + H(i)); \
1028 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1029 i += sizeof(TYPE); \
1034 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
1035 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
1036 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
1038 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
1039 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
1040 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
1042 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
1043 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
1044 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
1061 * Three-operand expander, unpredicated, in which the two inputs are
1062 * selected from the top or bottom half of the wide column.
1064 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1065 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1067 intptr_t i, opr_sz = simd_oprsz(desc); \
1068 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1069 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1070 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1071 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1072 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1073 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1077 DO_ZZZ_TB(sve2_saddl_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1078 DO_ZZZ_TB(sve2_saddl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1079 DO_ZZZ_TB(sve2_saddl_d
, int64_t, int32_t, H1_8
, H1_4
, DO_ADD
)
1081 DO_ZZZ_TB(sve2_ssubl_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1082 DO_ZZZ_TB(sve2_ssubl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1083 DO_ZZZ_TB(sve2_ssubl_d
, int64_t, int32_t, H1_8
, H1_4
, DO_SUB
)
1085 DO_ZZZ_TB(sve2_sabdl_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1086 DO_ZZZ_TB(sve2_sabdl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1087 DO_ZZZ_TB(sve2_sabdl_d
, int64_t, int32_t, H1_8
, H1_4
, DO_ABD
)
1089 DO_ZZZ_TB(sve2_uaddl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1090 DO_ZZZ_TB(sve2_uaddl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1091 DO_ZZZ_TB(sve2_uaddl_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_ADD
)
1093 DO_ZZZ_TB(sve2_usubl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1094 DO_ZZZ_TB(sve2_usubl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1095 DO_ZZZ_TB(sve2_usubl_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_SUB
)
1097 DO_ZZZ_TB(sve2_uabdl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1098 DO_ZZZ_TB(sve2_uabdl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1099 DO_ZZZ_TB(sve2_uabdl_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_ABD
)
1101 DO_ZZZ_TB(sve2_smull_zzz_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1102 DO_ZZZ_TB(sve2_smull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1103 DO_ZZZ_TB(sve2_smull_zzz_d
, int64_t, int32_t, H1_8
, H1_4
, DO_MUL
)
1105 DO_ZZZ_TB(sve2_umull_zzz_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1106 DO_ZZZ_TB(sve2_umull_zzz_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1107 DO_ZZZ_TB(sve2_umull_zzz_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_MUL
)
1109 /* Note that the multiply cannot overflow, but the doubling can. */
1110 static inline int16_t do_sqdmull_h(int16_t n
, int16_t m
)
1112 int16_t val
= n
* m
;
1113 return DO_SQADD_H(val
, val
);
1116 static inline int32_t do_sqdmull_s(int32_t n
, int32_t m
)
1118 int32_t val
= n
* m
;
1119 return DO_SQADD_S(val
, val
);
1122 static inline int64_t do_sqdmull_d(int64_t n
, int64_t m
)
1124 int64_t val
= n
* m
;
1125 return do_sqadd_d(val
, val
);
1128 DO_ZZZ_TB(sve2_sqdmull_zzz_h
, int16_t, int8_t, H1_2
, H1
, do_sqdmull_h
)
1129 DO_ZZZ_TB(sve2_sqdmull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1130 DO_ZZZ_TB(sve2_sqdmull_zzz_d
, int64_t, int32_t, H1_8
, H1_4
, do_sqdmull_d
)
1134 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1135 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1137 intptr_t i, opr_sz = simd_oprsz(desc); \
1138 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1139 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1140 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1141 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1142 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1146 DO_ZZZ_WTB(sve2_saddw_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1147 DO_ZZZ_WTB(sve2_saddw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1148 DO_ZZZ_WTB(sve2_saddw_d
, int64_t, int32_t, H1_8
, H1_4
, DO_ADD
)
1150 DO_ZZZ_WTB(sve2_ssubw_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1151 DO_ZZZ_WTB(sve2_ssubw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1152 DO_ZZZ_WTB(sve2_ssubw_d
, int64_t, int32_t, H1_8
, H1_4
, DO_SUB
)
1154 DO_ZZZ_WTB(sve2_uaddw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1155 DO_ZZZ_WTB(sve2_uaddw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1156 DO_ZZZ_WTB(sve2_uaddw_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_ADD
)
1158 DO_ZZZ_WTB(sve2_usubw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1159 DO_ZZZ_WTB(sve2_usubw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1160 DO_ZZZ_WTB(sve2_usubw_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_SUB
)
1164 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1165 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1167 intptr_t i, opr_sz = simd_oprsz(desc); \
1168 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1169 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1170 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1171 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1172 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1173 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1177 DO_ZZZ_NTB(sve2_eoril_b
, uint8_t, H1
, DO_EOR
)
1178 DO_ZZZ_NTB(sve2_eoril_h
, uint16_t, H1_2
, DO_EOR
)
1179 DO_ZZZ_NTB(sve2_eoril_s
, uint32_t, H1_4
, DO_EOR
)
1180 DO_ZZZ_NTB(sve2_eoril_d
, uint64_t, H1_8
, DO_EOR
)
1184 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1187 intptr_t i, opr_sz = simd_oprsz(desc); \
1188 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1189 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1190 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1191 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1192 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1193 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1197 DO_ZZZW_ACC(sve2_sabal_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1198 DO_ZZZW_ACC(sve2_sabal_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1199 DO_ZZZW_ACC(sve2_sabal_d
, int64_t, int32_t, H1_8
, H1_4
, DO_ABD
)
1201 DO_ZZZW_ACC(sve2_uabal_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1202 DO_ZZZW_ACC(sve2_uabal_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1203 DO_ZZZW_ACC(sve2_uabal_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_ABD
)
1205 DO_ZZZW_ACC(sve2_smlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1206 DO_ZZZW_ACC(sve2_smlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1207 DO_ZZZW_ACC(sve2_smlal_zzzw_d
, int64_t, int32_t, H1_8
, H1_4
, DO_MUL
)
1209 DO_ZZZW_ACC(sve2_umlal_zzzw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1210 DO_ZZZW_ACC(sve2_umlal_zzzw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1211 DO_ZZZW_ACC(sve2_umlal_zzzw_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_MUL
)
1213 #define DO_NMUL(N, M) -(N * M)
1215 DO_ZZZW_ACC(sve2_smlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
, DO_NMUL
)
1216 DO_ZZZW_ACC(sve2_smlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_NMUL
)
1217 DO_ZZZW_ACC(sve2_smlsl_zzzw_d
, int64_t, int32_t, H1_8
, H1_4
, DO_NMUL
)
1219 DO_ZZZW_ACC(sve2_umlsl_zzzw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_NMUL
)
1220 DO_ZZZW_ACC(sve2_umlsl_zzzw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_NMUL
)
1221 DO_ZZZW_ACC(sve2_umlsl_zzzw_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_NMUL
)
1225 #define DO_XTNB(NAME, TYPE, OP) \
1226 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1228 intptr_t i, opr_sz = simd_oprsz(desc); \
1229 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1230 TYPE nn = *(TYPE *)(vn + i); \
1231 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1232 *(TYPE *)(vd + i) = nn; \
1236 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1237 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1239 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1240 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1241 TYPE nn = *(TYPE *)(vn + i); \
1242 *(TYPEN *)(vd + i + odd) = OP(nn); \
1246 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1247 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1248 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1250 DO_XTNB(sve2_sqxtnb_h
, int16_t, DO_SQXTN_H
)
1251 DO_XTNB(sve2_sqxtnb_s
, int32_t, DO_SQXTN_S
)
1252 DO_XTNB(sve2_sqxtnb_d
, int64_t, DO_SQXTN_D
)
1254 DO_XTNT(sve2_sqxtnt_h
, int16_t, int8_t, H1
, DO_SQXTN_H
)
1255 DO_XTNT(sve2_sqxtnt_s
, int32_t, int16_t, H1_2
, DO_SQXTN_S
)
1256 DO_XTNT(sve2_sqxtnt_d
, int64_t, int32_t, H1_4
, DO_SQXTN_D
)
1258 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1259 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1260 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1262 DO_XTNB(sve2_uqxtnb_h
, uint16_t, DO_UQXTN_H
)
1263 DO_XTNB(sve2_uqxtnb_s
, uint32_t, DO_UQXTN_S
)
1264 DO_XTNB(sve2_uqxtnb_d
, uint64_t, DO_UQXTN_D
)
1266 DO_XTNT(sve2_uqxtnt_h
, uint16_t, uint8_t, H1
, DO_UQXTN_H
)
1267 DO_XTNT(sve2_uqxtnt_s
, uint32_t, uint16_t, H1_2
, DO_UQXTN_S
)
1268 DO_XTNT(sve2_uqxtnt_d
, uint64_t, uint32_t, H1_4
, DO_UQXTN_D
)
1270 DO_XTNB(sve2_sqxtunb_h
, int16_t, DO_UQXTN_H
)
1271 DO_XTNB(sve2_sqxtunb_s
, int32_t, DO_UQXTN_S
)
1272 DO_XTNB(sve2_sqxtunb_d
, int64_t, DO_UQXTN_D
)
1274 DO_XTNT(sve2_sqxtunt_h
, int16_t, int8_t, H1
, DO_UQXTN_H
)
1275 DO_XTNT(sve2_sqxtunt_s
, int32_t, int16_t, H1_2
, DO_UQXTN_S
)
1276 DO_XTNT(sve2_sqxtunt_d
, int64_t, int32_t, H1_4
, DO_UQXTN_D
)
1281 void HELPER(sve2_adcl_s
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1283 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1284 int sel
= H4(extract32(desc
, SIMD_DATA_SHIFT
, 1));
1285 uint32_t inv
= -extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1286 uint32_t *a
= va
, *n
= vn
;
1287 uint64_t *d
= vd
, *m
= vm
;
1289 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1290 uint32_t e1
= a
[2 * i
+ H4(0)];
1291 uint32_t e2
= n
[2 * i
+ sel
] ^ inv
;
1292 uint64_t c
= extract64(m
[i
], 32, 1);
1293 /* Compute and store the entire 33-bit result at once. */
1298 void HELPER(sve2_adcl_d
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1300 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1301 int sel
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1302 uint64_t inv
= -(uint64_t)extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1303 uint64_t *d
= vd
, *a
= va
, *n
= vn
, *m
= vm
;
1305 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
1306 Int128 e1
= int128_make64(a
[i
]);
1307 Int128 e2
= int128_make64(n
[i
+ sel
] ^ inv
);
1308 Int128 c
= int128_make64(m
[i
+ 1] & 1);
1309 Int128 r
= int128_add(int128_add(e1
, e2
), c
);
1310 d
[i
+ 0] = int128_getlo(r
);
1311 d
[i
+ 1] = int128_gethi(r
);
1315 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1318 intptr_t i, opr_sz = simd_oprsz(desc); \
1319 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1320 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1321 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1322 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1323 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1324 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1325 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1329 DO_SQDMLAL(sve2_sqdmlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1330 do_sqdmull_h
, DO_SQADD_H
)
1331 DO_SQDMLAL(sve2_sqdmlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1332 do_sqdmull_s
, DO_SQADD_S
)
1333 DO_SQDMLAL(sve2_sqdmlal_zzzw_d
, int64_t, int32_t, H1_8
, H1_4
,
1334 do_sqdmull_d
, do_sqadd_d
)
1336 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1337 do_sqdmull_h
, DO_SQSUB_H
)
1338 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1339 do_sqdmull_s
, DO_SQSUB_S
)
1340 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d
, int64_t, int32_t, H1_8
, H1_4
,
1341 do_sqdmull_d
, do_sqsub_d
)
1345 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1346 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1348 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1349 int rot = simd_data(desc); \
1350 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1351 bool sub_r = rot == 1 || rot == 2; \
1352 bool sub_i = rot >= 2; \
1353 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1354 for (i = 0; i < opr_sz; i += 2) { \
1355 TYPE elt1_a = n[H(i + sel_a)]; \
1356 TYPE elt2_a = m[H(i + sel_a)]; \
1357 TYPE elt2_b = m[H(i + sel_b)]; \
1358 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1359 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1363 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1365 DO_CMLA_FUNC(sve2_cmla_zzzz_b
, uint8_t, H1
, DO_CMLA
)
1366 DO_CMLA_FUNC(sve2_cmla_zzzz_h
, uint16_t, H2
, DO_CMLA
)
1367 DO_CMLA_FUNC(sve2_cmla_zzzz_s
, uint32_t, H4
, DO_CMLA
)
1368 DO_CMLA_FUNC(sve2_cmla_zzzz_d
, uint64_t, H8
, DO_CMLA
)
1370 #define DO_SQRDMLAH_B(N, M, A, S) \
1371 do_sqrdmlah_b(N, M, A, S, true)
1372 #define DO_SQRDMLAH_H(N, M, A, S) \
1373 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1374 #define DO_SQRDMLAH_S(N, M, A, S) \
1375 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1376 #define DO_SQRDMLAH_D(N, M, A, S) \
1377 do_sqrdmlah_d(N, M, A, S, true)
1379 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b
, int8_t, H1
, DO_SQRDMLAH_B
)
1380 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1381 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1382 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d
, int64_t, H8
, DO_SQRDMLAH_D
)
1384 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1385 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1387 intptr_t i, j, oprsz = simd_oprsz(desc); \
1388 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1389 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1390 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1391 bool sub_r = rot == 1 || rot == 2; \
1392 bool sub_i = rot >= 2; \
1393 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1394 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1395 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1396 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1397 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1398 TYPE elt1_a = n[H(i + j + sel_a)]; \
1399 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1400 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1405 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h
, int16_t, H2
, DO_CMLA
)
1406 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s
, int32_t, H4
, DO_CMLA
)
1408 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1409 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1413 #undef DO_CMLA_IDX_FUNC
1414 #undef DO_SQRDMLAH_B
1415 #undef DO_SQRDMLAH_H
1416 #undef DO_SQRDMLAH_S
1417 #undef DO_SQRDMLAH_D
1419 /* Note N and M are 4 elements bundled into one unit. */
1420 static int32_t do_cdot_s(uint32_t n
, uint32_t m
, int32_t a
,
1421 int sel_a
, int sel_b
, int sub_i
)
1423 for (int i
= 0; i
<= 1; i
++) {
1424 int32_t elt1_r
= (int8_t)(n
>> (16 * i
));
1425 int32_t elt1_i
= (int8_t)(n
>> (16 * i
+ 8));
1426 int32_t elt2_a
= (int8_t)(m
>> (16 * i
+ 8 * sel_a
));
1427 int32_t elt2_b
= (int8_t)(m
>> (16 * i
+ 8 * sel_b
));
1429 a
+= elt1_r
* elt2_a
+ elt1_i
* elt2_b
* sub_i
;
1434 static int64_t do_cdot_d(uint64_t n
, uint64_t m
, int64_t a
,
1435 int sel_a
, int sel_b
, int sub_i
)
1437 for (int i
= 0; i
<= 1; i
++) {
1438 int64_t elt1_r
= (int16_t)(n
>> (32 * i
+ 0));
1439 int64_t elt1_i
= (int16_t)(n
>> (32 * i
+ 16));
1440 int64_t elt2_a
= (int16_t)(m
>> (32 * i
+ 16 * sel_a
));
1441 int64_t elt2_b
= (int16_t)(m
>> (32 * i
+ 16 * sel_b
));
1443 a
+= elt1_r
* elt2_a
+ elt1_i
* elt2_b
* sub_i
;
1448 void HELPER(sve2_cdot_zzzz_s
)(void *vd
, void *vn
, void *vm
,
1449 void *va
, uint32_t desc
)
1451 int opr_sz
= simd_oprsz(desc
);
1452 int rot
= simd_data(desc
);
1453 int sel_a
= rot
& 1;
1454 int sel_b
= sel_a
^ 1;
1455 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1456 uint32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1458 for (int e
= 0; e
< opr_sz
/ 4; e
++) {
1459 d
[e
] = do_cdot_s(n
[e
], m
[e
], a
[e
], sel_a
, sel_b
, sub_i
);
1463 void HELPER(sve2_cdot_zzzz_d
)(void *vd
, void *vn
, void *vm
,
1464 void *va
, uint32_t desc
)
1466 int opr_sz
= simd_oprsz(desc
);
1467 int rot
= simd_data(desc
);
1468 int sel_a
= rot
& 1;
1469 int sel_b
= sel_a
^ 1;
1470 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1471 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1473 for (int e
= 0; e
< opr_sz
/ 8; e
++) {
1474 d
[e
] = do_cdot_d(n
[e
], m
[e
], a
[e
], sel_a
, sel_b
, sub_i
);
1478 void HELPER(sve2_cdot_idx_s
)(void *vd
, void *vn
, void *vm
,
1479 void *va
, uint32_t desc
)
1481 int opr_sz
= simd_oprsz(desc
);
1482 int rot
= extract32(desc
, SIMD_DATA_SHIFT
, 2);
1483 int idx
= H4(extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2));
1484 int sel_a
= rot
& 1;
1485 int sel_b
= sel_a
^ 1;
1486 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1487 uint32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1489 for (int seg
= 0; seg
< opr_sz
/ 4; seg
+= 4) {
1490 uint32_t seg_m
= m
[seg
+ idx
];
1491 for (int e
= 0; e
< 4; e
++) {
1492 d
[seg
+ e
] = do_cdot_s(n
[seg
+ e
], seg_m
, a
[seg
+ e
],
1493 sel_a
, sel_b
, sub_i
);
1498 void HELPER(sve2_cdot_idx_d
)(void *vd
, void *vn
, void *vm
,
1499 void *va
, uint32_t desc
)
1501 int seg
, opr_sz
= simd_oprsz(desc
);
1502 int rot
= extract32(desc
, SIMD_DATA_SHIFT
, 2);
1503 int idx
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2);
1504 int sel_a
= rot
& 1;
1505 int sel_b
= sel_a
^ 1;
1506 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1507 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1509 for (seg
= 0; seg
< opr_sz
/ 8; seg
+= 2) {
1510 uint64_t seg_m
= m
[seg
+ idx
];
1511 for (int e
= 0; e
< 2; e
++) {
1512 d
[seg
+ e
] = do_cdot_d(n
[seg
+ e
], seg_m
, a
[seg
+ e
],
1513 sel_a
, sel_b
, sub_i
);
1518 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1519 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1521 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1522 intptr_t i, j, idx = simd_data(desc); \
1523 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1524 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1526 for (j = 0; j < segment; j++) { \
1527 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1532 #define DO_SQRDMLAH_H(N, M, A) \
1533 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1534 #define DO_SQRDMLAH_S(N, M, A) \
1535 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1536 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1538 DO_ZZXZ(sve2_sqrdmlah_idx_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1539 DO_ZZXZ(sve2_sqrdmlah_idx_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1540 DO_ZZXZ(sve2_sqrdmlah_idx_d
, int64_t, H8
, DO_SQRDMLAH_D
)
1542 #define DO_SQRDMLSH_H(N, M, A) \
1543 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1544 #define DO_SQRDMLSH_S(N, M, A) \
1545 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1546 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1548 DO_ZZXZ(sve2_sqrdmlsh_idx_h
, int16_t, H2
, DO_SQRDMLSH_H
)
1549 DO_ZZXZ(sve2_sqrdmlsh_idx_s
, int32_t, H4
, DO_SQRDMLSH_S
)
1550 DO_ZZXZ(sve2_sqrdmlsh_idx_d
, int64_t, H8
, DO_SQRDMLSH_D
)
1554 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1555 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1557 intptr_t i, j, oprsz = simd_oprsz(desc); \
1558 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1559 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1560 for (i = 0; i < oprsz; i += 16) { \
1561 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1562 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1563 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1564 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1565 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1570 #define DO_MLA(N, M, A) (A + N * M)
1572 DO_ZZXW(sve2_smlal_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MLA
)
1573 DO_ZZXW(sve2_smlal_idx_d
, int64_t, int32_t, H1_8
, H1_4
, DO_MLA
)
1574 DO_ZZXW(sve2_umlal_idx_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MLA
)
1575 DO_ZZXW(sve2_umlal_idx_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_MLA
)
1577 #define DO_MLS(N, M, A) (A - N * M)
1579 DO_ZZXW(sve2_smlsl_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MLS
)
1580 DO_ZZXW(sve2_smlsl_idx_d
, int64_t, int32_t, H1_8
, H1_4
, DO_MLS
)
1581 DO_ZZXW(sve2_umlsl_idx_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MLS
)
1582 DO_ZZXW(sve2_umlsl_idx_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_MLS
)
1584 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1585 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1587 DO_ZZXW(sve2_sqdmlal_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SQDMLAL_S
)
1588 DO_ZZXW(sve2_sqdmlal_idx_d
, int64_t, int32_t, H1_8
, H1_4
, DO_SQDMLAL_D
)
1590 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1591 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1593 DO_ZZXW(sve2_sqdmlsl_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SQDMLSL_S
)
1594 DO_ZZXW(sve2_sqdmlsl_idx_d
, int64_t, int32_t, H1_8
, H1_4
, DO_SQDMLSL_D
)
1600 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1601 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1603 intptr_t i, j, oprsz = simd_oprsz(desc); \
1604 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1605 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1606 for (i = 0; i < oprsz; i += 16) { \
1607 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1608 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1609 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1610 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1615 DO_ZZX(sve2_sqdmull_idx_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1616 DO_ZZX(sve2_sqdmull_idx_d
, int64_t, int32_t, H1_8
, H1_4
, do_sqdmull_d
)
1618 DO_ZZX(sve2_smull_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1619 DO_ZZX(sve2_smull_idx_d
, int64_t, int32_t, H1_8
, H1_4
, DO_MUL
)
1621 DO_ZZX(sve2_umull_idx_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1622 DO_ZZX(sve2_umull_idx_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_MUL
)
1626 #define DO_BITPERM(NAME, TYPE, OP) \
1627 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1629 intptr_t i, opr_sz = simd_oprsz(desc); \
1630 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1631 TYPE nn = *(TYPE *)(vn + i); \
1632 TYPE mm = *(TYPE *)(vm + i); \
1633 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1637 static uint64_t bitextract(uint64_t data
, uint64_t mask
, int n
)
1642 for (db
= 0; db
< n
; ++db
) {
1643 if ((mask
>> db
) & 1) {
1644 res
|= ((data
>> db
) & 1) << rb
;
1651 DO_BITPERM(sve2_bext_b
, uint8_t, bitextract
)
1652 DO_BITPERM(sve2_bext_h
, uint16_t, bitextract
)
1653 DO_BITPERM(sve2_bext_s
, uint32_t, bitextract
)
1654 DO_BITPERM(sve2_bext_d
, uint64_t, bitextract
)
1656 static uint64_t bitdeposit(uint64_t data
, uint64_t mask
, int n
)
1661 for (rb
= 0; rb
< n
; ++rb
) {
1662 if ((mask
>> rb
) & 1) {
1663 res
|= ((data
>> db
) & 1) << rb
;
1670 DO_BITPERM(sve2_bdep_b
, uint8_t, bitdeposit
)
1671 DO_BITPERM(sve2_bdep_h
, uint16_t, bitdeposit
)
1672 DO_BITPERM(sve2_bdep_s
, uint32_t, bitdeposit
)
1673 DO_BITPERM(sve2_bdep_d
, uint64_t, bitdeposit
)
1675 static uint64_t bitgroup(uint64_t data
, uint64_t mask
, int n
)
1677 uint64_t resm
= 0, resu
= 0;
1678 int db
, rbm
= 0, rbu
= 0;
1680 for (db
= 0; db
< n
; ++db
) {
1681 uint64_t val
= (data
>> db
) & 1;
1682 if ((mask
>> db
) & 1) {
1683 resm
|= val
<< rbm
++;
1685 resu
|= val
<< rbu
++;
1689 return resm
| (resu
<< rbm
);
1692 DO_BITPERM(sve2_bgrp_b
, uint8_t, bitgroup
)
1693 DO_BITPERM(sve2_bgrp_h
, uint16_t, bitgroup
)
1694 DO_BITPERM(sve2_bgrp_s
, uint32_t, bitgroup
)
1695 DO_BITPERM(sve2_bgrp_d
, uint64_t, bitgroup
)
1699 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1700 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1702 intptr_t i, opr_sz = simd_oprsz(desc); \
1703 int sub_r = simd_data(desc); \
1705 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1706 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1707 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1708 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1709 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1710 acc_r = ADD_OP(acc_r, el2_i); \
1711 acc_i = SUB_OP(acc_i, el2_r); \
1712 *(TYPE *)(vd + H(i)) = acc_r; \
1713 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1716 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1717 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1718 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1719 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1720 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1721 acc_r = SUB_OP(acc_r, el2_i); \
1722 acc_i = ADD_OP(acc_i, el2_r); \
1723 *(TYPE *)(vd + H(i)) = acc_r; \
1724 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1729 DO_CADD(sve2_cadd_b
, int8_t, H1
, DO_ADD
, DO_SUB
)
1730 DO_CADD(sve2_cadd_h
, int16_t, H1_2
, DO_ADD
, DO_SUB
)
1731 DO_CADD(sve2_cadd_s
, int32_t, H1_4
, DO_ADD
, DO_SUB
)
1732 DO_CADD(sve2_cadd_d
, int64_t, H1_8
, DO_ADD
, DO_SUB
)
1734 DO_CADD(sve2_sqcadd_b
, int8_t, H1
, DO_SQADD_B
, DO_SQSUB_B
)
1735 DO_CADD(sve2_sqcadd_h
, int16_t, H1_2
, DO_SQADD_H
, DO_SQSUB_H
)
1736 DO_CADD(sve2_sqcadd_s
, int32_t, H1_4
, DO_SQADD_S
, DO_SQSUB_S
)
1737 DO_CADD(sve2_sqcadd_d
, int64_t, H1_8
, do_sqadd_d
, do_sqsub_d
)
1741 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1742 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1744 intptr_t i, opr_sz = simd_oprsz(desc); \
1745 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1746 int shift = simd_data(desc) >> 1; \
1747 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1748 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1749 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1753 DO_ZZI_SHLL(sve2_sshll_h
, int16_t, int8_t, H1_2
, H1
)
1754 DO_ZZI_SHLL(sve2_sshll_s
, int32_t, int16_t, H1_4
, H1_2
)
1755 DO_ZZI_SHLL(sve2_sshll_d
, int64_t, int32_t, H1_8
, H1_4
)
1757 DO_ZZI_SHLL(sve2_ushll_h
, uint16_t, uint8_t, H1_2
, H1
)
1758 DO_ZZI_SHLL(sve2_ushll_s
, uint32_t, uint16_t, H1_4
, H1_2
)
1759 DO_ZZI_SHLL(sve2_ushll_d
, uint64_t, uint32_t, H1_8
, H1_4
)
1763 /* Two-operand reduction expander, controlled by a predicate.
1764 * The difference between TYPERED and TYPERET has to do with
1765 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1766 * but TYPERET must be unsigned so that e.g. a 32-bit value
1767 * is not sign-extended to the ABI uint64_t return type.
1769 /* ??? If we were to vectorize this by hand the reduction ordering
1770 * would change. For integer operands, this is perfectly fine.
1772 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1773 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1775 intptr_t i, opr_sz = simd_oprsz(desc); \
1776 TYPERED ret = INIT; \
1777 for (i = 0; i < opr_sz; ) { \
1778 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1781 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1782 ret = OP(ret, nn); \
1784 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1787 return (TYPERET)ret; \
1790 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1791 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1793 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1797 for (i = 0; i < opr_sz; i += 1) { \
1798 if (pg[H1(i)] & 1) { \
1800 ret = OP(ret, nn); \
1806 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
1807 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
1808 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
1809 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
1811 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
1812 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
1813 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
1814 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
1816 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
1817 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
1818 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
1819 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
1821 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1822 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1823 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1825 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1826 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1827 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1828 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
1830 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
1831 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
1832 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
1833 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
1835 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
1836 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
1837 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
1838 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
1840 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
1841 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
1842 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
1843 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
1845 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
1846 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
1847 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
1848 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
1853 /* Two vector operand, one scalar operand, unpredicated. */
1854 #define DO_ZZI(NAME, TYPE, OP) \
1855 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1857 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1858 TYPE s = s64, *d = vd, *n = vn; \
1859 for (i = 0; i < opr_sz; ++i) { \
1860 d[i] = OP(n[i], s); \
1864 #define DO_SUBR(X, Y) (Y - X)
1866 DO_ZZI(sve_subri_b
, uint8_t, DO_SUBR
)
1867 DO_ZZI(sve_subri_h
, uint16_t, DO_SUBR
)
1868 DO_ZZI(sve_subri_s
, uint32_t, DO_SUBR
)
1869 DO_ZZI(sve_subri_d
, uint64_t, DO_SUBR
)
1871 DO_ZZI(sve_smaxi_b
, int8_t, DO_MAX
)
1872 DO_ZZI(sve_smaxi_h
, int16_t, DO_MAX
)
1873 DO_ZZI(sve_smaxi_s
, int32_t, DO_MAX
)
1874 DO_ZZI(sve_smaxi_d
, int64_t, DO_MAX
)
1876 DO_ZZI(sve_smini_b
, int8_t, DO_MIN
)
1877 DO_ZZI(sve_smini_h
, int16_t, DO_MIN
)
1878 DO_ZZI(sve_smini_s
, int32_t, DO_MIN
)
1879 DO_ZZI(sve_smini_d
, int64_t, DO_MIN
)
1881 DO_ZZI(sve_umaxi_b
, uint8_t, DO_MAX
)
1882 DO_ZZI(sve_umaxi_h
, uint16_t, DO_MAX
)
1883 DO_ZZI(sve_umaxi_s
, uint32_t, DO_MAX
)
1884 DO_ZZI(sve_umaxi_d
, uint64_t, DO_MAX
)
1886 DO_ZZI(sve_umini_b
, uint8_t, DO_MIN
)
1887 DO_ZZI(sve_umini_h
, uint16_t, DO_MIN
)
1888 DO_ZZI(sve_umini_s
, uint32_t, DO_MIN
)
1889 DO_ZZI(sve_umini_d
, uint64_t, DO_MIN
)
1909 /* Similar to the ARM LastActiveElement pseudocode function, except the
1910 result is multiplied by the element size. This includes the not found
1911 indication; e.g. not found for esz=3 is -8. */
1912 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
1914 uint64_t mask
= pred_esz_masks
[esz
];
1918 uint64_t this_g
= g
[--i
] & mask
;
1920 return i
* 64 + (63 - clz64(this_g
));
1923 return (intptr_t)-1 << esz
;
1926 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t pred_desc
)
1928 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1929 uint32_t flags
= PREDTEST_INIT
;
1930 uint64_t *d
= vd
, *g
= vg
;
1934 uint64_t this_d
= d
[i
];
1935 uint64_t this_g
= g
[i
];
1939 /* Set in D the first bit of G. */
1940 this_d
|= this_g
& -this_g
;
1943 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
1945 } while (++i
< words
);
1950 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
1952 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1953 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
1954 uint32_t flags
= PREDTEST_INIT
;
1955 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
1958 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
1959 esz_mask
= pred_esz_masks
[esz
];
1961 /* Similar to the pseudocode for pnext, but scaled by ESZ
1962 so that we find the correct bit. */
1963 if (next
< words
* 64) {
1967 mask
= ~((1ull << (next
& 63)) - 1);
1971 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
1973 next
= (next
& -64) + ctz64(this_g
);
1978 } while (next
< words
* 64);
1983 uint64_t this_d
= 0;
1984 if (i
== next
/ 64) {
1985 this_d
= 1ull << (next
& 63);
1988 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
1989 } while (++i
< words
);
1995 * Copy Zn into Zd, and store zero into inactive elements.
1996 * If inv, store zeros into the active elements.
1998 void HELPER(sve_movz_b
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2000 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2001 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
2002 uint64_t *d
= vd
, *n
= vn
;
2005 for (i
= 0; i
< opr_sz
; i
+= 1) {
2006 d
[i
] = n
[i
] & (expand_pred_b(pg
[H1(i
)]) ^ inv
);
2010 void HELPER(sve_movz_h
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2012 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2013 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
2014 uint64_t *d
= vd
, *n
= vn
;
2017 for (i
= 0; i
< opr_sz
; i
+= 1) {
2018 d
[i
] = n
[i
] & (expand_pred_h(pg
[H1(i
)]) ^ inv
);
2022 void HELPER(sve_movz_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2024 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2025 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
2026 uint64_t *d
= vd
, *n
= vn
;
2029 for (i
= 0; i
< opr_sz
; i
+= 1) {
2030 d
[i
] = n
[i
] & (expand_pred_s(pg
[H1(i
)]) ^ inv
);
2034 void HELPER(sve_movz_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2036 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2037 uint64_t *d
= vd
, *n
= vn
;
2039 uint8_t inv
= simd_data(desc
);
2041 for (i
= 0; i
< opr_sz
; i
+= 1) {
2042 d
[i
] = n
[i
] & -(uint64_t)((pg
[H1(i
)] ^ inv
) & 1);
2046 /* Three-operand expander, immediate operand, controlled by a predicate.
2048 #define DO_ZPZI(NAME, TYPE, H, OP) \
2049 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2051 intptr_t i, opr_sz = simd_oprsz(desc); \
2052 TYPE imm = simd_data(desc); \
2053 for (i = 0; i < opr_sz; ) { \
2054 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2057 TYPE nn = *(TYPE *)(vn + H(i)); \
2058 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2060 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2065 /* Similarly, specialized for 64-bit operands. */
2066 #define DO_ZPZI_D(NAME, TYPE, OP) \
2067 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2069 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2070 TYPE *d = vd, *n = vn; \
2071 TYPE imm = simd_data(desc); \
2073 for (i = 0; i < opr_sz; i += 1) { \
2074 if (pg[H1(i)] & 1) { \
2076 d[i] = OP(nn, imm); \
2081 #define DO_SHR(N, M) (N >> M)
2082 #define DO_SHL(N, M) (N << M)
2084 /* Arithmetic shift right for division. This rounds negative numbers
2085 toward zero as per signed division. Therefore before shifting,
2086 when N is negative, add 2**M-1. */
2087 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2089 static inline uint64_t do_urshr(uint64_t x
, unsigned sh
)
2091 if (likely(sh
< 64)) {
2092 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
2093 } else if (sh
== 64) {
2100 static inline int64_t do_srshr(int64_t x
, unsigned sh
)
2102 if (likely(sh
< 64)) {
2103 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
2105 /* Rounding the sign bit always produces 0. */
2110 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
2111 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
2112 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
2113 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
2115 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
2116 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
2117 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
2118 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
2120 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
2121 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
2122 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
2123 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
2125 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
2126 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
2127 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
2128 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
2130 /* SVE2 bitwise shift by immediate */
2131 DO_ZPZI(sve2_sqshl_zpzi_b
, int8_t, H1
, do_sqshl_b
)
2132 DO_ZPZI(sve2_sqshl_zpzi_h
, int16_t, H1_2
, do_sqshl_h
)
2133 DO_ZPZI(sve2_sqshl_zpzi_s
, int32_t, H1_4
, do_sqshl_s
)
2134 DO_ZPZI_D(sve2_sqshl_zpzi_d
, int64_t, do_sqshl_d
)
2136 DO_ZPZI(sve2_uqshl_zpzi_b
, uint8_t, H1
, do_uqshl_b
)
2137 DO_ZPZI(sve2_uqshl_zpzi_h
, uint16_t, H1_2
, do_uqshl_h
)
2138 DO_ZPZI(sve2_uqshl_zpzi_s
, uint32_t, H1_4
, do_uqshl_s
)
2139 DO_ZPZI_D(sve2_uqshl_zpzi_d
, uint64_t, do_uqshl_d
)
2141 DO_ZPZI(sve2_srshr_b
, int8_t, H1
, do_srshr
)
2142 DO_ZPZI(sve2_srshr_h
, int16_t, H1_2
, do_srshr
)
2143 DO_ZPZI(sve2_srshr_s
, int32_t, H1_4
, do_srshr
)
2144 DO_ZPZI_D(sve2_srshr_d
, int64_t, do_srshr
)
2146 DO_ZPZI(sve2_urshr_b
, uint8_t, H1
, do_urshr
)
2147 DO_ZPZI(sve2_urshr_h
, uint16_t, H1_2
, do_urshr
)
2148 DO_ZPZI(sve2_urshr_s
, uint32_t, H1_4
, do_urshr
)
2149 DO_ZPZI_D(sve2_urshr_d
, uint64_t, do_urshr
)
2151 #define do_suqrshl_b(n, m) \
2152 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2153 #define do_suqrshl_h(n, m) \
2154 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2155 #define do_suqrshl_s(n, m) \
2156 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2157 #define do_suqrshl_d(n, m) \
2158 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2160 DO_ZPZI(sve2_sqshlu_b
, int8_t, H1
, do_suqrshl_b
)
2161 DO_ZPZI(sve2_sqshlu_h
, int16_t, H1_2
, do_suqrshl_h
)
2162 DO_ZPZI(sve2_sqshlu_s
, int32_t, H1_4
, do_suqrshl_s
)
2163 DO_ZPZI_D(sve2_sqshlu_d
, int64_t, do_suqrshl_d
)
2169 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2170 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2172 intptr_t i, opr_sz = simd_oprsz(desc); \
2173 int shift = simd_data(desc); \
2174 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2175 TYPEW nn = *(TYPEW *)(vn + i); \
2176 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2180 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2181 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2183 intptr_t i, opr_sz = simd_oprsz(desc); \
2184 int shift = simd_data(desc); \
2185 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2186 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2187 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2191 DO_SHRNB(sve2_shrnb_h
, uint16_t, uint8_t, DO_SHR
)
2192 DO_SHRNB(sve2_shrnb_s
, uint32_t, uint16_t, DO_SHR
)
2193 DO_SHRNB(sve2_shrnb_d
, uint64_t, uint32_t, DO_SHR
)
2195 DO_SHRNT(sve2_shrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SHR
)
2196 DO_SHRNT(sve2_shrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SHR
)
2197 DO_SHRNT(sve2_shrnt_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_SHR
)
2199 DO_SHRNB(sve2_rshrnb_h
, uint16_t, uint8_t, do_urshr
)
2200 DO_SHRNB(sve2_rshrnb_s
, uint32_t, uint16_t, do_urshr
)
2201 DO_SHRNB(sve2_rshrnb_d
, uint64_t, uint32_t, do_urshr
)
2203 DO_SHRNT(sve2_rshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, do_urshr
)
2204 DO_SHRNT(sve2_rshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, do_urshr
)
2205 DO_SHRNT(sve2_rshrnt_d
, uint64_t, uint32_t, H1_8
, H1_4
, do_urshr
)
2207 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2208 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2209 #define DO_SQSHRUN_D(x, sh) \
2210 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2212 DO_SHRNB(sve2_sqshrunb_h
, int16_t, uint8_t, DO_SQSHRUN_H
)
2213 DO_SHRNB(sve2_sqshrunb_s
, int32_t, uint16_t, DO_SQSHRUN_S
)
2214 DO_SHRNB(sve2_sqshrunb_d
, int64_t, uint32_t, DO_SQSHRUN_D
)
2216 DO_SHRNT(sve2_sqshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRUN_H
)
2217 DO_SHRNT(sve2_sqshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRUN_S
)
2218 DO_SHRNT(sve2_sqshrunt_d
, int64_t, uint32_t, H1_8
, H1_4
, DO_SQSHRUN_D
)
2220 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2221 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2222 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2224 DO_SHRNB(sve2_sqrshrunb_h
, int16_t, uint8_t, DO_SQRSHRUN_H
)
2225 DO_SHRNB(sve2_sqrshrunb_s
, int32_t, uint16_t, DO_SQRSHRUN_S
)
2226 DO_SHRNB(sve2_sqrshrunb_d
, int64_t, uint32_t, DO_SQRSHRUN_D
)
2228 DO_SHRNT(sve2_sqrshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRUN_H
)
2229 DO_SHRNT(sve2_sqrshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRUN_S
)
2230 DO_SHRNT(sve2_sqrshrunt_d
, int64_t, uint32_t, H1_8
, H1_4
, DO_SQRSHRUN_D
)
2232 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2233 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2234 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2236 DO_SHRNB(sve2_sqshrnb_h
, int16_t, uint8_t, DO_SQSHRN_H
)
2237 DO_SHRNB(sve2_sqshrnb_s
, int32_t, uint16_t, DO_SQSHRN_S
)
2238 DO_SHRNB(sve2_sqshrnb_d
, int64_t, uint32_t, DO_SQSHRN_D
)
2240 DO_SHRNT(sve2_sqshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRN_H
)
2241 DO_SHRNT(sve2_sqshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRN_S
)
2242 DO_SHRNT(sve2_sqshrnt_d
, int64_t, uint32_t, H1_8
, H1_4
, DO_SQSHRN_D
)
2244 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2245 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2246 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2248 DO_SHRNB(sve2_sqrshrnb_h
, int16_t, uint8_t, DO_SQRSHRN_H
)
2249 DO_SHRNB(sve2_sqrshrnb_s
, int32_t, uint16_t, DO_SQRSHRN_S
)
2250 DO_SHRNB(sve2_sqrshrnb_d
, int64_t, uint32_t, DO_SQRSHRN_D
)
2252 DO_SHRNT(sve2_sqrshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRN_H
)
2253 DO_SHRNT(sve2_sqrshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRN_S
)
2254 DO_SHRNT(sve2_sqrshrnt_d
, int64_t, uint32_t, H1_8
, H1_4
, DO_SQRSHRN_D
)
2256 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2257 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2258 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2260 DO_SHRNB(sve2_uqshrnb_h
, uint16_t, uint8_t, DO_UQSHRN_H
)
2261 DO_SHRNB(sve2_uqshrnb_s
, uint32_t, uint16_t, DO_UQSHRN_S
)
2262 DO_SHRNB(sve2_uqshrnb_d
, uint64_t, uint32_t, DO_UQSHRN_D
)
2264 DO_SHRNT(sve2_uqshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQSHRN_H
)
2265 DO_SHRNT(sve2_uqshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQSHRN_S
)
2266 DO_SHRNT(sve2_uqshrnt_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_UQSHRN_D
)
2268 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2269 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2270 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2272 DO_SHRNB(sve2_uqrshrnb_h
, uint16_t, uint8_t, DO_UQRSHRN_H
)
2273 DO_SHRNB(sve2_uqrshrnb_s
, uint32_t, uint16_t, DO_UQRSHRN_S
)
2274 DO_SHRNB(sve2_uqrshrnb_d
, uint64_t, uint32_t, DO_UQRSHRN_D
)
2276 DO_SHRNT(sve2_uqrshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQRSHRN_H
)
2277 DO_SHRNT(sve2_uqrshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQRSHRN_S
)
2278 DO_SHRNT(sve2_uqrshrnt_d
, uint64_t, uint32_t, H1_8
, H1_4
, DO_UQRSHRN_D
)
2283 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2284 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2286 intptr_t i, opr_sz = simd_oprsz(desc); \
2287 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2288 TYPEW nn = *(TYPEW *)(vn + i); \
2289 TYPEW mm = *(TYPEW *)(vm + i); \
2290 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2294 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2295 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2297 intptr_t i, opr_sz = simd_oprsz(desc); \
2298 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2299 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2300 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2301 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2305 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2306 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2307 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2308 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2310 DO_BINOPNB(sve2_addhnb_h
, uint16_t, uint8_t, 8, DO_ADDHN
)
2311 DO_BINOPNB(sve2_addhnb_s
, uint32_t, uint16_t, 16, DO_ADDHN
)
2312 DO_BINOPNB(sve2_addhnb_d
, uint64_t, uint32_t, 32, DO_ADDHN
)
2314 DO_BINOPNT(sve2_addhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_ADDHN
)
2315 DO_BINOPNT(sve2_addhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_ADDHN
)
2316 DO_BINOPNT(sve2_addhnt_d
, uint64_t, uint32_t, 32, H1_8
, H1_4
, DO_ADDHN
)
2318 DO_BINOPNB(sve2_raddhnb_h
, uint16_t, uint8_t, 8, DO_RADDHN
)
2319 DO_BINOPNB(sve2_raddhnb_s
, uint32_t, uint16_t, 16, DO_RADDHN
)
2320 DO_BINOPNB(sve2_raddhnb_d
, uint64_t, uint32_t, 32, DO_RADDHN
)
2322 DO_BINOPNT(sve2_raddhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_RADDHN
)
2323 DO_BINOPNT(sve2_raddhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_RADDHN
)
2324 DO_BINOPNT(sve2_raddhnt_d
, uint64_t, uint32_t, 32, H1_8
, H1_4
, DO_RADDHN
)
2326 DO_BINOPNB(sve2_subhnb_h
, uint16_t, uint8_t, 8, DO_SUBHN
)
2327 DO_BINOPNB(sve2_subhnb_s
, uint32_t, uint16_t, 16, DO_SUBHN
)
2328 DO_BINOPNB(sve2_subhnb_d
, uint64_t, uint32_t, 32, DO_SUBHN
)
2330 DO_BINOPNT(sve2_subhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_SUBHN
)
2331 DO_BINOPNT(sve2_subhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_SUBHN
)
2332 DO_BINOPNT(sve2_subhnt_d
, uint64_t, uint32_t, 32, H1_8
, H1_4
, DO_SUBHN
)
2334 DO_BINOPNB(sve2_rsubhnb_h
, uint16_t, uint8_t, 8, DO_RSUBHN
)
2335 DO_BINOPNB(sve2_rsubhnb_s
, uint32_t, uint16_t, 16, DO_RSUBHN
)
2336 DO_BINOPNB(sve2_rsubhnb_d
, uint64_t, uint32_t, 32, DO_RSUBHN
)
2338 DO_BINOPNT(sve2_rsubhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_RSUBHN
)
2339 DO_BINOPNT(sve2_rsubhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_RSUBHN
)
2340 DO_BINOPNT(sve2_rsubhnt_d
, uint64_t, uint32_t, 32, H1_8
, H1_4
, DO_RSUBHN
)
2349 /* Fully general four-operand expander, controlled by a predicate.
2351 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2352 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2353 void *vg, uint32_t desc) \
2355 intptr_t i, opr_sz = simd_oprsz(desc); \
2356 for (i = 0; i < opr_sz; ) { \
2357 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2360 TYPE nn = *(TYPE *)(vn + H(i)); \
2361 TYPE mm = *(TYPE *)(vm + H(i)); \
2362 TYPE aa = *(TYPE *)(va + H(i)); \
2363 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2365 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2370 /* Similarly, specialized for 64-bit operands. */
2371 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2372 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2373 void *vg, uint32_t desc) \
2375 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2376 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2378 for (i = 0; i < opr_sz; i += 1) { \
2379 if (pg[H1(i)] & 1) { \
2380 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2381 d[i] = OP(aa, nn, mm); \
2386 #define DO_MLA(A, N, M) (A + N * M)
2387 #define DO_MLS(A, N, M) (A - N * M)
2389 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
2390 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
2392 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
2393 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
2395 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
2396 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
2398 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
2399 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
2406 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
2407 uint32_t incr
, uint32_t desc
)
2409 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2411 for (i
= 0; i
< opr_sz
; i
+= 1) {
2412 d
[H1(i
)] = start
+ i
* incr
;
2416 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
2417 uint32_t incr
, uint32_t desc
)
2419 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2421 for (i
= 0; i
< opr_sz
; i
+= 1) {
2422 d
[H2(i
)] = start
+ i
* incr
;
2426 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
2427 uint32_t incr
, uint32_t desc
)
2429 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2431 for (i
= 0; i
< opr_sz
; i
+= 1) {
2432 d
[H4(i
)] = start
+ i
* incr
;
2436 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
2437 uint64_t incr
, uint32_t desc
)
2439 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2441 for (i
= 0; i
< opr_sz
; i
+= 1) {
2442 d
[i
] = start
+ i
* incr
;
2446 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2448 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2449 uint32_t sh
= simd_data(desc
);
2450 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2451 for (i
= 0; i
< opr_sz
; i
+= 1) {
2452 d
[i
] = n
[i
] + (m
[i
] << sh
);
2456 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2458 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2459 uint64_t sh
= simd_data(desc
);
2460 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2461 for (i
= 0; i
< opr_sz
; i
+= 1) {
2462 d
[i
] = n
[i
] + (m
[i
] << sh
);
2466 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2468 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2469 uint64_t sh
= simd_data(desc
);
2470 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2471 for (i
= 0; i
< opr_sz
; i
+= 1) {
2472 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
2476 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2478 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2479 uint64_t sh
= simd_data(desc
);
2480 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2481 for (i
= 0; i
< opr_sz
; i
+= 1) {
2482 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
2486 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
2488 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2489 static const uint16_t coeff
[] = {
2490 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2491 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2492 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2493 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2495 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2496 uint16_t *d
= vd
, *n
= vn
;
2498 for (i
= 0; i
< opr_sz
; i
++) {
2500 intptr_t idx
= extract32(nn
, 0, 5);
2501 uint16_t exp
= extract32(nn
, 5, 5);
2502 d
[i
] = coeff
[idx
] | (exp
<< 10);
2506 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
2508 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2509 static const uint32_t coeff
[] = {
2510 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2511 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2512 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2513 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2514 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2515 0x1ef532, 0x20b051, 0x227043, 0x243516,
2516 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2517 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2518 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2519 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2520 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2521 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2522 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2523 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2524 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2525 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2527 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2528 uint32_t *d
= vd
, *n
= vn
;
2530 for (i
= 0; i
< opr_sz
; i
++) {
2532 intptr_t idx
= extract32(nn
, 0, 6);
2533 uint32_t exp
= extract32(nn
, 6, 8);
2534 d
[i
] = coeff
[idx
] | (exp
<< 23);
2538 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
2540 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2541 static const uint64_t coeff
[] = {
2542 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
2543 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
2544 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
2545 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
2546 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
2547 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
2548 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
2549 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
2550 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
2551 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
2552 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
2553 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
2554 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
2555 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
2556 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
2557 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
2558 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
2559 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
2560 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
2561 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
2562 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
2565 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2566 uint64_t *d
= vd
, *n
= vn
;
2568 for (i
= 0; i
< opr_sz
; i
++) {
2570 intptr_t idx
= extract32(nn
, 0, 6);
2571 uint64_t exp
= extract32(nn
, 6, 11);
2572 d
[i
] = coeff
[idx
] | (exp
<< 52);
2576 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2578 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2579 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
2580 for (i
= 0; i
< opr_sz
; i
+= 1) {
2586 d
[i
] = nn
^ (mm
& 2) << 14;
2590 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2592 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2593 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2594 for (i
= 0; i
< opr_sz
; i
+= 1) {
2600 d
[i
] = nn
^ (mm
& 2) << 30;
2604 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2606 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2607 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2608 for (i
= 0; i
< opr_sz
; i
+= 1) {
2614 d
[i
] = nn
^ (mm
& 2) << 62;
2619 * Signed saturating addition with scalar operand.
2622 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2624 intptr_t i
, oprsz
= simd_oprsz(desc
);
2626 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
2627 *(int8_t *)(d
+ i
) = DO_SQADD_B(b
, *(int8_t *)(a
+ i
));
2631 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2633 intptr_t i
, oprsz
= simd_oprsz(desc
);
2635 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
2636 *(int16_t *)(d
+ i
) = DO_SQADD_H(b
, *(int16_t *)(a
+ i
));
2640 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2642 intptr_t i
, oprsz
= simd_oprsz(desc
);
2644 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
2645 *(int32_t *)(d
+ i
) = DO_SQADD_S(b
, *(int32_t *)(a
+ i
));
2649 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2651 intptr_t i
, oprsz
= simd_oprsz(desc
);
2653 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
2654 *(int64_t *)(d
+ i
) = do_sqadd_d(b
, *(int64_t *)(a
+ i
));
2659 * Unsigned saturating addition with scalar operand.
2662 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2664 intptr_t i
, oprsz
= simd_oprsz(desc
);
2666 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
2667 *(uint8_t *)(d
+ i
) = DO_UQADD_B(b
, *(uint8_t *)(a
+ i
));
2671 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2673 intptr_t i
, oprsz
= simd_oprsz(desc
);
2675 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
2676 *(uint16_t *)(d
+ i
) = DO_UQADD_H(b
, *(uint16_t *)(a
+ i
));
2680 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2682 intptr_t i
, oprsz
= simd_oprsz(desc
);
2684 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
2685 *(uint32_t *)(d
+ i
) = DO_UQADD_S(b
, *(uint32_t *)(a
+ i
));
2689 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2691 intptr_t i
, oprsz
= simd_oprsz(desc
);
2693 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2694 *(uint64_t *)(d
+ i
) = do_uqadd_d(b
, *(uint64_t *)(a
+ i
));
2698 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2700 intptr_t i
, oprsz
= simd_oprsz(desc
);
2702 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2703 *(uint64_t *)(d
+ i
) = do_uqsub_d(*(uint64_t *)(a
+ i
), b
);
2707 /* Two operand predicated copy immediate with merge. All valid immediates
2708 * can fit within 17 signed bits in the simd_data field.
2710 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
2711 uint64_t mm
, uint32_t desc
)
2713 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2714 uint64_t *d
= vd
, *n
= vn
;
2717 mm
= dup_const(MO_8
, mm
);
2718 for (i
= 0; i
< opr_sz
; i
+= 1) {
2720 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
2721 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2725 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
2726 uint64_t mm
, uint32_t desc
)
2728 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2729 uint64_t *d
= vd
, *n
= vn
;
2732 mm
= dup_const(MO_16
, mm
);
2733 for (i
= 0; i
< opr_sz
; i
+= 1) {
2735 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
2736 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2740 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
2741 uint64_t mm
, uint32_t desc
)
2743 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2744 uint64_t *d
= vd
, *n
= vn
;
2747 mm
= dup_const(MO_32
, mm
);
2748 for (i
= 0; i
< opr_sz
; i
+= 1) {
2750 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
2751 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2755 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
2756 uint64_t mm
, uint32_t desc
)
2758 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2759 uint64_t *d
= vd
, *n
= vn
;
2762 for (i
= 0; i
< opr_sz
; i
+= 1) {
2764 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
2768 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2770 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2774 val
= dup_const(MO_8
, val
);
2775 for (i
= 0; i
< opr_sz
; i
+= 1) {
2776 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
2780 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2782 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2786 val
= dup_const(MO_16
, val
);
2787 for (i
= 0; i
< opr_sz
; i
+= 1) {
2788 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
2792 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2794 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2798 val
= dup_const(MO_32
, val
);
2799 for (i
= 0; i
< opr_sz
; i
+= 1) {
2800 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
2804 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2806 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2810 for (i
= 0; i
< opr_sz
; i
+= 1) {
2811 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
2815 /* Big-endian hosts need to frob the byte indices. If the copy
2816 * happens to be 8-byte aligned, then no frobbing necessary.
2818 static void swap_memmove(void *vd
, void *vs
, size_t n
)
2820 uintptr_t d
= (uintptr_t)vd
;
2821 uintptr_t s
= (uintptr_t)vs
;
2822 uintptr_t o
= (d
| s
| n
) & 7;
2825 #ifndef HOST_WORDS_BIGENDIAN
2834 if (d
< s
|| d
>= s
+ n
) {
2835 for (i
= 0; i
< n
; i
+= 4) {
2836 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2839 for (i
= n
; i
> 0; ) {
2841 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2848 if (d
< s
|| d
>= s
+ n
) {
2849 for (i
= 0; i
< n
; i
+= 2) {
2850 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2853 for (i
= n
; i
> 0; ) {
2855 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2861 if (d
< s
|| d
>= s
+ n
) {
2862 for (i
= 0; i
< n
; i
++) {
2863 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2866 for (i
= n
; i
> 0; ) {
2868 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2875 /* Similarly for memset of 0. */
2876 static void swap_memzero(void *vd
, size_t n
)
2878 uintptr_t d
= (uintptr_t)vd
;
2879 uintptr_t o
= (d
| n
) & 7;
2882 /* Usually, the first bit of a predicate is set, so N is 0. */
2883 if (likely(n
== 0)) {
2887 #ifndef HOST_WORDS_BIGENDIAN
2896 for (i
= 0; i
< n
; i
+= 4) {
2897 *(uint32_t *)H1_4(d
+ i
) = 0;
2903 for (i
= 0; i
< n
; i
+= 2) {
2904 *(uint16_t *)H1_2(d
+ i
) = 0;
2909 for (i
= 0; i
< n
; i
++) {
2910 *(uint8_t *)H1(d
+ i
) = 0;
2916 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2918 intptr_t opr_sz
= simd_oprsz(desc
);
2919 size_t n_ofs
= simd_data(desc
);
2920 size_t n_siz
= opr_sz
- n_ofs
;
2923 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2924 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
2925 } else if (vd
!= vn
) {
2926 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
2927 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2929 /* vd == vn == vm. Need temp space. */
2931 swap_memmove(&tmp
, vm
, n_ofs
);
2932 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
2933 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
2937 #define DO_INSR(NAME, TYPE, H) \
2938 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2940 intptr_t opr_sz = simd_oprsz(desc); \
2941 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2942 *(TYPE *)(vd + H(0)) = val; \
2945 DO_INSR(sve_insr_b
, uint8_t, H1
)
2946 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
2947 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
2948 DO_INSR(sve_insr_d
, uint64_t, H1_8
)
2952 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
2954 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2955 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2956 uint64_t f
= *(uint64_t *)(vn
+ i
);
2957 uint64_t b
= *(uint64_t *)(vn
+ j
);
2958 *(uint64_t *)(vd
+ i
) = bswap64(b
);
2959 *(uint64_t *)(vd
+ j
) = bswap64(f
);
2963 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
2965 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2966 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2967 uint64_t f
= *(uint64_t *)(vn
+ i
);
2968 uint64_t b
= *(uint64_t *)(vn
+ j
);
2969 *(uint64_t *)(vd
+ i
) = hswap64(b
);
2970 *(uint64_t *)(vd
+ j
) = hswap64(f
);
2974 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
2976 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2977 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2978 uint64_t f
= *(uint64_t *)(vn
+ i
);
2979 uint64_t b
= *(uint64_t *)(vn
+ j
);
2980 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
2981 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
2985 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
2987 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2988 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2989 uint64_t f
= *(uint64_t *)(vn
+ i
);
2990 uint64_t b
= *(uint64_t *)(vn
+ j
);
2991 *(uint64_t *)(vd
+ i
) = b
;
2992 *(uint64_t *)(vd
+ j
) = f
;
2996 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2998 static inline void do_tbl1(void *vd
, void *vn
, void *vm
, uint32_t desc
,
2999 bool is_tbx
, tb_impl_fn
*fn
)
3001 ARMVectorReg scratch
;
3002 uintptr_t oprsz
= simd_oprsz(desc
);
3004 if (unlikely(vd
== vn
)) {
3005 vn
= memcpy(&scratch
, vn
, oprsz
);
3008 fn(vd
, vn
, NULL
, vm
, oprsz
, is_tbx
);
3011 static inline void do_tbl2(void *vd
, void *vn0
, void *vn1
, void *vm
,
3012 uint32_t desc
, bool is_tbx
, tb_impl_fn
*fn
)
3014 ARMVectorReg scratch
;
3015 uintptr_t oprsz
= simd_oprsz(desc
);
3017 if (unlikely(vd
== vn0
)) {
3018 vn0
= memcpy(&scratch
, vn0
, oprsz
);
3022 } else if (unlikely(vd
== vn1
)) {
3023 vn1
= memcpy(&scratch
, vn1
, oprsz
);
3026 fn(vd
, vn0
, vn1
, vm
, oprsz
, is_tbx
);
3029 #define DO_TB(SUFF, TYPE, H) \
3030 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3031 void *vm, uintptr_t oprsz, bool is_tbx) \
3033 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3034 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3035 for (i = 0; i < nelem; ++i) { \
3036 TYPE index = indexes[H1(i)], val = 0; \
3037 if (index < nelem) { \
3038 val = tbl0[H(index)]; \
3041 if (tbl1 && index < nelem) { \
3042 val = tbl1[H(index)]; \
3043 } else if (is_tbx) { \
3050 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3052 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3054 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3055 void *vm, uint32_t desc) \
3057 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3059 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3061 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3064 DO_TB(b
, uint8_t, H1
)
3065 DO_TB(h
, uint16_t, H2
)
3066 DO_TB(s
, uint32_t, H4
)
3067 DO_TB(d
, uint64_t, H8
)
3071 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3072 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3074 intptr_t i, opr_sz = simd_oprsz(desc); \
3078 if (unlikely(vn - vd < opr_sz)) { \
3079 n = memcpy(&tmp, n, opr_sz / 2); \
3081 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3082 d[HD(i)] = n[HS(i)]; \
3086 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
3087 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
3088 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, H8
, H4
)
3090 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
3091 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
3092 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, H8
, H4
)
3096 /* Mask of bits included in the even numbered predicates of width esz.
3097 * We also use this for expand_bits/compress_bits, and so extend the
3098 * same pattern out to 16-bit units.
3100 static const uint64_t even_bit_esz_masks
[5] = {
3101 0x5555555555555555ull
,
3102 0x3333333333333333ull
,
3103 0x0f0f0f0f0f0f0f0full
,
3104 0x00ff00ff00ff00ffull
,
3105 0x0000ffff0000ffffull
,
3108 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3109 * For N==0, this corresponds to the operation that in qemu/bitops.h
3110 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3111 * section 7-2 Shuffling Bits.
3113 static uint64_t expand_bits(uint64_t x
, int n
)
3118 for (i
= 4; i
>= n
; i
--) {
3120 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
3125 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3126 * For N==0, this corresponds to the operation that in qemu/bitops.h
3127 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3128 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3130 static uint64_t compress_bits(uint64_t x
, int n
)
3134 for (i
= n
; i
<= 4; i
++) {
3136 x
&= even_bit_esz_masks
[i
];
3139 return x
& 0xffffffffu
;
3142 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3144 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3145 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3146 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3147 int esize
= 1 << esz
;
3152 uint64_t nn
= *(uint64_t *)vn
;
3153 uint64_t mm
= *(uint64_t *)vm
;
3154 int half
= 4 * oprsz
;
3156 nn
= extract64(nn
, high
* half
, half
);
3157 mm
= extract64(mm
, high
* half
, half
);
3158 nn
= expand_bits(nn
, esz
);
3159 mm
= expand_bits(mm
, esz
);
3160 d
[0] = nn
| (mm
<< esize
);
3162 ARMPredicateReg tmp
;
3164 /* We produce output faster than we consume input.
3165 Therefore we must be mindful of possible overlap. */
3167 vn
= memcpy(&tmp
, vn
, oprsz
);
3171 } else if (vd
== vm
) {
3172 vm
= memcpy(&tmp
, vm
, oprsz
);
3178 if ((oprsz
& 7) == 0) {
3179 uint32_t *n
= vn
, *m
= vm
;
3182 for (i
= 0; i
< oprsz
/ 8; i
++) {
3183 uint64_t nn
= n
[H4(high
+ i
)];
3184 uint64_t mm
= m
[H4(high
+ i
)];
3186 nn
= expand_bits(nn
, esz
);
3187 mm
= expand_bits(mm
, esz
);
3188 d
[i
] = nn
| (mm
<< esize
);
3191 uint8_t *n
= vn
, *m
= vm
;
3194 for (i
= 0; i
< oprsz
/ 2; i
++) {
3195 uint16_t nn
= n
[H1(high
+ i
)];
3196 uint16_t mm
= m
[H1(high
+ i
)];
3198 nn
= expand_bits(nn
, esz
);
3199 mm
= expand_bits(mm
, esz
);
3200 d16
[H2(i
)] = nn
| (mm
<< esize
);
3206 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3208 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3209 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3210 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
) << esz
;
3211 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3216 l
= compress_bits(n
[0] >> odd
, esz
);
3217 h
= compress_bits(m
[0] >> odd
, esz
);
3218 d
[0] = l
| (h
<< (4 * oprsz
));
3220 ARMPredicateReg tmp_m
;
3221 intptr_t oprsz_16
= oprsz
/ 16;
3223 if ((vm
- vd
) < (uintptr_t)oprsz
) {
3224 m
= memcpy(&tmp_m
, vm
, oprsz
);
3227 for (i
= 0; i
< oprsz_16
; i
++) {
3230 l
= compress_bits(l
>> odd
, esz
);
3231 h
= compress_bits(h
>> odd
, esz
);
3232 d
[i
] = l
| (h
<< 32);
3236 * For VL which is not a multiple of 512, the results from M do not
3237 * align nicely with the uint64_t for D. Put the aligned results
3238 * from M into TMP_M and then copy it into place afterward.
3241 int final_shift
= (oprsz
& 15) * 2;
3245 l
= compress_bits(l
>> odd
, esz
);
3246 h
= compress_bits(h
>> odd
, esz
);
3247 d
[i
] = l
| (h
<< final_shift
);
3249 for (i
= 0; i
< oprsz_16
; i
++) {
3252 l
= compress_bits(l
>> odd
, esz
);
3253 h
= compress_bits(h
>> odd
, esz
);
3254 tmp_m
.p
[i
] = l
| (h
<< 32);
3258 l
= compress_bits(l
>> odd
, esz
);
3259 h
= compress_bits(h
>> odd
, esz
);
3260 tmp_m
.p
[i
] = l
| (h
<< final_shift
);
3262 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
3264 for (i
= 0; i
< oprsz_16
; i
++) {
3267 l
= compress_bits(l
>> odd
, esz
);
3268 h
= compress_bits(h
>> odd
, esz
);
3269 d
[oprsz_16
+ i
] = l
| (h
<< 32);
3275 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3277 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3278 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3279 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3280 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3287 mask
= even_bit_esz_masks
[esz
];
3294 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
3295 uint64_t nn
= (n
[i
] & mask
) >> shr
;
3296 uint64_t mm
= (m
[i
] & mask
) << shl
;
3301 /* Reverse units of 2**N bits. */
3302 static uint64_t reverse_bits_64(uint64_t x
, int n
)
3307 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
3308 uint64_t mask
= even_bit_esz_masks
[i
];
3309 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
3314 static uint8_t reverse_bits_8(uint8_t x
, int n
)
3316 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
3319 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
3320 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
3325 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3327 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3328 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3329 intptr_t i
, oprsz_2
= oprsz
/ 2;
3332 uint64_t l
= *(uint64_t *)vn
;
3333 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
3334 *(uint64_t *)vd
= l
;
3335 } else if ((oprsz
& 15) == 0) {
3336 for (i
= 0; i
< oprsz_2
; i
+= 8) {
3337 intptr_t ih
= oprsz
- 8 - i
;
3338 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
3339 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
3340 *(uint64_t *)(vd
+ i
) = h
;
3341 *(uint64_t *)(vd
+ ih
) = l
;
3344 for (i
= 0; i
< oprsz_2
; i
+= 1) {
3345 intptr_t il
= H1(i
);
3346 intptr_t ih
= H1(oprsz
- 1 - i
);
3347 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
3348 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
3349 *(uint8_t *)(vd
+ il
) = h
;
3350 *(uint8_t *)(vd
+ ih
) = l
;
3355 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3357 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3358 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3363 uint64_t nn
= *(uint64_t *)vn
;
3364 int half
= 4 * oprsz
;
3366 nn
= extract64(nn
, high
* half
, half
);
3367 nn
= expand_bits(nn
, 0);
3370 ARMPredicateReg tmp_n
;
3372 /* We produce output faster than we consume input.
3373 Therefore we must be mindful of possible overlap. */
3374 if ((vn
- vd
) < (uintptr_t)oprsz
) {
3375 vn
= memcpy(&tmp_n
, vn
, oprsz
);
3381 if ((oprsz
& 7) == 0) {
3385 for (i
= 0; i
< oprsz
/ 8; i
++) {
3386 uint64_t nn
= n
[H4(high
+ i
)];
3387 d
[i
] = expand_bits(nn
, 0);
3393 for (i
= 0; i
< oprsz
/ 2; i
++) {
3394 uint16_t nn
= n
[H1(high
+ i
)];
3395 d16
[H2(i
)] = expand_bits(nn
, 0);
3401 #define DO_ZIP(NAME, TYPE, H) \
3402 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3404 intptr_t oprsz = simd_oprsz(desc); \
3405 intptr_t i, oprsz_2 = oprsz / 2; \
3406 ARMVectorReg tmp_n, tmp_m; \
3407 /* We produce output faster than we consume input. \
3408 Therefore we must be mindful of possible overlap. */ \
3409 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3410 vn = memcpy(&tmp_n, vn, oprsz_2); \
3412 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3413 vm = memcpy(&tmp_m, vm, oprsz_2); \
3415 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3416 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3417 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3419 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3420 memset(vd + oprsz - 16, 0, 16); \
3424 DO_ZIP(sve_zip_b
, uint8_t, H1
)
3425 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
3426 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
3427 DO_ZIP(sve_zip_d
, uint64_t, H1_8
)
3428 DO_ZIP(sve2_zip_q
, Int128
, )
3430 #define DO_UZP(NAME, TYPE, H) \
3431 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3433 intptr_t oprsz = simd_oprsz(desc); \
3434 intptr_t odd_ofs = simd_data(desc); \
3436 ARMVectorReg tmp_m; \
3437 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3438 vm = memcpy(&tmp_m, vm, oprsz); \
3440 i = 0, p = odd_ofs; \
3442 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3443 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3444 } while (p < oprsz); \
3447 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3448 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3449 } while (p < oprsz); \
3450 tcg_debug_assert(i == oprsz); \
3453 DO_UZP(sve_uzp_b
, uint8_t, H1
)
3454 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
3455 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
3456 DO_UZP(sve_uzp_d
, uint64_t, H1_8
)
3457 DO_UZP(sve2_uzp_q
, Int128
, )
3459 #define DO_TRN(NAME, TYPE, H) \
3460 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3462 intptr_t oprsz = simd_oprsz(desc); \
3463 intptr_t odd_ofs = simd_data(desc); \
3465 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3466 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3467 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3468 *(TYPE *)(vd + H(i + 0)) = ae; \
3469 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3471 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3472 memset(vd + oprsz - 16, 0, 16); \
3476 DO_TRN(sve_trn_b
, uint8_t, H1
)
3477 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
3478 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
3479 DO_TRN(sve_trn_d
, uint64_t, H1_8
)
3480 DO_TRN(sve2_trn_q
, Int128
, )
3486 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3488 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
3489 uint32_t *d
= vd
, *n
= vn
;
3492 for (i
= j
= 0; i
< opr_sz
; i
++) {
3493 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
3494 d
[H4(j
)] = n
[H4(i
)];
3498 for (; j
< opr_sz
; j
++) {
3503 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3505 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
3506 uint64_t *d
= vd
, *n
= vn
;
3509 for (i
= j
= 0; i
< opr_sz
; i
++) {
3510 if (pg
[H1(i
)] & 1) {
3515 for (; j
< opr_sz
; j
++) {
3520 /* Similar to the ARM LastActiveElement pseudocode function, except the
3521 * result is multiplied by the element size. This includes the not found
3522 * indication; e.g. not found for esz=3 is -8.
3524 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
3526 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3527 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3529 return last_active_element(vg
, words
, esz
);
3532 void HELPER(sve_splice
)(void *vd
, void *vn
, void *vm
, void *vg
, uint32_t desc
)
3534 intptr_t opr_sz
= simd_oprsz(desc
) / 8;
3535 int esz
= simd_data(desc
);
3536 uint64_t pg
, first_g
, last_g
, len
, mask
= pred_esz_masks
[esz
];
3537 intptr_t i
, first_i
, last_i
;
3540 first_i
= last_i
= 0;
3541 first_g
= last_g
= 0;
3543 /* Find the extent of the active elements within VG. */
3544 for (i
= QEMU_ALIGN_UP(opr_sz
, 8) - 8; i
>= 0; i
-= 8) {
3545 pg
= *(uint64_t *)(vg
+ i
) & mask
;
3558 first_i
= first_i
* 8 + ctz64(first_g
);
3559 last_i
= last_i
* 8 + 63 - clz64(last_g
);
3560 len
= last_i
- first_i
+ (1 << esz
);
3562 vm
= memcpy(&tmp
, vm
, opr_sz
* 8);
3564 swap_memmove(vd
, vn
+ first_i
, len
);
3566 swap_memmove(vd
+ len
, vm
, opr_sz
* 8 - len
);
3569 void HELPER(sve_sel_zpzz_b
)(void *vd
, void *vn
, void *vm
,
3570 void *vg
, uint32_t desc
)
3572 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3573 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3576 for (i
= 0; i
< opr_sz
; i
+= 1) {
3577 uint64_t nn
= n
[i
], mm
= m
[i
];
3578 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
3579 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3583 void HELPER(sve_sel_zpzz_h
)(void *vd
, void *vn
, void *vm
,
3584 void *vg
, uint32_t desc
)
3586 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3587 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3590 for (i
= 0; i
< opr_sz
; i
+= 1) {
3591 uint64_t nn
= n
[i
], mm
= m
[i
];
3592 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
3593 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3597 void HELPER(sve_sel_zpzz_s
)(void *vd
, void *vn
, void *vm
,
3598 void *vg
, uint32_t desc
)
3600 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3601 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3604 for (i
= 0; i
< opr_sz
; i
+= 1) {
3605 uint64_t nn
= n
[i
], mm
= m
[i
];
3606 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
3607 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3611 void HELPER(sve_sel_zpzz_d
)(void *vd
, void *vn
, void *vm
,
3612 void *vg
, uint32_t desc
)
3614 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3615 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3618 for (i
= 0; i
< opr_sz
; i
+= 1) {
3619 uint64_t nn
= n
[i
], mm
= m
[i
];
3620 d
[i
] = (pg
[H1(i
)] & 1 ? nn
: mm
);
3624 /* Two operand comparison controlled by a predicate.
3625 * ??? It is very tempting to want to be able to expand this inline
3626 * with x86 instructions, e.g.
3628 * vcmpeqw zm, zn, %ymm0
3629 * vpmovmskb %ymm0, %eax
3633 * or even aarch64, e.g.
3635 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3636 * cmeq v0.8h, zn, zm
3637 * and v0.8h, v0.8h, mask
3641 * However, coming up with an abstraction that allows vector inputs and
3642 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3643 * scalar outputs, is tricky.
3645 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3646 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3648 intptr_t opr_sz = simd_oprsz(desc); \
3649 uint32_t flags = PREDTEST_INIT; \
3650 intptr_t i = opr_sz; \
3652 uint64_t out = 0, pg; \
3654 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3655 TYPE nn = *(TYPE *)(vn + H(i)); \
3656 TYPE mm = *(TYPE *)(vm + H(i)); \
3659 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3661 *(uint64_t *)(vd + (i >> 3)) = out; \
3662 flags = iter_predtest_bwd(out, pg, flags); \
3667 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3668 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3669 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3670 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3671 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3672 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3673 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3674 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3676 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b
, uint8_t, ==)
3677 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h
, uint16_t, ==)
3678 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s
, uint32_t, ==)
3679 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d
, uint64_t, ==)
3681 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b
, uint8_t, !=)
3682 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h
, uint16_t, !=)
3683 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s
, uint32_t, !=)
3684 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d
, uint64_t, !=)
3686 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b
, int8_t, >)
3687 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h
, int16_t, >)
3688 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s
, int32_t, >)
3689 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d
, int64_t, >)
3691 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b
, int8_t, >=)
3692 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h
, int16_t, >=)
3693 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s
, int32_t, >=)
3694 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d
, int64_t, >=)
3696 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b
, uint8_t, >)
3697 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h
, uint16_t, >)
3698 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s
, uint32_t, >)
3699 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d
, uint64_t, >)
3701 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b
, uint8_t, >=)
3702 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h
, uint16_t, >=)
3703 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s
, uint32_t, >=)
3704 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d
, uint64_t, >=)
3706 #undef DO_CMP_PPZZ_B
3707 #undef DO_CMP_PPZZ_H
3708 #undef DO_CMP_PPZZ_S
3709 #undef DO_CMP_PPZZ_D
3712 /* Similar, but the second source is "wide". */
3713 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3714 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3716 intptr_t opr_sz = simd_oprsz(desc); \
3717 uint32_t flags = PREDTEST_INIT; \
3718 intptr_t i = opr_sz; \
3720 uint64_t out = 0, pg; \
3722 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3724 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3725 TYPE nn = *(TYPE *)(vn + H(i)); \
3729 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3731 *(uint64_t *)(vd + (i >> 3)) = out; \
3732 flags = iter_predtest_bwd(out, pg, flags); \
3737 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3738 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3739 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3740 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3741 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3742 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3744 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b
, int8_t, uint64_t, ==)
3745 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h
, int16_t, uint64_t, ==)
3746 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s
, int32_t, uint64_t, ==)
3748 DO_CMP_PPZW_B(sve_cmpne_ppzw_b
, int8_t, uint64_t, !=)
3749 DO_CMP_PPZW_H(sve_cmpne_ppzw_h
, int16_t, uint64_t, !=)
3750 DO_CMP_PPZW_S(sve_cmpne_ppzw_s
, int32_t, uint64_t, !=)
3752 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b
, int8_t, int64_t, >)
3753 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h
, int16_t, int64_t, >)
3754 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s
, int32_t, int64_t, >)
3756 DO_CMP_PPZW_B(sve_cmpge_ppzw_b
, int8_t, int64_t, >=)
3757 DO_CMP_PPZW_H(sve_cmpge_ppzw_h
, int16_t, int64_t, >=)
3758 DO_CMP_PPZW_S(sve_cmpge_ppzw_s
, int32_t, int64_t, >=)
3760 DO_CMP_PPZW_B(sve_cmphi_ppzw_b
, uint8_t, uint64_t, >)
3761 DO_CMP_PPZW_H(sve_cmphi_ppzw_h
, uint16_t, uint64_t, >)
3762 DO_CMP_PPZW_S(sve_cmphi_ppzw_s
, uint32_t, uint64_t, >)
3764 DO_CMP_PPZW_B(sve_cmphs_ppzw_b
, uint8_t, uint64_t, >=)
3765 DO_CMP_PPZW_H(sve_cmphs_ppzw_h
, uint16_t, uint64_t, >=)
3766 DO_CMP_PPZW_S(sve_cmphs_ppzw_s
, uint32_t, uint64_t, >=)
3768 DO_CMP_PPZW_B(sve_cmplt_ppzw_b
, int8_t, int64_t, <)
3769 DO_CMP_PPZW_H(sve_cmplt_ppzw_h
, int16_t, int64_t, <)
3770 DO_CMP_PPZW_S(sve_cmplt_ppzw_s
, int32_t, int64_t, <)
3772 DO_CMP_PPZW_B(sve_cmple_ppzw_b
, int8_t, int64_t, <=)
3773 DO_CMP_PPZW_H(sve_cmple_ppzw_h
, int16_t, int64_t, <=)
3774 DO_CMP_PPZW_S(sve_cmple_ppzw_s
, int32_t, int64_t, <=)
3776 DO_CMP_PPZW_B(sve_cmplo_ppzw_b
, uint8_t, uint64_t, <)
3777 DO_CMP_PPZW_H(sve_cmplo_ppzw_h
, uint16_t, uint64_t, <)
3778 DO_CMP_PPZW_S(sve_cmplo_ppzw_s
, uint32_t, uint64_t, <)
3780 DO_CMP_PPZW_B(sve_cmpls_ppzw_b
, uint8_t, uint64_t, <=)
3781 DO_CMP_PPZW_H(sve_cmpls_ppzw_h
, uint16_t, uint64_t, <=)
3782 DO_CMP_PPZW_S(sve_cmpls_ppzw_s
, uint32_t, uint64_t, <=)
3784 #undef DO_CMP_PPZW_B
3785 #undef DO_CMP_PPZW_H
3786 #undef DO_CMP_PPZW_S
3789 /* Similar, but the second source is immediate. */
3790 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3793 intptr_t opr_sz = simd_oprsz(desc); \
3794 uint32_t flags = PREDTEST_INIT; \
3795 TYPE mm = simd_data(desc); \
3796 intptr_t i = opr_sz; \
3798 uint64_t out = 0, pg; \
3800 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3801 TYPE nn = *(TYPE *)(vn + H(i)); \
3804 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3806 *(uint64_t *)(vd + (i >> 3)) = out; \
3807 flags = iter_predtest_bwd(out, pg, flags); \
3812 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3813 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3814 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3815 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3816 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3817 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3818 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3819 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3821 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b
, uint8_t, ==)
3822 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h
, uint16_t, ==)
3823 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s
, uint32_t, ==)
3824 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d
, uint64_t, ==)
3826 DO_CMP_PPZI_B(sve_cmpne_ppzi_b
, uint8_t, !=)
3827 DO_CMP_PPZI_H(sve_cmpne_ppzi_h
, uint16_t, !=)
3828 DO_CMP_PPZI_S(sve_cmpne_ppzi_s
, uint32_t, !=)
3829 DO_CMP_PPZI_D(sve_cmpne_ppzi_d
, uint64_t, !=)
3831 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b
, int8_t, >)
3832 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h
, int16_t, >)
3833 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s
, int32_t, >)
3834 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d
, int64_t, >)
3836 DO_CMP_PPZI_B(sve_cmpge_ppzi_b
, int8_t, >=)
3837 DO_CMP_PPZI_H(sve_cmpge_ppzi_h
, int16_t, >=)
3838 DO_CMP_PPZI_S(sve_cmpge_ppzi_s
, int32_t, >=)
3839 DO_CMP_PPZI_D(sve_cmpge_ppzi_d
, int64_t, >=)
3841 DO_CMP_PPZI_B(sve_cmphi_ppzi_b
, uint8_t, >)
3842 DO_CMP_PPZI_H(sve_cmphi_ppzi_h
, uint16_t, >)
3843 DO_CMP_PPZI_S(sve_cmphi_ppzi_s
, uint32_t, >)
3844 DO_CMP_PPZI_D(sve_cmphi_ppzi_d
, uint64_t, >)
3846 DO_CMP_PPZI_B(sve_cmphs_ppzi_b
, uint8_t, >=)
3847 DO_CMP_PPZI_H(sve_cmphs_ppzi_h
, uint16_t, >=)
3848 DO_CMP_PPZI_S(sve_cmphs_ppzi_s
, uint32_t, >=)
3849 DO_CMP_PPZI_D(sve_cmphs_ppzi_d
, uint64_t, >=)
3851 DO_CMP_PPZI_B(sve_cmplt_ppzi_b
, int8_t, <)
3852 DO_CMP_PPZI_H(sve_cmplt_ppzi_h
, int16_t, <)
3853 DO_CMP_PPZI_S(sve_cmplt_ppzi_s
, int32_t, <)
3854 DO_CMP_PPZI_D(sve_cmplt_ppzi_d
, int64_t, <)
3856 DO_CMP_PPZI_B(sve_cmple_ppzi_b
, int8_t, <=)
3857 DO_CMP_PPZI_H(sve_cmple_ppzi_h
, int16_t, <=)
3858 DO_CMP_PPZI_S(sve_cmple_ppzi_s
, int32_t, <=)
3859 DO_CMP_PPZI_D(sve_cmple_ppzi_d
, int64_t, <=)
3861 DO_CMP_PPZI_B(sve_cmplo_ppzi_b
, uint8_t, <)
3862 DO_CMP_PPZI_H(sve_cmplo_ppzi_h
, uint16_t, <)
3863 DO_CMP_PPZI_S(sve_cmplo_ppzi_s
, uint32_t, <)
3864 DO_CMP_PPZI_D(sve_cmplo_ppzi_d
, uint64_t, <)
3866 DO_CMP_PPZI_B(sve_cmpls_ppzi_b
, uint8_t, <=)
3867 DO_CMP_PPZI_H(sve_cmpls_ppzi_h
, uint16_t, <=)
3868 DO_CMP_PPZI_S(sve_cmpls_ppzi_s
, uint32_t, <=)
3869 DO_CMP_PPZI_D(sve_cmpls_ppzi_d
, uint64_t, <=)
3871 #undef DO_CMP_PPZI_B
3872 #undef DO_CMP_PPZI_H
3873 #undef DO_CMP_PPZI_S
3874 #undef DO_CMP_PPZI_D
3877 /* Similar to the ARM LastActive pseudocode function. */
3878 static bool last_active_pred(void *vd
, void *vg
, intptr_t oprsz
)
3882 for (i
= QEMU_ALIGN_UP(oprsz
, 8) - 8; i
>= 0; i
-= 8) {
3883 uint64_t pg
= *(uint64_t *)(vg
+ i
);
3885 return (pow2floor(pg
) & *(uint64_t *)(vd
+ i
)) != 0;
3891 /* Compute a mask into RETB that is true for all G, up to and including
3892 * (if after) or excluding (if !after) the first G & N.
3893 * Return true if BRK found.
3895 static bool compute_brk(uint64_t *retb
, uint64_t n
, uint64_t g
,
3896 bool brk
, bool after
)
3902 } else if ((g
& n
) == 0) {
3903 /* For all G, no N are set; break not found. */
3906 /* Break somewhere in N. Locate it. */
3907 b
= g
& n
; /* guard true, pred true */
3908 b
= b
& -b
; /* first such */
3910 b
= b
| (b
- 1); /* break after same */
3912 b
= b
- 1; /* break before same */
3921 /* Compute a zeroing BRK. */
3922 static void compute_brk_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3923 intptr_t oprsz
, bool after
)
3928 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3929 uint64_t this_b
, this_g
= g
[i
];
3931 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3932 d
[i
] = this_b
& this_g
;
3936 /* Likewise, but also compute flags. */
3937 static uint32_t compute_brks_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3938 intptr_t oprsz
, bool after
)
3940 uint32_t flags
= PREDTEST_INIT
;
3944 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3945 uint64_t this_b
, this_d
, this_g
= g
[i
];
3947 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3948 d
[i
] = this_d
= this_b
& this_g
;
3949 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3954 /* Compute a merging BRK. */
3955 static void compute_brk_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3956 intptr_t oprsz
, bool after
)
3961 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3962 uint64_t this_b
, this_g
= g
[i
];
3964 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3965 d
[i
] = (this_b
& this_g
) | (d
[i
] & ~this_g
);
3969 /* Likewise, but also compute flags. */
3970 static uint32_t compute_brks_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3971 intptr_t oprsz
, bool after
)
3973 uint32_t flags
= PREDTEST_INIT
;
3977 for (i
= 0; i
< oprsz
/ 8; ++i
) {
3978 uint64_t this_b
, this_d
= d
[i
], this_g
= g
[i
];
3980 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3981 d
[i
] = this_d
= (this_b
& this_g
) | (this_d
& ~this_g
);
3982 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3987 static uint32_t do_zero(ARMPredicateReg
*d
, intptr_t oprsz
)
3989 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3990 * The compiler should turn this into 4 64-bit integer stores.
3992 memset(d
, 0, sizeof(ARMPredicateReg
));
3993 return PREDTEST_INIT
;
3996 void HELPER(sve_brkpa
)(void *vd
, void *vn
, void *vm
, void *vg
,
3999 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4000 if (last_active_pred(vn
, vg
, oprsz
)) {
4001 compute_brk_z(vd
, vm
, vg
, oprsz
, true);
4007 uint32_t HELPER(sve_brkpas
)(void *vd
, void *vn
, void *vm
, void *vg
,
4010 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4011 if (last_active_pred(vn
, vg
, oprsz
)) {
4012 return compute_brks_z(vd
, vm
, vg
, oprsz
, true);
4014 return do_zero(vd
, oprsz
);
4018 void HELPER(sve_brkpb
)(void *vd
, void *vn
, void *vm
, void *vg
,
4021 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4022 if (last_active_pred(vn
, vg
, oprsz
)) {
4023 compute_brk_z(vd
, vm
, vg
, oprsz
, false);
4029 uint32_t HELPER(sve_brkpbs
)(void *vd
, void *vn
, void *vm
, void *vg
,
4032 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4033 if (last_active_pred(vn
, vg
, oprsz
)) {
4034 return compute_brks_z(vd
, vm
, vg
, oprsz
, false);
4036 return do_zero(vd
, oprsz
);
4040 void HELPER(sve_brka_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4042 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4043 compute_brk_z(vd
, vn
, vg
, oprsz
, true);
4046 uint32_t HELPER(sve_brkas_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4048 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4049 return compute_brks_z(vd
, vn
, vg
, oprsz
, true);
4052 void HELPER(sve_brkb_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4054 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4055 compute_brk_z(vd
, vn
, vg
, oprsz
, false);
4058 uint32_t HELPER(sve_brkbs_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4060 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4061 return compute_brks_z(vd
, vn
, vg
, oprsz
, false);
4064 void HELPER(sve_brka_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4066 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4067 compute_brk_m(vd
, vn
, vg
, oprsz
, true);
4070 uint32_t HELPER(sve_brkas_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4072 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4073 return compute_brks_m(vd
, vn
, vg
, oprsz
, true);
4076 void HELPER(sve_brkb_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4078 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4079 compute_brk_m(vd
, vn
, vg
, oprsz
, false);
4082 uint32_t HELPER(sve_brkbs_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4084 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4085 return compute_brks_m(vd
, vn
, vg
, oprsz
, false);
4088 void HELPER(sve_brkn
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4090 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4091 if (!last_active_pred(vn
, vg
, oprsz
)) {
4096 /* As if PredTest(Ones(PL), D, esz). */
4097 static uint32_t predtest_ones(ARMPredicateReg
*d
, intptr_t oprsz
,
4100 uint32_t flags
= PREDTEST_INIT
;
4103 for (i
= 0; i
< oprsz
/ 8; i
++) {
4104 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
, flags
);
4107 uint64_t mask
= ~(-1ULL << (8 * (oprsz
& 7)));
4108 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
& mask
, flags
);
4113 uint32_t HELPER(sve_brkns
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4115 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4116 if (last_active_pred(vn
, vg
, oprsz
)) {
4117 return predtest_ones(vd
, oprsz
, -1);
4119 return do_zero(vd
, oprsz
);
4123 uint64_t HELPER(sve_cntp
)(void *vn
, void *vg
, uint32_t pred_desc
)
4125 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
4126 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
4127 uint64_t *n
= vn
, *g
= vg
, sum
= 0, mask
= pred_esz_masks
[esz
];
4130 for (i
= 0; i
< words
; ++i
) {
4131 uint64_t t
= n
[i
] & g
[i
] & mask
;
4137 uint32_t HELPER(sve_whilel
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
4139 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4140 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
4141 uint64_t esz_mask
= pred_esz_masks
[esz
];
4142 ARMPredicateReg
*d
= vd
;
4146 /* Begin with a zero predicate register. */
4147 flags
= do_zero(d
, oprsz
);
4152 /* Set all of the requested bits. */
4153 for (i
= 0; i
< count
/ 64; ++i
) {
4157 d
->p
[i
] = MAKE_64BIT_MASK(0, count
& 63) & esz_mask
;
4160 return predtest_ones(d
, oprsz
, esz_mask
);
4163 uint32_t HELPER(sve_whileg
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
4165 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4166 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
4167 uint64_t esz_mask
= pred_esz_masks
[esz
];
4168 ARMPredicateReg
*d
= vd
;
4169 intptr_t i
, invcount
, oprbits
;
4173 return do_zero(d
, oprsz
);
4176 oprbits
= oprsz
* 8;
4177 tcg_debug_assert(count
<= oprbits
);
4181 bits
&= MAKE_64BIT_MASK(0, oprbits
& 63);
4184 invcount
= oprbits
- count
;
4185 for (i
= (oprsz
- 1) / 8; i
> invcount
/ 64; --i
) {
4190 d
->p
[i
] = bits
& MAKE_64BIT_MASK(invcount
& 63, 64);
4196 return predtest_ones(d
, oprsz
, esz_mask
);
4199 /* Recursive reduction on a function;
4200 * C.f. the ARM ARM function ReducePredicated.
4202 * While it would be possible to write this without the DATA temporary,
4203 * it is much simpler to process the predicate register this way.
4204 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4205 * little to gain with a more complex non-recursive form.
4207 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4208 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4213 uintptr_t half = n / 2; \
4214 TYPE lo = NAME##_reduce(data, status, half); \
4215 TYPE hi = NAME##_reduce(data + half, status, half); \
4216 return TYPE##_##FUNC(lo, hi, status); \
4219 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4221 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4222 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4223 for (i = 0; i < oprsz; ) { \
4224 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4226 TYPE nn = *(TYPE *)(vn + H(i)); \
4227 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4228 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4231 for (; i < maxsz; i += sizeof(TYPE)) { \
4232 *(TYPE *)((void *)data + i) = IDENT; \
4234 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4237 DO_REDUCE(sve_faddv_h
, float16
, H1_2
, add
, float16_zero
)
4238 DO_REDUCE(sve_faddv_s
, float32
, H1_4
, add
, float32_zero
)
4239 DO_REDUCE(sve_faddv_d
, float64
, H1_8
, add
, float64_zero
)
4241 /* Identity is floatN_default_nan, without the function call. */
4242 DO_REDUCE(sve_fminnmv_h
, float16
, H1_2
, minnum
, 0x7E00)
4243 DO_REDUCE(sve_fminnmv_s
, float32
, H1_4
, minnum
, 0x7FC00000)
4244 DO_REDUCE(sve_fminnmv_d
, float64
, H1_8
, minnum
, 0x7FF8000000000000ULL
)
4246 DO_REDUCE(sve_fmaxnmv_h
, float16
, H1_2
, maxnum
, 0x7E00)
4247 DO_REDUCE(sve_fmaxnmv_s
, float32
, H1_4
, maxnum
, 0x7FC00000)
4248 DO_REDUCE(sve_fmaxnmv_d
, float64
, H1_8
, maxnum
, 0x7FF8000000000000ULL
)
4250 DO_REDUCE(sve_fminv_h
, float16
, H1_2
, min
, float16_infinity
)
4251 DO_REDUCE(sve_fminv_s
, float32
, H1_4
, min
, float32_infinity
)
4252 DO_REDUCE(sve_fminv_d
, float64
, H1_8
, min
, float64_infinity
)
4254 DO_REDUCE(sve_fmaxv_h
, float16
, H1_2
, max
, float16_chs(float16_infinity
))
4255 DO_REDUCE(sve_fmaxv_s
, float32
, H1_4
, max
, float32_chs(float32_infinity
))
4256 DO_REDUCE(sve_fmaxv_d
, float64
, H1_8
, max
, float64_chs(float64_infinity
))
4260 uint64_t HELPER(sve_fadda_h
)(uint64_t nn
, void *vm
, void *vg
,
4261 void *status
, uint32_t desc
)
4263 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
4264 float16 result
= nn
;
4267 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4270 float16 mm
= *(float16
*)(vm
+ H1_2(i
));
4271 result
= float16_add(result
, mm
, status
);
4273 i
+= sizeof(float16
), pg
>>= sizeof(float16
);
4275 } while (i
< opr_sz
);
4280 uint64_t HELPER(sve_fadda_s
)(uint64_t nn
, void *vm
, void *vg
,
4281 void *status
, uint32_t desc
)
4283 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
4284 float32 result
= nn
;
4287 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4290 float32 mm
= *(float32
*)(vm
+ H1_2(i
));
4291 result
= float32_add(result
, mm
, status
);
4293 i
+= sizeof(float32
), pg
>>= sizeof(float32
);
4295 } while (i
< opr_sz
);
4300 uint64_t HELPER(sve_fadda_d
)(uint64_t nn
, void *vm
, void *vg
,
4301 void *status
, uint32_t desc
)
4303 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
) / 8;
4307 for (i
= 0; i
< opr_sz
; i
++) {
4308 if (pg
[H1(i
)] & 1) {
4309 nn
= float64_add(nn
, m
[i
], status
);
4316 /* Fully general three-operand expander, controlled by a predicate,
4317 * With the extra float_status parameter.
4319 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4320 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4321 void *status, uint32_t desc) \
4323 intptr_t i = simd_oprsz(desc); \
4326 uint64_t pg = g[(i - 1) >> 6]; \
4328 i -= sizeof(TYPE); \
4329 if (likely((pg >> (i & 63)) & 1)) { \
4330 TYPE nn = *(TYPE *)(vn + H(i)); \
4331 TYPE mm = *(TYPE *)(vm + H(i)); \
4332 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4338 DO_ZPZZ_FP(sve_fadd_h
, uint16_t, H1_2
, float16_add
)
4339 DO_ZPZZ_FP(sve_fadd_s
, uint32_t, H1_4
, float32_add
)
4340 DO_ZPZZ_FP(sve_fadd_d
, uint64_t, H1_8
, float64_add
)
4342 DO_ZPZZ_FP(sve_fsub_h
, uint16_t, H1_2
, float16_sub
)
4343 DO_ZPZZ_FP(sve_fsub_s
, uint32_t, H1_4
, float32_sub
)
4344 DO_ZPZZ_FP(sve_fsub_d
, uint64_t, H1_8
, float64_sub
)
4346 DO_ZPZZ_FP(sve_fmul_h
, uint16_t, H1_2
, float16_mul
)
4347 DO_ZPZZ_FP(sve_fmul_s
, uint32_t, H1_4
, float32_mul
)
4348 DO_ZPZZ_FP(sve_fmul_d
, uint64_t, H1_8
, float64_mul
)
4350 DO_ZPZZ_FP(sve_fdiv_h
, uint16_t, H1_2
, float16_div
)
4351 DO_ZPZZ_FP(sve_fdiv_s
, uint32_t, H1_4
, float32_div
)
4352 DO_ZPZZ_FP(sve_fdiv_d
, uint64_t, H1_8
, float64_div
)
4354 DO_ZPZZ_FP(sve_fmin_h
, uint16_t, H1_2
, float16_min
)
4355 DO_ZPZZ_FP(sve_fmin_s
, uint32_t, H1_4
, float32_min
)
4356 DO_ZPZZ_FP(sve_fmin_d
, uint64_t, H1_8
, float64_min
)
4358 DO_ZPZZ_FP(sve_fmax_h
, uint16_t, H1_2
, float16_max
)
4359 DO_ZPZZ_FP(sve_fmax_s
, uint32_t, H1_4
, float32_max
)
4360 DO_ZPZZ_FP(sve_fmax_d
, uint64_t, H1_8
, float64_max
)
4362 DO_ZPZZ_FP(sve_fminnum_h
, uint16_t, H1_2
, float16_minnum
)
4363 DO_ZPZZ_FP(sve_fminnum_s
, uint32_t, H1_4
, float32_minnum
)
4364 DO_ZPZZ_FP(sve_fminnum_d
, uint64_t, H1_8
, float64_minnum
)
4366 DO_ZPZZ_FP(sve_fmaxnum_h
, uint16_t, H1_2
, float16_maxnum
)
4367 DO_ZPZZ_FP(sve_fmaxnum_s
, uint32_t, H1_4
, float32_maxnum
)
4368 DO_ZPZZ_FP(sve_fmaxnum_d
, uint64_t, H1_8
, float64_maxnum
)
4370 static inline float16
abd_h(float16 a
, float16 b
, float_status
*s
)
4372 return float16_abs(float16_sub(a
, b
, s
));
4375 static inline float32
abd_s(float32 a
, float32 b
, float_status
*s
)
4377 return float32_abs(float32_sub(a
, b
, s
));
4380 static inline float64
abd_d(float64 a
, float64 b
, float_status
*s
)
4382 return float64_abs(float64_sub(a
, b
, s
));
4385 DO_ZPZZ_FP(sve_fabd_h
, uint16_t, H1_2
, abd_h
)
4386 DO_ZPZZ_FP(sve_fabd_s
, uint32_t, H1_4
, abd_s
)
4387 DO_ZPZZ_FP(sve_fabd_d
, uint64_t, H1_8
, abd_d
)
4389 static inline float64
scalbn_d(float64 a
, int64_t b
, float_status
*s
)
4391 int b_int
= MIN(MAX(b
, INT_MIN
), INT_MAX
);
4392 return float64_scalbn(a
, b_int
, s
);
4395 DO_ZPZZ_FP(sve_fscalbn_h
, int16_t, H1_2
, float16_scalbn
)
4396 DO_ZPZZ_FP(sve_fscalbn_s
, int32_t, H1_4
, float32_scalbn
)
4397 DO_ZPZZ_FP(sve_fscalbn_d
, int64_t, H1_8
, scalbn_d
)
4399 DO_ZPZZ_FP(sve_fmulx_h
, uint16_t, H1_2
, helper_advsimd_mulxh
)
4400 DO_ZPZZ_FP(sve_fmulx_s
, uint32_t, H1_4
, helper_vfp_mulxs
)
4401 DO_ZPZZ_FP(sve_fmulx_d
, uint64_t, H1_8
, helper_vfp_mulxd
)
4405 /* Three-operand expander, with one scalar operand, controlled by
4406 * a predicate, with the extra float_status parameter.
4408 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4409 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4410 void *status, uint32_t desc) \
4412 intptr_t i = simd_oprsz(desc); \
4416 uint64_t pg = g[(i - 1) >> 6]; \
4418 i -= sizeof(TYPE); \
4419 if (likely((pg >> (i & 63)) & 1)) { \
4420 TYPE nn = *(TYPE *)(vn + H(i)); \
4421 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4427 DO_ZPZS_FP(sve_fadds_h
, float16
, H1_2
, float16_add
)
4428 DO_ZPZS_FP(sve_fadds_s
, float32
, H1_4
, float32_add
)
4429 DO_ZPZS_FP(sve_fadds_d
, float64
, H1_8
, float64_add
)
4431 DO_ZPZS_FP(sve_fsubs_h
, float16
, H1_2
, float16_sub
)
4432 DO_ZPZS_FP(sve_fsubs_s
, float32
, H1_4
, float32_sub
)
4433 DO_ZPZS_FP(sve_fsubs_d
, float64
, H1_8
, float64_sub
)
4435 DO_ZPZS_FP(sve_fmuls_h
, float16
, H1_2
, float16_mul
)
4436 DO_ZPZS_FP(sve_fmuls_s
, float32
, H1_4
, float32_mul
)
4437 DO_ZPZS_FP(sve_fmuls_d
, float64
, H1_8
, float64_mul
)
4439 static inline float16
subr_h(float16 a
, float16 b
, float_status
*s
)
4441 return float16_sub(b
, a
, s
);
4444 static inline float32
subr_s(float32 a
, float32 b
, float_status
*s
)
4446 return float32_sub(b
, a
, s
);
4449 static inline float64
subr_d(float64 a
, float64 b
, float_status
*s
)
4451 return float64_sub(b
, a
, s
);
4454 DO_ZPZS_FP(sve_fsubrs_h
, float16
, H1_2
, subr_h
)
4455 DO_ZPZS_FP(sve_fsubrs_s
, float32
, H1_4
, subr_s
)
4456 DO_ZPZS_FP(sve_fsubrs_d
, float64
, H1_8
, subr_d
)
4458 DO_ZPZS_FP(sve_fmaxnms_h
, float16
, H1_2
, float16_maxnum
)
4459 DO_ZPZS_FP(sve_fmaxnms_s
, float32
, H1_4
, float32_maxnum
)
4460 DO_ZPZS_FP(sve_fmaxnms_d
, float64
, H1_8
, float64_maxnum
)
4462 DO_ZPZS_FP(sve_fminnms_h
, float16
, H1_2
, float16_minnum
)
4463 DO_ZPZS_FP(sve_fminnms_s
, float32
, H1_4
, float32_minnum
)
4464 DO_ZPZS_FP(sve_fminnms_d
, float64
, H1_8
, float64_minnum
)
4466 DO_ZPZS_FP(sve_fmaxs_h
, float16
, H1_2
, float16_max
)
4467 DO_ZPZS_FP(sve_fmaxs_s
, float32
, H1_4
, float32_max
)
4468 DO_ZPZS_FP(sve_fmaxs_d
, float64
, H1_8
, float64_max
)
4470 DO_ZPZS_FP(sve_fmins_h
, float16
, H1_2
, float16_min
)
4471 DO_ZPZS_FP(sve_fmins_s
, float32
, H1_4
, float32_min
)
4472 DO_ZPZS_FP(sve_fmins_d
, float64
, H1_8
, float64_min
)
4474 /* Fully general two-operand expander, controlled by a predicate,
4475 * With the extra float_status parameter.
4477 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4478 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4480 intptr_t i = simd_oprsz(desc); \
4483 uint64_t pg = g[(i - 1) >> 6]; \
4485 i -= sizeof(TYPE); \
4486 if (likely((pg >> (i & 63)) & 1)) { \
4487 TYPE nn = *(TYPE *)(vn + H(i)); \
4488 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4494 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4495 * FZ16. When converting from fp16, this affects flushing input denormals;
4496 * when converting to fp16, this affects flushing output denormals.
4498 static inline float32
sve_f16_to_f32(float16 f
, float_status
*fpst
)
4500 bool save
= get_flush_inputs_to_zero(fpst
);
4503 set_flush_inputs_to_zero(false, fpst
);
4504 ret
= float16_to_float32(f
, true, fpst
);
4505 set_flush_inputs_to_zero(save
, fpst
);
4509 static inline float64
sve_f16_to_f64(float16 f
, float_status
*fpst
)
4511 bool save
= get_flush_inputs_to_zero(fpst
);
4514 set_flush_inputs_to_zero(false, fpst
);
4515 ret
= float16_to_float64(f
, true, fpst
);
4516 set_flush_inputs_to_zero(save
, fpst
);
4520 static inline float16
sve_f32_to_f16(float32 f
, float_status
*fpst
)
4522 bool save
= get_flush_to_zero(fpst
);
4525 set_flush_to_zero(false, fpst
);
4526 ret
= float32_to_float16(f
, true, fpst
);
4527 set_flush_to_zero(save
, fpst
);
4531 static inline float16
sve_f64_to_f16(float64 f
, float_status
*fpst
)
4533 bool save
= get_flush_to_zero(fpst
);
4536 set_flush_to_zero(false, fpst
);
4537 ret
= float64_to_float16(f
, true, fpst
);
4538 set_flush_to_zero(save
, fpst
);
4542 static inline int16_t vfp_float16_to_int16_rtz(float16 f
, float_status
*s
)
4544 if (float16_is_any_nan(f
)) {
4545 float_raise(float_flag_invalid
, s
);
4548 return float16_to_int16_round_to_zero(f
, s
);
4551 static inline int64_t vfp_float16_to_int64_rtz(float16 f
, float_status
*s
)
4553 if (float16_is_any_nan(f
)) {
4554 float_raise(float_flag_invalid
, s
);
4557 return float16_to_int64_round_to_zero(f
, s
);
4560 static inline int64_t vfp_float32_to_int64_rtz(float32 f
, float_status
*s
)
4562 if (float32_is_any_nan(f
)) {
4563 float_raise(float_flag_invalid
, s
);
4566 return float32_to_int64_round_to_zero(f
, s
);
4569 static inline int64_t vfp_float64_to_int64_rtz(float64 f
, float_status
*s
)
4571 if (float64_is_any_nan(f
)) {
4572 float_raise(float_flag_invalid
, s
);
4575 return float64_to_int64_round_to_zero(f
, s
);
4578 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f
, float_status
*s
)
4580 if (float16_is_any_nan(f
)) {
4581 float_raise(float_flag_invalid
, s
);
4584 return float16_to_uint16_round_to_zero(f
, s
);
4587 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f
, float_status
*s
)
4589 if (float16_is_any_nan(f
)) {
4590 float_raise(float_flag_invalid
, s
);
4593 return float16_to_uint64_round_to_zero(f
, s
);
4596 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f
, float_status
*s
)
4598 if (float32_is_any_nan(f
)) {
4599 float_raise(float_flag_invalid
, s
);
4602 return float32_to_uint64_round_to_zero(f
, s
);
4605 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f
, float_status
*s
)
4607 if (float64_is_any_nan(f
)) {
4608 float_raise(float_flag_invalid
, s
);
4611 return float64_to_uint64_round_to_zero(f
, s
);
4614 DO_ZPZ_FP(sve_fcvt_sh
, uint32_t, H1_4
, sve_f32_to_f16
)
4615 DO_ZPZ_FP(sve_fcvt_hs
, uint32_t, H1_4
, sve_f16_to_f32
)
4616 DO_ZPZ_FP(sve_bfcvt
, uint32_t, H1_4
, float32_to_bfloat16
)
4617 DO_ZPZ_FP(sve_fcvt_dh
, uint64_t, H1_8
, sve_f64_to_f16
)
4618 DO_ZPZ_FP(sve_fcvt_hd
, uint64_t, H1_8
, sve_f16_to_f64
)
4619 DO_ZPZ_FP(sve_fcvt_ds
, uint64_t, H1_8
, float64_to_float32
)
4620 DO_ZPZ_FP(sve_fcvt_sd
, uint64_t, H1_8
, float32_to_float64
)
4622 DO_ZPZ_FP(sve_fcvtzs_hh
, uint16_t, H1_2
, vfp_float16_to_int16_rtz
)
4623 DO_ZPZ_FP(sve_fcvtzs_hs
, uint32_t, H1_4
, helper_vfp_tosizh
)
4624 DO_ZPZ_FP(sve_fcvtzs_ss
, uint32_t, H1_4
, helper_vfp_tosizs
)
4625 DO_ZPZ_FP(sve_fcvtzs_hd
, uint64_t, H1_8
, vfp_float16_to_int64_rtz
)
4626 DO_ZPZ_FP(sve_fcvtzs_sd
, uint64_t, H1_8
, vfp_float32_to_int64_rtz
)
4627 DO_ZPZ_FP(sve_fcvtzs_ds
, uint64_t, H1_8
, helper_vfp_tosizd
)
4628 DO_ZPZ_FP(sve_fcvtzs_dd
, uint64_t, H1_8
, vfp_float64_to_int64_rtz
)
4630 DO_ZPZ_FP(sve_fcvtzu_hh
, uint16_t, H1_2
, vfp_float16_to_uint16_rtz
)
4631 DO_ZPZ_FP(sve_fcvtzu_hs
, uint32_t, H1_4
, helper_vfp_touizh
)
4632 DO_ZPZ_FP(sve_fcvtzu_ss
, uint32_t, H1_4
, helper_vfp_touizs
)
4633 DO_ZPZ_FP(sve_fcvtzu_hd
, uint64_t, H1_8
, vfp_float16_to_uint64_rtz
)
4634 DO_ZPZ_FP(sve_fcvtzu_sd
, uint64_t, H1_8
, vfp_float32_to_uint64_rtz
)
4635 DO_ZPZ_FP(sve_fcvtzu_ds
, uint64_t, H1_8
, helper_vfp_touizd
)
4636 DO_ZPZ_FP(sve_fcvtzu_dd
, uint64_t, H1_8
, vfp_float64_to_uint64_rtz
)
4638 DO_ZPZ_FP(sve_frint_h
, uint16_t, H1_2
, helper_advsimd_rinth
)
4639 DO_ZPZ_FP(sve_frint_s
, uint32_t, H1_4
, helper_rints
)
4640 DO_ZPZ_FP(sve_frint_d
, uint64_t, H1_8
, helper_rintd
)
4642 DO_ZPZ_FP(sve_frintx_h
, uint16_t, H1_2
, float16_round_to_int
)
4643 DO_ZPZ_FP(sve_frintx_s
, uint32_t, H1_4
, float32_round_to_int
)
4644 DO_ZPZ_FP(sve_frintx_d
, uint64_t, H1_8
, float64_round_to_int
)
4646 DO_ZPZ_FP(sve_frecpx_h
, uint16_t, H1_2
, helper_frecpx_f16
)
4647 DO_ZPZ_FP(sve_frecpx_s
, uint32_t, H1_4
, helper_frecpx_f32
)
4648 DO_ZPZ_FP(sve_frecpx_d
, uint64_t, H1_8
, helper_frecpx_f64
)
4650 DO_ZPZ_FP(sve_fsqrt_h
, uint16_t, H1_2
, float16_sqrt
)
4651 DO_ZPZ_FP(sve_fsqrt_s
, uint32_t, H1_4
, float32_sqrt
)
4652 DO_ZPZ_FP(sve_fsqrt_d
, uint64_t, H1_8
, float64_sqrt
)
4654 DO_ZPZ_FP(sve_scvt_hh
, uint16_t, H1_2
, int16_to_float16
)
4655 DO_ZPZ_FP(sve_scvt_sh
, uint32_t, H1_4
, int32_to_float16
)
4656 DO_ZPZ_FP(sve_scvt_ss
, uint32_t, H1_4
, int32_to_float32
)
4657 DO_ZPZ_FP(sve_scvt_sd
, uint64_t, H1_8
, int32_to_float64
)
4658 DO_ZPZ_FP(sve_scvt_dh
, uint64_t, H1_8
, int64_to_float16
)
4659 DO_ZPZ_FP(sve_scvt_ds
, uint64_t, H1_8
, int64_to_float32
)
4660 DO_ZPZ_FP(sve_scvt_dd
, uint64_t, H1_8
, int64_to_float64
)
4662 DO_ZPZ_FP(sve_ucvt_hh
, uint16_t, H1_2
, uint16_to_float16
)
4663 DO_ZPZ_FP(sve_ucvt_sh
, uint32_t, H1_4
, uint32_to_float16
)
4664 DO_ZPZ_FP(sve_ucvt_ss
, uint32_t, H1_4
, uint32_to_float32
)
4665 DO_ZPZ_FP(sve_ucvt_sd
, uint64_t, H1_8
, uint32_to_float64
)
4666 DO_ZPZ_FP(sve_ucvt_dh
, uint64_t, H1_8
, uint64_to_float16
)
4667 DO_ZPZ_FP(sve_ucvt_ds
, uint64_t, H1_8
, uint64_to_float32
)
4668 DO_ZPZ_FP(sve_ucvt_dd
, uint64_t, H1_8
, uint64_to_float64
)
4670 static int16_t do_float16_logb_as_int(float16 a
, float_status
*s
)
4672 /* Extract frac to the top of the uint32_t. */
4673 uint32_t frac
= (uint32_t)a
<< (16 + 6);
4674 int16_t exp
= extract32(a
, 10, 5);
4676 if (unlikely(exp
== 0)) {
4678 if (!get_flush_inputs_to_zero(s
)) {
4679 /* denormal: bias - fractional_zeros */
4680 return -15 - clz32(frac
);
4683 float_raise(float_flag_input_denormal
, s
);
4685 } else if (unlikely(exp
== 0x1f)) {
4687 return INT16_MAX
; /* infinity */
4690 /* normal: exp - bias */
4694 float_raise(float_flag_invalid
, s
);
4698 static int32_t do_float32_logb_as_int(float32 a
, float_status
*s
)
4700 /* Extract frac to the top of the uint32_t. */
4701 uint32_t frac
= a
<< 9;
4702 int32_t exp
= extract32(a
, 23, 8);
4704 if (unlikely(exp
== 0)) {
4706 if (!get_flush_inputs_to_zero(s
)) {
4707 /* denormal: bias - fractional_zeros */
4708 return -127 - clz32(frac
);
4711 float_raise(float_flag_input_denormal
, s
);
4713 } else if (unlikely(exp
== 0xff)) {
4715 return INT32_MAX
; /* infinity */
4718 /* normal: exp - bias */
4722 float_raise(float_flag_invalid
, s
);
4726 static int64_t do_float64_logb_as_int(float64 a
, float_status
*s
)
4728 /* Extract frac to the top of the uint64_t. */
4729 uint64_t frac
= a
<< 12;
4730 int64_t exp
= extract64(a
, 52, 11);
4732 if (unlikely(exp
== 0)) {
4734 if (!get_flush_inputs_to_zero(s
)) {
4735 /* denormal: bias - fractional_zeros */
4736 return -1023 - clz64(frac
);
4739 float_raise(float_flag_input_denormal
, s
);
4741 } else if (unlikely(exp
== 0x7ff)) {
4743 return INT64_MAX
; /* infinity */
4746 /* normal: exp - bias */
4750 float_raise(float_flag_invalid
, s
);
4754 DO_ZPZ_FP(flogb_h
, float16
, H1_2
, do_float16_logb_as_int
)
4755 DO_ZPZ_FP(flogb_s
, float32
, H1_4
, do_float32_logb_as_int
)
4756 DO_ZPZ_FP(flogb_d
, float64
, H1_8
, do_float64_logb_as_int
)
4760 static void do_fmla_zpzzz_h(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4761 float_status
*status
, uint32_t desc
,
4762 uint16_t neg1
, uint16_t neg3
)
4764 intptr_t i
= simd_oprsz(desc
);
4768 uint64_t pg
= g
[(i
- 1) >> 6];
4771 if (likely((pg
>> (i
& 63)) & 1)) {
4772 float16 e1
, e2
, e3
, r
;
4774 e1
= *(uint16_t *)(vn
+ H1_2(i
)) ^ neg1
;
4775 e2
= *(uint16_t *)(vm
+ H1_2(i
));
4776 e3
= *(uint16_t *)(va
+ H1_2(i
)) ^ neg3
;
4777 r
= float16_muladd(e1
, e2
, e3
, 0, status
);
4778 *(uint16_t *)(vd
+ H1_2(i
)) = r
;
4784 void HELPER(sve_fmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4785 void *vg
, void *status
, uint32_t desc
)
4787 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4790 void HELPER(sve_fmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4791 void *vg
, void *status
, uint32_t desc
)
4793 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0);
4796 void HELPER(sve_fnmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4797 void *vg
, void *status
, uint32_t desc
)
4799 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0x8000);
4802 void HELPER(sve_fnmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4803 void *vg
, void *status
, uint32_t desc
)
4805 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x8000);
4808 static void do_fmla_zpzzz_s(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4809 float_status
*status
, uint32_t desc
,
4810 uint32_t neg1
, uint32_t neg3
)
4812 intptr_t i
= simd_oprsz(desc
);
4816 uint64_t pg
= g
[(i
- 1) >> 6];
4819 if (likely((pg
>> (i
& 63)) & 1)) {
4820 float32 e1
, e2
, e3
, r
;
4822 e1
= *(uint32_t *)(vn
+ H1_4(i
)) ^ neg1
;
4823 e2
= *(uint32_t *)(vm
+ H1_4(i
));
4824 e3
= *(uint32_t *)(va
+ H1_4(i
)) ^ neg3
;
4825 r
= float32_muladd(e1
, e2
, e3
, 0, status
);
4826 *(uint32_t *)(vd
+ H1_4(i
)) = r
;
4832 void HELPER(sve_fmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4833 void *vg
, void *status
, uint32_t desc
)
4835 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4838 void HELPER(sve_fmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4839 void *vg
, void *status
, uint32_t desc
)
4841 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0);
4844 void HELPER(sve_fnmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4845 void *vg
, void *status
, uint32_t desc
)
4847 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0x80000000);
4850 void HELPER(sve_fnmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4851 void *vg
, void *status
, uint32_t desc
)
4853 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x80000000);
4856 static void do_fmla_zpzzz_d(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4857 float_status
*status
, uint32_t desc
,
4858 uint64_t neg1
, uint64_t neg3
)
4860 intptr_t i
= simd_oprsz(desc
);
4864 uint64_t pg
= g
[(i
- 1) >> 6];
4867 if (likely((pg
>> (i
& 63)) & 1)) {
4868 float64 e1
, e2
, e3
, r
;
4870 e1
= *(uint64_t *)(vn
+ i
) ^ neg1
;
4871 e2
= *(uint64_t *)(vm
+ i
);
4872 e3
= *(uint64_t *)(va
+ i
) ^ neg3
;
4873 r
= float64_muladd(e1
, e2
, e3
, 0, status
);
4874 *(uint64_t *)(vd
+ i
) = r
;
4880 void HELPER(sve_fmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4881 void *vg
, void *status
, uint32_t desc
)
4883 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4886 void HELPER(sve_fmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4887 void *vg
, void *status
, uint32_t desc
)
4889 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, 0);
4892 void HELPER(sve_fnmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4893 void *vg
, void *status
, uint32_t desc
)
4895 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, INT64_MIN
);
4898 void HELPER(sve_fnmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4899 void *vg
, void *status
, uint32_t desc
)
4901 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, INT64_MIN
);
4904 /* Two operand floating-point comparison controlled by a predicate.
4905 * Unlike the integer version, we are not allowed to optimistically
4906 * compare operands, since the comparison may have side effects wrt
4909 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4910 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4911 void *status, uint32_t desc) \
4913 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4914 uint64_t *d = vd, *g = vg; \
4916 uint64_t out = 0, pg = g[j]; \
4918 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4919 if (likely((pg >> (i & 63)) & 1)) { \
4920 TYPE nn = *(TYPE *)(vn + H(i)); \
4921 TYPE mm = *(TYPE *)(vm + H(i)); \
4922 out |= OP(TYPE, nn, mm, status); \
4929 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4930 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4931 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4932 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4933 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4934 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4936 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4937 DO_FPCMP_PPZZ_H(NAME, OP) \
4938 DO_FPCMP_PPZZ_S(NAME, OP) \
4939 DO_FPCMP_PPZZ_D(NAME, OP)
4941 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4942 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4943 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4944 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4945 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4946 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4947 #define DO_FCMUO(TYPE, X, Y, ST) \
4948 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4949 #define DO_FACGE(TYPE, X, Y, ST) \
4950 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4951 #define DO_FACGT(TYPE, X, Y, ST) \
4952 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4954 DO_FPCMP_PPZZ_ALL(sve_fcmge
, DO_FCMGE
)
4955 DO_FPCMP_PPZZ_ALL(sve_fcmgt
, DO_FCMGT
)
4956 DO_FPCMP_PPZZ_ALL(sve_fcmeq
, DO_FCMEQ
)
4957 DO_FPCMP_PPZZ_ALL(sve_fcmne
, DO_FCMNE
)
4958 DO_FPCMP_PPZZ_ALL(sve_fcmuo
, DO_FCMUO
)
4959 DO_FPCMP_PPZZ_ALL(sve_facge
, DO_FACGE
)
4960 DO_FPCMP_PPZZ_ALL(sve_facgt
, DO_FACGT
)
4962 #undef DO_FPCMP_PPZZ_ALL
4963 #undef DO_FPCMP_PPZZ_D
4964 #undef DO_FPCMP_PPZZ_S
4965 #undef DO_FPCMP_PPZZ_H
4966 #undef DO_FPCMP_PPZZ
4968 /* One operand floating-point comparison against zero, controlled
4971 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4972 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4973 void *status, uint32_t desc) \
4975 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4976 uint64_t *d = vd, *g = vg; \
4978 uint64_t out = 0, pg = g[j]; \
4980 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4981 if ((pg >> (i & 63)) & 1) { \
4982 TYPE nn = *(TYPE *)(vn + H(i)); \
4983 out |= OP(TYPE, nn, 0, status); \
4990 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4991 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4992 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4993 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4994 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4995 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4997 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4998 DO_FPCMP_PPZ0_H(NAME, OP) \
4999 DO_FPCMP_PPZ0_S(NAME, OP) \
5000 DO_FPCMP_PPZ0_D(NAME, OP)
5002 DO_FPCMP_PPZ0_ALL(sve_fcmge0
, DO_FCMGE
)
5003 DO_FPCMP_PPZ0_ALL(sve_fcmgt0
, DO_FCMGT
)
5004 DO_FPCMP_PPZ0_ALL(sve_fcmle0
, DO_FCMLE
)
5005 DO_FPCMP_PPZ0_ALL(sve_fcmlt0
, DO_FCMLT
)
5006 DO_FPCMP_PPZ0_ALL(sve_fcmeq0
, DO_FCMEQ
)
5007 DO_FPCMP_PPZ0_ALL(sve_fcmne0
, DO_FCMNE
)
5009 /* FP Trig Multiply-Add. */
5011 void HELPER(sve_ftmad_h
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
5013 static const float16 coeff
[16] = {
5014 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5015 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5017 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float16
);
5018 intptr_t x
= simd_data(desc
);
5019 float16
*d
= vd
, *n
= vn
, *m
= vm
;
5020 for (i
= 0; i
< opr_sz
; i
++) {
5023 if (float16_is_neg(mm
)) {
5024 mm
= float16_abs(mm
);
5027 d
[i
] = float16_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
5031 void HELPER(sve_ftmad_s
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
5033 static const float32 coeff
[16] = {
5034 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5035 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5036 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5037 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5039 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float32
);
5040 intptr_t x
= simd_data(desc
);
5041 float32
*d
= vd
, *n
= vn
, *m
= vm
;
5042 for (i
= 0; i
< opr_sz
; i
++) {
5045 if (float32_is_neg(mm
)) {
5046 mm
= float32_abs(mm
);
5049 d
[i
] = float32_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
5053 void HELPER(sve_ftmad_d
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
5055 static const float64 coeff
[16] = {
5056 0x3ff0000000000000ull
, 0xbfc5555555555543ull
,
5057 0x3f8111111110f30cull
, 0xbf2a01a019b92fc6ull
,
5058 0x3ec71de351f3d22bull
, 0xbe5ae5e2b60f7b91ull
,
5059 0x3de5d8408868552full
, 0x0000000000000000ull
,
5060 0x3ff0000000000000ull
, 0xbfe0000000000000ull
,
5061 0x3fa5555555555536ull
, 0xbf56c16c16c13a0bull
,
5062 0x3efa01a019b1e8d8ull
, 0xbe927e4f7282f468ull
,
5063 0x3e21ee96d2641b13ull
, 0xbda8f76380fbb401ull
,
5065 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float64
);
5066 intptr_t x
= simd_data(desc
);
5067 float64
*d
= vd
, *n
= vn
, *m
= vm
;
5068 for (i
= 0; i
< opr_sz
; i
++) {
5071 if (float64_is_neg(mm
)) {
5072 mm
= float64_abs(mm
);
5075 d
[i
] = float64_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
5083 void HELPER(sve_fcadd_h
)(void *vd
, void *vn
, void *vm
, void *vg
,
5084 void *vs
, uint32_t desc
)
5086 intptr_t j
, i
= simd_oprsz(desc
);
5088 float16 neg_imag
= float16_set_sign(0, simd_data(desc
));
5089 float16 neg_real
= float16_chs(neg_imag
);
5092 uint64_t pg
= g
[(i
- 1) >> 6];
5094 float16 e0
, e1
, e2
, e3
;
5096 /* I holds the real index; J holds the imag index. */
5097 j
= i
- sizeof(float16
);
5098 i
-= 2 * sizeof(float16
);
5100 e0
= *(float16
*)(vn
+ H1_2(i
));
5101 e1
= *(float16
*)(vm
+ H1_2(j
)) ^ neg_real
;
5102 e2
= *(float16
*)(vn
+ H1_2(j
));
5103 e3
= *(float16
*)(vm
+ H1_2(i
)) ^ neg_imag
;
5105 if (likely((pg
>> (i
& 63)) & 1)) {
5106 *(float16
*)(vd
+ H1_2(i
)) = float16_add(e0
, e1
, vs
);
5108 if (likely((pg
>> (j
& 63)) & 1)) {
5109 *(float16
*)(vd
+ H1_2(j
)) = float16_add(e2
, e3
, vs
);
5115 void HELPER(sve_fcadd_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
5116 void *vs
, uint32_t desc
)
5118 intptr_t j
, i
= simd_oprsz(desc
);
5120 float32 neg_imag
= float32_set_sign(0, simd_data(desc
));
5121 float32 neg_real
= float32_chs(neg_imag
);
5124 uint64_t pg
= g
[(i
- 1) >> 6];
5126 float32 e0
, e1
, e2
, e3
;
5128 /* I holds the real index; J holds the imag index. */
5129 j
= i
- sizeof(float32
);
5130 i
-= 2 * sizeof(float32
);
5132 e0
= *(float32
*)(vn
+ H1_2(i
));
5133 e1
= *(float32
*)(vm
+ H1_2(j
)) ^ neg_real
;
5134 e2
= *(float32
*)(vn
+ H1_2(j
));
5135 e3
= *(float32
*)(vm
+ H1_2(i
)) ^ neg_imag
;
5137 if (likely((pg
>> (i
& 63)) & 1)) {
5138 *(float32
*)(vd
+ H1_2(i
)) = float32_add(e0
, e1
, vs
);
5140 if (likely((pg
>> (j
& 63)) & 1)) {
5141 *(float32
*)(vd
+ H1_2(j
)) = float32_add(e2
, e3
, vs
);
5147 void HELPER(sve_fcadd_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
5148 void *vs
, uint32_t desc
)
5150 intptr_t j
, i
= simd_oprsz(desc
);
5152 float64 neg_imag
= float64_set_sign(0, simd_data(desc
));
5153 float64 neg_real
= float64_chs(neg_imag
);
5156 uint64_t pg
= g
[(i
- 1) >> 6];
5158 float64 e0
, e1
, e2
, e3
;
5160 /* I holds the real index; J holds the imag index. */
5161 j
= i
- sizeof(float64
);
5162 i
-= 2 * sizeof(float64
);
5164 e0
= *(float64
*)(vn
+ H1_2(i
));
5165 e1
= *(float64
*)(vm
+ H1_2(j
)) ^ neg_real
;
5166 e2
= *(float64
*)(vn
+ H1_2(j
));
5167 e3
= *(float64
*)(vm
+ H1_2(i
)) ^ neg_imag
;
5169 if (likely((pg
>> (i
& 63)) & 1)) {
5170 *(float64
*)(vd
+ H1_2(i
)) = float64_add(e0
, e1
, vs
);
5172 if (likely((pg
>> (j
& 63)) & 1)) {
5173 *(float64
*)(vd
+ H1_2(j
)) = float64_add(e2
, e3
, vs
);
5180 * FP Complex Multiply
5183 void HELPER(sve_fcmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
5184 void *vg
, void *status
, uint32_t desc
)
5186 intptr_t j
, i
= simd_oprsz(desc
);
5187 unsigned rot
= simd_data(desc
);
5188 bool flip
= rot
& 1;
5189 float16 neg_imag
, neg_real
;
5192 neg_imag
= float16_set_sign(0, (rot
& 2) != 0);
5193 neg_real
= float16_set_sign(0, rot
== 1 || rot
== 2);
5196 uint64_t pg
= g
[(i
- 1) >> 6];
5198 float16 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5200 /* I holds the real index; J holds the imag index. */
5201 j
= i
- sizeof(float16
);
5202 i
-= 2 * sizeof(float16
);
5204 nr
= *(float16
*)(vn
+ H1_2(i
));
5205 ni
= *(float16
*)(vn
+ H1_2(j
));
5206 mr
= *(float16
*)(vm
+ H1_2(i
));
5207 mi
= *(float16
*)(vm
+ H1_2(j
));
5209 e2
= (flip
? ni
: nr
);
5210 e1
= (flip
? mi
: mr
) ^ neg_real
;
5212 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5214 if (likely((pg
>> (i
& 63)) & 1)) {
5215 d
= *(float16
*)(va
+ H1_2(i
));
5216 d
= float16_muladd(e2
, e1
, d
, 0, status
);
5217 *(float16
*)(vd
+ H1_2(i
)) = d
;
5219 if (likely((pg
>> (j
& 63)) & 1)) {
5220 d
= *(float16
*)(va
+ H1_2(j
));
5221 d
= float16_muladd(e4
, e3
, d
, 0, status
);
5222 *(float16
*)(vd
+ H1_2(j
)) = d
;
5228 void HELPER(sve_fcmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
5229 void *vg
, void *status
, uint32_t desc
)
5231 intptr_t j
, i
= simd_oprsz(desc
);
5232 unsigned rot
= simd_data(desc
);
5233 bool flip
= rot
& 1;
5234 float32 neg_imag
, neg_real
;
5237 neg_imag
= float32_set_sign(0, (rot
& 2) != 0);
5238 neg_real
= float32_set_sign(0, rot
== 1 || rot
== 2);
5241 uint64_t pg
= g
[(i
- 1) >> 6];
5243 float32 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5245 /* I holds the real index; J holds the imag index. */
5246 j
= i
- sizeof(float32
);
5247 i
-= 2 * sizeof(float32
);
5249 nr
= *(float32
*)(vn
+ H1_2(i
));
5250 ni
= *(float32
*)(vn
+ H1_2(j
));
5251 mr
= *(float32
*)(vm
+ H1_2(i
));
5252 mi
= *(float32
*)(vm
+ H1_2(j
));
5254 e2
= (flip
? ni
: nr
);
5255 e1
= (flip
? mi
: mr
) ^ neg_real
;
5257 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5259 if (likely((pg
>> (i
& 63)) & 1)) {
5260 d
= *(float32
*)(va
+ H1_2(i
));
5261 d
= float32_muladd(e2
, e1
, d
, 0, status
);
5262 *(float32
*)(vd
+ H1_2(i
)) = d
;
5264 if (likely((pg
>> (j
& 63)) & 1)) {
5265 d
= *(float32
*)(va
+ H1_2(j
));
5266 d
= float32_muladd(e4
, e3
, d
, 0, status
);
5267 *(float32
*)(vd
+ H1_2(j
)) = d
;
5273 void HELPER(sve_fcmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
5274 void *vg
, void *status
, uint32_t desc
)
5276 intptr_t j
, i
= simd_oprsz(desc
);
5277 unsigned rot
= simd_data(desc
);
5278 bool flip
= rot
& 1;
5279 float64 neg_imag
, neg_real
;
5282 neg_imag
= float64_set_sign(0, (rot
& 2) != 0);
5283 neg_real
= float64_set_sign(0, rot
== 1 || rot
== 2);
5286 uint64_t pg
= g
[(i
- 1) >> 6];
5288 float64 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5290 /* I holds the real index; J holds the imag index. */
5291 j
= i
- sizeof(float64
);
5292 i
-= 2 * sizeof(float64
);
5294 nr
= *(float64
*)(vn
+ H1_2(i
));
5295 ni
= *(float64
*)(vn
+ H1_2(j
));
5296 mr
= *(float64
*)(vm
+ H1_2(i
));
5297 mi
= *(float64
*)(vm
+ H1_2(j
));
5299 e2
= (flip
? ni
: nr
);
5300 e1
= (flip
? mi
: mr
) ^ neg_real
;
5302 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5304 if (likely((pg
>> (i
& 63)) & 1)) {
5305 d
= *(float64
*)(va
+ H1_2(i
));
5306 d
= float64_muladd(e2
, e1
, d
, 0, status
);
5307 *(float64
*)(vd
+ H1_2(i
)) = d
;
5309 if (likely((pg
>> (j
& 63)) & 1)) {
5310 d
= *(float64
*)(va
+ H1_2(j
));
5311 d
= float64_muladd(e4
, e3
, d
, 0, status
);
5312 *(float64
*)(vd
+ H1_2(j
)) = d
;
5319 * Load contiguous data, protected by a governing predicate.
5323 * Load one element into @vd + @reg_off from @host.
5324 * The controlling predicate is known to be true.
5326 typedef void sve_ldst1_host_fn(void *vd
, intptr_t reg_off
, void *host
);
5329 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5330 * The controlling predicate is known to be true.
5332 typedef void sve_ldst1_tlb_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
5333 target_ulong vaddr
, uintptr_t retaddr
);
5336 * Generate the above primitives.
5339 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5340 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5342 TYPEM val = HOST(host); \
5343 *(TYPEE *)(vd + H(reg_off)) = val; \
5346 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5347 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5348 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5350 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5351 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5352 target_ulong addr, uintptr_t ra) \
5354 *(TYPEE *)(vd + H(reg_off)) = \
5355 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5358 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5359 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5360 target_ulong addr, uintptr_t ra) \
5362 TLB(env, useronly_clean_ptr(addr), \
5363 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5366 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5367 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5368 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5370 DO_LD_PRIM_1(ld1bb
, H1
, uint8_t, uint8_t)
5371 DO_LD_PRIM_1(ld1bhu
, H1_2
, uint16_t, uint8_t)
5372 DO_LD_PRIM_1(ld1bhs
, H1_2
, uint16_t, int8_t)
5373 DO_LD_PRIM_1(ld1bsu
, H1_4
, uint32_t, uint8_t)
5374 DO_LD_PRIM_1(ld1bss
, H1_4
, uint32_t, int8_t)
5375 DO_LD_PRIM_1(ld1bdu
, H1_8
, uint64_t, uint8_t)
5376 DO_LD_PRIM_1(ld1bds
, H1_8
, uint64_t, int8_t)
5378 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5379 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5380 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5382 DO_ST_PRIM_1(bb
, H1
, uint8_t, uint8_t)
5383 DO_ST_PRIM_1(bh
, H1_2
, uint16_t, uint8_t)
5384 DO_ST_PRIM_1(bs
, H1_4
, uint32_t, uint8_t)
5385 DO_ST_PRIM_1(bd
, H1_8
, uint64_t, uint8_t)
5387 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5388 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5389 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5390 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5391 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5393 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5394 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5395 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5396 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5397 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5399 DO_LD_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, lduw
)
5400 DO_LD_PRIM_2(hsu
, H1_4
, uint32_t, uint16_t, lduw
)
5401 DO_LD_PRIM_2(hss
, H1_4
, uint32_t, int16_t, lduw
)
5402 DO_LD_PRIM_2(hdu
, H1_8
, uint64_t, uint16_t, lduw
)
5403 DO_LD_PRIM_2(hds
, H1_8
, uint64_t, int16_t, lduw
)
5405 DO_ST_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, stw
)
5406 DO_ST_PRIM_2(hs
, H1_4
, uint32_t, uint16_t, stw
)
5407 DO_ST_PRIM_2(hd
, H1_8
, uint64_t, uint16_t, stw
)
5409 DO_LD_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, ldl
)
5410 DO_LD_PRIM_2(sdu
, H1_8
, uint64_t, uint32_t, ldl
)
5411 DO_LD_PRIM_2(sds
, H1_8
, uint64_t, int32_t, ldl
)
5413 DO_ST_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, stl
)
5414 DO_ST_PRIM_2(sd
, H1_8
, uint64_t, uint32_t, stl
)
5416 DO_LD_PRIM_2(dd
, H1_8
, uint64_t, uint64_t, ldq
)
5417 DO_ST_PRIM_2(dd
, H1_8
, uint64_t, uint64_t, stq
)
5428 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5429 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5430 * element >= @reg_off, or @reg_max if there were no active elements at all.
5432 static intptr_t find_next_active(uint64_t *vg
, intptr_t reg_off
,
5433 intptr_t reg_max
, int esz
)
5435 uint64_t pg_mask
= pred_esz_masks
[esz
];
5436 uint64_t pg
= (vg
[reg_off
>> 6] & pg_mask
) >> (reg_off
& 63);
5438 /* In normal usage, the first element is active. */
5439 if (likely(pg
& 1)) {
5447 if (unlikely(reg_off
>= reg_max
)) {
5448 /* The entire predicate was false. */
5451 pg
= vg
[reg_off
>> 6] & pg_mask
;
5454 reg_off
+= ctz64(pg
);
5456 /* We should never see an out of range predicate bit set. */
5457 tcg_debug_assert(reg_off
< reg_max
);
5462 * Resolve the guest virtual address to info->host and info->flags.
5463 * If @nofault, return false if the page is invalid, otherwise
5464 * exit via page fault exception.
5473 static bool sve_probe_page(SVEHostPage
*info
, bool nofault
,
5474 CPUARMState
*env
, target_ulong addr
,
5475 int mem_off
, MMUAccessType access_type
,
5476 int mmu_idx
, uintptr_t retaddr
)
5483 * User-only currently always issues with TBI. See the comment
5484 * above useronly_clean_ptr. Usually we clean this top byte away
5485 * during translation, but we can't do that for e.g. vector + imm
5488 * We currently always enable TBI for user-only, and do not provide
5489 * a way to turn it off. So clean the pointer unconditionally here,
5490 * rather than look it up here, or pass it down from above.
5492 addr
= useronly_clean_ptr(addr
);
5494 flags
= probe_access_flags(env
, addr
, access_type
, mmu_idx
, nofault
,
5495 &info
->host
, retaddr
);
5496 info
->flags
= flags
;
5498 if (flags
& TLB_INVALID_MASK
) {
5503 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5504 info
->host
-= mem_off
;
5506 #ifdef CONFIG_USER_ONLY
5507 memset(&info
->attrs
, 0, sizeof(info
->attrs
));
5510 * Find the iotlbentry for addr and return the transaction attributes.
5511 * This *must* be present in the TLB because we just found the mapping.
5514 uintptr_t index
= tlb_index(env
, mmu_idx
, addr
);
5516 # ifdef CONFIG_DEBUG_TCG
5517 CPUTLBEntry
*entry
= tlb_entry(env
, mmu_idx
, addr
);
5518 target_ulong comparator
= (access_type
== MMU_DATA_LOAD
5520 : tlb_addr_write(entry
));
5521 g_assert(tlb_hit(comparator
, addr
));
5524 CPUIOTLBEntry
*iotlbentry
= &env_tlb(env
)->d
[mmu_idx
].iotlb
[index
];
5525 info
->attrs
= iotlbentry
->attrs
;
5534 * Analyse contiguous data, protected by a governing predicate.
5545 * First and last element wholly contained within the two pages.
5546 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5547 * reg_off_last[0] may be < 0 if the first element crosses pages.
5548 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5549 * are set >= 0 only if there are complete elements on a second page.
5551 * The reg_off_* offsets are relative to the internal vector register.
5552 * The mem_off_first offset is relative to the memory address; the
5553 * two offsets are different when a load operation extends, a store
5554 * operation truncates, or for multi-register operations.
5556 int16_t mem_off_first
[2];
5557 int16_t reg_off_first
[2];
5558 int16_t reg_off_last
[2];
5561 * One element that is misaligned and spans both pages,
5562 * or -1 if there is no such active element.
5564 int16_t mem_off_split
;
5565 int16_t reg_off_split
;
5568 * The byte offset at which the entire operation crosses a page boundary.
5569 * Set >= 0 if and only if the entire operation spans two pages.
5573 /* TLB data for the two pages. */
5574 SVEHostPage page
[2];
5578 * Find first active element on each page, and a loose bound for the
5579 * final element on each page. Identify any single element that spans
5580 * the page boundary. Return true if there are any active elements.
5582 static bool sve_cont_ldst_elements(SVEContLdSt
*info
, target_ulong addr
,
5583 uint64_t *vg
, intptr_t reg_max
,
5586 const int esize
= 1 << esz
;
5587 const uint64_t pg_mask
= pred_esz_masks
[esz
];
5588 intptr_t reg_off_first
= -1, reg_off_last
= -1, reg_off_split
;
5589 intptr_t mem_off_last
, mem_off_split
;
5590 intptr_t page_split
, elt_split
;
5593 /* Set all of the element indices to -1, and the TLB data to 0. */
5594 memset(info
, -1, offsetof(SVEContLdSt
, page
));
5595 memset(info
->page
, 0, sizeof(info
->page
));
5597 /* Gross scan over the entire predicate to find bounds. */
5600 uint64_t pg
= vg
[i
] & pg_mask
;
5602 reg_off_last
= i
* 64 + 63 - clz64(pg
);
5603 if (reg_off_first
< 0) {
5604 reg_off_first
= i
* 64 + ctz64(pg
);
5607 } while (++i
* 64 < reg_max
);
5609 if (unlikely(reg_off_first
< 0)) {
5610 /* No active elements, no pages touched. */
5613 tcg_debug_assert(reg_off_last
>= 0 && reg_off_last
< reg_max
);
5615 info
->reg_off_first
[0] = reg_off_first
;
5616 info
->mem_off_first
[0] = (reg_off_first
>> esz
) * msize
;
5617 mem_off_last
= (reg_off_last
>> esz
) * msize
;
5619 page_split
= -(addr
| TARGET_PAGE_MASK
);
5620 if (likely(mem_off_last
+ msize
<= page_split
)) {
5621 /* The entire operation fits within a single page. */
5622 info
->reg_off_last
[0] = reg_off_last
;
5626 info
->page_split
= page_split
;
5627 elt_split
= page_split
/ msize
;
5628 reg_off_split
= elt_split
<< esz
;
5629 mem_off_split
= elt_split
* msize
;
5632 * This is the last full element on the first page, but it is not
5633 * necessarily active. If there is no full element, i.e. the first
5634 * active element is the one that's split, this value remains -1.
5635 * It is useful as iteration bounds.
5637 if (elt_split
!= 0) {
5638 info
->reg_off_last
[0] = reg_off_split
- esize
;
5641 /* Determine if an unaligned element spans the pages. */
5642 if (page_split
% msize
!= 0) {
5643 /* It is helpful to know if the split element is active. */
5644 if ((vg
[reg_off_split
>> 6] >> (reg_off_split
& 63)) & 1) {
5645 info
->reg_off_split
= reg_off_split
;
5646 info
->mem_off_split
= mem_off_split
;
5648 if (reg_off_split
== reg_off_last
) {
5649 /* The page crossing element is last. */
5653 reg_off_split
+= esize
;
5654 mem_off_split
+= msize
;
5658 * We do want the first active element on the second page, because
5659 * this may affect the address reported in an exception.
5661 reg_off_split
= find_next_active(vg
, reg_off_split
, reg_max
, esz
);
5662 tcg_debug_assert(reg_off_split
<= reg_off_last
);
5663 info
->reg_off_first
[1] = reg_off_split
;
5664 info
->mem_off_first
[1] = (reg_off_split
>> esz
) * msize
;
5665 info
->reg_off_last
[1] = reg_off_last
;
5670 * Resolve the guest virtual addresses to info->page[].
5671 * Control the generation of page faults with @fault. Return false if
5672 * there is no work to do, which can only happen with @fault == FAULT_NO.
5674 static bool sve_cont_ldst_pages(SVEContLdSt
*info
, SVEContFault fault
,
5675 CPUARMState
*env
, target_ulong addr
,
5676 MMUAccessType access_type
, uintptr_t retaddr
)
5678 int mmu_idx
= cpu_mmu_index(env
, false);
5679 int mem_off
= info
->mem_off_first
[0];
5680 bool nofault
= fault
== FAULT_NO
;
5681 bool have_work
= true;
5683 if (!sve_probe_page(&info
->page
[0], nofault
, env
, addr
, mem_off
,
5684 access_type
, mmu_idx
, retaddr
)) {
5685 /* No work to be done. */
5689 if (likely(info
->page_split
< 0)) {
5690 /* The entire operation was on the one page. */
5695 * If the second page is invalid, then we want the fault address to be
5696 * the first byte on that page which is accessed.
5698 if (info
->mem_off_split
>= 0) {
5700 * There is an element split across the pages. The fault address
5701 * should be the first byte of the second page.
5703 mem_off
= info
->page_split
;
5705 * If the split element is also the first active element
5706 * of the vector, then: For first-fault we should continue
5707 * to generate faults for the second page. For no-fault,
5708 * we have work only if the second page is valid.
5710 if (info
->mem_off_first
[0] < info
->mem_off_split
) {
5711 nofault
= FAULT_FIRST
;
5716 * There is no element split across the pages. The fault address
5717 * should be the first active element on the second page.
5719 mem_off
= info
->mem_off_first
[1];
5721 * There must have been one active element on the first page,
5722 * so we're out of first-fault territory.
5724 nofault
= fault
!= FAULT_ALL
;
5727 have_work
|= sve_probe_page(&info
->page
[1], nofault
, env
, addr
, mem_off
,
5728 access_type
, mmu_idx
, retaddr
);
5732 static void sve_cont_ldst_watchpoints(SVEContLdSt
*info
, CPUARMState
*env
,
5733 uint64_t *vg
, target_ulong addr
,
5734 int esize
, int msize
, int wp_access
,
5737 #ifndef CONFIG_USER_ONLY
5738 intptr_t mem_off
, reg_off
, reg_last
;
5739 int flags0
= info
->page
[0].flags
;
5740 int flags1
= info
->page
[1].flags
;
5742 if (likely(!((flags0
| flags1
) & TLB_WATCHPOINT
))) {
5746 /* Indicate that watchpoints are handled. */
5747 info
->page
[0].flags
= flags0
& ~TLB_WATCHPOINT
;
5748 info
->page
[1].flags
= flags1
& ~TLB_WATCHPOINT
;
5750 if (flags0
& TLB_WATCHPOINT
) {
5751 mem_off
= info
->mem_off_first
[0];
5752 reg_off
= info
->reg_off_first
[0];
5753 reg_last
= info
->reg_off_last
[0];
5755 while (reg_off
<= reg_last
) {
5756 uint64_t pg
= vg
[reg_off
>> 6];
5758 if ((pg
>> (reg_off
& 63)) & 1) {
5759 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5760 msize
, info
->page
[0].attrs
,
5761 wp_access
, retaddr
);
5765 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5769 mem_off
= info
->mem_off_split
;
5771 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
, msize
,
5772 info
->page
[0].attrs
, wp_access
, retaddr
);
5775 mem_off
= info
->mem_off_first
[1];
5776 if ((flags1
& TLB_WATCHPOINT
) && mem_off
>= 0) {
5777 reg_off
= info
->reg_off_first
[1];
5778 reg_last
= info
->reg_off_last
[1];
5781 uint64_t pg
= vg
[reg_off
>> 6];
5783 if ((pg
>> (reg_off
& 63)) & 1) {
5784 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5785 msize
, info
->page
[1].attrs
,
5786 wp_access
, retaddr
);
5790 } while (reg_off
& 63);
5791 } while (reg_off
<= reg_last
);
5796 static void sve_cont_ldst_mte_check(SVEContLdSt
*info
, CPUARMState
*env
,
5797 uint64_t *vg
, target_ulong addr
, int esize
,
5798 int msize
, uint32_t mtedesc
, uintptr_t ra
)
5800 intptr_t mem_off
, reg_off
, reg_last
;
5802 /* Process the page only if MemAttr == Tagged. */
5803 if (arm_tlb_mte_tagged(&info
->page
[0].attrs
)) {
5804 mem_off
= info
->mem_off_first
[0];
5805 reg_off
= info
->reg_off_first
[0];
5806 reg_last
= info
->reg_off_split
;
5808 reg_last
= info
->reg_off_last
[0];
5812 uint64_t pg
= vg
[reg_off
>> 6];
5814 if ((pg
>> (reg_off
& 63)) & 1) {
5815 mte_check(env
, mtedesc
, addr
, ra
);
5819 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5820 } while (reg_off
<= reg_last
);
5823 mem_off
= info
->mem_off_first
[1];
5824 if (mem_off
>= 0 && arm_tlb_mte_tagged(&info
->page
[1].attrs
)) {
5825 reg_off
= info
->reg_off_first
[1];
5826 reg_last
= info
->reg_off_last
[1];
5829 uint64_t pg
= vg
[reg_off
>> 6];
5831 if ((pg
>> (reg_off
& 63)) & 1) {
5832 mte_check(env
, mtedesc
, addr
, ra
);
5836 } while (reg_off
& 63);
5837 } while (reg_off
<= reg_last
);
5842 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5844 static inline QEMU_ALWAYS_INLINE
5845 void sve_ldN_r(CPUARMState
*env
, uint64_t *vg
, const target_ulong addr
,
5846 uint32_t desc
, const uintptr_t retaddr
,
5847 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
5848 sve_ldst1_host_fn
*host_fn
,
5849 sve_ldst1_tlb_fn
*tlb_fn
)
5851 const unsigned rd
= simd_data(desc
);
5852 const intptr_t reg_max
= simd_oprsz(desc
);
5853 intptr_t reg_off
, reg_last
, mem_off
;
5858 /* Find the active elements. */
5859 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
5860 /* The entire predicate was false; no load occurs. */
5861 for (i
= 0; i
< N
; ++i
) {
5862 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5867 /* Probe the page(s). Exit with exception for any invalid page. */
5868 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, retaddr
);
5870 /* Handle watchpoints for all active elements. */
5871 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5872 BP_MEM_READ
, retaddr
);
5875 * Handle mte checks for all active elements.
5876 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5879 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5883 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
5884 if (unlikely(flags
!= 0)) {
5885 #ifdef CONFIG_USER_ONLY
5886 g_assert_not_reached();
5889 * At least one page includes MMIO.
5890 * Any bus operation can fail with cpu_transaction_failed,
5891 * which for ARM will raise SyncExternal. Perform the load
5892 * into scratch memory to preserve register state until the end.
5894 ARMVectorReg scratch
[4] = { };
5896 mem_off
= info
.mem_off_first
[0];
5897 reg_off
= info
.reg_off_first
[0];
5898 reg_last
= info
.reg_off_last
[1];
5900 reg_last
= info
.reg_off_split
;
5902 reg_last
= info
.reg_off_last
[0];
5907 uint64_t pg
= vg
[reg_off
>> 6];
5909 if ((pg
>> (reg_off
& 63)) & 1) {
5910 for (i
= 0; i
< N
; ++i
) {
5911 tlb_fn(env
, &scratch
[i
], reg_off
,
5912 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5915 reg_off
+= 1 << esz
;
5916 mem_off
+= N
<< msz
;
5917 } while (reg_off
& 63);
5918 } while (reg_off
<= reg_last
);
5920 for (i
= 0; i
< N
; ++i
) {
5921 memcpy(&env
->vfp
.zregs
[(rd
+ i
) & 31], &scratch
[i
], reg_max
);
5927 /* The entire operation is in RAM, on valid pages. */
5929 for (i
= 0; i
< N
; ++i
) {
5930 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5933 mem_off
= info
.mem_off_first
[0];
5934 reg_off
= info
.reg_off_first
[0];
5935 reg_last
= info
.reg_off_last
[0];
5936 host
= info
.page
[0].host
;
5938 while (reg_off
<= reg_last
) {
5939 uint64_t pg
= vg
[reg_off
>> 6];
5941 if ((pg
>> (reg_off
& 63)) & 1) {
5942 for (i
= 0; i
< N
; ++i
) {
5943 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5944 host
+ mem_off
+ (i
<< msz
));
5947 reg_off
+= 1 << esz
;
5948 mem_off
+= N
<< msz
;
5949 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5953 * Use the slow path to manage the cross-page misalignment.
5954 * But we know this is RAM and cannot trap.
5956 mem_off
= info
.mem_off_split
;
5957 if (unlikely(mem_off
>= 0)) {
5958 reg_off
= info
.reg_off_split
;
5959 for (i
= 0; i
< N
; ++i
) {
5960 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5961 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5965 mem_off
= info
.mem_off_first
[1];
5966 if (unlikely(mem_off
>= 0)) {
5967 reg_off
= info
.reg_off_first
[1];
5968 reg_last
= info
.reg_off_last
[1];
5969 host
= info
.page
[1].host
;
5972 uint64_t pg
= vg
[reg_off
>> 6];
5974 if ((pg
>> (reg_off
& 63)) & 1) {
5975 for (i
= 0; i
< N
; ++i
) {
5976 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5977 host
+ mem_off
+ (i
<< msz
));
5980 reg_off
+= 1 << esz
;
5981 mem_off
+= N
<< msz
;
5982 } while (reg_off
& 63);
5983 } while (reg_off
<= reg_last
);
5987 static inline QEMU_ALWAYS_INLINE
5988 void sve_ldN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5989 uint32_t desc
, const uintptr_t ra
,
5990 const int esz
, const int msz
, const int N
,
5991 sve_ldst1_host_fn
*host_fn
,
5992 sve_ldst1_tlb_fn
*tlb_fn
)
5994 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5995 int bit55
= extract64(addr
, 55, 1);
5997 /* Remove mtedesc from the normal sve descriptor. */
5998 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6000 /* Perform gross MTE suppression early. */
6001 if (!tbi_check(desc
, bit55
) ||
6002 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6006 sve_ldN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
6009 #define DO_LD1_1(NAME, ESZ) \
6010 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
6011 target_ulong addr, uint32_t desc) \
6013 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
6014 sve_##NAME##_host, sve_##NAME##_tlb); \
6016 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
6017 target_ulong addr, uint32_t desc) \
6019 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6020 sve_##NAME##_host, sve_##NAME##_tlb); \
6023 #define DO_LD1_2(NAME, ESZ, MSZ) \
6024 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6025 target_ulong addr, uint32_t desc) \
6027 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6028 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6030 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6031 target_ulong addr, uint32_t desc) \
6033 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6034 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6036 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6037 target_ulong addr, uint32_t desc) \
6039 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6040 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6042 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6043 target_ulong addr, uint32_t desc) \
6045 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6046 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6049 DO_LD1_1(ld1bb
, MO_8
)
6050 DO_LD1_1(ld1bhu
, MO_16
)
6051 DO_LD1_1(ld1bhs
, MO_16
)
6052 DO_LD1_1(ld1bsu
, MO_32
)
6053 DO_LD1_1(ld1bss
, MO_32
)
6054 DO_LD1_1(ld1bdu
, MO_64
)
6055 DO_LD1_1(ld1bds
, MO_64
)
6057 DO_LD1_2(ld1hh
, MO_16
, MO_16
)
6058 DO_LD1_2(ld1hsu
, MO_32
, MO_16
)
6059 DO_LD1_2(ld1hss
, MO_32
, MO_16
)
6060 DO_LD1_2(ld1hdu
, MO_64
, MO_16
)
6061 DO_LD1_2(ld1hds
, MO_64
, MO_16
)
6063 DO_LD1_2(ld1ss
, MO_32
, MO_32
)
6064 DO_LD1_2(ld1sdu
, MO_64
, MO_32
)
6065 DO_LD1_2(ld1sds
, MO_64
, MO_32
)
6067 DO_LD1_2(ld1dd
, MO_64
, MO_64
)
6072 #define DO_LDN_1(N) \
6073 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6074 target_ulong addr, uint32_t desc) \
6076 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6077 sve_ld1bb_host, sve_ld1bb_tlb); \
6079 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6080 target_ulong addr, uint32_t desc) \
6082 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6083 sve_ld1bb_host, sve_ld1bb_tlb); \
6086 #define DO_LDN_2(N, SUFF, ESZ) \
6087 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6088 target_ulong addr, uint32_t desc) \
6090 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6091 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6093 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6094 target_ulong addr, uint32_t desc) \
6096 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6097 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6099 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6100 target_ulong addr, uint32_t desc) \
6102 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6103 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6105 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6106 target_ulong addr, uint32_t desc) \
6108 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6109 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6116 DO_LDN_2(2, hh
, MO_16
)
6117 DO_LDN_2(3, hh
, MO_16
)
6118 DO_LDN_2(4, hh
, MO_16
)
6120 DO_LDN_2(2, ss
, MO_32
)
6121 DO_LDN_2(3, ss
, MO_32
)
6122 DO_LDN_2(4, ss
, MO_32
)
6124 DO_LDN_2(2, dd
, MO_64
)
6125 DO_LDN_2(3, dd
, MO_64
)
6126 DO_LDN_2(4, dd
, MO_64
)
6132 * Load contiguous data, first-fault and no-fault.
6134 * For user-only, one could argue that we should hold the mmap_lock during
6135 * the operation so that there is no race between page_check_range and the
6136 * load operation. However, unmapping pages out from under a running thread
6137 * is extraordinarily unlikely. This theoretical race condition also affects
6138 * linux-user/ in its get_user/put_user macros.
6140 * TODO: Construct some helpers, written in assembly, that interact with
6141 * handle_cpu_signal to produce memory ops which can properly report errors
6145 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6146 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6147 * option, which leaves subsequent data unchanged.
6149 static void record_fault(CPUARMState
*env
, uintptr_t i
, uintptr_t oprsz
)
6151 uint64_t *ffr
= env
->vfp
.pregs
[FFR_PRED_NUM
].p
;
6154 ffr
[i
/ 64] &= MAKE_64BIT_MASK(0, i
& 63);
6155 i
= ROUND_UP(i
, 64);
6157 for (; i
< oprsz
; i
+= 64) {
6163 * Common helper for all contiguous no-fault and first-fault loads.
6165 static inline QEMU_ALWAYS_INLINE
6166 void sve_ldnfff1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
6167 uint32_t desc
, const uintptr_t retaddr
, uint32_t mtedesc
,
6168 const int esz
, const int msz
, const SVEContFault fault
,
6169 sve_ldst1_host_fn
*host_fn
,
6170 sve_ldst1_tlb_fn
*tlb_fn
)
6172 const unsigned rd
= simd_data(desc
);
6173 void *vd
= &env
->vfp
.zregs
[rd
];
6174 const intptr_t reg_max
= simd_oprsz(desc
);
6175 intptr_t reg_off
, mem_off
, reg_last
;
6180 /* Find the active elements. */
6181 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, 1 << msz
)) {
6182 /* The entire predicate was false; no load occurs. */
6183 memset(vd
, 0, reg_max
);
6186 reg_off
= info
.reg_off_first
[0];
6188 /* Probe the page(s). */
6189 if (!sve_cont_ldst_pages(&info
, fault
, env
, addr
, MMU_DATA_LOAD
, retaddr
)) {
6190 /* Fault on first element. */
6191 tcg_debug_assert(fault
== FAULT_NO
);
6192 memset(vd
, 0, reg_max
);
6196 mem_off
= info
.mem_off_first
[0];
6197 flags
= info
.page
[0].flags
;
6200 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6201 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6203 if (arm_tlb_mte_tagged(&info
.page
[0].attrs
)) {
6207 if (fault
== FAULT_FIRST
) {
6208 /* Trapping mte check for the first-fault element. */
6210 mte_check(env
, mtedesc
, addr
+ mem_off
, retaddr
);
6214 * Special handling of the first active element,
6215 * if it crosses a page boundary or is MMIO.
6217 bool is_split
= mem_off
== info
.mem_off_split
;
6218 if (unlikely(flags
!= 0) || unlikely(is_split
)) {
6220 * Use the slow path for cross-page handling.
6221 * Might trap for MMIO or watchpoints.
6223 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
6225 /* After any fault, zero the other elements. */
6226 swap_memzero(vd
, reg_off
);
6227 reg_off
+= 1 << esz
;
6228 mem_off
+= 1 << msz
;
6229 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
6235 memset(vd
, 0, reg_max
);
6238 memset(vd
, 0, reg_max
);
6239 if (unlikely(mem_off
== info
.mem_off_split
)) {
6240 /* The first active element crosses a page boundary. */
6241 flags
|= info
.page
[1].flags
;
6242 if (unlikely(flags
& TLB_MMIO
)) {
6243 /* Some page is MMIO, see below. */
6246 if (unlikely(flags
& TLB_WATCHPOINT
) &&
6247 (cpu_watchpoint_address_matches
6248 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
6250 /* Watchpoint hit, see below. */
6253 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
6257 * Use the slow path for cross-page handling.
6258 * This is RAM, without a watchpoint, and will not trap.
6260 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
6266 * From this point on, all memory operations are MemSingleNF.
6268 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6269 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6271 * Unfortuately we do not have access to the memory attributes from the
6272 * PTE to tell Device memory from Normal memory. So we make a mostly
6273 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6274 * This gives the right answer for the common cases of "Normal memory,
6275 * backed by host RAM" and "Device memory, backed by MMIO".
6276 * The architecture allows us to suppress an NF load and return
6277 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6278 * case of "Normal memory, backed by MMIO" is permitted. The case we
6279 * get wrong is "Device memory, backed by host RAM", for which we
6280 * should return (UNKNOWN, FAULT) for but do not.
6282 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6283 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6284 * architectural breakpoints the same.
6286 if (unlikely(flags
& TLB_MMIO
)) {
6290 reg_last
= info
.reg_off_last
[0];
6291 host
= info
.page
[0].host
;
6294 uint64_t pg
= *(uint64_t *)(vg
+ (reg_off
>> 3));
6296 if ((pg
>> (reg_off
& 63)) & 1) {
6297 if (unlikely(flags
& TLB_WATCHPOINT
) &&
6298 (cpu_watchpoint_address_matches
6299 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
6303 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
6306 host_fn(vd
, reg_off
, host
+ mem_off
);
6308 reg_off
+= 1 << esz
;
6309 mem_off
+= 1 << msz
;
6310 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6311 } while (reg_off
<= reg_last
);
6314 * MemSingleNF is allowed to fail for any reason. We have special
6315 * code above to handle the first element crossing a page boundary.
6316 * As an implementation choice, decline to handle a cross-page element
6317 * in any other position.
6319 reg_off
= info
.reg_off_split
;
6325 reg_off
= info
.reg_off_first
[1];
6326 if (likely(reg_off
< 0)) {
6327 /* No active elements on the second page. All done. */
6332 * MemSingleNF is allowed to fail for any reason. As an implementation
6333 * choice, decline to handle elements on the second page. This should
6334 * be low frequency as the guest walks through memory -- the next
6335 * iteration of the guest's loop should be aligned on the page boundary,
6336 * and then all following iterations will stay aligned.
6340 record_fault(env
, reg_off
, reg_max
);
6343 static inline QEMU_ALWAYS_INLINE
6344 void sve_ldnfff1_r_mte(CPUARMState
*env
, void *vg
, target_ulong addr
,
6345 uint32_t desc
, const uintptr_t retaddr
,
6346 const int esz
, const int msz
, const SVEContFault fault
,
6347 sve_ldst1_host_fn
*host_fn
,
6348 sve_ldst1_tlb_fn
*tlb_fn
)
6350 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6351 int bit55
= extract64(addr
, 55, 1);
6353 /* Remove mtedesc from the normal sve descriptor. */
6354 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6356 /* Perform gross MTE suppression early. */
6357 if (!tbi_check(desc
, bit55
) ||
6358 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6362 sve_ldnfff1_r(env
, vg
, addr
, desc
, retaddr
, mtedesc
,
6363 esz
, msz
, fault
, host_fn
, tlb_fn
);
6366 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6367 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6368 target_ulong addr, uint32_t desc) \
6370 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6371 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6373 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6374 target_ulong addr, uint32_t desc) \
6376 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6377 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6379 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6380 target_ulong addr, uint32_t desc) \
6382 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6383 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6385 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6386 target_ulong addr, uint32_t desc) \
6388 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6389 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6392 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6393 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6394 target_ulong addr, uint32_t desc) \
6396 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6397 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6399 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6400 target_ulong addr, uint32_t desc) \
6402 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6403 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6405 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6406 target_ulong addr, uint32_t desc) \
6408 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6409 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6411 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6412 target_ulong addr, uint32_t desc) \
6414 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6415 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6417 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6418 target_ulong addr, uint32_t desc) \
6420 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6421 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6423 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6424 target_ulong addr, uint32_t desc) \
6426 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6427 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6429 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6430 target_ulong addr, uint32_t desc) \
6432 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6433 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6435 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6436 target_ulong addr, uint32_t desc) \
6438 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6439 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6442 DO_LDFF1_LDNF1_1(bb
, MO_8
)
6443 DO_LDFF1_LDNF1_1(bhu
, MO_16
)
6444 DO_LDFF1_LDNF1_1(bhs
, MO_16
)
6445 DO_LDFF1_LDNF1_1(bsu
, MO_32
)
6446 DO_LDFF1_LDNF1_1(bss
, MO_32
)
6447 DO_LDFF1_LDNF1_1(bdu
, MO_64
)
6448 DO_LDFF1_LDNF1_1(bds
, MO_64
)
6450 DO_LDFF1_LDNF1_2(hh
, MO_16
, MO_16
)
6451 DO_LDFF1_LDNF1_2(hsu
, MO_32
, MO_16
)
6452 DO_LDFF1_LDNF1_2(hss
, MO_32
, MO_16
)
6453 DO_LDFF1_LDNF1_2(hdu
, MO_64
, MO_16
)
6454 DO_LDFF1_LDNF1_2(hds
, MO_64
, MO_16
)
6456 DO_LDFF1_LDNF1_2(ss
, MO_32
, MO_32
)
6457 DO_LDFF1_LDNF1_2(sdu
, MO_64
, MO_32
)
6458 DO_LDFF1_LDNF1_2(sds
, MO_64
, MO_32
)
6460 DO_LDFF1_LDNF1_2(dd
, MO_64
, MO_64
)
6462 #undef DO_LDFF1_LDNF1_1
6463 #undef DO_LDFF1_LDNF1_2
6466 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6469 static inline QEMU_ALWAYS_INLINE
6470 void sve_stN_r(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6471 uint32_t desc
, const uintptr_t retaddr
,
6472 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
6473 sve_ldst1_host_fn
*host_fn
,
6474 sve_ldst1_tlb_fn
*tlb_fn
)
6476 const unsigned rd
= simd_data(desc
);
6477 const intptr_t reg_max
= simd_oprsz(desc
);
6478 intptr_t reg_off
, reg_last
, mem_off
;
6483 /* Find the active elements. */
6484 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
6485 /* The entire predicate was false; no store occurs. */
6489 /* Probe the page(s). Exit with exception for any invalid page. */
6490 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, retaddr
);
6492 /* Handle watchpoints for all active elements. */
6493 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6494 BP_MEM_WRITE
, retaddr
);
6497 * Handle mte checks for all active elements.
6498 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6501 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6505 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
6506 if (unlikely(flags
!= 0)) {
6507 #ifdef CONFIG_USER_ONLY
6508 g_assert_not_reached();
6511 * At least one page includes MMIO.
6512 * Any bus operation can fail with cpu_transaction_failed,
6513 * which for ARM will raise SyncExternal. We cannot avoid
6514 * this fault and will leave with the store incomplete.
6516 mem_off
= info
.mem_off_first
[0];
6517 reg_off
= info
.reg_off_first
[0];
6518 reg_last
= info
.reg_off_last
[1];
6520 reg_last
= info
.reg_off_split
;
6522 reg_last
= info
.reg_off_last
[0];
6527 uint64_t pg
= vg
[reg_off
>> 6];
6529 if ((pg
>> (reg_off
& 63)) & 1) {
6530 for (i
= 0; i
< N
; ++i
) {
6531 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6532 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6535 reg_off
+= 1 << esz
;
6536 mem_off
+= N
<< msz
;
6537 } while (reg_off
& 63);
6538 } while (reg_off
<= reg_last
);
6543 mem_off
= info
.mem_off_first
[0];
6544 reg_off
= info
.reg_off_first
[0];
6545 reg_last
= info
.reg_off_last
[0];
6546 host
= info
.page
[0].host
;
6548 while (reg_off
<= reg_last
) {
6549 uint64_t pg
= vg
[reg_off
>> 6];
6551 if ((pg
>> (reg_off
& 63)) & 1) {
6552 for (i
= 0; i
< N
; ++i
) {
6553 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6554 host
+ mem_off
+ (i
<< msz
));
6557 reg_off
+= 1 << esz
;
6558 mem_off
+= N
<< msz
;
6559 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6563 * Use the slow path to manage the cross-page misalignment.
6564 * But we know this is RAM and cannot trap.
6566 mem_off
= info
.mem_off_split
;
6567 if (unlikely(mem_off
>= 0)) {
6568 reg_off
= info
.reg_off_split
;
6569 for (i
= 0; i
< N
; ++i
) {
6570 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6571 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6575 mem_off
= info
.mem_off_first
[1];
6576 if (unlikely(mem_off
>= 0)) {
6577 reg_off
= info
.reg_off_first
[1];
6578 reg_last
= info
.reg_off_last
[1];
6579 host
= info
.page
[1].host
;
6582 uint64_t pg
= vg
[reg_off
>> 6];
6584 if ((pg
>> (reg_off
& 63)) & 1) {
6585 for (i
= 0; i
< N
; ++i
) {
6586 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6587 host
+ mem_off
+ (i
<< msz
));
6590 reg_off
+= 1 << esz
;
6591 mem_off
+= N
<< msz
;
6592 } while (reg_off
& 63);
6593 } while (reg_off
<= reg_last
);
6597 static inline QEMU_ALWAYS_INLINE
6598 void sve_stN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6599 uint32_t desc
, const uintptr_t ra
,
6600 const int esz
, const int msz
, const int N
,
6601 sve_ldst1_host_fn
*host_fn
,
6602 sve_ldst1_tlb_fn
*tlb_fn
)
6604 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6605 int bit55
= extract64(addr
, 55, 1);
6607 /* Remove mtedesc from the normal sve descriptor. */
6608 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6610 /* Perform gross MTE suppression early. */
6611 if (!tbi_check(desc
, bit55
) ||
6612 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6616 sve_stN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
6619 #define DO_STN_1(N, NAME, ESZ) \
6620 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6621 target_ulong addr, uint32_t desc) \
6623 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6624 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6626 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6627 target_ulong addr, uint32_t desc) \
6629 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6630 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6633 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6634 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6635 target_ulong addr, uint32_t desc) \
6637 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6638 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6640 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6641 target_ulong addr, uint32_t desc) \
6643 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6644 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6646 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6647 target_ulong addr, uint32_t desc) \
6649 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6650 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6652 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6653 target_ulong addr, uint32_t desc) \
6655 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6656 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6659 DO_STN_1(1, bb
, MO_8
)
6660 DO_STN_1(1, bh
, MO_16
)
6661 DO_STN_1(1, bs
, MO_32
)
6662 DO_STN_1(1, bd
, MO_64
)
6663 DO_STN_1(2, bb
, MO_8
)
6664 DO_STN_1(3, bb
, MO_8
)
6665 DO_STN_1(4, bb
, MO_8
)
6667 DO_STN_2(1, hh
, MO_16
, MO_16
)
6668 DO_STN_2(1, hs
, MO_32
, MO_16
)
6669 DO_STN_2(1, hd
, MO_64
, MO_16
)
6670 DO_STN_2(2, hh
, MO_16
, MO_16
)
6671 DO_STN_2(3, hh
, MO_16
, MO_16
)
6672 DO_STN_2(4, hh
, MO_16
, MO_16
)
6674 DO_STN_2(1, ss
, MO_32
, MO_32
)
6675 DO_STN_2(1, sd
, MO_64
, MO_32
)
6676 DO_STN_2(2, ss
, MO_32
, MO_32
)
6677 DO_STN_2(3, ss
, MO_32
, MO_32
)
6678 DO_STN_2(4, ss
, MO_32
, MO_32
)
6680 DO_STN_2(1, dd
, MO_64
, MO_64
)
6681 DO_STN_2(2, dd
, MO_64
, MO_64
)
6682 DO_STN_2(3, dd
, MO_64
, MO_64
)
6683 DO_STN_2(4, dd
, MO_64
, MO_64
)
6689 * Loads with a vector index.
6693 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6695 typedef target_ulong
zreg_off_fn(void *reg
, intptr_t reg_ofs
);
6697 static target_ulong
off_zsu_s(void *reg
, intptr_t reg_ofs
)
6699 return *(uint32_t *)(reg
+ H1_4(reg_ofs
));
6702 static target_ulong
off_zss_s(void *reg
, intptr_t reg_ofs
)
6704 return *(int32_t *)(reg
+ H1_4(reg_ofs
));
6707 static target_ulong
off_zsu_d(void *reg
, intptr_t reg_ofs
)
6709 return (uint32_t)*(uint64_t *)(reg
+ reg_ofs
);
6712 static target_ulong
off_zss_d(void *reg
, intptr_t reg_ofs
)
6714 return (int32_t)*(uint64_t *)(reg
+ reg_ofs
);
6717 static target_ulong
off_zd_d(void *reg
, intptr_t reg_ofs
)
6719 return *(uint64_t *)(reg
+ reg_ofs
);
6722 static inline QEMU_ALWAYS_INLINE
6723 void sve_ld1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6724 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6725 uint32_t mtedesc
, int esize
, int msize
,
6726 zreg_off_fn
*off_fn
,
6727 sve_ldst1_host_fn
*host_fn
,
6728 sve_ldst1_tlb_fn
*tlb_fn
)
6730 const int mmu_idx
= cpu_mmu_index(env
, false);
6731 const intptr_t reg_max
= simd_oprsz(desc
);
6732 const int scale
= simd_data(desc
);
6733 ARMVectorReg scratch
;
6735 SVEHostPage info
, info2
;
6737 memset(&scratch
, 0, reg_max
);
6740 uint64_t pg
= vg
[reg_off
>> 6];
6742 if (likely(pg
& 1)) {
6743 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6744 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6746 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_LOAD
,
6749 if (likely(in_page
>= msize
)) {
6750 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6751 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6752 info
.attrs
, BP_MEM_READ
, retaddr
);
6754 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6755 mte_check(env
, mtedesc
, addr
, retaddr
);
6757 host_fn(&scratch
, reg_off
, info
.host
);
6759 /* Element crosses the page boundary. */
6760 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6761 MMU_DATA_LOAD
, mmu_idx
, retaddr
);
6762 if (unlikely((info
.flags
| info2
.flags
) & TLB_WATCHPOINT
)) {
6763 cpu_check_watchpoint(env_cpu(env
), addr
,
6765 BP_MEM_READ
, retaddr
);
6767 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6768 mte_check(env
, mtedesc
, addr
, retaddr
);
6770 tlb_fn(env
, &scratch
, reg_off
, addr
, retaddr
);
6775 } while (reg_off
& 63);
6776 } while (reg_off
< reg_max
);
6778 /* Wait until all exceptions have been raised to write back. */
6779 memcpy(vd
, &scratch
, reg_max
);
6782 static inline QEMU_ALWAYS_INLINE
6783 void sve_ld1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6784 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6785 int esize
, int msize
, zreg_off_fn
*off_fn
,
6786 sve_ldst1_host_fn
*host_fn
,
6787 sve_ldst1_tlb_fn
*tlb_fn
)
6789 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6790 /* Remove mtedesc from the normal sve descriptor. */
6791 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6794 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6795 * offset base entirely over the address space hole to change the
6796 * pointer tag, or change the bit55 selector. So we could here
6797 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6799 sve_ld1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6800 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6803 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6804 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6805 void *vm, target_ulong base, uint32_t desc) \
6807 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6808 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6810 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6811 void *vm, target_ulong base, uint32_t desc) \
6813 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6814 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6817 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6818 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6819 void *vm, target_ulong base, uint32_t desc) \
6821 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6822 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6825 void *vm, target_ulong base, uint32_t desc) \
6827 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6828 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6831 DO_LD1_ZPZ_S(bsu
, zsu
, MO_8
)
6832 DO_LD1_ZPZ_S(bsu
, zss
, MO_8
)
6833 DO_LD1_ZPZ_D(bdu
, zsu
, MO_8
)
6834 DO_LD1_ZPZ_D(bdu
, zss
, MO_8
)
6835 DO_LD1_ZPZ_D(bdu
, zd
, MO_8
)
6837 DO_LD1_ZPZ_S(bss
, zsu
, MO_8
)
6838 DO_LD1_ZPZ_S(bss
, zss
, MO_8
)
6839 DO_LD1_ZPZ_D(bds
, zsu
, MO_8
)
6840 DO_LD1_ZPZ_D(bds
, zss
, MO_8
)
6841 DO_LD1_ZPZ_D(bds
, zd
, MO_8
)
6843 DO_LD1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6844 DO_LD1_ZPZ_S(hsu_le
, zss
, MO_16
)
6845 DO_LD1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6846 DO_LD1_ZPZ_D(hdu_le
, zss
, MO_16
)
6847 DO_LD1_ZPZ_D(hdu_le
, zd
, MO_16
)
6849 DO_LD1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6850 DO_LD1_ZPZ_S(hsu_be
, zss
, MO_16
)
6851 DO_LD1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6852 DO_LD1_ZPZ_D(hdu_be
, zss
, MO_16
)
6853 DO_LD1_ZPZ_D(hdu_be
, zd
, MO_16
)
6855 DO_LD1_ZPZ_S(hss_le
, zsu
, MO_16
)
6856 DO_LD1_ZPZ_S(hss_le
, zss
, MO_16
)
6857 DO_LD1_ZPZ_D(hds_le
, zsu
, MO_16
)
6858 DO_LD1_ZPZ_D(hds_le
, zss
, MO_16
)
6859 DO_LD1_ZPZ_D(hds_le
, zd
, MO_16
)
6861 DO_LD1_ZPZ_S(hss_be
, zsu
, MO_16
)
6862 DO_LD1_ZPZ_S(hss_be
, zss
, MO_16
)
6863 DO_LD1_ZPZ_D(hds_be
, zsu
, MO_16
)
6864 DO_LD1_ZPZ_D(hds_be
, zss
, MO_16
)
6865 DO_LD1_ZPZ_D(hds_be
, zd
, MO_16
)
6867 DO_LD1_ZPZ_S(ss_le
, zsu
, MO_32
)
6868 DO_LD1_ZPZ_S(ss_le
, zss
, MO_32
)
6869 DO_LD1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6870 DO_LD1_ZPZ_D(sdu_le
, zss
, MO_32
)
6871 DO_LD1_ZPZ_D(sdu_le
, zd
, MO_32
)
6873 DO_LD1_ZPZ_S(ss_be
, zsu
, MO_32
)
6874 DO_LD1_ZPZ_S(ss_be
, zss
, MO_32
)
6875 DO_LD1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6876 DO_LD1_ZPZ_D(sdu_be
, zss
, MO_32
)
6877 DO_LD1_ZPZ_D(sdu_be
, zd
, MO_32
)
6879 DO_LD1_ZPZ_D(sds_le
, zsu
, MO_32
)
6880 DO_LD1_ZPZ_D(sds_le
, zss
, MO_32
)
6881 DO_LD1_ZPZ_D(sds_le
, zd
, MO_32
)
6883 DO_LD1_ZPZ_D(sds_be
, zsu
, MO_32
)
6884 DO_LD1_ZPZ_D(sds_be
, zss
, MO_32
)
6885 DO_LD1_ZPZ_D(sds_be
, zd
, MO_32
)
6887 DO_LD1_ZPZ_D(dd_le
, zsu
, MO_64
)
6888 DO_LD1_ZPZ_D(dd_le
, zss
, MO_64
)
6889 DO_LD1_ZPZ_D(dd_le
, zd
, MO_64
)
6891 DO_LD1_ZPZ_D(dd_be
, zsu
, MO_64
)
6892 DO_LD1_ZPZ_D(dd_be
, zss
, MO_64
)
6893 DO_LD1_ZPZ_D(dd_be
, zd
, MO_64
)
6898 /* First fault loads with a vector index. */
6901 * Common helpers for all gather first-faulting loads.
6904 static inline QEMU_ALWAYS_INLINE
6905 void sve_ldff1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6906 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6907 uint32_t mtedesc
, const int esz
, const int msz
,
6908 zreg_off_fn
*off_fn
,
6909 sve_ldst1_host_fn
*host_fn
,
6910 sve_ldst1_tlb_fn
*tlb_fn
)
6912 const int mmu_idx
= cpu_mmu_index(env
, false);
6913 const intptr_t reg_max
= simd_oprsz(desc
);
6914 const int scale
= simd_data(desc
);
6915 const int esize
= 1 << esz
;
6916 const int msize
= 1 << msz
;
6919 target_ulong addr
, in_page
;
6921 /* Skip to the first true predicate. */
6922 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
6923 if (unlikely(reg_off
>= reg_max
)) {
6924 /* The entire predicate was false; no load occurs. */
6925 memset(vd
, 0, reg_max
);
6930 * Probe the first element, allowing faults.
6932 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6934 mte_check(env
, mtedesc
, addr
, retaddr
);
6936 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6938 /* After any fault, zero the other elements. */
6939 swap_memzero(vd
, reg_off
);
6941 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
6944 * Probe the remaining elements, not allowing faults.
6946 while (reg_off
< reg_max
) {
6947 uint64_t pg
= vg
[reg_off
>> 6];
6949 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6950 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6951 in_page
= -(addr
| TARGET_PAGE_MASK
);
6953 if (unlikely(in_page
< msize
)) {
6954 /* Stop if the element crosses a page boundary. */
6958 sve_probe_page(&info
, true, env
, addr
, 0, MMU_DATA_LOAD
,
6960 if (unlikely(info
.flags
& (TLB_INVALID_MASK
| TLB_MMIO
))) {
6963 if (unlikely(info
.flags
& TLB_WATCHPOINT
) &&
6964 (cpu_watchpoint_address_matches
6965 (env_cpu(env
), addr
, msize
) & BP_MEM_READ
)) {
6969 arm_tlb_mte_tagged(&info
.attrs
) &&
6970 !mte_probe(env
, mtedesc
, addr
)) {
6974 host_fn(vd
, reg_off
, info
.host
);
6977 } while (reg_off
& 63);
6982 record_fault(env
, reg_off
, reg_max
);
6985 static inline QEMU_ALWAYS_INLINE
6986 void sve_ldff1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6987 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6988 const int esz
, const int msz
,
6989 zreg_off_fn
*off_fn
,
6990 sve_ldst1_host_fn
*host_fn
,
6991 sve_ldst1_tlb_fn
*tlb_fn
)
6993 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6994 /* Remove mtedesc from the normal sve descriptor. */
6995 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6998 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6999 * offset base entirely over the address space hole to change the
7000 * pointer tag, or change the bit55 selector. So we could here
7001 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7003 sve_ldff1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
7004 esz
, msz
, off_fn
, host_fn
, tlb_fn
);
7007 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
7008 void HELPER(sve_ldff##MEM##_##OFS) \
7009 (CPUARMState *env, void *vd, void *vg, \
7010 void *vm, target_ulong base, uint32_t desc) \
7012 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
7013 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7015 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7016 (CPUARMState *env, void *vd, void *vg, \
7017 void *vm, target_ulong base, uint32_t desc) \
7019 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7020 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7023 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7024 void HELPER(sve_ldff##MEM##_##OFS) \
7025 (CPUARMState *env, void *vd, void *vg, \
7026 void *vm, target_ulong base, uint32_t desc) \
7028 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7029 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7031 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7032 (CPUARMState *env, void *vd, void *vg, \
7033 void *vm, target_ulong base, uint32_t desc) \
7035 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7036 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7039 DO_LDFF1_ZPZ_S(bsu
, zsu
, MO_8
)
7040 DO_LDFF1_ZPZ_S(bsu
, zss
, MO_8
)
7041 DO_LDFF1_ZPZ_D(bdu
, zsu
, MO_8
)
7042 DO_LDFF1_ZPZ_D(bdu
, zss
, MO_8
)
7043 DO_LDFF1_ZPZ_D(bdu
, zd
, MO_8
)
7045 DO_LDFF1_ZPZ_S(bss
, zsu
, MO_8
)
7046 DO_LDFF1_ZPZ_S(bss
, zss
, MO_8
)
7047 DO_LDFF1_ZPZ_D(bds
, zsu
, MO_8
)
7048 DO_LDFF1_ZPZ_D(bds
, zss
, MO_8
)
7049 DO_LDFF1_ZPZ_D(bds
, zd
, MO_8
)
7051 DO_LDFF1_ZPZ_S(hsu_le
, zsu
, MO_16
)
7052 DO_LDFF1_ZPZ_S(hsu_le
, zss
, MO_16
)
7053 DO_LDFF1_ZPZ_D(hdu_le
, zsu
, MO_16
)
7054 DO_LDFF1_ZPZ_D(hdu_le
, zss
, MO_16
)
7055 DO_LDFF1_ZPZ_D(hdu_le
, zd
, MO_16
)
7057 DO_LDFF1_ZPZ_S(hsu_be
, zsu
, MO_16
)
7058 DO_LDFF1_ZPZ_S(hsu_be
, zss
, MO_16
)
7059 DO_LDFF1_ZPZ_D(hdu_be
, zsu
, MO_16
)
7060 DO_LDFF1_ZPZ_D(hdu_be
, zss
, MO_16
)
7061 DO_LDFF1_ZPZ_D(hdu_be
, zd
, MO_16
)
7063 DO_LDFF1_ZPZ_S(hss_le
, zsu
, MO_16
)
7064 DO_LDFF1_ZPZ_S(hss_le
, zss
, MO_16
)
7065 DO_LDFF1_ZPZ_D(hds_le
, zsu
, MO_16
)
7066 DO_LDFF1_ZPZ_D(hds_le
, zss
, MO_16
)
7067 DO_LDFF1_ZPZ_D(hds_le
, zd
, MO_16
)
7069 DO_LDFF1_ZPZ_S(hss_be
, zsu
, MO_16
)
7070 DO_LDFF1_ZPZ_S(hss_be
, zss
, MO_16
)
7071 DO_LDFF1_ZPZ_D(hds_be
, zsu
, MO_16
)
7072 DO_LDFF1_ZPZ_D(hds_be
, zss
, MO_16
)
7073 DO_LDFF1_ZPZ_D(hds_be
, zd
, MO_16
)
7075 DO_LDFF1_ZPZ_S(ss_le
, zsu
, MO_32
)
7076 DO_LDFF1_ZPZ_S(ss_le
, zss
, MO_32
)
7077 DO_LDFF1_ZPZ_D(sdu_le
, zsu
, MO_32
)
7078 DO_LDFF1_ZPZ_D(sdu_le
, zss
, MO_32
)
7079 DO_LDFF1_ZPZ_D(sdu_le
, zd
, MO_32
)
7081 DO_LDFF1_ZPZ_S(ss_be
, zsu
, MO_32
)
7082 DO_LDFF1_ZPZ_S(ss_be
, zss
, MO_32
)
7083 DO_LDFF1_ZPZ_D(sdu_be
, zsu
, MO_32
)
7084 DO_LDFF1_ZPZ_D(sdu_be
, zss
, MO_32
)
7085 DO_LDFF1_ZPZ_D(sdu_be
, zd
, MO_32
)
7087 DO_LDFF1_ZPZ_D(sds_le
, zsu
, MO_32
)
7088 DO_LDFF1_ZPZ_D(sds_le
, zss
, MO_32
)
7089 DO_LDFF1_ZPZ_D(sds_le
, zd
, MO_32
)
7091 DO_LDFF1_ZPZ_D(sds_be
, zsu
, MO_32
)
7092 DO_LDFF1_ZPZ_D(sds_be
, zss
, MO_32
)
7093 DO_LDFF1_ZPZ_D(sds_be
, zd
, MO_32
)
7095 DO_LDFF1_ZPZ_D(dd_le
, zsu
, MO_64
)
7096 DO_LDFF1_ZPZ_D(dd_le
, zss
, MO_64
)
7097 DO_LDFF1_ZPZ_D(dd_le
, zd
, MO_64
)
7099 DO_LDFF1_ZPZ_D(dd_be
, zsu
, MO_64
)
7100 DO_LDFF1_ZPZ_D(dd_be
, zss
, MO_64
)
7101 DO_LDFF1_ZPZ_D(dd_be
, zd
, MO_64
)
7103 /* Stores with a vector index. */
7105 static inline QEMU_ALWAYS_INLINE
7106 void sve_st1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
7107 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
7108 uint32_t mtedesc
, int esize
, int msize
,
7109 zreg_off_fn
*off_fn
,
7110 sve_ldst1_host_fn
*host_fn
,
7111 sve_ldst1_tlb_fn
*tlb_fn
)
7113 const int mmu_idx
= cpu_mmu_index(env
, false);
7114 const intptr_t reg_max
= simd_oprsz(desc
);
7115 const int scale
= simd_data(desc
);
7116 void *host
[ARM_MAX_VQ
* 4];
7117 intptr_t reg_off
, i
;
7118 SVEHostPage info
, info2
;
7121 * Probe all of the elements for host addresses and flags.
7125 uint64_t pg
= vg
[reg_off
>> 6];
7127 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
7128 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
7131 if (likely((pg
>> (reg_off
& 63)) & 1)) {
7132 if (likely(in_page
>= msize
)) {
7133 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_STORE
,
7135 host
[i
] = info
.host
;
7138 * Element crosses the page boundary.
7139 * Probe both pages, but do not record the host address,
7140 * so that we use the slow path.
7142 sve_probe_page(&info
, false, env
, addr
, 0,
7143 MMU_DATA_STORE
, mmu_idx
, retaddr
);
7144 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
7145 MMU_DATA_STORE
, mmu_idx
, retaddr
);
7146 info
.flags
|= info2
.flags
;
7149 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
7150 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
7151 info
.attrs
, BP_MEM_WRITE
, retaddr
);
7154 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
7155 mte_check(env
, mtedesc
, addr
, retaddr
);
7160 } while (reg_off
& 63);
7161 } while (reg_off
< reg_max
);
7164 * Now that we have recognized all exceptions except SyncExternal
7165 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7167 * Note for the common case of an element in RAM, not crossing a page
7168 * boundary, we have stored the host address in host[]. This doubles
7169 * as a first-level check against the predicate, since only enabled
7170 * elements have non-null host addresses.
7175 if (likely(h
!= NULL
)) {
7176 host_fn(vd
, reg_off
, h
);
7177 } else if ((vg
[reg_off
>> 6] >> (reg_off
& 63)) & 1) {
7178 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
7179 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
7183 } while (reg_off
< reg_max
);
7186 static inline QEMU_ALWAYS_INLINE
7187 void sve_st1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
7188 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
7189 int esize
, int msize
, zreg_off_fn
*off_fn
,
7190 sve_ldst1_host_fn
*host_fn
,
7191 sve_ldst1_tlb_fn
*tlb_fn
)
7193 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
7194 /* Remove mtedesc from the normal sve descriptor. */
7195 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
7198 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7199 * offset base entirely over the address space hole to change the
7200 * pointer tag, or change the bit55 selector. So we could here
7201 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7203 sve_st1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
7204 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
7207 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7208 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7209 void *vm, target_ulong base, uint32_t desc) \
7211 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7212 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7214 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7215 void *vm, target_ulong base, uint32_t desc) \
7217 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7218 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7221 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7222 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7223 void *vm, target_ulong base, uint32_t desc) \
7225 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7226 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7228 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7229 void *vm, target_ulong base, uint32_t desc) \
7231 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7232 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7235 DO_ST1_ZPZ_S(bs
, zsu
, MO_8
)
7236 DO_ST1_ZPZ_S(hs_le
, zsu
, MO_16
)
7237 DO_ST1_ZPZ_S(hs_be
, zsu
, MO_16
)
7238 DO_ST1_ZPZ_S(ss_le
, zsu
, MO_32
)
7239 DO_ST1_ZPZ_S(ss_be
, zsu
, MO_32
)
7241 DO_ST1_ZPZ_S(bs
, zss
, MO_8
)
7242 DO_ST1_ZPZ_S(hs_le
, zss
, MO_16
)
7243 DO_ST1_ZPZ_S(hs_be
, zss
, MO_16
)
7244 DO_ST1_ZPZ_S(ss_le
, zss
, MO_32
)
7245 DO_ST1_ZPZ_S(ss_be
, zss
, MO_32
)
7247 DO_ST1_ZPZ_D(bd
, zsu
, MO_8
)
7248 DO_ST1_ZPZ_D(hd_le
, zsu
, MO_16
)
7249 DO_ST1_ZPZ_D(hd_be
, zsu
, MO_16
)
7250 DO_ST1_ZPZ_D(sd_le
, zsu
, MO_32
)
7251 DO_ST1_ZPZ_D(sd_be
, zsu
, MO_32
)
7252 DO_ST1_ZPZ_D(dd_le
, zsu
, MO_64
)
7253 DO_ST1_ZPZ_D(dd_be
, zsu
, MO_64
)
7255 DO_ST1_ZPZ_D(bd
, zss
, MO_8
)
7256 DO_ST1_ZPZ_D(hd_le
, zss
, MO_16
)
7257 DO_ST1_ZPZ_D(hd_be
, zss
, MO_16
)
7258 DO_ST1_ZPZ_D(sd_le
, zss
, MO_32
)
7259 DO_ST1_ZPZ_D(sd_be
, zss
, MO_32
)
7260 DO_ST1_ZPZ_D(dd_le
, zss
, MO_64
)
7261 DO_ST1_ZPZ_D(dd_be
, zss
, MO_64
)
7263 DO_ST1_ZPZ_D(bd
, zd
, MO_8
)
7264 DO_ST1_ZPZ_D(hd_le
, zd
, MO_16
)
7265 DO_ST1_ZPZ_D(hd_be
, zd
, MO_16
)
7266 DO_ST1_ZPZ_D(sd_le
, zd
, MO_32
)
7267 DO_ST1_ZPZ_D(sd_be
, zd
, MO_32
)
7268 DO_ST1_ZPZ_D(dd_le
, zd
, MO_64
)
7269 DO_ST1_ZPZ_D(dd_be
, zd
, MO_64
)
7274 void HELPER(sve2_eor3
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7276 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7277 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7279 for (i
= 0; i
< opr_sz
; ++i
) {
7280 d
[i
] = n
[i
] ^ m
[i
] ^ k
[i
];
7284 void HELPER(sve2_bcax
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7286 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7287 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7289 for (i
= 0; i
< opr_sz
; ++i
) {
7290 d
[i
] = n
[i
] ^ (m
[i
] & ~k
[i
]);
7294 void HELPER(sve2_bsl1n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7296 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7297 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7299 for (i
= 0; i
< opr_sz
; ++i
) {
7300 d
[i
] = (~n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]);
7304 void HELPER(sve2_bsl2n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7306 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7307 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7309 for (i
= 0; i
< opr_sz
; ++i
) {
7310 d
[i
] = (n
[i
] & k
[i
]) | (~m
[i
] & ~k
[i
]);
7314 void HELPER(sve2_nbsl
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7316 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7317 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7319 for (i
= 0; i
< opr_sz
; ++i
) {
7320 d
[i
] = ~((n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]));
7325 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7326 * See hasless(v,1) from
7327 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7329 static inline bool do_match2(uint64_t n
, uint64_t m0
, uint64_t m1
, int esz
)
7331 int bits
= 8 << esz
;
7332 uint64_t ones
= dup_const(esz
, 1);
7333 uint64_t signs
= ones
<< (bits
- 1);
7334 uint64_t cmp0
, cmp1
;
7336 cmp1
= dup_const(esz
, n
);
7339 cmp0
= (cmp0
- ones
) & ~cmp0
;
7340 cmp1
= (cmp1
- ones
) & ~cmp1
;
7341 return (cmp0
| cmp1
) & signs
;
7344 static inline uint32_t do_match(void *vd
, void *vn
, void *vm
, void *vg
,
7345 uint32_t desc
, int esz
, bool nmatch
)
7347 uint16_t esz_mask
= pred_esz_masks
[esz
];
7348 intptr_t opr_sz
= simd_oprsz(desc
);
7349 uint32_t flags
= PREDTEST_INIT
;
7352 for (i
= 0; i
< opr_sz
; i
+= 16) {
7353 uint64_t m0
= *(uint64_t *)(vm
+ i
);
7354 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
7355 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3)) & esz_mask
;
7358 for (j
= 0; j
< 16; j
+= 8) {
7359 uint64_t n
= *(uint64_t *)(vn
+ i
+ j
);
7361 for (k
= 0; k
< 8; k
+= 1 << esz
) {
7362 if (pg
& (1 << (j
+ k
))) {
7363 bool o
= do_match2(n
>> (k
* 8), m0
, m1
, esz
);
7364 out
|= (o
^ nmatch
) << (j
+ k
);
7368 *(uint16_t *)(vd
+ H1_2(i
>> 3)) = out
;
7369 flags
= iter_predtest_fwd(out
, pg
, flags
);
7374 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7375 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7377 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7380 DO_PPZZ_MATCH(sve2_match_ppzz_b
, MO_8
, false)
7381 DO_PPZZ_MATCH(sve2_match_ppzz_h
, MO_16
, false)
7383 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b
, MO_8
, true)
7384 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h
, MO_16
, true)
7386 #undef DO_PPZZ_MATCH
7388 void HELPER(sve2_histcnt_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
7391 ARMVectorReg scratch
;
7393 intptr_t opr_sz
= simd_oprsz(desc
);
7394 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
7398 n
= memcpy(&scratch
, n
, opr_sz
);
7402 } else if (d
== m
) {
7403 m
= memcpy(&scratch
, m
, opr_sz
);
7406 for (i
= 0; i
< opr_sz
; i
+= 4) {
7410 pred
= pg
[H1(i
>> 3)] >> (i
& 7);
7412 uint32_t nn
= n
[H4(i
>> 2)];
7414 for (j
= 0; j
<= i
; j
+= 4) {
7415 pred
= pg
[H1(j
>> 3)] >> (j
& 7);
7416 if ((pred
& 1) && nn
== m
[H4(j
>> 2)]) {
7421 d
[H4(i
>> 2)] = count
;
7425 void HELPER(sve2_histcnt_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
7428 ARMVectorReg scratch
;
7430 intptr_t opr_sz
= simd_oprsz(desc
);
7431 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7435 n
= memcpy(&scratch
, n
, opr_sz
);
7439 } else if (d
== m
) {
7440 m
= memcpy(&scratch
, m
, opr_sz
);
7443 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
7445 if (pg
[H1(i
)] & 1) {
7447 for (j
= 0; j
<= i
; ++j
) {
7448 if ((pg
[H1(j
)] & 1) && nn
== m
[j
]) {
7458 * Returns the number of bytes in m0 and m1 that match n.
7459 * Unlike do_match2 we don't just need true/false, we need an exact count.
7460 * This requires two extra logical operations.
7462 static inline uint64_t do_histseg_cnt(uint8_t n
, uint64_t m0
, uint64_t m1
)
7464 const uint64_t mask
= dup_const(MO_8
, 0x7f);
7465 uint64_t cmp0
, cmp1
;
7467 cmp1
= dup_const(MO_8
, n
);
7472 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7473 * 2: carry in to msb if byte != 0 (+ mask)
7474 * 3: set msb if cmp has msb set (| cmp)
7475 * 4: set ~msb to ignore them (| mask)
7476 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7477 * 5: invert, resulting in 0x80 if and only if byte == 0.
7479 cmp0
= ~(((cmp0
& mask
) + mask
) | cmp0
| mask
);
7480 cmp1
= ~(((cmp1
& mask
) + mask
) | cmp1
| mask
);
7483 * Combine the two compares in a way that the bits do
7484 * not overlap, and so preserves the count of set bits.
7485 * If the host has an efficient instruction for ctpop,
7486 * then ctpop(x) + ctpop(y) has the same number of
7487 * operations as ctpop(x | (y >> 1)). If the host does
7488 * not have an efficient ctpop, then we only want to
7491 return ctpop64(cmp0
| (cmp1
>> 1));
7494 void HELPER(sve2_histseg
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7497 intptr_t opr_sz
= simd_oprsz(desc
);
7499 for (i
= 0; i
< opr_sz
; i
+= 16) {
7500 uint64_t n0
= *(uint64_t *)(vn
+ i
);
7501 uint64_t m0
= *(uint64_t *)(vm
+ i
);
7502 uint64_t n1
= *(uint64_t *)(vn
+ i
+ 8);
7503 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
7507 for (j
= 0; j
< 64; j
+= 8) {
7508 uint64_t cnt0
= do_histseg_cnt(n0
>> j
, m0
, m1
);
7509 uint64_t cnt1
= do_histseg_cnt(n1
>> j
, m0
, m1
);
7514 *(uint64_t *)(vd
+ i
) = out0
;
7515 *(uint64_t *)(vd
+ i
+ 8) = out1
;
7519 void HELPER(sve2_xar_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7521 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7522 int shr
= simd_data(desc
);
7524 uint64_t mask
= dup_const(MO_8
, 0xff >> shr
);
7525 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7527 for (i
= 0; i
< opr_sz
; ++i
) {
7528 uint64_t t
= n
[i
] ^ m
[i
];
7529 d
[i
] = ((t
>> shr
) & mask
) | ((t
<< shl
) & ~mask
);
7533 void HELPER(sve2_xar_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7535 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7536 int shr
= simd_data(desc
);
7538 uint64_t mask
= dup_const(MO_16
, 0xffff >> shr
);
7539 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7541 for (i
= 0; i
< opr_sz
; ++i
) {
7542 uint64_t t
= n
[i
] ^ m
[i
];
7543 d
[i
] = ((t
>> shr
) & mask
) | ((t
<< shl
) & ~mask
);
7547 void HELPER(sve2_xar_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7549 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
7550 int shr
= simd_data(desc
);
7551 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
7553 for (i
= 0; i
< opr_sz
; ++i
) {
7554 d
[i
] = ror32(n
[i
] ^ m
[i
], shr
);
7558 void HELPER(fmmla_s
)(void *vd
, void *vn
, void *vm
, void *va
,
7559 void *status
, uint32_t desc
)
7561 intptr_t s
, opr_sz
= simd_oprsz(desc
) / (sizeof(float32
) * 4);
7563 for (s
= 0; s
< opr_sz
; ++s
) {
7564 float32
*n
= vn
+ s
* sizeof(float32
) * 4;
7565 float32
*m
= vm
+ s
* sizeof(float32
) * 4;
7566 float32
*a
= va
+ s
* sizeof(float32
) * 4;
7567 float32
*d
= vd
+ s
* sizeof(float32
) * 4;
7568 float32 n00
= n
[H4(0)], n01
= n
[H4(1)];
7569 float32 n10
= n
[H4(2)], n11
= n
[H4(3)];
7570 float32 m00
= m
[H4(0)], m01
= m
[H4(1)];
7571 float32 m10
= m
[H4(2)], m11
= m
[H4(3)];
7575 p0
= float32_mul(n00
, m00
, status
);
7576 p1
= float32_mul(n01
, m01
, status
);
7577 d
[H4(0)] = float32_add(a
[H4(0)], float32_add(p0
, p1
, status
), status
);
7580 p0
= float32_mul(n00
, m10
, status
);
7581 p1
= float32_mul(n01
, m11
, status
);
7582 d
[H4(1)] = float32_add(a
[H4(1)], float32_add(p0
, p1
, status
), status
);
7585 p0
= float32_mul(n10
, m00
, status
);
7586 p1
= float32_mul(n11
, m01
, status
);
7587 d
[H4(2)] = float32_add(a
[H4(2)], float32_add(p0
, p1
, status
), status
);
7590 p0
= float32_mul(n10
, m10
, status
);
7591 p1
= float32_mul(n11
, m11
, status
);
7592 d
[H4(3)] = float32_add(a
[H4(3)], float32_add(p0
, p1
, status
), status
);
7596 void HELPER(fmmla_d
)(void *vd
, void *vn
, void *vm
, void *va
,
7597 void *status
, uint32_t desc
)
7599 intptr_t s
, opr_sz
= simd_oprsz(desc
) / (sizeof(float64
) * 4);
7601 for (s
= 0; s
< opr_sz
; ++s
) {
7602 float64
*n
= vn
+ s
* sizeof(float64
) * 4;
7603 float64
*m
= vm
+ s
* sizeof(float64
) * 4;
7604 float64
*a
= va
+ s
* sizeof(float64
) * 4;
7605 float64
*d
= vd
+ s
* sizeof(float64
) * 4;
7606 float64 n00
= n
[0], n01
= n
[1], n10
= n
[2], n11
= n
[3];
7607 float64 m00
= m
[0], m01
= m
[1], m10
= m
[2], m11
= m
[3];
7611 p0
= float64_mul(n00
, m00
, status
);
7612 p1
= float64_mul(n01
, m01
, status
);
7613 d
[0] = float64_add(a
[0], float64_add(p0
, p1
, status
), status
);
7616 p0
= float64_mul(n00
, m10
, status
);
7617 p1
= float64_mul(n01
, m11
, status
);
7618 d
[1] = float64_add(a
[1], float64_add(p0
, p1
, status
), status
);
7621 p0
= float64_mul(n10
, m00
, status
);
7622 p1
= float64_mul(n11
, m01
, status
);
7623 d
[2] = float64_add(a
[2], float64_add(p0
, p1
, status
), status
);
7626 p0
= float64_mul(n10
, m10
, status
);
7627 p1
= float64_mul(n11
, m11
, status
);
7628 d
[3] = float64_add(a
[3], float64_add(p0
, p1
, status
), status
);
7632 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7633 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7635 intptr_t i = simd_oprsz(desc); \
7638 uint64_t pg = g[(i - 1) >> 6]; \
7640 i -= sizeof(TYPEW); \
7641 if (likely((pg >> (i & 63)) & 1)) { \
7642 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7643 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7649 DO_FCVTNT(sve_bfcvtnt
, uint32_t, uint16_t, H1_4
, H1_2
, float32_to_bfloat16
)
7650 DO_FCVTNT(sve2_fcvtnt_sh
, uint32_t, uint16_t, H1_4
, H1_2
, sve_f32_to_f16
)
7651 DO_FCVTNT(sve2_fcvtnt_ds
, uint64_t, uint32_t, H1_8
, H1_4
, float64_to_float32
)
7653 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7654 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7656 intptr_t i = simd_oprsz(desc); \
7659 uint64_t pg = g[(i - 1) >> 6]; \
7661 i -= sizeof(TYPEW); \
7662 if (likely((pg >> (i & 63)) & 1)) { \
7663 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7664 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7670 DO_FCVTLT(sve2_fcvtlt_hs
, uint32_t, uint16_t, H1_4
, H1_2
, sve_f16_to_f32
)
7671 DO_FCVTLT(sve2_fcvtlt_sd
, uint64_t, uint32_t, H1_8
, H1_4
, float32_to_float64
)