target/arm: Rewrite vector gather stores
[qemu.git] / target / arm / sve_helper.c
bloba95e445b2266a7e11a79b4a787c7d01e49eeab87
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 return flags;
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
96 return flags;
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
116 return flags;
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
127 * printf("0x%016lx,\n", m);
130 static inline uint64_t expand_pred_b(uint8_t byte)
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
220 return word[byte];
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 static inline uint64_t expand_pred_h(uint8_t byte)
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
249 return word[byte & 0x55];
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
260 return word[byte & 0x11];
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
266 return rol32(h, 16);
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
280 return rol64(h, 32);
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
375 * We must avoid the C undefined behaviour cases: division by
376 * zero and signed division of INT_MIN by -1. Both of these
377 * have architecturally defined required results for Arm.
378 * We special case all signed divisions by -1 to avoid having
379 * to deduce the minimum integer for the type involved.
381 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
384 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
389 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
394 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
399 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
404 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
409 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
414 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
419 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
424 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
429 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
434 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
439 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
444 /* Because the computation type is at least twice as large as required,
445 these work for both signed and unsigned source types. */
446 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
448 return (n * m) >> 8;
451 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
453 return (n * m) >> 16;
456 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
458 return (n * m) >> 32;
461 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
468 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
475 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
480 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
485 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
490 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
493 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
496 /* Note that all bits of the shift are significant
497 and not modulo the element size. */
498 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
502 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
506 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
510 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
514 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
518 #undef DO_ZPZZ
519 #undef DO_ZPZZ_D
521 /* Three-operand expander, controlled by a predicate, in which the
522 * third operand is "wide". That is, for D = N op M, the same 64-bit
523 * value of M is used with all of the narrower values of N.
525 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
542 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
546 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
550 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
554 #undef DO_ZPZW
556 /* Fully general two-operand expander, controlled by a predicate.
558 #define DO_ZPZ(NAME, TYPE, H, OP) \
559 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
574 /* Similarly, specialized for 64-bit operands. */
575 #define DO_ZPZ_D(NAME, TYPE, OP) \
576 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
589 #define DO_CLS_B(N) (clrsb32(N) - 24)
590 #define DO_CLS_H(N) (clrsb32(N) - 16)
592 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
597 #define DO_CLZ_B(N) (clz32(N) - 24)
598 #define DO_CLZ_H(N) (clz32(N) - 16)
600 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
605 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
610 #define DO_CNOT(N) (N == 0)
612 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
617 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
619 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
623 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
625 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
629 #define DO_NOT(N) (~N)
631 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
636 #define DO_SXTB(N) ((int8_t)N)
637 #define DO_SXTH(N) ((int16_t)N)
638 #define DO_SXTS(N) ((int32_t)N)
639 #define DO_UXTB(N) ((uint8_t)N)
640 #define DO_UXTH(N) ((uint16_t)N)
641 #define DO_UXTS(N) ((uint32_t)N)
643 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
650 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
657 #define DO_ABS(N) (N < 0 ? -N : N)
659 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
664 #define DO_NEG(N) (-N)
666 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
671 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
675 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
678 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
680 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
685 /* Three-operand expander, unpredicated, in which the third operand is "wide".
687 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
701 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
705 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
709 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
713 #undef DO_ZZW
715 #undef DO_CLS_B
716 #undef DO_CLS_H
717 #undef DO_CLZ_B
718 #undef DO_CLZ_H
719 #undef DO_CNOT
720 #undef DO_FABS
721 #undef DO_FNEG
722 #undef DO_ABS
723 #undef DO_NEG
724 #undef DO_ZPZ
725 #undef DO_ZPZ_D
727 /* Two-operand reduction expander, controlled by a predicate.
728 * The difference between TYPERED and TYPERET has to do with
729 * sign-extension. E.g. for SMAX, TYPERED must be signed,
730 * but TYPERET must be unsigned so that e.g. a 32-bit value
731 * is not sign-extended to the ABI uint64_t return type.
733 /* ??? If we were to vectorize this by hand the reduction ordering
734 * would change. For integer operands, this is perfectly fine.
736 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
751 return (TYPERET)ret; \
754 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
767 return ret; \
770 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
775 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
780 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
785 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
789 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
794 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
799 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
804 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
809 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
814 #undef DO_VPZ
815 #undef DO_VPZ_D
817 /* Two vector operand, one scalar operand, unpredicated. */
818 #define DO_ZZI(NAME, TYPE, OP) \
819 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
828 #define DO_SUBR(X, Y) (Y - X)
830 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
835 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
840 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
845 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
850 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
855 #undef DO_ZZI
857 #undef DO_AND
858 #undef DO_ORR
859 #undef DO_EOR
860 #undef DO_BIC
861 #undef DO_ADD
862 #undef DO_SUB
863 #undef DO_MAX
864 #undef DO_MIN
865 #undef DO_ABD
866 #undef DO_MUL
867 #undef DO_DIV
868 #undef DO_ASR
869 #undef DO_LSR
870 #undef DO_LSL
871 #undef DO_SUBR
873 /* Similar to the ARM LastActiveElement pseudocode function, except the
874 result is multiplied by the element size. This includes the not found
875 indication; e.g. not found for esz=3 is -8. */
876 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
890 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
900 if (this_g) {
901 if (!(flags & 4)) {
902 /* Set in D the first bit of G. */
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
906 flags = iter_predtest_fwd(this_d, this_g, flags);
908 } while (++i < words);
910 return flags;
913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
924 /* Similar to the pseudocode for pnext, but scaled by ESZ
925 so that we find the correct bit. */
926 if (next < words * 64) {
927 uint64_t mask = -1;
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
954 return flags;
957 /* Store zero into every active element of Zd. We will use this for two
958 * and three-operand predicated instructions for which logic dictates a
959 * zero result. In particular, logical shift by element size, which is
960 * otherwise undefined on the host.
962 * For element sizes smaller than uint64_t, we use tables to expand
963 * the N bits of the controlling predicate to a byte mask, and clear
964 * those bytes.
966 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
976 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
986 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
996 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1008 /* Copy Zn into Zd, and store zero into inactive elements. */
1009 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1019 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1029 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1039 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1049 /* Three-operand expander, immediate operand, controlled by a predicate.
1051 #define DO_ZPZI(NAME, TYPE, H, OP) \
1052 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1068 /* Similarly, specialized for 64-bit operands. */
1069 #define DO_ZPZI_D(NAME, TYPE, OP) \
1070 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1084 #define DO_SHR(N, M) (N >> M)
1085 #define DO_SHL(N, M) (N << M)
1087 /* Arithmetic shift right for division. This rounds negative numbers
1088 toward zero as per signed division. Therefore before shifting,
1089 when N is negative, add 2**M-1. */
1090 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1112 #undef DO_SHR
1113 #undef DO_SHL
1114 #undef DO_ASRD
1115 #undef DO_ZPZI
1116 #undef DO_ZPZI_D
1118 /* Fully general four-operand expander, controlled by a predicate.
1120 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1139 /* Similarly, specialized for 64-bit operands. */
1140 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1155 #define DO_MLA(A, N, M) (A + N * M)
1156 #define DO_MLS(A, N, M) (A - N * M)
1158 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1161 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1164 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1167 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1170 #undef DO_MLA
1171 #undef DO_MLS
1172 #undef DO_ZPZZZ
1173 #undef DO_ZPZZZ_D
1175 void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1185 void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1195 void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1205 void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1215 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1225 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1235 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1245 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1255 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1257 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1275 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1277 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1307 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1345 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1355 d[i] = nn ^ (mm & 2) << 14;
1359 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1369 d[i] = nn ^ (mm & 2) << 30;
1373 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1383 d[i] = nn ^ (mm & 2) << 62;
1388 * Signed saturating addition with scalar operand.
1391 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1393 intptr_t i, oprsz = simd_oprsz(desc);
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1402 *(int8_t *)(d + i) = r;
1406 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1408 intptr_t i, oprsz = simd_oprsz(desc);
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1417 *(int16_t *)(d + i) = r;
1421 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1423 intptr_t i, oprsz = simd_oprsz(desc);
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1432 *(int32_t *)(d + i) = r;
1436 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1438 intptr_t i, oprsz = simd_oprsz(desc);
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444 /* Signed overflow. */
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1447 *(int64_t *)(d + i) = r;
1452 * Unsigned saturating addition with scalar operand.
1455 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1457 intptr_t i, oprsz = simd_oprsz(desc);
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1466 *(uint8_t *)(d + i) = r;
1470 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1472 intptr_t i, oprsz = simd_oprsz(desc);
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1481 *(uint16_t *)(d + i) = r;
1485 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1487 intptr_t i, oprsz = simd_oprsz(desc);
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1496 *(uint32_t *)(d + i) = r;
1500 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1502 intptr_t i, oprsz = simd_oprsz(desc);
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1509 *(uint64_t *)(d + i) = r;
1513 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1515 intptr_t i, oprsz = simd_oprsz(desc);
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1523 /* Two operand predicated copy immediate with merge. All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1526 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1541 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1556 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1571 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1584 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1596 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1608 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1620 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1631 /* Big-endian hosts need to frob the byte indicies. If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1634 static void swap_memmove(void *vd, void *vs, size_t n)
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1641 #ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643 #endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1660 break;
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1674 break;
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1687 break;
1691 /* Similarly for memset of 0. */
1692 static void swap_memzero(void *vd, size_t n)
1694 uintptr_t d = (uintptr_t)vd;
1695 uintptr_t o = (d | n) & 7;
1696 size_t i;
1698 /* Usually, the first bit of a predicate is set, so N is 0. */
1699 if (likely(n == 0)) {
1700 return;
1703 #ifndef HOST_WORDS_BIGENDIAN
1704 o = 0;
1705 #endif
1706 switch (o) {
1707 case 0:
1708 memset(vd, 0, n);
1709 break;
1711 case 4:
1712 for (i = 0; i < n; i += 4) {
1713 *(uint32_t *)H1_4(d + i) = 0;
1715 break;
1717 case 2:
1718 case 6:
1719 for (i = 0; i < n; i += 2) {
1720 *(uint16_t *)H1_2(d + i) = 0;
1722 break;
1724 default:
1725 for (i = 0; i < n; i++) {
1726 *(uint8_t *)H1(d + i) = 0;
1728 break;
1732 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1734 intptr_t opr_sz = simd_oprsz(desc);
1735 size_t n_ofs = simd_data(desc);
1736 size_t n_siz = opr_sz - n_ofs;
1738 if (vd != vm) {
1739 swap_memmove(vd, vn + n_ofs, n_siz);
1740 swap_memmove(vd + n_siz, vm, n_ofs);
1741 } else if (vd != vn) {
1742 swap_memmove(vd + n_siz, vd, n_ofs);
1743 swap_memmove(vd, vn + n_ofs, n_siz);
1744 } else {
1745 /* vd == vn == vm. Need temp space. */
1746 ARMVectorReg tmp;
1747 swap_memmove(&tmp, vm, n_ofs);
1748 swap_memmove(vd, vd + n_ofs, n_siz);
1749 memcpy(vd + n_siz, &tmp, n_ofs);
1753 #define DO_INSR(NAME, TYPE, H) \
1754 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1756 intptr_t opr_sz = simd_oprsz(desc); \
1757 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1758 *(TYPE *)(vd + H(0)) = val; \
1761 DO_INSR(sve_insr_b, uint8_t, H1)
1762 DO_INSR(sve_insr_h, uint16_t, H1_2)
1763 DO_INSR(sve_insr_s, uint32_t, H1_4)
1764 DO_INSR(sve_insr_d, uint64_t, )
1766 #undef DO_INSR
1768 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1770 intptr_t i, j, opr_sz = simd_oprsz(desc);
1771 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1772 uint64_t f = *(uint64_t *)(vn + i);
1773 uint64_t b = *(uint64_t *)(vn + j);
1774 *(uint64_t *)(vd + i) = bswap64(b);
1775 *(uint64_t *)(vd + j) = bswap64(f);
1779 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1781 intptr_t i, j, opr_sz = simd_oprsz(desc);
1782 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1783 uint64_t f = *(uint64_t *)(vn + i);
1784 uint64_t b = *(uint64_t *)(vn + j);
1785 *(uint64_t *)(vd + i) = hswap64(b);
1786 *(uint64_t *)(vd + j) = hswap64(f);
1790 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1792 intptr_t i, j, opr_sz = simd_oprsz(desc);
1793 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1794 uint64_t f = *(uint64_t *)(vn + i);
1795 uint64_t b = *(uint64_t *)(vn + j);
1796 *(uint64_t *)(vd + i) = rol64(b, 32);
1797 *(uint64_t *)(vd + j) = rol64(f, 32);
1801 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1803 intptr_t i, j, opr_sz = simd_oprsz(desc);
1804 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1805 uint64_t f = *(uint64_t *)(vn + i);
1806 uint64_t b = *(uint64_t *)(vn + j);
1807 *(uint64_t *)(vd + i) = b;
1808 *(uint64_t *)(vd + j) = f;
1812 #define DO_TBL(NAME, TYPE, H) \
1813 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1815 intptr_t i, opr_sz = simd_oprsz(desc); \
1816 uintptr_t elem = opr_sz / sizeof(TYPE); \
1817 TYPE *d = vd, *n = vn, *m = vm; \
1818 ARMVectorReg tmp; \
1819 if (unlikely(vd == vn)) { \
1820 n = memcpy(&tmp, vn, opr_sz); \
1822 for (i = 0; i < elem; i++) { \
1823 TYPE j = m[H(i)]; \
1824 d[H(i)] = j < elem ? n[H(j)] : 0; \
1828 DO_TBL(sve_tbl_b, uint8_t, H1)
1829 DO_TBL(sve_tbl_h, uint16_t, H2)
1830 DO_TBL(sve_tbl_s, uint32_t, H4)
1831 DO_TBL(sve_tbl_d, uint64_t, )
1833 #undef TBL
1835 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1836 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1838 intptr_t i, opr_sz = simd_oprsz(desc); \
1839 TYPED *d = vd; \
1840 TYPES *n = vn; \
1841 ARMVectorReg tmp; \
1842 if (unlikely(vn - vd < opr_sz)) { \
1843 n = memcpy(&tmp, n, opr_sz / 2); \
1845 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1846 d[HD(i)] = n[HS(i)]; \
1850 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1851 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1852 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1854 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1855 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1856 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1858 #undef DO_UNPK
1860 /* Mask of bits included in the even numbered predicates of width esz.
1861 * We also use this for expand_bits/compress_bits, and so extend the
1862 * same pattern out to 16-bit units.
1864 static const uint64_t even_bit_esz_masks[5] = {
1865 0x5555555555555555ull,
1866 0x3333333333333333ull,
1867 0x0f0f0f0f0f0f0f0full,
1868 0x00ff00ff00ff00ffull,
1869 0x0000ffff0000ffffull,
1872 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1873 * For N==0, this corresponds to the operation that in qemu/bitops.h
1874 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1875 * section 7-2 Shuffling Bits.
1877 static uint64_t expand_bits(uint64_t x, int n)
1879 int i;
1881 x &= 0xffffffffu;
1882 for (i = 4; i >= n; i--) {
1883 int sh = 1 << i;
1884 x = ((x << sh) | x) & even_bit_esz_masks[i];
1886 return x;
1889 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1890 * For N==0, this corresponds to the operation that in qemu/bitops.h
1891 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1892 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1894 static uint64_t compress_bits(uint64_t x, int n)
1896 int i;
1898 for (i = n; i <= 4; i++) {
1899 int sh = 1 << i;
1900 x &= even_bit_esz_masks[i];
1901 x = (x >> sh) | x;
1903 return x & 0xffffffffu;
1906 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1908 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1909 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1910 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1911 uint64_t *d = vd;
1912 intptr_t i;
1914 if (oprsz <= 8) {
1915 uint64_t nn = *(uint64_t *)vn;
1916 uint64_t mm = *(uint64_t *)vm;
1917 int half = 4 * oprsz;
1919 nn = extract64(nn, high * half, half);
1920 mm = extract64(mm, high * half, half);
1921 nn = expand_bits(nn, esz);
1922 mm = expand_bits(mm, esz);
1923 d[0] = nn + (mm << (1 << esz));
1924 } else {
1925 ARMPredicateReg tmp_n, tmp_m;
1927 /* We produce output faster than we consume input.
1928 Therefore we must be mindful of possible overlap. */
1929 if ((vn - vd) < (uintptr_t)oprsz) {
1930 vn = memcpy(&tmp_n, vn, oprsz);
1932 if ((vm - vd) < (uintptr_t)oprsz) {
1933 vm = memcpy(&tmp_m, vm, oprsz);
1935 if (high) {
1936 high = oprsz >> 1;
1939 if ((high & 3) == 0) {
1940 uint32_t *n = vn, *m = vm;
1941 high >>= 2;
1943 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1944 uint64_t nn = n[H4(high + i)];
1945 uint64_t mm = m[H4(high + i)];
1947 nn = expand_bits(nn, esz);
1948 mm = expand_bits(mm, esz);
1949 d[i] = nn + (mm << (1 << esz));
1951 } else {
1952 uint8_t *n = vn, *m = vm;
1953 uint16_t *d16 = vd;
1955 for (i = 0; i < oprsz / 2; i++) {
1956 uint16_t nn = n[H1(high + i)];
1957 uint16_t mm = m[H1(high + i)];
1959 nn = expand_bits(nn, esz);
1960 mm = expand_bits(mm, esz);
1961 d16[H2(i)] = nn + (mm << (1 << esz));
1967 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1969 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1970 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1971 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1972 uint64_t *d = vd, *n = vn, *m = vm;
1973 uint64_t l, h;
1974 intptr_t i;
1976 if (oprsz <= 8) {
1977 l = compress_bits(n[0] >> odd, esz);
1978 h = compress_bits(m[0] >> odd, esz);
1979 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1980 } else {
1981 ARMPredicateReg tmp_m;
1982 intptr_t oprsz_16 = oprsz / 16;
1984 if ((vm - vd) < (uintptr_t)oprsz) {
1985 m = memcpy(&tmp_m, vm, oprsz);
1988 for (i = 0; i < oprsz_16; i++) {
1989 l = n[2 * i + 0];
1990 h = n[2 * i + 1];
1991 l = compress_bits(l >> odd, esz);
1992 h = compress_bits(h >> odd, esz);
1993 d[i] = l + (h << 32);
1996 /* For VL which is not a power of 2, the results from M do not
1997 align nicely with the uint64_t for D. Put the aligned results
1998 from M into TMP_M and then copy it into place afterward. */
1999 if (oprsz & 15) {
2000 d[i] = compress_bits(n[2 * i] >> odd, esz);
2002 for (i = 0; i < oprsz_16; i++) {
2003 l = m[2 * i + 0];
2004 h = m[2 * i + 1];
2005 l = compress_bits(l >> odd, esz);
2006 h = compress_bits(h >> odd, esz);
2007 tmp_m.p[i] = l + (h << 32);
2009 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2011 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2012 } else {
2013 for (i = 0; i < oprsz_16; i++) {
2014 l = m[2 * i + 0];
2015 h = m[2 * i + 1];
2016 l = compress_bits(l >> odd, esz);
2017 h = compress_bits(h >> odd, esz);
2018 d[oprsz_16 + i] = l + (h << 32);
2024 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2026 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2027 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2028 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2029 uint64_t *d = vd, *n = vn, *m = vm;
2030 uint64_t mask;
2031 int shr, shl;
2032 intptr_t i;
2034 shl = 1 << esz;
2035 shr = 0;
2036 mask = even_bit_esz_masks[esz];
2037 if (odd) {
2038 mask <<= shl;
2039 shr = shl;
2040 shl = 0;
2043 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2044 uint64_t nn = (n[i] & mask) >> shr;
2045 uint64_t mm = (m[i] & mask) << shl;
2046 d[i] = nn + mm;
2050 /* Reverse units of 2**N bits. */
2051 static uint64_t reverse_bits_64(uint64_t x, int n)
2053 int i, sh;
2055 x = bswap64(x);
2056 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2057 uint64_t mask = even_bit_esz_masks[i];
2058 x = ((x & mask) << sh) | ((x >> sh) & mask);
2060 return x;
2063 static uint8_t reverse_bits_8(uint8_t x, int n)
2065 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2066 int i, sh;
2068 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2069 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2071 return x;
2074 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2076 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2077 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2078 intptr_t i, oprsz_2 = oprsz / 2;
2080 if (oprsz <= 8) {
2081 uint64_t l = *(uint64_t *)vn;
2082 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2083 *(uint64_t *)vd = l;
2084 } else if ((oprsz & 15) == 0) {
2085 for (i = 0; i < oprsz_2; i += 8) {
2086 intptr_t ih = oprsz - 8 - i;
2087 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2088 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2089 *(uint64_t *)(vd + i) = h;
2090 *(uint64_t *)(vd + ih) = l;
2092 } else {
2093 for (i = 0; i < oprsz_2; i += 1) {
2094 intptr_t il = H1(i);
2095 intptr_t ih = H1(oprsz - 1 - i);
2096 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2097 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2098 *(uint8_t *)(vd + il) = h;
2099 *(uint8_t *)(vd + ih) = l;
2104 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2106 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2107 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2108 uint64_t *d = vd;
2109 intptr_t i;
2111 if (oprsz <= 8) {
2112 uint64_t nn = *(uint64_t *)vn;
2113 int half = 4 * oprsz;
2115 nn = extract64(nn, high * half, half);
2116 nn = expand_bits(nn, 0);
2117 d[0] = nn;
2118 } else {
2119 ARMPredicateReg tmp_n;
2121 /* We produce output faster than we consume input.
2122 Therefore we must be mindful of possible overlap. */
2123 if ((vn - vd) < (uintptr_t)oprsz) {
2124 vn = memcpy(&tmp_n, vn, oprsz);
2126 if (high) {
2127 high = oprsz >> 1;
2130 if ((high & 3) == 0) {
2131 uint32_t *n = vn;
2132 high >>= 2;
2134 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2135 uint64_t nn = n[H4(high + i)];
2136 d[i] = expand_bits(nn, 0);
2138 } else {
2139 uint16_t *d16 = vd;
2140 uint8_t *n = vn;
2142 for (i = 0; i < oprsz / 2; i++) {
2143 uint16_t nn = n[H1(high + i)];
2144 d16[H2(i)] = expand_bits(nn, 0);
2150 #define DO_ZIP(NAME, TYPE, H) \
2151 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2153 intptr_t oprsz = simd_oprsz(desc); \
2154 intptr_t i, oprsz_2 = oprsz / 2; \
2155 ARMVectorReg tmp_n, tmp_m; \
2156 /* We produce output faster than we consume input. \
2157 Therefore we must be mindful of possible overlap. */ \
2158 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2159 vn = memcpy(&tmp_n, vn, oprsz_2); \
2161 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2162 vm = memcpy(&tmp_m, vm, oprsz_2); \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2166 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2170 DO_ZIP(sve_zip_b, uint8_t, H1)
2171 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2172 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2173 DO_ZIP(sve_zip_d, uint64_t, )
2175 #define DO_UZP(NAME, TYPE, H) \
2176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2178 intptr_t oprsz = simd_oprsz(desc); \
2179 intptr_t oprsz_2 = oprsz / 2; \
2180 intptr_t odd_ofs = simd_data(desc); \
2181 intptr_t i; \
2182 ARMVectorReg tmp_m; \
2183 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2184 vm = memcpy(&tmp_m, vm, oprsz); \
2186 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2187 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2189 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2190 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2194 DO_UZP(sve_uzp_b, uint8_t, H1)
2195 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2196 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2197 DO_UZP(sve_uzp_d, uint64_t, )
2199 #define DO_TRN(NAME, TYPE, H) \
2200 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2202 intptr_t oprsz = simd_oprsz(desc); \
2203 intptr_t odd_ofs = simd_data(desc); \
2204 intptr_t i; \
2205 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2206 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2207 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2208 *(TYPE *)(vd + H(i + 0)) = ae; \
2209 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2213 DO_TRN(sve_trn_b, uint8_t, H1)
2214 DO_TRN(sve_trn_h, uint16_t, H1_2)
2215 DO_TRN(sve_trn_s, uint32_t, H1_4)
2216 DO_TRN(sve_trn_d, uint64_t, )
2218 #undef DO_ZIP
2219 #undef DO_UZP
2220 #undef DO_TRN
2222 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2224 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2225 uint32_t *d = vd, *n = vn;
2226 uint8_t *pg = vg;
2228 for (i = j = 0; i < opr_sz; i++) {
2229 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2230 d[H4(j)] = n[H4(i)];
2231 j++;
2234 for (; j < opr_sz; j++) {
2235 d[H4(j)] = 0;
2239 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2241 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2242 uint64_t *d = vd, *n = vn;
2243 uint8_t *pg = vg;
2245 for (i = j = 0; i < opr_sz; i++) {
2246 if (pg[H1(i)] & 1) {
2247 d[j] = n[i];
2248 j++;
2251 for (; j < opr_sz; j++) {
2252 d[j] = 0;
2256 /* Similar to the ARM LastActiveElement pseudocode function, except the
2257 * result is multiplied by the element size. This includes the not found
2258 * indication; e.g. not found for esz=3 is -8.
2260 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2262 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2263 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2265 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2268 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2270 intptr_t opr_sz = simd_oprsz(desc) / 8;
2271 int esz = simd_data(desc);
2272 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2273 intptr_t i, first_i, last_i;
2274 ARMVectorReg tmp;
2276 first_i = last_i = 0;
2277 first_g = last_g = 0;
2279 /* Find the extent of the active elements within VG. */
2280 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2281 pg = *(uint64_t *)(vg + i) & mask;
2282 if (pg) {
2283 if (last_g == 0) {
2284 last_g = pg;
2285 last_i = i;
2287 first_g = pg;
2288 first_i = i;
2292 len = 0;
2293 if (first_g != 0) {
2294 first_i = first_i * 8 + ctz64(first_g);
2295 last_i = last_i * 8 + 63 - clz64(last_g);
2296 len = last_i - first_i + (1 << esz);
2297 if (vd == vm) {
2298 vm = memcpy(&tmp, vm, opr_sz * 8);
2300 swap_memmove(vd, vn + first_i, len);
2302 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2305 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2306 void *vg, uint32_t desc)
2308 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2309 uint64_t *d = vd, *n = vn, *m = vm;
2310 uint8_t *pg = vg;
2312 for (i = 0; i < opr_sz; i += 1) {
2313 uint64_t nn = n[i], mm = m[i];
2314 uint64_t pp = expand_pred_b(pg[H1(i)]);
2315 d[i] = (nn & pp) | (mm & ~pp);
2319 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2320 void *vg, uint32_t desc)
2322 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2323 uint64_t *d = vd, *n = vn, *m = vm;
2324 uint8_t *pg = vg;
2326 for (i = 0; i < opr_sz; i += 1) {
2327 uint64_t nn = n[i], mm = m[i];
2328 uint64_t pp = expand_pred_h(pg[H1(i)]);
2329 d[i] = (nn & pp) | (mm & ~pp);
2333 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2334 void *vg, uint32_t desc)
2336 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2337 uint64_t *d = vd, *n = vn, *m = vm;
2338 uint8_t *pg = vg;
2340 for (i = 0; i < opr_sz; i += 1) {
2341 uint64_t nn = n[i], mm = m[i];
2342 uint64_t pp = expand_pred_s(pg[H1(i)]);
2343 d[i] = (nn & pp) | (mm & ~pp);
2347 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2348 void *vg, uint32_t desc)
2350 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2351 uint64_t *d = vd, *n = vn, *m = vm;
2352 uint8_t *pg = vg;
2354 for (i = 0; i < opr_sz; i += 1) {
2355 uint64_t nn = n[i], mm = m[i];
2356 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2360 /* Two operand comparison controlled by a predicate.
2361 * ??? It is very tempting to want to be able to expand this inline
2362 * with x86 instructions, e.g.
2364 * vcmpeqw zm, zn, %ymm0
2365 * vpmovmskb %ymm0, %eax
2366 * and $0x5555, %eax
2367 * and pg, %eax
2369 * or even aarch64, e.g.
2371 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2372 * cmeq v0.8h, zn, zm
2373 * and v0.8h, v0.8h, mask
2374 * addv h0, v0.8h
2375 * and v0.8b, pg
2377 * However, coming up with an abstraction that allows vector inputs and
2378 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2379 * scalar outputs, is tricky.
2381 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2382 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2384 intptr_t opr_sz = simd_oprsz(desc); \
2385 uint32_t flags = PREDTEST_INIT; \
2386 intptr_t i = opr_sz; \
2387 do { \
2388 uint64_t out = 0, pg; \
2389 do { \
2390 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2391 TYPE nn = *(TYPE *)(vn + H(i)); \
2392 TYPE mm = *(TYPE *)(vm + H(i)); \
2393 out |= nn OP mm; \
2394 } while (i & 63); \
2395 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2396 out &= pg; \
2397 *(uint64_t *)(vd + (i >> 3)) = out; \
2398 flags = iter_predtest_bwd(out, pg, flags); \
2399 } while (i > 0); \
2400 return flags; \
2403 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2404 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2405 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2406 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2407 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2408 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2409 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2410 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2412 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2413 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2414 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2415 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2417 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2418 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2419 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2420 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2422 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2423 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2424 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2425 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2427 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2428 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2429 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2430 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2432 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2433 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2434 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2435 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2437 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2438 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2439 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2440 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2442 #undef DO_CMP_PPZZ_B
2443 #undef DO_CMP_PPZZ_H
2444 #undef DO_CMP_PPZZ_S
2445 #undef DO_CMP_PPZZ_D
2446 #undef DO_CMP_PPZZ
2448 /* Similar, but the second source is "wide". */
2449 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2450 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2452 intptr_t opr_sz = simd_oprsz(desc); \
2453 uint32_t flags = PREDTEST_INIT; \
2454 intptr_t i = opr_sz; \
2455 do { \
2456 uint64_t out = 0, pg; \
2457 do { \
2458 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2459 do { \
2460 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2461 TYPE nn = *(TYPE *)(vn + H(i)); \
2462 out |= nn OP mm; \
2463 } while (i & 7); \
2464 } while (i & 63); \
2465 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2466 out &= pg; \
2467 *(uint64_t *)(vd + (i >> 3)) = out; \
2468 flags = iter_predtest_bwd(out, pg, flags); \
2469 } while (i > 0); \
2470 return flags; \
2473 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2474 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2475 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2476 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2477 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2478 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2480 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2481 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2482 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2484 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2485 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2486 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2488 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2489 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2490 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2492 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2493 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2494 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2496 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2497 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2498 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2500 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2501 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2502 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2504 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2505 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2506 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2508 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2509 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2510 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2512 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2513 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2514 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2516 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2517 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2518 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2520 #undef DO_CMP_PPZW_B
2521 #undef DO_CMP_PPZW_H
2522 #undef DO_CMP_PPZW_S
2523 #undef DO_CMP_PPZW
2525 /* Similar, but the second source is immediate. */
2526 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2527 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2529 intptr_t opr_sz = simd_oprsz(desc); \
2530 uint32_t flags = PREDTEST_INIT; \
2531 TYPE mm = simd_data(desc); \
2532 intptr_t i = opr_sz; \
2533 do { \
2534 uint64_t out = 0, pg; \
2535 do { \
2536 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2537 TYPE nn = *(TYPE *)(vn + H(i)); \
2538 out |= nn OP mm; \
2539 } while (i & 63); \
2540 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2541 out &= pg; \
2542 *(uint64_t *)(vd + (i >> 3)) = out; \
2543 flags = iter_predtest_bwd(out, pg, flags); \
2544 } while (i > 0); \
2545 return flags; \
2548 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2549 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2550 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2551 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2552 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2553 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2554 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2555 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2557 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2558 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2559 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2560 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2562 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2563 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2564 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2565 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2567 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2568 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2569 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2570 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2572 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2573 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2574 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2575 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2577 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2578 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2579 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2580 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2582 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2583 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2584 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2585 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2587 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2588 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2589 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2590 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2592 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2593 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2594 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2595 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2597 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2598 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2599 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2600 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2602 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2603 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2604 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2605 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2607 #undef DO_CMP_PPZI_B
2608 #undef DO_CMP_PPZI_H
2609 #undef DO_CMP_PPZI_S
2610 #undef DO_CMP_PPZI_D
2611 #undef DO_CMP_PPZI
2613 /* Similar to the ARM LastActive pseudocode function. */
2614 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2616 intptr_t i;
2618 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2619 uint64_t pg = *(uint64_t *)(vg + i);
2620 if (pg) {
2621 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2624 return 0;
2627 /* Compute a mask into RETB that is true for all G, up to and including
2628 * (if after) or excluding (if !after) the first G & N.
2629 * Return true if BRK found.
2631 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2632 bool brk, bool after)
2634 uint64_t b;
2636 if (brk) {
2637 b = 0;
2638 } else if ((g & n) == 0) {
2639 /* For all G, no N are set; break not found. */
2640 b = g;
2641 } else {
2642 /* Break somewhere in N. Locate it. */
2643 b = g & n; /* guard true, pred true */
2644 b = b & -b; /* first such */
2645 if (after) {
2646 b = b | (b - 1); /* break after same */
2647 } else {
2648 b = b - 1; /* break before same */
2650 brk = true;
2653 *retb = b;
2654 return brk;
2657 /* Compute a zeroing BRK. */
2658 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2659 intptr_t oprsz, bool after)
2661 bool brk = false;
2662 intptr_t i;
2664 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2665 uint64_t this_b, this_g = g[i];
2667 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2668 d[i] = this_b & this_g;
2672 /* Likewise, but also compute flags. */
2673 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2674 intptr_t oprsz, bool after)
2676 uint32_t flags = PREDTEST_INIT;
2677 bool brk = false;
2678 intptr_t i;
2680 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2681 uint64_t this_b, this_d, this_g = g[i];
2683 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2684 d[i] = this_d = this_b & this_g;
2685 flags = iter_predtest_fwd(this_d, this_g, flags);
2687 return flags;
2690 /* Compute a merging BRK. */
2691 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2692 intptr_t oprsz, bool after)
2694 bool brk = false;
2695 intptr_t i;
2697 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2698 uint64_t this_b, this_g = g[i];
2700 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2701 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2705 /* Likewise, but also compute flags. */
2706 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2707 intptr_t oprsz, bool after)
2709 uint32_t flags = PREDTEST_INIT;
2710 bool brk = false;
2711 intptr_t i;
2713 for (i = 0; i < oprsz / 8; ++i) {
2714 uint64_t this_b, this_d = d[i], this_g = g[i];
2716 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2717 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2718 flags = iter_predtest_fwd(this_d, this_g, flags);
2720 return flags;
2723 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2725 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2726 * The compiler should turn this into 4 64-bit integer stores.
2728 memset(d, 0, sizeof(ARMPredicateReg));
2729 return PREDTEST_INIT;
2732 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2733 uint32_t pred_desc)
2735 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2736 if (last_active_pred(vn, vg, oprsz)) {
2737 compute_brk_z(vd, vm, vg, oprsz, true);
2738 } else {
2739 do_zero(vd, oprsz);
2743 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2744 uint32_t pred_desc)
2746 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747 if (last_active_pred(vn, vg, oprsz)) {
2748 return compute_brks_z(vd, vm, vg, oprsz, true);
2749 } else {
2750 return do_zero(vd, oprsz);
2754 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2755 uint32_t pred_desc)
2757 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2758 if (last_active_pred(vn, vg, oprsz)) {
2759 compute_brk_z(vd, vm, vg, oprsz, false);
2760 } else {
2761 do_zero(vd, oprsz);
2765 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2766 uint32_t pred_desc)
2768 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2769 if (last_active_pred(vn, vg, oprsz)) {
2770 return compute_brks_z(vd, vm, vg, oprsz, false);
2771 } else {
2772 return do_zero(vd, oprsz);
2776 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2779 compute_brk_z(vd, vn, vg, oprsz, true);
2782 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2785 return compute_brks_z(vd, vn, vg, oprsz, true);
2788 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2790 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2791 compute_brk_z(vd, vn, vg, oprsz, false);
2794 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2796 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2797 return compute_brks_z(vd, vn, vg, oprsz, false);
2800 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2802 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2803 compute_brk_m(vd, vn, vg, oprsz, true);
2806 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2808 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2809 return compute_brks_m(vd, vn, vg, oprsz, true);
2812 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2815 compute_brk_m(vd, vn, vg, oprsz, false);
2818 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2820 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2821 return compute_brks_m(vd, vn, vg, oprsz, false);
2824 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2826 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2828 if (!last_active_pred(vn, vg, oprsz)) {
2829 do_zero(vd, oprsz);
2833 /* As if PredTest(Ones(PL), D, esz). */
2834 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2835 uint64_t esz_mask)
2837 uint32_t flags = PREDTEST_INIT;
2838 intptr_t i;
2840 for (i = 0; i < oprsz / 8; i++) {
2841 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2843 if (oprsz & 7) {
2844 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2845 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2847 return flags;
2850 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2852 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2854 if (last_active_pred(vn, vg, oprsz)) {
2855 return predtest_ones(vd, oprsz, -1);
2856 } else {
2857 return do_zero(vd, oprsz);
2861 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2863 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2864 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2865 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2866 intptr_t i;
2868 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2869 uint64_t t = n[i] & g[i] & mask;
2870 sum += ctpop64(t);
2872 return sum;
2875 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2877 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2878 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2879 uint64_t esz_mask = pred_esz_masks[esz];
2880 ARMPredicateReg *d = vd;
2881 uint32_t flags;
2882 intptr_t i;
2884 /* Begin with a zero predicate register. */
2885 flags = do_zero(d, oprsz);
2886 if (count == 0) {
2887 return flags;
2890 /* Set all of the requested bits. */
2891 for (i = 0; i < count / 64; ++i) {
2892 d->p[i] = esz_mask;
2894 if (count & 63) {
2895 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2898 return predtest_ones(d, oprsz, esz_mask);
2901 /* Recursive reduction on a function;
2902 * C.f. the ARM ARM function ReducePredicated.
2904 * While it would be possible to write this without the DATA temporary,
2905 * it is much simpler to process the predicate register this way.
2906 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2907 * little to gain with a more complex non-recursive form.
2909 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2910 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2912 if (n == 1) { \
2913 return *data; \
2914 } else { \
2915 uintptr_t half = n / 2; \
2916 TYPE lo = NAME##_reduce(data, status, half); \
2917 TYPE hi = NAME##_reduce(data + half, status, half); \
2918 return TYPE##_##FUNC(lo, hi, status); \
2921 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2923 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2924 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2925 for (i = 0; i < oprsz; ) { \
2926 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2927 do { \
2928 TYPE nn = *(TYPE *)(vn + H(i)); \
2929 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2930 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2931 } while (i & 15); \
2933 for (; i < maxsz; i += sizeof(TYPE)) { \
2934 *(TYPE *)((void *)data + i) = IDENT; \
2936 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2939 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2940 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2941 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2943 /* Identity is floatN_default_nan, without the function call. */
2944 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2945 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2946 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2948 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2949 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2950 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2952 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2953 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2954 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2956 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2957 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2958 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2960 #undef DO_REDUCE
2962 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2963 void *status, uint32_t desc)
2965 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2966 float16 result = nn;
2968 do {
2969 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2970 do {
2971 if (pg & 1) {
2972 float16 mm = *(float16 *)(vm + H1_2(i));
2973 result = float16_add(result, mm, status);
2975 i += sizeof(float16), pg >>= sizeof(float16);
2976 } while (i & 15);
2977 } while (i < opr_sz);
2979 return result;
2982 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2983 void *status, uint32_t desc)
2985 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2986 float32 result = nn;
2988 do {
2989 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2990 do {
2991 if (pg & 1) {
2992 float32 mm = *(float32 *)(vm + H1_2(i));
2993 result = float32_add(result, mm, status);
2995 i += sizeof(float32), pg >>= sizeof(float32);
2996 } while (i & 15);
2997 } while (i < opr_sz);
2999 return result;
3002 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3003 void *status, uint32_t desc)
3005 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3006 uint64_t *m = vm;
3007 uint8_t *pg = vg;
3009 for (i = 0; i < opr_sz; i++) {
3010 if (pg[H1(i)] & 1) {
3011 nn = float64_add(nn, m[i], status);
3015 return nn;
3018 /* Fully general three-operand expander, controlled by a predicate,
3019 * With the extra float_status parameter.
3021 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3022 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3023 void *status, uint32_t desc) \
3025 intptr_t i = simd_oprsz(desc); \
3026 uint64_t *g = vg; \
3027 do { \
3028 uint64_t pg = g[(i - 1) >> 6]; \
3029 do { \
3030 i -= sizeof(TYPE); \
3031 if (likely((pg >> (i & 63)) & 1)) { \
3032 TYPE nn = *(TYPE *)(vn + H(i)); \
3033 TYPE mm = *(TYPE *)(vm + H(i)); \
3034 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3036 } while (i & 63); \
3037 } while (i != 0); \
3040 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3041 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3042 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3044 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3045 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3046 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3048 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3049 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3050 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3052 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3053 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3054 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3056 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3057 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3058 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3060 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3061 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3062 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3064 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3065 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3066 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3068 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3069 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3070 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3072 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3074 return float16_abs(float16_sub(a, b, s));
3077 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3079 return float32_abs(float32_sub(a, b, s));
3082 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3084 return float64_abs(float64_sub(a, b, s));
3087 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3088 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3089 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3091 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3093 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3094 return float64_scalbn(a, b_int, s);
3097 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3098 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3099 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3101 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3102 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3103 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3105 #undef DO_ZPZZ_FP
3107 /* Three-operand expander, with one scalar operand, controlled by
3108 * a predicate, with the extra float_status parameter.
3110 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3111 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3112 void *status, uint32_t desc) \
3114 intptr_t i = simd_oprsz(desc); \
3115 uint64_t *g = vg; \
3116 TYPE mm = scalar; \
3117 do { \
3118 uint64_t pg = g[(i - 1) >> 6]; \
3119 do { \
3120 i -= sizeof(TYPE); \
3121 if (likely((pg >> (i & 63)) & 1)) { \
3122 TYPE nn = *(TYPE *)(vn + H(i)); \
3123 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3125 } while (i & 63); \
3126 } while (i != 0); \
3129 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3130 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3131 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3133 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3134 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3135 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3137 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3138 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3139 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3141 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3143 return float16_sub(b, a, s);
3146 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3148 return float32_sub(b, a, s);
3151 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3153 return float64_sub(b, a, s);
3156 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3157 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3158 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3160 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3161 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3162 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3164 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3165 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3166 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3168 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3169 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3170 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3172 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3173 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3174 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3176 /* Fully general two-operand expander, controlled by a predicate,
3177 * With the extra float_status parameter.
3179 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3180 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3182 intptr_t i = simd_oprsz(desc); \
3183 uint64_t *g = vg; \
3184 do { \
3185 uint64_t pg = g[(i - 1) >> 6]; \
3186 do { \
3187 i -= sizeof(TYPE); \
3188 if (likely((pg >> (i & 63)) & 1)) { \
3189 TYPE nn = *(TYPE *)(vn + H(i)); \
3190 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3192 } while (i & 63); \
3193 } while (i != 0); \
3196 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3197 * FZ16. When converting from fp16, this affects flushing input denormals;
3198 * when converting to fp16, this affects flushing output denormals.
3200 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3202 flag save = get_flush_inputs_to_zero(fpst);
3203 float32 ret;
3205 set_flush_inputs_to_zero(false, fpst);
3206 ret = float16_to_float32(f, true, fpst);
3207 set_flush_inputs_to_zero(save, fpst);
3208 return ret;
3211 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3213 flag save = get_flush_inputs_to_zero(fpst);
3214 float64 ret;
3216 set_flush_inputs_to_zero(false, fpst);
3217 ret = float16_to_float64(f, true, fpst);
3218 set_flush_inputs_to_zero(save, fpst);
3219 return ret;
3222 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3224 flag save = get_flush_to_zero(fpst);
3225 float16 ret;
3227 set_flush_to_zero(false, fpst);
3228 ret = float32_to_float16(f, true, fpst);
3229 set_flush_to_zero(save, fpst);
3230 return ret;
3233 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3235 flag save = get_flush_to_zero(fpst);
3236 float16 ret;
3238 set_flush_to_zero(false, fpst);
3239 ret = float64_to_float16(f, true, fpst);
3240 set_flush_to_zero(save, fpst);
3241 return ret;
3244 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3246 if (float16_is_any_nan(f)) {
3247 float_raise(float_flag_invalid, s);
3248 return 0;
3250 return float16_to_int16_round_to_zero(f, s);
3253 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3255 if (float16_is_any_nan(f)) {
3256 float_raise(float_flag_invalid, s);
3257 return 0;
3259 return float16_to_int64_round_to_zero(f, s);
3262 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3264 if (float32_is_any_nan(f)) {
3265 float_raise(float_flag_invalid, s);
3266 return 0;
3268 return float32_to_int64_round_to_zero(f, s);
3271 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3273 if (float64_is_any_nan(f)) {
3274 float_raise(float_flag_invalid, s);
3275 return 0;
3277 return float64_to_int64_round_to_zero(f, s);
3280 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3282 if (float16_is_any_nan(f)) {
3283 float_raise(float_flag_invalid, s);
3284 return 0;
3286 return float16_to_uint16_round_to_zero(f, s);
3289 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3291 if (float16_is_any_nan(f)) {
3292 float_raise(float_flag_invalid, s);
3293 return 0;
3295 return float16_to_uint64_round_to_zero(f, s);
3298 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3300 if (float32_is_any_nan(f)) {
3301 float_raise(float_flag_invalid, s);
3302 return 0;
3304 return float32_to_uint64_round_to_zero(f, s);
3307 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3309 if (float64_is_any_nan(f)) {
3310 float_raise(float_flag_invalid, s);
3311 return 0;
3313 return float64_to_uint64_round_to_zero(f, s);
3316 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3317 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3318 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3319 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3320 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3321 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3323 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3324 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3325 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3326 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3327 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3328 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3329 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3331 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3332 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3333 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3334 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3335 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3336 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3337 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3339 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3340 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3341 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3343 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3344 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3345 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3347 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3348 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3349 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3351 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3352 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3353 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3355 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3356 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3357 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3358 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3359 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3360 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3361 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3363 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3364 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3365 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3366 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3367 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3368 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3369 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3371 #undef DO_ZPZ_FP
3373 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3374 * "properly", so we need to encode some of the registers into DESC.
3376 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3378 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3379 uint16_t neg1, uint16_t neg3)
3381 intptr_t i = simd_oprsz(desc);
3382 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3383 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3384 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3385 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3386 void *vd = &env->vfp.zregs[rd];
3387 void *vn = &env->vfp.zregs[rn];
3388 void *vm = &env->vfp.zregs[rm];
3389 void *va = &env->vfp.zregs[ra];
3390 uint64_t *g = vg;
3392 do {
3393 uint64_t pg = g[(i - 1) >> 6];
3394 do {
3395 i -= 2;
3396 if (likely((pg >> (i & 63)) & 1)) {
3397 float16 e1, e2, e3, r;
3399 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3400 e2 = *(uint16_t *)(vm + H1_2(i));
3401 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3402 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
3403 *(uint16_t *)(vd + H1_2(i)) = r;
3405 } while (i & 63);
3406 } while (i != 0);
3409 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3411 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3414 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3416 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3419 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3421 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3424 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3426 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3429 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3430 uint32_t neg1, uint32_t neg3)
3432 intptr_t i = simd_oprsz(desc);
3433 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3434 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3435 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3436 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3437 void *vd = &env->vfp.zregs[rd];
3438 void *vn = &env->vfp.zregs[rn];
3439 void *vm = &env->vfp.zregs[rm];
3440 void *va = &env->vfp.zregs[ra];
3441 uint64_t *g = vg;
3443 do {
3444 uint64_t pg = g[(i - 1) >> 6];
3445 do {
3446 i -= 4;
3447 if (likely((pg >> (i & 63)) & 1)) {
3448 float32 e1, e2, e3, r;
3450 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3451 e2 = *(uint32_t *)(vm + H1_4(i));
3452 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3453 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3454 *(uint32_t *)(vd + H1_4(i)) = r;
3456 } while (i & 63);
3457 } while (i != 0);
3460 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3462 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3465 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3467 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3470 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3472 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3475 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3477 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3480 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3481 uint64_t neg1, uint64_t neg3)
3483 intptr_t i = simd_oprsz(desc);
3484 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3485 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3486 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3487 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3488 void *vd = &env->vfp.zregs[rd];
3489 void *vn = &env->vfp.zregs[rn];
3490 void *vm = &env->vfp.zregs[rm];
3491 void *va = &env->vfp.zregs[ra];
3492 uint64_t *g = vg;
3494 do {
3495 uint64_t pg = g[(i - 1) >> 6];
3496 do {
3497 i -= 8;
3498 if (likely((pg >> (i & 63)) & 1)) {
3499 float64 e1, e2, e3, r;
3501 e1 = *(uint64_t *)(vn + i) ^ neg1;
3502 e2 = *(uint64_t *)(vm + i);
3503 e3 = *(uint64_t *)(va + i) ^ neg3;
3504 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3505 *(uint64_t *)(vd + i) = r;
3507 } while (i & 63);
3508 } while (i != 0);
3511 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3513 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3516 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3518 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3521 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3523 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3526 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3528 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3531 /* Two operand floating-point comparison controlled by a predicate.
3532 * Unlike the integer version, we are not allowed to optimistically
3533 * compare operands, since the comparison may have side effects wrt
3534 * the FPSR.
3536 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3537 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3538 void *status, uint32_t desc) \
3540 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3541 uint64_t *d = vd, *g = vg; \
3542 do { \
3543 uint64_t out = 0, pg = g[j]; \
3544 do { \
3545 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3546 if (likely((pg >> (i & 63)) & 1)) { \
3547 TYPE nn = *(TYPE *)(vn + H(i)); \
3548 TYPE mm = *(TYPE *)(vm + H(i)); \
3549 out |= OP(TYPE, nn, mm, status); \
3551 } while (i & 63); \
3552 d[j--] = out; \
3553 } while (i > 0); \
3556 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3557 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3558 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3559 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3560 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3561 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3563 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3564 DO_FPCMP_PPZZ_H(NAME, OP) \
3565 DO_FPCMP_PPZZ_S(NAME, OP) \
3566 DO_FPCMP_PPZZ_D(NAME, OP)
3568 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3569 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3570 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3571 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3572 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3573 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3574 #define DO_FCMUO(TYPE, X, Y, ST) \
3575 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3576 #define DO_FACGE(TYPE, X, Y, ST) \
3577 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3578 #define DO_FACGT(TYPE, X, Y, ST) \
3579 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3581 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3582 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3583 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3584 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3585 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3586 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3587 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3589 #undef DO_FPCMP_PPZZ_ALL
3590 #undef DO_FPCMP_PPZZ_D
3591 #undef DO_FPCMP_PPZZ_S
3592 #undef DO_FPCMP_PPZZ_H
3593 #undef DO_FPCMP_PPZZ
3595 /* One operand floating-point comparison against zero, controlled
3596 * by a predicate.
3598 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3599 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3600 void *status, uint32_t desc) \
3602 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3603 uint64_t *d = vd, *g = vg; \
3604 do { \
3605 uint64_t out = 0, pg = g[j]; \
3606 do { \
3607 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3608 if ((pg >> (i & 63)) & 1) { \
3609 TYPE nn = *(TYPE *)(vn + H(i)); \
3610 out |= OP(TYPE, nn, 0, status); \
3612 } while (i & 63); \
3613 d[j--] = out; \
3614 } while (i > 0); \
3617 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3618 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3619 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3620 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3621 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3622 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3624 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3625 DO_FPCMP_PPZ0_H(NAME, OP) \
3626 DO_FPCMP_PPZ0_S(NAME, OP) \
3627 DO_FPCMP_PPZ0_D(NAME, OP)
3629 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3630 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3631 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3632 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3633 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3634 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3636 /* FP Trig Multiply-Add. */
3638 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3640 static const float16 coeff[16] = {
3641 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3642 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3644 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3645 intptr_t x = simd_data(desc);
3646 float16 *d = vd, *n = vn, *m = vm;
3647 for (i = 0; i < opr_sz; i++) {
3648 float16 mm = m[i];
3649 intptr_t xx = x;
3650 if (float16_is_neg(mm)) {
3651 mm = float16_abs(mm);
3652 xx += 8;
3654 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3658 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3660 static const float32 coeff[16] = {
3661 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3662 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3663 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3664 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3666 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3667 intptr_t x = simd_data(desc);
3668 float32 *d = vd, *n = vn, *m = vm;
3669 for (i = 0; i < opr_sz; i++) {
3670 float32 mm = m[i];
3671 intptr_t xx = x;
3672 if (float32_is_neg(mm)) {
3673 mm = float32_abs(mm);
3674 xx += 8;
3676 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3680 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3682 static const float64 coeff[16] = {
3683 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3684 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3685 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3686 0x3de5d8408868552full, 0x0000000000000000ull,
3687 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3688 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3689 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3690 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3692 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3693 intptr_t x = simd_data(desc);
3694 float64 *d = vd, *n = vn, *m = vm;
3695 for (i = 0; i < opr_sz; i++) {
3696 float64 mm = m[i];
3697 intptr_t xx = x;
3698 if (float64_is_neg(mm)) {
3699 mm = float64_abs(mm);
3700 xx += 8;
3702 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3707 * FP Complex Add
3710 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3711 void *vs, uint32_t desc)
3713 intptr_t j, i = simd_oprsz(desc);
3714 uint64_t *g = vg;
3715 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3716 float16 neg_real = float16_chs(neg_imag);
3718 do {
3719 uint64_t pg = g[(i - 1) >> 6];
3720 do {
3721 float16 e0, e1, e2, e3;
3723 /* I holds the real index; J holds the imag index. */
3724 j = i - sizeof(float16);
3725 i -= 2 * sizeof(float16);
3727 e0 = *(float16 *)(vn + H1_2(i));
3728 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3729 e2 = *(float16 *)(vn + H1_2(j));
3730 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3732 if (likely((pg >> (i & 63)) & 1)) {
3733 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3735 if (likely((pg >> (j & 63)) & 1)) {
3736 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3738 } while (i & 63);
3739 } while (i != 0);
3742 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3743 void *vs, uint32_t desc)
3745 intptr_t j, i = simd_oprsz(desc);
3746 uint64_t *g = vg;
3747 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3748 float32 neg_real = float32_chs(neg_imag);
3750 do {
3751 uint64_t pg = g[(i - 1) >> 6];
3752 do {
3753 float32 e0, e1, e2, e3;
3755 /* I holds the real index; J holds the imag index. */
3756 j = i - sizeof(float32);
3757 i -= 2 * sizeof(float32);
3759 e0 = *(float32 *)(vn + H1_2(i));
3760 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3761 e2 = *(float32 *)(vn + H1_2(j));
3762 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3764 if (likely((pg >> (i & 63)) & 1)) {
3765 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3767 if (likely((pg >> (j & 63)) & 1)) {
3768 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3770 } while (i & 63);
3771 } while (i != 0);
3774 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3775 void *vs, uint32_t desc)
3777 intptr_t j, i = simd_oprsz(desc);
3778 uint64_t *g = vg;
3779 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3780 float64 neg_real = float64_chs(neg_imag);
3782 do {
3783 uint64_t pg = g[(i - 1) >> 6];
3784 do {
3785 float64 e0, e1, e2, e3;
3787 /* I holds the real index; J holds the imag index. */
3788 j = i - sizeof(float64);
3789 i -= 2 * sizeof(float64);
3791 e0 = *(float64 *)(vn + H1_2(i));
3792 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3793 e2 = *(float64 *)(vn + H1_2(j));
3794 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3796 if (likely((pg >> (i & 63)) & 1)) {
3797 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3799 if (likely((pg >> (j & 63)) & 1)) {
3800 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3802 } while (i & 63);
3803 } while (i != 0);
3807 * FP Complex Multiply
3810 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3812 void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3814 intptr_t j, i = simd_oprsz(desc);
3815 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3816 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3817 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3818 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3819 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3820 bool flip = rot & 1;
3821 float16 neg_imag, neg_real;
3822 void *vd = &env->vfp.zregs[rd];
3823 void *vn = &env->vfp.zregs[rn];
3824 void *vm = &env->vfp.zregs[rm];
3825 void *va = &env->vfp.zregs[ra];
3826 uint64_t *g = vg;
3828 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3829 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3831 do {
3832 uint64_t pg = g[(i - 1) >> 6];
3833 do {
3834 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3836 /* I holds the real index; J holds the imag index. */
3837 j = i - sizeof(float16);
3838 i -= 2 * sizeof(float16);
3840 nr = *(float16 *)(vn + H1_2(i));
3841 ni = *(float16 *)(vn + H1_2(j));
3842 mr = *(float16 *)(vm + H1_2(i));
3843 mi = *(float16 *)(vm + H1_2(j));
3845 e2 = (flip ? ni : nr);
3846 e1 = (flip ? mi : mr) ^ neg_real;
3847 e4 = e2;
3848 e3 = (flip ? mr : mi) ^ neg_imag;
3850 if (likely((pg >> (i & 63)) & 1)) {
3851 d = *(float16 *)(va + H1_2(i));
3852 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3853 *(float16 *)(vd + H1_2(i)) = d;
3855 if (likely((pg >> (j & 63)) & 1)) {
3856 d = *(float16 *)(va + H1_2(j));
3857 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3858 *(float16 *)(vd + H1_2(j)) = d;
3860 } while (i & 63);
3861 } while (i != 0);
3864 void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3866 intptr_t j, i = simd_oprsz(desc);
3867 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3868 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3869 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3870 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3871 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3872 bool flip = rot & 1;
3873 float32 neg_imag, neg_real;
3874 void *vd = &env->vfp.zregs[rd];
3875 void *vn = &env->vfp.zregs[rn];
3876 void *vm = &env->vfp.zregs[rm];
3877 void *va = &env->vfp.zregs[ra];
3878 uint64_t *g = vg;
3880 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3881 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3883 do {
3884 uint64_t pg = g[(i - 1) >> 6];
3885 do {
3886 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3888 /* I holds the real index; J holds the imag index. */
3889 j = i - sizeof(float32);
3890 i -= 2 * sizeof(float32);
3892 nr = *(float32 *)(vn + H1_2(i));
3893 ni = *(float32 *)(vn + H1_2(j));
3894 mr = *(float32 *)(vm + H1_2(i));
3895 mi = *(float32 *)(vm + H1_2(j));
3897 e2 = (flip ? ni : nr);
3898 e1 = (flip ? mi : mr) ^ neg_real;
3899 e4 = e2;
3900 e3 = (flip ? mr : mi) ^ neg_imag;
3902 if (likely((pg >> (i & 63)) & 1)) {
3903 d = *(float32 *)(va + H1_2(i));
3904 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3905 *(float32 *)(vd + H1_2(i)) = d;
3907 if (likely((pg >> (j & 63)) & 1)) {
3908 d = *(float32 *)(va + H1_2(j));
3909 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3910 *(float32 *)(vd + H1_2(j)) = d;
3912 } while (i & 63);
3913 } while (i != 0);
3916 void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3918 intptr_t j, i = simd_oprsz(desc);
3919 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3920 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3921 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3922 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3923 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3924 bool flip = rot & 1;
3925 float64 neg_imag, neg_real;
3926 void *vd = &env->vfp.zregs[rd];
3927 void *vn = &env->vfp.zregs[rn];
3928 void *vm = &env->vfp.zregs[rm];
3929 void *va = &env->vfp.zregs[ra];
3930 uint64_t *g = vg;
3932 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3933 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3935 do {
3936 uint64_t pg = g[(i - 1) >> 6];
3937 do {
3938 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3940 /* I holds the real index; J holds the imag index. */
3941 j = i - sizeof(float64);
3942 i -= 2 * sizeof(float64);
3944 nr = *(float64 *)(vn + H1_2(i));
3945 ni = *(float64 *)(vn + H1_2(j));
3946 mr = *(float64 *)(vm + H1_2(i));
3947 mi = *(float64 *)(vm + H1_2(j));
3949 e2 = (flip ? ni : nr);
3950 e1 = (flip ? mi : mr) ^ neg_real;
3951 e4 = e2;
3952 e3 = (flip ? mr : mi) ^ neg_imag;
3954 if (likely((pg >> (i & 63)) & 1)) {
3955 d = *(float64 *)(va + H1_2(i));
3956 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3957 *(float64 *)(vd + H1_2(i)) = d;
3959 if (likely((pg >> (j & 63)) & 1)) {
3960 d = *(float64 *)(va + H1_2(j));
3961 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3962 *(float64 *)(vd + H1_2(j)) = d;
3964 } while (i & 63);
3965 } while (i != 0);
3969 * Load contiguous data, protected by a governing predicate.
3973 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3974 * Memory is valid through @host + @mem_max. The register element
3975 * indicies are inferred from @mem_ofs, as modified by the types for
3976 * which the helper is built. Return the @mem_ofs of the first element
3977 * not loaded (which is @mem_max if they are all loaded).
3979 * For softmmu, we have fully validated the guest page. For user-only,
3980 * we cannot fully validate without taking the mmap lock, but since we
3981 * know the access is within one host page, if any access is valid they
3982 * all must be valid. However, when @vg is all false, it may be that
3983 * no access is valid.
3985 typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
3986 intptr_t mem_ofs, intptr_t mem_max);
3989 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3990 * The controlling predicate is known to be true.
3992 typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3993 target_ulong vaddr, int mmu_idx, uintptr_t ra);
3994 typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
3997 * Generate the above primitives.
4000 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4001 static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
4002 intptr_t mem_off, const intptr_t mem_max) \
4004 intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
4005 uint64_t *pg = vg; \
4006 while (mem_off + sizeof(TYPEM) <= mem_max) { \
4007 TYPEM val = 0; \
4008 if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
4009 val = HOST(host + mem_off); \
4011 *(TYPEE *)(vd + H(reg_off)) = val; \
4012 mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
4014 return mem_off; \
4017 #ifdef CONFIG_SOFTMMU
4018 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4019 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4020 target_ulong addr, int mmu_idx, uintptr_t ra) \
4022 TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
4023 TYPEM val = TLB(env, addr, oi, ra); \
4024 *(TYPEE *)(vd + H(reg_off)) = val; \
4026 #else
4027 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4028 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4029 target_ulong addr, int mmu_idx, uintptr_t ra) \
4031 TYPEM val = HOST(g2h(addr)); \
4032 *(TYPEE *)(vd + H(reg_off)) = val; \
4034 #endif
4036 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4037 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4038 DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4040 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4041 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4042 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4043 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4044 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4045 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4046 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4048 #define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
4049 DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
4050 DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
4051 MOEND, helper_##end##_##PT##_mmu)
4053 DO_LD_PRIM_2(ld1hh, le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
4054 DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
4055 DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t, int16_t, lduw, lduw)
4056 DO_LD_PRIM_2(ld1hdu, le, MO_LE, , uint64_t, uint16_t, lduw, lduw)
4057 DO_LD_PRIM_2(ld1hds, le, MO_LE, , uint64_t, int16_t, lduw, lduw)
4059 DO_LD_PRIM_2(ld1ss, le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
4060 DO_LD_PRIM_2(ld1sdu, le, MO_LE, , uint64_t, uint32_t, ldl, ldul)
4061 DO_LD_PRIM_2(ld1sds, le, MO_LE, , uint64_t, int32_t, ldl, ldul)
4063 DO_LD_PRIM_2(ld1dd, le, MO_LE, , uint64_t, uint64_t, ldq, ldq)
4065 DO_LD_PRIM_2(ld1hh, be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
4066 DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
4067 DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t, int16_t, lduw, lduw)
4068 DO_LD_PRIM_2(ld1hdu, be, MO_BE, , uint64_t, uint16_t, lduw, lduw)
4069 DO_LD_PRIM_2(ld1hds, be, MO_BE, , uint64_t, int16_t, lduw, lduw)
4071 DO_LD_PRIM_2(ld1ss, be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
4072 DO_LD_PRIM_2(ld1sdu, be, MO_BE, , uint64_t, uint32_t, ldl, ldul)
4073 DO_LD_PRIM_2(ld1sds, be, MO_BE, , uint64_t, int32_t, ldl, ldul)
4075 DO_LD_PRIM_2(ld1dd, be, MO_BE, , uint64_t, uint64_t, ldq, ldq)
4077 #undef DO_LD_TLB
4078 #undef DO_LD_HOST
4079 #undef DO_LD_PRIM_1
4080 #undef DO_LD_PRIM_2
4083 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4084 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4085 * element >= @reg_off, or @reg_max if there were no active elements at all.
4087 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4088 intptr_t reg_max, int esz)
4090 uint64_t pg_mask = pred_esz_masks[esz];
4091 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4093 /* In normal usage, the first element is active. */
4094 if (likely(pg & 1)) {
4095 return reg_off;
4098 if (pg == 0) {
4099 reg_off &= -64;
4100 do {
4101 reg_off += 64;
4102 if (unlikely(reg_off >= reg_max)) {
4103 /* The entire predicate was false. */
4104 return reg_max;
4106 pg = vg[reg_off >> 6] & pg_mask;
4107 } while (pg == 0);
4109 reg_off += ctz64(pg);
4111 /* We should never see an out of range predicate bit set. */
4112 tcg_debug_assert(reg_off < reg_max);
4113 return reg_off;
4117 * Return the maximum offset <= @mem_max which is still within the page
4118 * referenced by @base + @mem_off.
4120 static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4121 intptr_t mem_max)
4123 target_ulong addr = base + mem_off;
4124 intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4125 return MIN(split, mem_max - mem_off) + mem_off;
4128 static inline void set_helper_retaddr(uintptr_t ra)
4130 #ifdef CONFIG_USER_ONLY
4131 helper_retaddr = ra;
4132 #endif
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null. Elide the useless test.
4139 static inline bool test_host_page(void *host)
4141 #ifdef CONFIG_USER_ONLY
4142 return true;
4143 #else
4144 return likely(host != NULL);
4145 #endif
4149 * Common helper for all contiguous one-register predicated loads.
4151 static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
4152 uint32_t desc, const uintptr_t retaddr,
4153 const int esz, const int msz,
4154 sve_ld1_host_fn *host_fn,
4155 sve_ld1_tlb_fn *tlb_fn)
4157 void *vd = &env->vfp.zregs[simd_data(desc)];
4158 const int diffsz = esz - msz;
4159 const intptr_t reg_max = simd_oprsz(desc);
4160 const intptr_t mem_max = reg_max >> diffsz;
4161 const int mmu_idx = cpu_mmu_index(env, false);
4162 ARMVectorReg scratch;
4163 void *host;
4164 intptr_t split, reg_off, mem_off;
4166 /* Find the first active element. */
4167 reg_off = find_next_active(vg, 0, reg_max, esz);
4168 if (unlikely(reg_off == reg_max)) {
4169 /* The entire predicate was false; no load occurs. */
4170 memset(vd, 0, reg_max);
4171 return;
4173 mem_off = reg_off >> diffsz;
4174 set_helper_retaddr(retaddr);
4177 * If the (remaining) load is entirely within a single page, then:
4178 * For softmmu, and the tlb hits, then no faults will occur;
4179 * For user-only, either the first load will fault or none will.
4180 * We can thus perform the load directly to the destination and
4181 * Vd will be unmodified on any exception path.
4183 split = max_for_page(addr, mem_off, mem_max);
4184 if (likely(split == mem_max)) {
4185 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4186 if (test_host_page(host)) {
4187 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4188 tcg_debug_assert(mem_off == mem_max);
4189 set_helper_retaddr(0);
4190 /* After having taken any fault, zero leading inactive elements. */
4191 swap_memzero(vd, reg_off);
4192 return;
4197 * Perform the predicated read into a temporary, thus ensuring
4198 * if the load of the last element faults, Vd is not modified.
4200 #ifdef CONFIG_USER_ONLY
4201 swap_memzero(&scratch, reg_off);
4202 host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
4203 #else
4204 memset(&scratch, 0, reg_max);
4205 goto start;
4206 while (1) {
4207 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4208 if (reg_off >= reg_max) {
4209 break;
4211 mem_off = reg_off >> diffsz;
4212 split = max_for_page(addr, mem_off, mem_max);
4214 start:
4215 if (split - mem_off >= (1 << msz)) {
4216 /* At least one whole element on this page. */
4217 host = tlb_vaddr_to_host(env, addr + mem_off,
4218 MMU_DATA_LOAD, mmu_idx);
4219 if (host) {
4220 mem_off = host_fn(&scratch, vg, host - mem_off,
4221 mem_off, split);
4222 reg_off = mem_off << diffsz;
4223 continue;
4228 * Perform one normal read. This may fault, longjmping out to the
4229 * main loop in order to raise an exception. It may succeed, and
4230 * as a side-effect load the TLB entry for the next round. Finally,
4231 * in the extremely unlikely case we're performing this operation
4232 * on I/O memory, it may succeed but not bring in the TLB entry.
4233 * But even then we have still made forward progress.
4235 tlb_fn(env, &scratch, reg_off, addr + mem_off, mmu_idx, retaddr);
4236 reg_off += 1 << esz;
4238 #endif
4240 set_helper_retaddr(0);
4241 memcpy(vd, &scratch, reg_max);
4244 #define DO_LD1_1(NAME, ESZ) \
4245 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4246 target_ulong addr, uint32_t desc) \
4248 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4249 sve_##NAME##_host, sve_##NAME##_tlb); \
4252 #define DO_LD1_2(NAME, ESZ, MSZ) \
4253 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4254 target_ulong addr, uint32_t desc) \
4256 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4257 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4259 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4260 target_ulong addr, uint32_t desc) \
4262 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4263 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4266 DO_LD1_1(ld1bb, 0)
4267 DO_LD1_1(ld1bhu, 1)
4268 DO_LD1_1(ld1bhs, 1)
4269 DO_LD1_1(ld1bsu, 2)
4270 DO_LD1_1(ld1bss, 2)
4271 DO_LD1_1(ld1bdu, 3)
4272 DO_LD1_1(ld1bds, 3)
4274 DO_LD1_2(ld1hh, 1, 1)
4275 DO_LD1_2(ld1hsu, 2, 1)
4276 DO_LD1_2(ld1hss, 2, 1)
4277 DO_LD1_2(ld1hdu, 3, 1)
4278 DO_LD1_2(ld1hds, 3, 1)
4280 DO_LD1_2(ld1ss, 2, 2)
4281 DO_LD1_2(ld1sdu, 3, 2)
4282 DO_LD1_2(ld1sds, 3, 2)
4284 DO_LD1_2(ld1dd, 3, 3)
4286 #undef DO_LD1_1
4287 #undef DO_LD1_2
4290 * Common helpers for all contiguous 2,3,4-register predicated loads.
4292 static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4293 uint32_t desc, int size, uintptr_t ra,
4294 sve_ld1_tlb_fn *tlb_fn)
4296 const int mmu_idx = cpu_mmu_index(env, false);
4297 intptr_t i, oprsz = simd_oprsz(desc);
4298 unsigned rd = simd_data(desc);
4299 ARMVectorReg scratch[2] = { };
4301 set_helper_retaddr(ra);
4302 for (i = 0; i < oprsz; ) {
4303 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4304 do {
4305 if (pg & 1) {
4306 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4307 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4309 i += size, pg >>= size;
4310 addr += 2 * size;
4311 } while (i & 15);
4313 set_helper_retaddr(0);
4315 /* Wait until all exceptions have been raised to write back. */
4316 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4317 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4320 static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4321 uint32_t desc, int size, uintptr_t ra,
4322 sve_ld1_tlb_fn *tlb_fn)
4324 const int mmu_idx = cpu_mmu_index(env, false);
4325 intptr_t i, oprsz = simd_oprsz(desc);
4326 unsigned rd = simd_data(desc);
4327 ARMVectorReg scratch[3] = { };
4329 set_helper_retaddr(ra);
4330 for (i = 0; i < oprsz; ) {
4331 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4332 do {
4333 if (pg & 1) {
4334 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4335 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4336 tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
4338 i += size, pg >>= size;
4339 addr += 3 * size;
4340 } while (i & 15);
4342 set_helper_retaddr(0);
4344 /* Wait until all exceptions have been raised to write back. */
4345 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4346 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4347 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4350 static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4351 uint32_t desc, int size, uintptr_t ra,
4352 sve_ld1_tlb_fn *tlb_fn)
4354 const int mmu_idx = cpu_mmu_index(env, false);
4355 intptr_t i, oprsz = simd_oprsz(desc);
4356 unsigned rd = simd_data(desc);
4357 ARMVectorReg scratch[4] = { };
4359 set_helper_retaddr(ra);
4360 for (i = 0; i < oprsz; ) {
4361 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4362 do {
4363 if (pg & 1) {
4364 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4365 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4366 tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
4367 tlb_fn(env, &scratch[3], i, addr + 3 * size, mmu_idx, ra);
4369 i += size, pg >>= size;
4370 addr += 4 * size;
4371 } while (i & 15);
4373 set_helper_retaddr(0);
4375 /* Wait until all exceptions have been raised to write back. */
4376 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4377 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4378 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4379 memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4382 #define DO_LDN_1(N) \
4383 void __attribute__((flatten)) HELPER(sve_ld##N##bb_r) \
4384 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4386 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4389 #define DO_LDN_2(N, SUFF, SIZE) \
4390 void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_le_r) \
4391 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4393 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4394 sve_ld1##SUFF##_le_tlb); \
4396 void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_be_r) \
4397 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4399 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4400 sve_ld1##SUFF##_be_tlb); \
4403 DO_LDN_1(2)
4404 DO_LDN_1(3)
4405 DO_LDN_1(4)
4407 DO_LDN_2(2, hh, 2)
4408 DO_LDN_2(3, hh, 2)
4409 DO_LDN_2(4, hh, 2)
4411 DO_LDN_2(2, ss, 4)
4412 DO_LDN_2(3, ss, 4)
4413 DO_LDN_2(4, ss, 4)
4415 DO_LDN_2(2, dd, 8)
4416 DO_LDN_2(3, dd, 8)
4417 DO_LDN_2(4, dd, 8)
4419 #undef DO_LDN_1
4420 #undef DO_LDN_2
4423 * Load contiguous data, first-fault and no-fault.
4425 * For user-only, one could argue that we should hold the mmap_lock during
4426 * the operation so that there is no race between page_check_range and the
4427 * load operation. However, unmapping pages out from under a running thread
4428 * is extraordinarily unlikely. This theoretical race condition also affects
4429 * linux-user/ in its get_user/put_user macros.
4431 * TODO: Construct some helpers, written in assembly, that interact with
4432 * handle_cpu_signal to produce memory ops which can properly report errors
4433 * without racing.
4436 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4437 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4438 * option, which leaves subsequent data unchanged.
4440 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4442 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4444 if (i & 63) {
4445 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4446 i = ROUND_UP(i, 64);
4448 for (; i < oprsz; i += 64) {
4449 ffr[i / 64] = 0;
4454 * Common helper for all contiguous first-fault loads.
4456 static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4457 uint32_t desc, const uintptr_t retaddr,
4458 const int esz, const int msz,
4459 sve_ld1_host_fn *host_fn,
4460 sve_ld1_tlb_fn *tlb_fn)
4462 void *vd = &env->vfp.zregs[simd_data(desc)];
4463 const int diffsz = esz - msz;
4464 const intptr_t reg_max = simd_oprsz(desc);
4465 const intptr_t mem_max = reg_max >> diffsz;
4466 const int mmu_idx = cpu_mmu_index(env, false);
4467 intptr_t split, reg_off, mem_off;
4468 void *host;
4470 /* Skip to the first active element. */
4471 reg_off = find_next_active(vg, 0, reg_max, esz);
4472 if (unlikely(reg_off == reg_max)) {
4473 /* The entire predicate was false; no load occurs. */
4474 memset(vd, 0, reg_max);
4475 return;
4477 mem_off = reg_off >> diffsz;
4478 set_helper_retaddr(retaddr);
4481 * If the (remaining) load is entirely within a single page, then:
4482 * For softmmu, and the tlb hits, then no faults will occur;
4483 * For user-only, either the first load will fault or none will.
4484 * We can thus perform the load directly to the destination and
4485 * Vd will be unmodified on any exception path.
4487 split = max_for_page(addr, mem_off, mem_max);
4488 if (likely(split == mem_max)) {
4489 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4490 if (test_host_page(host)) {
4491 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4492 tcg_debug_assert(mem_off == mem_max);
4493 set_helper_retaddr(0);
4494 /* After any fault, zero any leading inactive elements. */
4495 swap_memzero(vd, reg_off);
4496 return;
4500 #ifdef CONFIG_USER_ONLY
4502 * The page(s) containing this first element at ADDR+MEM_OFF must
4503 * be valid. Considering that this first element may be misaligned
4504 * and cross a page boundary itself, take the rest of the page from
4505 * the last byte of the element.
4507 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4508 mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
4510 /* After any fault, zero any leading inactive elements. */
4511 swap_memzero(vd, reg_off);
4512 reg_off = mem_off << diffsz;
4513 #else
4515 * Perform one normal read, which will fault or not.
4516 * But it is likely to bring the page into the tlb.
4518 tlb_fn(env, vd, reg_off, addr + mem_off, mmu_idx, retaddr);
4520 /* After any fault, zero any leading predicated false elts. */
4521 swap_memzero(vd, reg_off);
4522 mem_off += 1 << msz;
4523 reg_off += 1 << esz;
4525 /* Try again to read the balance of the page. */
4526 split = max_for_page(addr, mem_off - 1, mem_max);
4527 if (split >= (1 << msz)) {
4528 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4529 if (host) {
4530 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4531 reg_off = mem_off << diffsz;
4534 #endif
4536 set_helper_retaddr(0);
4537 record_fault(env, reg_off, reg_max);
4541 * Common helper for all contiguous no-fault loads.
4543 static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4544 uint32_t desc, const int esz, const int msz,
4545 sve_ld1_host_fn *host_fn)
4547 void *vd = &env->vfp.zregs[simd_data(desc)];
4548 const int diffsz = esz - msz;
4549 const intptr_t reg_max = simd_oprsz(desc);
4550 const intptr_t mem_max = reg_max >> diffsz;
4551 const int mmu_idx = cpu_mmu_index(env, false);
4552 intptr_t split, reg_off, mem_off;
4553 void *host;
4555 #ifdef CONFIG_USER_ONLY
4556 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4557 if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4558 /* The entire operation is valid and will not fault. */
4559 host_fn(vd, vg, host, 0, mem_max);
4560 return;
4562 #endif
4564 /* There will be no fault, so we may modify in advance. */
4565 memset(vd, 0, reg_max);
4567 /* Skip to the first active element. */
4568 reg_off = find_next_active(vg, 0, reg_max, esz);
4569 if (unlikely(reg_off == reg_max)) {
4570 /* The entire predicate was false; no load occurs. */
4571 return;
4573 mem_off = reg_off >> diffsz;
4575 #ifdef CONFIG_USER_ONLY
4576 if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4577 /* At least one load is valid; take the rest of the page. */
4578 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4579 mem_off = host_fn(vd, vg, host, mem_off, split);
4580 reg_off = mem_off << diffsz;
4582 #else
4584 * If the address is not in the TLB, we have no way to bring the
4585 * entry into the TLB without also risking a fault. Note that
4586 * the corollary is that we never load from an address not in RAM.
4588 * This last is out of spec, in a weird corner case.
4589 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4590 * must not actually hit the bus -- it returns UNKNOWN data instead.
4591 * But if you map non-RAM with Normal memory attributes and do a NF
4592 * load then it should access the bus. (Nobody ought actually do this
4593 * in the real world, obviously.)
4595 * Then there are the annoying special cases with watchpoints...
4597 * TODO: Add a form of tlb_fill that does not raise an exception,
4598 * with a form of tlb_vaddr_to_host and a set of loads to match.
4599 * The non_fault_vaddr_to_host would handle everything, usually,
4600 * and the loads would handle the iomem path for watchpoints.
4602 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4603 split = max_for_page(addr, mem_off, mem_max);
4604 if (host && split >= (1 << msz)) {
4605 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4606 reg_off = mem_off << diffsz;
4608 #endif
4610 record_fault(env, reg_off, reg_max);
4613 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4614 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4615 target_ulong addr, uint32_t desc) \
4617 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4618 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4620 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4621 target_ulong addr, uint32_t desc) \
4623 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
4626 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4627 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4628 target_ulong addr, uint32_t desc) \
4630 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4631 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4633 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4634 target_ulong addr, uint32_t desc) \
4636 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4638 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4639 target_ulong addr, uint32_t desc) \
4641 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4642 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4644 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4645 target_ulong addr, uint32_t desc) \
4647 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
4650 DO_LDFF1_LDNF1_1(bb, 0)
4651 DO_LDFF1_LDNF1_1(bhu, 1)
4652 DO_LDFF1_LDNF1_1(bhs, 1)
4653 DO_LDFF1_LDNF1_1(bsu, 2)
4654 DO_LDFF1_LDNF1_1(bss, 2)
4655 DO_LDFF1_LDNF1_1(bdu, 3)
4656 DO_LDFF1_LDNF1_1(bds, 3)
4658 DO_LDFF1_LDNF1_2(hh, 1, 1)
4659 DO_LDFF1_LDNF1_2(hsu, 2, 1)
4660 DO_LDFF1_LDNF1_2(hss, 2, 1)
4661 DO_LDFF1_LDNF1_2(hdu, 3, 1)
4662 DO_LDFF1_LDNF1_2(hds, 3, 1)
4664 DO_LDFF1_LDNF1_2(ss, 2, 2)
4665 DO_LDFF1_LDNF1_2(sdu, 3, 2)
4666 DO_LDFF1_LDNF1_2(sds, 3, 2)
4668 DO_LDFF1_LDNF1_2(dd, 3, 3)
4670 #undef DO_LDFF1_LDNF1_1
4671 #undef DO_LDFF1_LDNF1_2
4674 * Store contiguous data, protected by a governing predicate.
4677 #ifdef CONFIG_SOFTMMU
4678 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4679 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4680 target_ulong addr, int mmu_idx, uintptr_t ra) \
4682 TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
4683 TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
4685 #else
4686 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4687 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4688 target_ulong addr, int mmu_idx, uintptr_t ra) \
4690 HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
4692 #endif
4694 DO_ST_TLB(st1bb, H1, uint8_t, stb_p, 0, helper_ret_stb_mmu)
4695 DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
4696 DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
4697 DO_ST_TLB(st1bd, , uint64_t, stb_p, 0, helper_ret_stb_mmu)
4699 DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4700 DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4701 DO_ST_TLB(st1hd_le, , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4703 DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4704 DO_ST_TLB(st1sd_le, , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4706 DO_ST_TLB(st1dd_le, , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)
4708 DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4709 DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4710 DO_ST_TLB(st1hd_be, , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4712 DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4713 DO_ST_TLB(st1sd_be, , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4715 DO_ST_TLB(st1dd_be, , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)
4717 #undef DO_ST_TLB
4720 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4722 static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
4723 uint32_t desc, const uintptr_t ra,
4724 const int esize, const int msize,
4725 sve_st1_tlb_fn *tlb_fn)
4727 const int mmu_idx = cpu_mmu_index(env, false);
4728 intptr_t i, oprsz = simd_oprsz(desc);
4729 unsigned rd = simd_data(desc);
4730 void *vd = &env->vfp.zregs[rd];
4732 set_helper_retaddr(ra);
4733 for (i = 0; i < oprsz; ) {
4734 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4735 do {
4736 if (pg & 1) {
4737 tlb_fn(env, vd, i, addr, mmu_idx, ra);
4739 i += esize, pg >>= esize;
4740 addr += msize;
4741 } while (i & 15);
4743 set_helper_retaddr(0);
4746 static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
4747 uint32_t desc, const uintptr_t ra,
4748 const int esize, const int msize,
4749 sve_st1_tlb_fn *tlb_fn)
4751 const int mmu_idx = cpu_mmu_index(env, false);
4752 intptr_t i, oprsz = simd_oprsz(desc);
4753 unsigned rd = simd_data(desc);
4754 void *d1 = &env->vfp.zregs[rd];
4755 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4757 set_helper_retaddr(ra);
4758 for (i = 0; i < oprsz; ) {
4759 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4760 do {
4761 if (pg & 1) {
4762 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4763 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4765 i += esize, pg >>= esize;
4766 addr += 2 * msize;
4767 } while (i & 15);
4769 set_helper_retaddr(0);
4772 static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
4773 uint32_t desc, const uintptr_t ra,
4774 const int esize, const int msize,
4775 sve_st1_tlb_fn *tlb_fn)
4777 const int mmu_idx = cpu_mmu_index(env, false);
4778 intptr_t i, oprsz = simd_oprsz(desc);
4779 unsigned rd = simd_data(desc);
4780 void *d1 = &env->vfp.zregs[rd];
4781 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4782 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4784 set_helper_retaddr(ra);
4785 for (i = 0; i < oprsz; ) {
4786 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4787 do {
4788 if (pg & 1) {
4789 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4790 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4791 tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
4793 i += esize, pg >>= esize;
4794 addr += 3 * msize;
4795 } while (i & 15);
4797 set_helper_retaddr(0);
4800 static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
4801 uint32_t desc, const uintptr_t ra,
4802 const int esize, const int msize,
4803 sve_st1_tlb_fn *tlb_fn)
4805 const int mmu_idx = cpu_mmu_index(env, false);
4806 intptr_t i, oprsz = simd_oprsz(desc);
4807 unsigned rd = simd_data(desc);
4808 void *d1 = &env->vfp.zregs[rd];
4809 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4810 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4811 void *d4 = &env->vfp.zregs[(rd + 3) & 31];
4813 set_helper_retaddr(ra);
4814 for (i = 0; i < oprsz; ) {
4815 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4816 do {
4817 if (pg & 1) {
4818 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4819 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4820 tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
4821 tlb_fn(env, d4, i, addr + 3 * msize, mmu_idx, ra);
4823 i += esize, pg >>= esize;
4824 addr += 4 * msize;
4825 } while (i & 15);
4827 set_helper_retaddr(0);
4830 #define DO_STN_1(N, NAME, ESIZE) \
4831 void __attribute__((flatten)) HELPER(sve_st##N##NAME##_r) \
4832 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4834 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
4835 sve_st1##NAME##_tlb); \
4838 #define DO_STN_2(N, NAME, ESIZE, MSIZE) \
4839 void __attribute__((flatten)) HELPER(sve_st##N##NAME##_le_r) \
4840 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4842 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4843 sve_st1##NAME##_le_tlb); \
4845 void __attribute__((flatten)) HELPER(sve_st##N##NAME##_be_r) \
4846 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4848 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4849 sve_st1##NAME##_be_tlb); \
4852 DO_STN_1(1, bb, 1)
4853 DO_STN_1(1, bh, 2)
4854 DO_STN_1(1, bs, 4)
4855 DO_STN_1(1, bd, 8)
4856 DO_STN_1(2, bb, 1)
4857 DO_STN_1(3, bb, 1)
4858 DO_STN_1(4, bb, 1)
4860 DO_STN_2(1, hh, 2, 2)
4861 DO_STN_2(1, hs, 4, 2)
4862 DO_STN_2(1, hd, 8, 2)
4863 DO_STN_2(2, hh, 2, 2)
4864 DO_STN_2(3, hh, 2, 2)
4865 DO_STN_2(4, hh, 2, 2)
4867 DO_STN_2(1, ss, 4, 4)
4868 DO_STN_2(1, sd, 8, 4)
4869 DO_STN_2(2, ss, 4, 4)
4870 DO_STN_2(3, ss, 4, 4)
4871 DO_STN_2(4, ss, 4, 4)
4873 DO_STN_2(1, dd, 8, 8)
4874 DO_STN_2(2, dd, 8, 8)
4875 DO_STN_2(3, dd, 8, 8)
4876 DO_STN_2(4, dd, 8, 8)
4878 #undef DO_STN_1
4879 #undef DO_STN_2
4882 * Loads with a vector index.
4886 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
4888 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
4890 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
4892 return *(uint32_t *)(reg + H1_4(reg_ofs));
4895 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
4897 return *(int32_t *)(reg + H1_4(reg_ofs));
4900 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
4902 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
4905 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
4907 return (int32_t)*(uint64_t *)(reg + reg_ofs);
4910 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
4912 return *(uint64_t *)(reg + reg_ofs);
4915 static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
4916 target_ulong base, uint32_t desc, uintptr_t ra,
4917 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4919 const int mmu_idx = cpu_mmu_index(env, false);
4920 intptr_t i, oprsz = simd_oprsz(desc);
4921 unsigned scale = simd_data(desc);
4922 ARMVectorReg scratch = { };
4924 set_helper_retaddr(ra);
4925 for (i = 0; i < oprsz; ) {
4926 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4927 do {
4928 if (likely(pg & 1)) {
4929 target_ulong off = off_fn(vm, i);
4930 tlb_fn(env, &scratch, i, base + (off << scale), mmu_idx, ra);
4932 i += 4, pg >>= 4;
4933 } while (i & 15);
4935 set_helper_retaddr(0);
4937 /* Wait until all exceptions have been raised to write back. */
4938 memcpy(vd, &scratch, oprsz);
4941 static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
4942 target_ulong base, uint32_t desc, uintptr_t ra,
4943 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4945 const int mmu_idx = cpu_mmu_index(env, false);
4946 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4947 unsigned scale = simd_data(desc);
4948 ARMVectorReg scratch = { };
4950 set_helper_retaddr(ra);
4951 for (i = 0; i < oprsz; i++) {
4952 uint8_t pg = *(uint8_t *)(vg + H1(i));
4953 if (likely(pg & 1)) {
4954 target_ulong off = off_fn(vm, i * 8);
4955 tlb_fn(env, &scratch, i * 8, base + (off << scale), mmu_idx, ra);
4958 set_helper_retaddr(0);
4960 /* Wait until all exceptions have been raised to write back. */
4961 memcpy(vd, &scratch, oprsz * 8);
4964 #define DO_LD1_ZPZ_S(MEM, OFS) \
4965 void __attribute__((flatten)) HELPER(sve_ld##MEM##_##OFS) \
4966 (CPUARMState *env, void *vd, void *vg, void *vm, \
4967 target_ulong base, uint32_t desc) \
4969 sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
4970 off_##OFS##_s, sve_ld1##MEM##_tlb); \
4973 #define DO_LD1_ZPZ_D(MEM, OFS) \
4974 void __attribute__((flatten)) HELPER(sve_ld##MEM##_##OFS) \
4975 (CPUARMState *env, void *vd, void *vg, void *vm, \
4976 target_ulong base, uint32_t desc) \
4978 sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
4979 off_##OFS##_d, sve_ld1##MEM##_tlb); \
4982 DO_LD1_ZPZ_S(bsu, zsu)
4983 DO_LD1_ZPZ_S(bsu, zss)
4984 DO_LD1_ZPZ_D(bdu, zsu)
4985 DO_LD1_ZPZ_D(bdu, zss)
4986 DO_LD1_ZPZ_D(bdu, zd)
4988 DO_LD1_ZPZ_S(bss, zsu)
4989 DO_LD1_ZPZ_S(bss, zss)
4990 DO_LD1_ZPZ_D(bds, zsu)
4991 DO_LD1_ZPZ_D(bds, zss)
4992 DO_LD1_ZPZ_D(bds, zd)
4994 DO_LD1_ZPZ_S(hsu_le, zsu)
4995 DO_LD1_ZPZ_S(hsu_le, zss)
4996 DO_LD1_ZPZ_D(hdu_le, zsu)
4997 DO_LD1_ZPZ_D(hdu_le, zss)
4998 DO_LD1_ZPZ_D(hdu_le, zd)
5000 DO_LD1_ZPZ_S(hsu_be, zsu)
5001 DO_LD1_ZPZ_S(hsu_be, zss)
5002 DO_LD1_ZPZ_D(hdu_be, zsu)
5003 DO_LD1_ZPZ_D(hdu_be, zss)
5004 DO_LD1_ZPZ_D(hdu_be, zd)
5006 DO_LD1_ZPZ_S(hss_le, zsu)
5007 DO_LD1_ZPZ_S(hss_le, zss)
5008 DO_LD1_ZPZ_D(hds_le, zsu)
5009 DO_LD1_ZPZ_D(hds_le, zss)
5010 DO_LD1_ZPZ_D(hds_le, zd)
5012 DO_LD1_ZPZ_S(hss_be, zsu)
5013 DO_LD1_ZPZ_S(hss_be, zss)
5014 DO_LD1_ZPZ_D(hds_be, zsu)
5015 DO_LD1_ZPZ_D(hds_be, zss)
5016 DO_LD1_ZPZ_D(hds_be, zd)
5018 DO_LD1_ZPZ_S(ss_le, zsu)
5019 DO_LD1_ZPZ_S(ss_le, zss)
5020 DO_LD1_ZPZ_D(sdu_le, zsu)
5021 DO_LD1_ZPZ_D(sdu_le, zss)
5022 DO_LD1_ZPZ_D(sdu_le, zd)
5024 DO_LD1_ZPZ_S(ss_be, zsu)
5025 DO_LD1_ZPZ_S(ss_be, zss)
5026 DO_LD1_ZPZ_D(sdu_be, zsu)
5027 DO_LD1_ZPZ_D(sdu_be, zss)
5028 DO_LD1_ZPZ_D(sdu_be, zd)
5030 DO_LD1_ZPZ_D(sds_le, zsu)
5031 DO_LD1_ZPZ_D(sds_le, zss)
5032 DO_LD1_ZPZ_D(sds_le, zd)
5034 DO_LD1_ZPZ_D(sds_be, zsu)
5035 DO_LD1_ZPZ_D(sds_be, zss)
5036 DO_LD1_ZPZ_D(sds_be, zd)
5038 DO_LD1_ZPZ_D(dd_le, zsu)
5039 DO_LD1_ZPZ_D(dd_le, zss)
5040 DO_LD1_ZPZ_D(dd_le, zd)
5042 DO_LD1_ZPZ_D(dd_be, zsu)
5043 DO_LD1_ZPZ_D(dd_be, zss)
5044 DO_LD1_ZPZ_D(dd_be, zd)
5046 #undef DO_LD1_ZPZ_S
5047 #undef DO_LD1_ZPZ_D
5049 /* First fault loads with a vector index. */
5051 #ifdef CONFIG_USER_ONLY
5053 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
5054 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5055 target_ulong base, uint32_t desc) \
5057 intptr_t i, oprsz = simd_oprsz(desc); \
5058 unsigned scale = simd_data(desc); \
5059 uintptr_t ra = GETPC(); \
5060 bool first = true; \
5061 mmap_lock(); \
5062 for (i = 0; i < oprsz; ) { \
5063 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
5064 do { \
5065 TYPEM m = 0; \
5066 if (pg & 1) { \
5067 target_ulong off = *(TYPEI *)(vm + H(i)); \
5068 target_ulong addr = base + (off << scale); \
5069 if (!first && \
5070 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
5071 record_fault(env, i, oprsz); \
5072 goto exit; \
5074 m = FN(env, addr, ra); \
5075 first = false; \
5077 *(TYPEE *)(vd + H(i)) = m; \
5078 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
5079 } while (i & 15); \
5081 exit: \
5082 mmap_unlock(); \
5085 #else
5087 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
5088 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5089 target_ulong base, uint32_t desc) \
5091 g_assert_not_reached(); \
5094 #endif
5096 #define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
5097 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
5098 #define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
5099 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
5101 DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
5102 DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
5103 DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
5104 DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
5105 DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
5107 DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
5108 DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
5109 DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
5110 DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
5111 DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
5113 DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
5114 DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
5115 DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
5116 DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
5117 DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
5118 DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
5119 DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
5121 DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
5122 DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
5123 DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
5124 DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
5125 DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
5126 DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
5127 DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
5129 DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
5130 DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
5131 DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
5132 DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
5133 DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
5134 DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
5135 DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
5137 /* Stores with a vector index. */
5139 static void sve_st1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5140 target_ulong base, uint32_t desc, uintptr_t ra,
5141 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
5143 const int mmu_idx = cpu_mmu_index(env, false);
5144 intptr_t i, oprsz = simd_oprsz(desc);
5145 unsigned scale = simd_data(desc);
5147 set_helper_retaddr(ra);
5148 for (i = 0; i < oprsz; ) {
5149 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5150 do {
5151 if (likely(pg & 1)) {
5152 target_ulong off = off_fn(vm, i);
5153 tlb_fn(env, vd, i, base + (off << scale), mmu_idx, ra);
5155 i += 4, pg >>= 4;
5156 } while (i & 15);
5158 set_helper_retaddr(0);
5161 static void sve_st1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5162 target_ulong base, uint32_t desc, uintptr_t ra,
5163 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
5165 const int mmu_idx = cpu_mmu_index(env, false);
5166 intptr_t i, oprsz = simd_oprsz(desc) / 8;
5167 unsigned scale = simd_data(desc);
5169 set_helper_retaddr(ra);
5170 for (i = 0; i < oprsz; i++) {
5171 uint8_t pg = *(uint8_t *)(vg + H1(i));
5172 if (likely(pg & 1)) {
5173 target_ulong off = off_fn(vm, i * 8);
5174 tlb_fn(env, vd, i * 8, base + (off << scale), mmu_idx, ra);
5177 set_helper_retaddr(0);
5180 #define DO_ST1_ZPZ_S(MEM, OFS) \
5181 void __attribute__((flatten)) HELPER(sve_st##MEM##_##OFS) \
5182 (CPUARMState *env, void *vd, void *vg, void *vm, \
5183 target_ulong base, uint32_t desc) \
5185 sve_st1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5186 off_##OFS##_s, sve_st1##MEM##_tlb); \
5189 #define DO_ST1_ZPZ_D(MEM, OFS) \
5190 void __attribute__((flatten)) HELPER(sve_st##MEM##_##OFS) \
5191 (CPUARMState *env, void *vd, void *vg, void *vm, \
5192 target_ulong base, uint32_t desc) \
5194 sve_st1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5195 off_##OFS##_d, sve_st1##MEM##_tlb); \
5198 DO_ST1_ZPZ_S(bs, zsu)
5199 DO_ST1_ZPZ_S(hs_le, zsu)
5200 DO_ST1_ZPZ_S(hs_be, zsu)
5201 DO_ST1_ZPZ_S(ss_le, zsu)
5202 DO_ST1_ZPZ_S(ss_be, zsu)
5204 DO_ST1_ZPZ_S(bs, zss)
5205 DO_ST1_ZPZ_S(hs_le, zss)
5206 DO_ST1_ZPZ_S(hs_be, zss)
5207 DO_ST1_ZPZ_S(ss_le, zss)
5208 DO_ST1_ZPZ_S(ss_be, zss)
5210 DO_ST1_ZPZ_D(bd, zsu)
5211 DO_ST1_ZPZ_D(hd_le, zsu)
5212 DO_ST1_ZPZ_D(hd_be, zsu)
5213 DO_ST1_ZPZ_D(sd_le, zsu)
5214 DO_ST1_ZPZ_D(sd_be, zsu)
5215 DO_ST1_ZPZ_D(dd_le, zsu)
5216 DO_ST1_ZPZ_D(dd_be, zsu)
5218 DO_ST1_ZPZ_D(bd, zss)
5219 DO_ST1_ZPZ_D(hd_le, zss)
5220 DO_ST1_ZPZ_D(hd_be, zss)
5221 DO_ST1_ZPZ_D(sd_le, zss)
5222 DO_ST1_ZPZ_D(sd_be, zss)
5223 DO_ST1_ZPZ_D(dd_le, zss)
5224 DO_ST1_ZPZ_D(dd_be, zss)
5226 DO_ST1_ZPZ_D(bd, zd)
5227 DO_ST1_ZPZ_D(hd_le, zd)
5228 DO_ST1_ZPZ_D(hd_be, zd)
5229 DO_ST1_ZPZ_D(sd_le, zd)
5230 DO_ST1_ZPZ_D(sd_be, zd)
5231 DO_ST1_ZPZ_D(dd_le, zd)
5232 DO_ST1_ZPZ_D(dd_be, zd)
5234 #undef DO_ST1_ZPZ_S
5235 #undef DO_ST1_ZPZ_D