util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
[qemu/ar7.git] / target / arm / sve_helper.c
blob4758d46f34d4196caaf19e7e2c226f8a94104fd9
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
39 #else
40 #define H1(x) (x)
41 #define H1_2(x) (x)
42 #define H1_4(x) (x)
43 #define H2(x) (x)
44 #define H4(x) (x)
45 #endif
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
57 /* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
76 return flags;
79 /* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
98 return flags;
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
118 return flags;
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
129 * printf("0x%016lx,\n", m);
132 static inline uint64_t expand_pred_b(uint8_t byte)
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
222 return word[byte];
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
239 static inline uint64_t expand_pred_h(uint8_t byte)
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
251 return word[byte & 0x55];
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
262 return word[byte & 0x11];
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
268 return rol32(h, 16);
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
282 return rol64(h, 32);
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
314 #undef DO_AND
315 #undef DO_BIC
316 #undef DO_EOR
317 #undef DO_ORR
318 #undef DO_ORN
319 #undef DO_NOR
320 #undef DO_NAND
321 #undef DO_SEL
322 #undef LOGICAL_PPPP
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
450 return (n * m) >> 8;
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455 return (n * m) >> 16;
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460 return (n * m) >> 32;
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
520 #undef DO_ZPZZ
521 #undef DO_ZPZZ_D
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
556 #undef DO_ZPZW
558 /* Fully general two-operand expander, controlled by a predicate.
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
612 #define DO_CNOT(N) (N == 0)
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
631 #define DO_NOT(N) (~N)
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
659 #define DO_ABS(N) (N < 0 ? -N : N)
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
666 #define DO_NEG(N) (-N)
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
715 #undef DO_ZZW
717 #undef DO_CLS_B
718 #undef DO_CLS_H
719 #undef DO_CLZ_B
720 #undef DO_CLZ_H
721 #undef DO_CNOT
722 #undef DO_FABS
723 #undef DO_FNEG
724 #undef DO_ABS
725 #undef DO_NEG
726 #undef DO_ZPZ
727 #undef DO_ZPZ_D
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
753 return (TYPERET)ret; \
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
769 return ret; \
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
816 #undef DO_VPZ
817 #undef DO_VPZ_D
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
830 #define DO_SUBR(X, Y) (Y - X)
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
857 #undef DO_ZZI
859 #undef DO_AND
860 #undef DO_ORR
861 #undef DO_EOR
862 #undef DO_BIC
863 #undef DO_ADD
864 #undef DO_SUB
865 #undef DO_MAX
866 #undef DO_MIN
867 #undef DO_ABD
868 #undef DO_MUL
869 #undef DO_DIV
870 #undef DO_ASR
871 #undef DO_LSR
872 #undef DO_LSL
873 #undef DO_SUBR
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
894 uint32_t flags = PREDTEST_INIT;
895 uint64_t *d = vd, *g = vg;
896 intptr_t i = 0;
898 do {
899 uint64_t this_d = d[i];
900 uint64_t this_g = g[i];
902 if (this_g) {
903 if (!(flags & 4)) {
904 /* Set in D the first bit of G. */
905 this_d |= this_g & -this_g;
906 d[i] = this_d;
908 flags = iter_predtest_fwd(this_d, this_g, flags);
910 } while (++i < words);
912 return flags;
915 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
917 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
918 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
919 uint32_t flags = PREDTEST_INIT;
920 uint64_t *d = vd, *g = vg, esz_mask;
921 intptr_t i, next;
923 next = last_active_element(vd, words, esz) + (1 << esz);
924 esz_mask = pred_esz_masks[esz];
926 /* Similar to the pseudocode for pnext, but scaled by ESZ
927 so that we find the correct bit. */
928 if (next < words * 64) {
929 uint64_t mask = -1;
931 if (next & 63) {
932 mask = ~((1ull << (next & 63)) - 1);
933 next &= -64;
935 do {
936 uint64_t this_g = g[next / 64] & esz_mask & mask;
937 if (this_g != 0) {
938 next = (next & -64) + ctz64(this_g);
939 break;
941 next += 64;
942 mask = -1;
943 } while (next < words * 64);
946 i = 0;
947 do {
948 uint64_t this_d = 0;
949 if (i == next / 64) {
950 this_d = 1ull << (next & 63);
952 d[i] = this_d;
953 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
954 } while (++i < words);
956 return flags;
960 * Copy Zn into Zd, and store zero into inactive elements.
961 * If inv, store zeros into the active elements.
963 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
965 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
966 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
967 uint64_t *d = vd, *n = vn;
968 uint8_t *pg = vg;
970 for (i = 0; i < opr_sz; i += 1) {
971 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
975 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
977 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
978 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
979 uint64_t *d = vd, *n = vn;
980 uint8_t *pg = vg;
982 for (i = 0; i < opr_sz; i += 1) {
983 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
987 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
989 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
990 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
991 uint64_t *d = vd, *n = vn;
992 uint8_t *pg = vg;
994 for (i = 0; i < opr_sz; i += 1) {
995 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
999 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1002 uint64_t *d = vd, *n = vn;
1003 uint8_t *pg = vg;
1004 uint8_t inv = simd_data(desc);
1006 for (i = 0; i < opr_sz; i += 1) {
1007 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1011 /* Three-operand expander, immediate operand, controlled by a predicate.
1013 #define DO_ZPZI(NAME, TYPE, H, OP) \
1014 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1016 intptr_t i, opr_sz = simd_oprsz(desc); \
1017 TYPE imm = simd_data(desc); \
1018 for (i = 0; i < opr_sz; ) { \
1019 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1020 do { \
1021 if (pg & 1) { \
1022 TYPE nn = *(TYPE *)(vn + H(i)); \
1023 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1025 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1026 } while (i & 15); \
1030 /* Similarly, specialized for 64-bit operands. */
1031 #define DO_ZPZI_D(NAME, TYPE, OP) \
1032 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1034 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1035 TYPE *d = vd, *n = vn; \
1036 TYPE imm = simd_data(desc); \
1037 uint8_t *pg = vg; \
1038 for (i = 0; i < opr_sz; i += 1) { \
1039 if (pg[H1(i)] & 1) { \
1040 TYPE nn = n[i]; \
1041 d[i] = OP(nn, imm); \
1046 #define DO_SHR(N, M) (N >> M)
1047 #define DO_SHL(N, M) (N << M)
1049 /* Arithmetic shift right for division. This rounds negative numbers
1050 toward zero as per signed division. Therefore before shifting,
1051 when N is negative, add 2**M-1. */
1052 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1054 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1055 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1056 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1057 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1059 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1060 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1061 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1062 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1064 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1065 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1066 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1067 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1069 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1070 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1071 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1072 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1074 #undef DO_SHR
1075 #undef DO_SHL
1076 #undef DO_ASRD
1077 #undef DO_ZPZI
1078 #undef DO_ZPZI_D
1080 /* Fully general four-operand expander, controlled by a predicate.
1082 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1083 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1084 void *vg, uint32_t desc) \
1086 intptr_t i, opr_sz = simd_oprsz(desc); \
1087 for (i = 0; i < opr_sz; ) { \
1088 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1089 do { \
1090 if (pg & 1) { \
1091 TYPE nn = *(TYPE *)(vn + H(i)); \
1092 TYPE mm = *(TYPE *)(vm + H(i)); \
1093 TYPE aa = *(TYPE *)(va + H(i)); \
1094 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1096 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1097 } while (i & 15); \
1101 /* Similarly, specialized for 64-bit operands. */
1102 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1103 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1104 void *vg, uint32_t desc) \
1106 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1107 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1108 uint8_t *pg = vg; \
1109 for (i = 0; i < opr_sz; i += 1) { \
1110 if (pg[H1(i)] & 1) { \
1111 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1112 d[i] = OP(aa, nn, mm); \
1117 #define DO_MLA(A, N, M) (A + N * M)
1118 #define DO_MLS(A, N, M) (A - N * M)
1120 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1121 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1123 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1124 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1126 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1127 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1129 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1130 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1132 #undef DO_MLA
1133 #undef DO_MLS
1134 #undef DO_ZPZZZ
1135 #undef DO_ZPZZZ_D
1137 void HELPER(sve_index_b)(void *vd, uint32_t start,
1138 uint32_t incr, uint32_t desc)
1140 intptr_t i, opr_sz = simd_oprsz(desc);
1141 uint8_t *d = vd;
1142 for (i = 0; i < opr_sz; i += 1) {
1143 d[H1(i)] = start + i * incr;
1147 void HELPER(sve_index_h)(void *vd, uint32_t start,
1148 uint32_t incr, uint32_t desc)
1150 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1151 uint16_t *d = vd;
1152 for (i = 0; i < opr_sz; i += 1) {
1153 d[H2(i)] = start + i * incr;
1157 void HELPER(sve_index_s)(void *vd, uint32_t start,
1158 uint32_t incr, uint32_t desc)
1160 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1161 uint32_t *d = vd;
1162 for (i = 0; i < opr_sz; i += 1) {
1163 d[H4(i)] = start + i * incr;
1167 void HELPER(sve_index_d)(void *vd, uint64_t start,
1168 uint64_t incr, uint32_t desc)
1170 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1171 uint64_t *d = vd;
1172 for (i = 0; i < opr_sz; i += 1) {
1173 d[i] = start + i * incr;
1177 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1179 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1180 uint32_t sh = simd_data(desc);
1181 uint32_t *d = vd, *n = vn, *m = vm;
1182 for (i = 0; i < opr_sz; i += 1) {
1183 d[i] = n[i] + (m[i] << sh);
1187 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1189 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1190 uint64_t sh = simd_data(desc);
1191 uint64_t *d = vd, *n = vn, *m = vm;
1192 for (i = 0; i < opr_sz; i += 1) {
1193 d[i] = n[i] + (m[i] << sh);
1197 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1199 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1200 uint64_t sh = simd_data(desc);
1201 uint64_t *d = vd, *n = vn, *m = vm;
1202 for (i = 0; i < opr_sz; i += 1) {
1203 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1207 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1209 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1210 uint64_t sh = simd_data(desc);
1211 uint64_t *d = vd, *n = vn, *m = vm;
1212 for (i = 0; i < opr_sz; i += 1) {
1213 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1217 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1219 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1220 static const uint16_t coeff[] = {
1221 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1222 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1223 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1224 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1226 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1227 uint16_t *d = vd, *n = vn;
1229 for (i = 0; i < opr_sz; i++) {
1230 uint16_t nn = n[i];
1231 intptr_t idx = extract32(nn, 0, 5);
1232 uint16_t exp = extract32(nn, 5, 5);
1233 d[i] = coeff[idx] | (exp << 10);
1237 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1239 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1240 static const uint32_t coeff[] = {
1241 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1242 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1243 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1244 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1245 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1246 0x1ef532, 0x20b051, 0x227043, 0x243516,
1247 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1248 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1249 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1250 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1251 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1252 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1253 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1254 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1255 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1256 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1258 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1259 uint32_t *d = vd, *n = vn;
1261 for (i = 0; i < opr_sz; i++) {
1262 uint32_t nn = n[i];
1263 intptr_t idx = extract32(nn, 0, 6);
1264 uint32_t exp = extract32(nn, 6, 8);
1265 d[i] = coeff[idx] | (exp << 23);
1269 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1271 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1272 static const uint64_t coeff[] = {
1273 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1274 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1275 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1276 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1277 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1278 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1279 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1280 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1281 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1282 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1283 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1284 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1285 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1286 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1287 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1288 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1289 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1290 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1291 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1292 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1293 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1294 0xFA7C1819E90D8ull,
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1297 uint64_t *d = vd, *n = vn;
1299 for (i = 0; i < opr_sz; i++) {
1300 uint64_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint64_t exp = extract32(nn, 6, 11);
1303 d[i] = coeff[idx] | (exp << 52);
1307 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1309 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1310 uint16_t *d = vd, *n = vn, *m = vm;
1311 for (i = 0; i < opr_sz; i += 1) {
1312 uint16_t nn = n[i];
1313 uint16_t mm = m[i];
1314 if (mm & 1) {
1315 nn = float16_one;
1317 d[i] = nn ^ (mm & 2) << 14;
1321 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1323 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1324 uint32_t *d = vd, *n = vn, *m = vm;
1325 for (i = 0; i < opr_sz; i += 1) {
1326 uint32_t nn = n[i];
1327 uint32_t mm = m[i];
1328 if (mm & 1) {
1329 nn = float32_one;
1331 d[i] = nn ^ (mm & 2) << 30;
1335 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1337 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1338 uint64_t *d = vd, *n = vn, *m = vm;
1339 for (i = 0; i < opr_sz; i += 1) {
1340 uint64_t nn = n[i];
1341 uint64_t mm = m[i];
1342 if (mm & 1) {
1343 nn = float64_one;
1345 d[i] = nn ^ (mm & 2) << 62;
1350 * Signed saturating addition with scalar operand.
1353 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1355 intptr_t i, oprsz = simd_oprsz(desc);
1357 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1358 int r = *(int8_t *)(a + i) + b;
1359 if (r > INT8_MAX) {
1360 r = INT8_MAX;
1361 } else if (r < INT8_MIN) {
1362 r = INT8_MIN;
1364 *(int8_t *)(d + i) = r;
1368 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1370 intptr_t i, oprsz = simd_oprsz(desc);
1372 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1373 int r = *(int16_t *)(a + i) + b;
1374 if (r > INT16_MAX) {
1375 r = INT16_MAX;
1376 } else if (r < INT16_MIN) {
1377 r = INT16_MIN;
1379 *(int16_t *)(d + i) = r;
1383 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1385 intptr_t i, oprsz = simd_oprsz(desc);
1387 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1388 int64_t r = *(int32_t *)(a + i) + b;
1389 if (r > INT32_MAX) {
1390 r = INT32_MAX;
1391 } else if (r < INT32_MIN) {
1392 r = INT32_MIN;
1394 *(int32_t *)(d + i) = r;
1398 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1400 intptr_t i, oprsz = simd_oprsz(desc);
1402 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1403 int64_t ai = *(int64_t *)(a + i);
1404 int64_t r = ai + b;
1405 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1406 /* Signed overflow. */
1407 r = (r < 0 ? INT64_MAX : INT64_MIN);
1409 *(int64_t *)(d + i) = r;
1414 * Unsigned saturating addition with scalar operand.
1417 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1419 intptr_t i, oprsz = simd_oprsz(desc);
1421 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1422 int r = *(uint8_t *)(a + i) + b;
1423 if (r > UINT8_MAX) {
1424 r = UINT8_MAX;
1425 } else if (r < 0) {
1426 r = 0;
1428 *(uint8_t *)(d + i) = r;
1432 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1434 intptr_t i, oprsz = simd_oprsz(desc);
1436 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1437 int r = *(uint16_t *)(a + i) + b;
1438 if (r > UINT16_MAX) {
1439 r = UINT16_MAX;
1440 } else if (r < 0) {
1441 r = 0;
1443 *(uint16_t *)(d + i) = r;
1447 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1449 intptr_t i, oprsz = simd_oprsz(desc);
1451 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1452 int64_t r = *(uint32_t *)(a + i) + b;
1453 if (r > UINT32_MAX) {
1454 r = UINT32_MAX;
1455 } else if (r < 0) {
1456 r = 0;
1458 *(uint32_t *)(d + i) = r;
1462 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1464 intptr_t i, oprsz = simd_oprsz(desc);
1466 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1467 uint64_t r = *(uint64_t *)(a + i) + b;
1468 if (r < b) {
1469 r = UINT64_MAX;
1471 *(uint64_t *)(d + i) = r;
1475 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1477 intptr_t i, oprsz = simd_oprsz(desc);
1479 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1480 uint64_t ai = *(uint64_t *)(a + i);
1481 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1485 /* Two operand predicated copy immediate with merge. All valid immediates
1486 * can fit within 17 signed bits in the simd_data field.
1488 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1489 uint64_t mm, uint32_t desc)
1491 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1492 uint64_t *d = vd, *n = vn;
1493 uint8_t *pg = vg;
1495 mm = dup_const(MO_8, mm);
1496 for (i = 0; i < opr_sz; i += 1) {
1497 uint64_t nn = n[i];
1498 uint64_t pp = expand_pred_b(pg[H1(i)]);
1499 d[i] = (mm & pp) | (nn & ~pp);
1503 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1504 uint64_t mm, uint32_t desc)
1506 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1507 uint64_t *d = vd, *n = vn;
1508 uint8_t *pg = vg;
1510 mm = dup_const(MO_16, mm);
1511 for (i = 0; i < opr_sz; i += 1) {
1512 uint64_t nn = n[i];
1513 uint64_t pp = expand_pred_h(pg[H1(i)]);
1514 d[i] = (mm & pp) | (nn & ~pp);
1518 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1519 uint64_t mm, uint32_t desc)
1521 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1522 uint64_t *d = vd, *n = vn;
1523 uint8_t *pg = vg;
1525 mm = dup_const(MO_32, mm);
1526 for (i = 0; i < opr_sz; i += 1) {
1527 uint64_t nn = n[i];
1528 uint64_t pp = expand_pred_s(pg[H1(i)]);
1529 d[i] = (mm & pp) | (nn & ~pp);
1533 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1534 uint64_t mm, uint32_t desc)
1536 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1537 uint64_t *d = vd, *n = vn;
1538 uint8_t *pg = vg;
1540 for (i = 0; i < opr_sz; i += 1) {
1541 uint64_t nn = n[i];
1542 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1546 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1548 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1549 uint64_t *d = vd;
1550 uint8_t *pg = vg;
1552 val = dup_const(MO_8, val);
1553 for (i = 0; i < opr_sz; i += 1) {
1554 d[i] = val & expand_pred_b(pg[H1(i)]);
1558 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1560 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1561 uint64_t *d = vd;
1562 uint8_t *pg = vg;
1564 val = dup_const(MO_16, val);
1565 for (i = 0; i < opr_sz; i += 1) {
1566 d[i] = val & expand_pred_h(pg[H1(i)]);
1570 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1572 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1573 uint64_t *d = vd;
1574 uint8_t *pg = vg;
1576 val = dup_const(MO_32, val);
1577 for (i = 0; i < opr_sz; i += 1) {
1578 d[i] = val & expand_pred_s(pg[H1(i)]);
1582 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1584 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1585 uint64_t *d = vd;
1586 uint8_t *pg = vg;
1588 for (i = 0; i < opr_sz; i += 1) {
1589 d[i] = (pg[H1(i)] & 1 ? val : 0);
1593 /* Big-endian hosts need to frob the byte indices. If the copy
1594 * happens to be 8-byte aligned, then no frobbing necessary.
1596 static void swap_memmove(void *vd, void *vs, size_t n)
1598 uintptr_t d = (uintptr_t)vd;
1599 uintptr_t s = (uintptr_t)vs;
1600 uintptr_t o = (d | s | n) & 7;
1601 size_t i;
1603 #ifndef HOST_WORDS_BIGENDIAN
1604 o = 0;
1605 #endif
1606 switch (o) {
1607 case 0:
1608 memmove(vd, vs, n);
1609 break;
1611 case 4:
1612 if (d < s || d >= s + n) {
1613 for (i = 0; i < n; i += 4) {
1614 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1616 } else {
1617 for (i = n; i > 0; ) {
1618 i -= 4;
1619 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1622 break;
1624 case 2:
1625 case 6:
1626 if (d < s || d >= s + n) {
1627 for (i = 0; i < n; i += 2) {
1628 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1630 } else {
1631 for (i = n; i > 0; ) {
1632 i -= 2;
1633 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1636 break;
1638 default:
1639 if (d < s || d >= s + n) {
1640 for (i = 0; i < n; i++) {
1641 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1643 } else {
1644 for (i = n; i > 0; ) {
1645 i -= 1;
1646 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1649 break;
1653 /* Similarly for memset of 0. */
1654 static void swap_memzero(void *vd, size_t n)
1656 uintptr_t d = (uintptr_t)vd;
1657 uintptr_t o = (d | n) & 7;
1658 size_t i;
1660 /* Usually, the first bit of a predicate is set, so N is 0. */
1661 if (likely(n == 0)) {
1662 return;
1665 #ifndef HOST_WORDS_BIGENDIAN
1666 o = 0;
1667 #endif
1668 switch (o) {
1669 case 0:
1670 memset(vd, 0, n);
1671 break;
1673 case 4:
1674 for (i = 0; i < n; i += 4) {
1675 *(uint32_t *)H1_4(d + i) = 0;
1677 break;
1679 case 2:
1680 case 6:
1681 for (i = 0; i < n; i += 2) {
1682 *(uint16_t *)H1_2(d + i) = 0;
1684 break;
1686 default:
1687 for (i = 0; i < n; i++) {
1688 *(uint8_t *)H1(d + i) = 0;
1690 break;
1694 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1696 intptr_t opr_sz = simd_oprsz(desc);
1697 size_t n_ofs = simd_data(desc);
1698 size_t n_siz = opr_sz - n_ofs;
1700 if (vd != vm) {
1701 swap_memmove(vd, vn + n_ofs, n_siz);
1702 swap_memmove(vd + n_siz, vm, n_ofs);
1703 } else if (vd != vn) {
1704 swap_memmove(vd + n_siz, vd, n_ofs);
1705 swap_memmove(vd, vn + n_ofs, n_siz);
1706 } else {
1707 /* vd == vn == vm. Need temp space. */
1708 ARMVectorReg tmp;
1709 swap_memmove(&tmp, vm, n_ofs);
1710 swap_memmove(vd, vd + n_ofs, n_siz);
1711 memcpy(vd + n_siz, &tmp, n_ofs);
1715 #define DO_INSR(NAME, TYPE, H) \
1716 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1718 intptr_t opr_sz = simd_oprsz(desc); \
1719 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1720 *(TYPE *)(vd + H(0)) = val; \
1723 DO_INSR(sve_insr_b, uint8_t, H1)
1724 DO_INSR(sve_insr_h, uint16_t, H1_2)
1725 DO_INSR(sve_insr_s, uint32_t, H1_4)
1726 DO_INSR(sve_insr_d, uint64_t, )
1728 #undef DO_INSR
1730 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1732 intptr_t i, j, opr_sz = simd_oprsz(desc);
1733 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1734 uint64_t f = *(uint64_t *)(vn + i);
1735 uint64_t b = *(uint64_t *)(vn + j);
1736 *(uint64_t *)(vd + i) = bswap64(b);
1737 *(uint64_t *)(vd + j) = bswap64(f);
1741 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1743 intptr_t i, j, opr_sz = simd_oprsz(desc);
1744 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1745 uint64_t f = *(uint64_t *)(vn + i);
1746 uint64_t b = *(uint64_t *)(vn + j);
1747 *(uint64_t *)(vd + i) = hswap64(b);
1748 *(uint64_t *)(vd + j) = hswap64(f);
1752 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1754 intptr_t i, j, opr_sz = simd_oprsz(desc);
1755 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1756 uint64_t f = *(uint64_t *)(vn + i);
1757 uint64_t b = *(uint64_t *)(vn + j);
1758 *(uint64_t *)(vd + i) = rol64(b, 32);
1759 *(uint64_t *)(vd + j) = rol64(f, 32);
1763 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1765 intptr_t i, j, opr_sz = simd_oprsz(desc);
1766 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1767 uint64_t f = *(uint64_t *)(vn + i);
1768 uint64_t b = *(uint64_t *)(vn + j);
1769 *(uint64_t *)(vd + i) = b;
1770 *(uint64_t *)(vd + j) = f;
1774 #define DO_TBL(NAME, TYPE, H) \
1775 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1777 intptr_t i, opr_sz = simd_oprsz(desc); \
1778 uintptr_t elem = opr_sz / sizeof(TYPE); \
1779 TYPE *d = vd, *n = vn, *m = vm; \
1780 ARMVectorReg tmp; \
1781 if (unlikely(vd == vn)) { \
1782 n = memcpy(&tmp, vn, opr_sz); \
1784 for (i = 0; i < elem; i++) { \
1785 TYPE j = m[H(i)]; \
1786 d[H(i)] = j < elem ? n[H(j)] : 0; \
1790 DO_TBL(sve_tbl_b, uint8_t, H1)
1791 DO_TBL(sve_tbl_h, uint16_t, H2)
1792 DO_TBL(sve_tbl_s, uint32_t, H4)
1793 DO_TBL(sve_tbl_d, uint64_t, )
1795 #undef TBL
1797 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1798 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1800 intptr_t i, opr_sz = simd_oprsz(desc); \
1801 TYPED *d = vd; \
1802 TYPES *n = vn; \
1803 ARMVectorReg tmp; \
1804 if (unlikely(vn - vd < opr_sz)) { \
1805 n = memcpy(&tmp, n, opr_sz / 2); \
1807 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1808 d[HD(i)] = n[HS(i)]; \
1812 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1813 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1814 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1816 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1817 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1818 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1820 #undef DO_UNPK
1822 /* Mask of bits included in the even numbered predicates of width esz.
1823 * We also use this for expand_bits/compress_bits, and so extend the
1824 * same pattern out to 16-bit units.
1826 static const uint64_t even_bit_esz_masks[5] = {
1827 0x5555555555555555ull,
1828 0x3333333333333333ull,
1829 0x0f0f0f0f0f0f0f0full,
1830 0x00ff00ff00ff00ffull,
1831 0x0000ffff0000ffffull,
1834 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1835 * For N==0, this corresponds to the operation that in qemu/bitops.h
1836 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1837 * section 7-2 Shuffling Bits.
1839 static uint64_t expand_bits(uint64_t x, int n)
1841 int i;
1843 x &= 0xffffffffu;
1844 for (i = 4; i >= n; i--) {
1845 int sh = 1 << i;
1846 x = ((x << sh) | x) & even_bit_esz_masks[i];
1848 return x;
1851 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1852 * For N==0, this corresponds to the operation that in qemu/bitops.h
1853 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1854 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1856 static uint64_t compress_bits(uint64_t x, int n)
1858 int i;
1860 for (i = n; i <= 4; i++) {
1861 int sh = 1 << i;
1862 x &= even_bit_esz_masks[i];
1863 x = (x >> sh) | x;
1865 return x & 0xffffffffu;
1868 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1870 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1871 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1872 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1873 uint64_t *d = vd;
1874 intptr_t i;
1876 if (oprsz <= 8) {
1877 uint64_t nn = *(uint64_t *)vn;
1878 uint64_t mm = *(uint64_t *)vm;
1879 int half = 4 * oprsz;
1881 nn = extract64(nn, high * half, half);
1882 mm = extract64(mm, high * half, half);
1883 nn = expand_bits(nn, esz);
1884 mm = expand_bits(mm, esz);
1885 d[0] = nn + (mm << (1 << esz));
1886 } else {
1887 ARMPredicateReg tmp_n, tmp_m;
1889 /* We produce output faster than we consume input.
1890 Therefore we must be mindful of possible overlap. */
1891 if ((vn - vd) < (uintptr_t)oprsz) {
1892 vn = memcpy(&tmp_n, vn, oprsz);
1894 if ((vm - vd) < (uintptr_t)oprsz) {
1895 vm = memcpy(&tmp_m, vm, oprsz);
1897 if (high) {
1898 high = oprsz >> 1;
1901 if ((high & 3) == 0) {
1902 uint32_t *n = vn, *m = vm;
1903 high >>= 2;
1905 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1906 uint64_t nn = n[H4(high + i)];
1907 uint64_t mm = m[H4(high + i)];
1909 nn = expand_bits(nn, esz);
1910 mm = expand_bits(mm, esz);
1911 d[i] = nn + (mm << (1 << esz));
1913 } else {
1914 uint8_t *n = vn, *m = vm;
1915 uint16_t *d16 = vd;
1917 for (i = 0; i < oprsz / 2; i++) {
1918 uint16_t nn = n[H1(high + i)];
1919 uint16_t mm = m[H1(high + i)];
1921 nn = expand_bits(nn, esz);
1922 mm = expand_bits(mm, esz);
1923 d16[H2(i)] = nn + (mm << (1 << esz));
1929 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1931 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1932 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1933 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1934 uint64_t *d = vd, *n = vn, *m = vm;
1935 uint64_t l, h;
1936 intptr_t i;
1938 if (oprsz <= 8) {
1939 l = compress_bits(n[0] >> odd, esz);
1940 h = compress_bits(m[0] >> odd, esz);
1941 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1942 } else {
1943 ARMPredicateReg tmp_m;
1944 intptr_t oprsz_16 = oprsz / 16;
1946 if ((vm - vd) < (uintptr_t)oprsz) {
1947 m = memcpy(&tmp_m, vm, oprsz);
1950 for (i = 0; i < oprsz_16; i++) {
1951 l = n[2 * i + 0];
1952 h = n[2 * i + 1];
1953 l = compress_bits(l >> odd, esz);
1954 h = compress_bits(h >> odd, esz);
1955 d[i] = l + (h << 32);
1958 /* For VL which is not a power of 2, the results from M do not
1959 align nicely with the uint64_t for D. Put the aligned results
1960 from M into TMP_M and then copy it into place afterward. */
1961 if (oprsz & 15) {
1962 d[i] = compress_bits(n[2 * i] >> odd, esz);
1964 for (i = 0; i < oprsz_16; i++) {
1965 l = m[2 * i + 0];
1966 h = m[2 * i + 1];
1967 l = compress_bits(l >> odd, esz);
1968 h = compress_bits(h >> odd, esz);
1969 tmp_m.p[i] = l + (h << 32);
1971 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1973 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1974 } else {
1975 for (i = 0; i < oprsz_16; i++) {
1976 l = m[2 * i + 0];
1977 h = m[2 * i + 1];
1978 l = compress_bits(l >> odd, esz);
1979 h = compress_bits(h >> odd, esz);
1980 d[oprsz_16 + i] = l + (h << 32);
1986 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1988 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1989 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1990 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1991 uint64_t *d = vd, *n = vn, *m = vm;
1992 uint64_t mask;
1993 int shr, shl;
1994 intptr_t i;
1996 shl = 1 << esz;
1997 shr = 0;
1998 mask = even_bit_esz_masks[esz];
1999 if (odd) {
2000 mask <<= shl;
2001 shr = shl;
2002 shl = 0;
2005 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2006 uint64_t nn = (n[i] & mask) >> shr;
2007 uint64_t mm = (m[i] & mask) << shl;
2008 d[i] = nn + mm;
2012 /* Reverse units of 2**N bits. */
2013 static uint64_t reverse_bits_64(uint64_t x, int n)
2015 int i, sh;
2017 x = bswap64(x);
2018 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2019 uint64_t mask = even_bit_esz_masks[i];
2020 x = ((x & mask) << sh) | ((x >> sh) & mask);
2022 return x;
2025 static uint8_t reverse_bits_8(uint8_t x, int n)
2027 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2028 int i, sh;
2030 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2031 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2033 return x;
2036 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2038 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2039 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2040 intptr_t i, oprsz_2 = oprsz / 2;
2042 if (oprsz <= 8) {
2043 uint64_t l = *(uint64_t *)vn;
2044 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2045 *(uint64_t *)vd = l;
2046 } else if ((oprsz & 15) == 0) {
2047 for (i = 0; i < oprsz_2; i += 8) {
2048 intptr_t ih = oprsz - 8 - i;
2049 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2050 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2051 *(uint64_t *)(vd + i) = h;
2052 *(uint64_t *)(vd + ih) = l;
2054 } else {
2055 for (i = 0; i < oprsz_2; i += 1) {
2056 intptr_t il = H1(i);
2057 intptr_t ih = H1(oprsz - 1 - i);
2058 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2059 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2060 *(uint8_t *)(vd + il) = h;
2061 *(uint8_t *)(vd + ih) = l;
2066 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2068 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2069 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2070 uint64_t *d = vd;
2071 intptr_t i;
2073 if (oprsz <= 8) {
2074 uint64_t nn = *(uint64_t *)vn;
2075 int half = 4 * oprsz;
2077 nn = extract64(nn, high * half, half);
2078 nn = expand_bits(nn, 0);
2079 d[0] = nn;
2080 } else {
2081 ARMPredicateReg tmp_n;
2083 /* We produce output faster than we consume input.
2084 Therefore we must be mindful of possible overlap. */
2085 if ((vn - vd) < (uintptr_t)oprsz) {
2086 vn = memcpy(&tmp_n, vn, oprsz);
2088 if (high) {
2089 high = oprsz >> 1;
2092 if ((high & 3) == 0) {
2093 uint32_t *n = vn;
2094 high >>= 2;
2096 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2097 uint64_t nn = n[H4(high + i)];
2098 d[i] = expand_bits(nn, 0);
2100 } else {
2101 uint16_t *d16 = vd;
2102 uint8_t *n = vn;
2104 for (i = 0; i < oprsz / 2; i++) {
2105 uint16_t nn = n[H1(high + i)];
2106 d16[H2(i)] = expand_bits(nn, 0);
2112 #define DO_ZIP(NAME, TYPE, H) \
2113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2115 intptr_t oprsz = simd_oprsz(desc); \
2116 intptr_t i, oprsz_2 = oprsz / 2; \
2117 ARMVectorReg tmp_n, tmp_m; \
2118 /* We produce output faster than we consume input. \
2119 Therefore we must be mindful of possible overlap. */ \
2120 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2121 vn = memcpy(&tmp_n, vn, oprsz_2); \
2123 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2124 vm = memcpy(&tmp_m, vm, oprsz_2); \
2126 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2127 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2128 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2132 DO_ZIP(sve_zip_b, uint8_t, H1)
2133 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2134 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2135 DO_ZIP(sve_zip_d, uint64_t, )
2137 #define DO_UZP(NAME, TYPE, H) \
2138 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2140 intptr_t oprsz = simd_oprsz(desc); \
2141 intptr_t oprsz_2 = oprsz / 2; \
2142 intptr_t odd_ofs = simd_data(desc); \
2143 intptr_t i; \
2144 ARMVectorReg tmp_m; \
2145 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2146 vm = memcpy(&tmp_m, vm, oprsz); \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2151 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2152 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2156 DO_UZP(sve_uzp_b, uint8_t, H1)
2157 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2158 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2159 DO_UZP(sve_uzp_d, uint64_t, )
2161 #define DO_TRN(NAME, TYPE, H) \
2162 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2164 intptr_t oprsz = simd_oprsz(desc); \
2165 intptr_t odd_ofs = simd_data(desc); \
2166 intptr_t i; \
2167 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2168 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2169 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2170 *(TYPE *)(vd + H(i + 0)) = ae; \
2171 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2175 DO_TRN(sve_trn_b, uint8_t, H1)
2176 DO_TRN(sve_trn_h, uint16_t, H1_2)
2177 DO_TRN(sve_trn_s, uint32_t, H1_4)
2178 DO_TRN(sve_trn_d, uint64_t, )
2180 #undef DO_ZIP
2181 #undef DO_UZP
2182 #undef DO_TRN
2184 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2186 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2187 uint32_t *d = vd, *n = vn;
2188 uint8_t *pg = vg;
2190 for (i = j = 0; i < opr_sz; i++) {
2191 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2192 d[H4(j)] = n[H4(i)];
2193 j++;
2196 for (; j < opr_sz; j++) {
2197 d[H4(j)] = 0;
2201 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2203 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2204 uint64_t *d = vd, *n = vn;
2205 uint8_t *pg = vg;
2207 for (i = j = 0; i < opr_sz; i++) {
2208 if (pg[H1(i)] & 1) {
2209 d[j] = n[i];
2210 j++;
2213 for (; j < opr_sz; j++) {
2214 d[j] = 0;
2218 /* Similar to the ARM LastActiveElement pseudocode function, except the
2219 * result is multiplied by the element size. This includes the not found
2220 * indication; e.g. not found for esz=3 is -8.
2222 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2224 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2225 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2227 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2230 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2232 intptr_t opr_sz = simd_oprsz(desc) / 8;
2233 int esz = simd_data(desc);
2234 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2235 intptr_t i, first_i, last_i;
2236 ARMVectorReg tmp;
2238 first_i = last_i = 0;
2239 first_g = last_g = 0;
2241 /* Find the extent of the active elements within VG. */
2242 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2243 pg = *(uint64_t *)(vg + i) & mask;
2244 if (pg) {
2245 if (last_g == 0) {
2246 last_g = pg;
2247 last_i = i;
2249 first_g = pg;
2250 first_i = i;
2254 len = 0;
2255 if (first_g != 0) {
2256 first_i = first_i * 8 + ctz64(first_g);
2257 last_i = last_i * 8 + 63 - clz64(last_g);
2258 len = last_i - first_i + (1 << esz);
2259 if (vd == vm) {
2260 vm = memcpy(&tmp, vm, opr_sz * 8);
2262 swap_memmove(vd, vn + first_i, len);
2264 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2267 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2268 void *vg, uint32_t desc)
2270 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2271 uint64_t *d = vd, *n = vn, *m = vm;
2272 uint8_t *pg = vg;
2274 for (i = 0; i < opr_sz; i += 1) {
2275 uint64_t nn = n[i], mm = m[i];
2276 uint64_t pp = expand_pred_b(pg[H1(i)]);
2277 d[i] = (nn & pp) | (mm & ~pp);
2281 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2282 void *vg, uint32_t desc)
2284 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2285 uint64_t *d = vd, *n = vn, *m = vm;
2286 uint8_t *pg = vg;
2288 for (i = 0; i < opr_sz; i += 1) {
2289 uint64_t nn = n[i], mm = m[i];
2290 uint64_t pp = expand_pred_h(pg[H1(i)]);
2291 d[i] = (nn & pp) | (mm & ~pp);
2295 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2296 void *vg, uint32_t desc)
2298 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2299 uint64_t *d = vd, *n = vn, *m = vm;
2300 uint8_t *pg = vg;
2302 for (i = 0; i < opr_sz; i += 1) {
2303 uint64_t nn = n[i], mm = m[i];
2304 uint64_t pp = expand_pred_s(pg[H1(i)]);
2305 d[i] = (nn & pp) | (mm & ~pp);
2309 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2310 void *vg, uint32_t desc)
2312 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2313 uint64_t *d = vd, *n = vn, *m = vm;
2314 uint8_t *pg = vg;
2316 for (i = 0; i < opr_sz; i += 1) {
2317 uint64_t nn = n[i], mm = m[i];
2318 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2322 /* Two operand comparison controlled by a predicate.
2323 * ??? It is very tempting to want to be able to expand this inline
2324 * with x86 instructions, e.g.
2326 * vcmpeqw zm, zn, %ymm0
2327 * vpmovmskb %ymm0, %eax
2328 * and $0x5555, %eax
2329 * and pg, %eax
2331 * or even aarch64, e.g.
2333 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2334 * cmeq v0.8h, zn, zm
2335 * and v0.8h, v0.8h, mask
2336 * addv h0, v0.8h
2337 * and v0.8b, pg
2339 * However, coming up with an abstraction that allows vector inputs and
2340 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2341 * scalar outputs, is tricky.
2343 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2344 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2346 intptr_t opr_sz = simd_oprsz(desc); \
2347 uint32_t flags = PREDTEST_INIT; \
2348 intptr_t i = opr_sz; \
2349 do { \
2350 uint64_t out = 0, pg; \
2351 do { \
2352 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2353 TYPE nn = *(TYPE *)(vn + H(i)); \
2354 TYPE mm = *(TYPE *)(vm + H(i)); \
2355 out |= nn OP mm; \
2356 } while (i & 63); \
2357 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2358 out &= pg; \
2359 *(uint64_t *)(vd + (i >> 3)) = out; \
2360 flags = iter_predtest_bwd(out, pg, flags); \
2361 } while (i > 0); \
2362 return flags; \
2365 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2366 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2367 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2368 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2369 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2370 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2371 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2372 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2374 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2375 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2376 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2377 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2379 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2380 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2381 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2382 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2384 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2385 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2386 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2387 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2389 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2390 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2391 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2392 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2394 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2395 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2396 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2397 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2399 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2400 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2401 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2402 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2404 #undef DO_CMP_PPZZ_B
2405 #undef DO_CMP_PPZZ_H
2406 #undef DO_CMP_PPZZ_S
2407 #undef DO_CMP_PPZZ_D
2408 #undef DO_CMP_PPZZ
2410 /* Similar, but the second source is "wide". */
2411 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2412 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2414 intptr_t opr_sz = simd_oprsz(desc); \
2415 uint32_t flags = PREDTEST_INIT; \
2416 intptr_t i = opr_sz; \
2417 do { \
2418 uint64_t out = 0, pg; \
2419 do { \
2420 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2421 do { \
2422 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2423 TYPE nn = *(TYPE *)(vn + H(i)); \
2424 out |= nn OP mm; \
2425 } while (i & 7); \
2426 } while (i & 63); \
2427 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2428 out &= pg; \
2429 *(uint64_t *)(vd + (i >> 3)) = out; \
2430 flags = iter_predtest_bwd(out, pg, flags); \
2431 } while (i > 0); \
2432 return flags; \
2435 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2436 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2437 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2438 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2439 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2440 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2442 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2443 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2444 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2446 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2447 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2448 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2450 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2451 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2452 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2454 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2455 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2456 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2458 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2459 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2460 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2462 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2463 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2464 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2466 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2467 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2468 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2470 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2471 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2472 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2474 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2475 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2476 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2478 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2479 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2480 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2482 #undef DO_CMP_PPZW_B
2483 #undef DO_CMP_PPZW_H
2484 #undef DO_CMP_PPZW_S
2485 #undef DO_CMP_PPZW
2487 /* Similar, but the second source is immediate. */
2488 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2489 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2491 intptr_t opr_sz = simd_oprsz(desc); \
2492 uint32_t flags = PREDTEST_INIT; \
2493 TYPE mm = simd_data(desc); \
2494 intptr_t i = opr_sz; \
2495 do { \
2496 uint64_t out = 0, pg; \
2497 do { \
2498 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2499 TYPE nn = *(TYPE *)(vn + H(i)); \
2500 out |= nn OP mm; \
2501 } while (i & 63); \
2502 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2503 out &= pg; \
2504 *(uint64_t *)(vd + (i >> 3)) = out; \
2505 flags = iter_predtest_bwd(out, pg, flags); \
2506 } while (i > 0); \
2507 return flags; \
2510 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2511 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2512 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2513 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2514 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2515 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2516 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2517 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2519 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2520 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2521 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2522 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2524 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2525 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2526 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2527 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2529 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2530 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2531 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2532 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2534 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2535 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2536 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2537 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2539 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2540 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2541 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2542 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2544 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2545 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2546 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2547 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2549 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2550 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2551 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2552 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2554 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2555 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2556 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2557 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2559 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2560 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2561 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2562 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2564 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2565 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2566 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2567 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2569 #undef DO_CMP_PPZI_B
2570 #undef DO_CMP_PPZI_H
2571 #undef DO_CMP_PPZI_S
2572 #undef DO_CMP_PPZI_D
2573 #undef DO_CMP_PPZI
2575 /* Similar to the ARM LastActive pseudocode function. */
2576 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2578 intptr_t i;
2580 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2581 uint64_t pg = *(uint64_t *)(vg + i);
2582 if (pg) {
2583 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2586 return 0;
2589 /* Compute a mask into RETB that is true for all G, up to and including
2590 * (if after) or excluding (if !after) the first G & N.
2591 * Return true if BRK found.
2593 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2594 bool brk, bool after)
2596 uint64_t b;
2598 if (brk) {
2599 b = 0;
2600 } else if ((g & n) == 0) {
2601 /* For all G, no N are set; break not found. */
2602 b = g;
2603 } else {
2604 /* Break somewhere in N. Locate it. */
2605 b = g & n; /* guard true, pred true */
2606 b = b & -b; /* first such */
2607 if (after) {
2608 b = b | (b - 1); /* break after same */
2609 } else {
2610 b = b - 1; /* break before same */
2612 brk = true;
2615 *retb = b;
2616 return brk;
2619 /* Compute a zeroing BRK. */
2620 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2621 intptr_t oprsz, bool after)
2623 bool brk = false;
2624 intptr_t i;
2626 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2627 uint64_t this_b, this_g = g[i];
2629 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2630 d[i] = this_b & this_g;
2634 /* Likewise, but also compute flags. */
2635 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2636 intptr_t oprsz, bool after)
2638 uint32_t flags = PREDTEST_INIT;
2639 bool brk = false;
2640 intptr_t i;
2642 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2643 uint64_t this_b, this_d, this_g = g[i];
2645 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2646 d[i] = this_d = this_b & this_g;
2647 flags = iter_predtest_fwd(this_d, this_g, flags);
2649 return flags;
2652 /* Compute a merging BRK. */
2653 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2654 intptr_t oprsz, bool after)
2656 bool brk = false;
2657 intptr_t i;
2659 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2660 uint64_t this_b, this_g = g[i];
2662 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2663 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2667 /* Likewise, but also compute flags. */
2668 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2669 intptr_t oprsz, bool after)
2671 uint32_t flags = PREDTEST_INIT;
2672 bool brk = false;
2673 intptr_t i;
2675 for (i = 0; i < oprsz / 8; ++i) {
2676 uint64_t this_b, this_d = d[i], this_g = g[i];
2678 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2679 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2680 flags = iter_predtest_fwd(this_d, this_g, flags);
2682 return flags;
2685 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2687 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2688 * The compiler should turn this into 4 64-bit integer stores.
2690 memset(d, 0, sizeof(ARMPredicateReg));
2691 return PREDTEST_INIT;
2694 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2695 uint32_t pred_desc)
2697 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2698 if (last_active_pred(vn, vg, oprsz)) {
2699 compute_brk_z(vd, vm, vg, oprsz, true);
2700 } else {
2701 do_zero(vd, oprsz);
2705 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2706 uint32_t pred_desc)
2708 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2709 if (last_active_pred(vn, vg, oprsz)) {
2710 return compute_brks_z(vd, vm, vg, oprsz, true);
2711 } else {
2712 return do_zero(vd, oprsz);
2716 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2717 uint32_t pred_desc)
2719 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2720 if (last_active_pred(vn, vg, oprsz)) {
2721 compute_brk_z(vd, vm, vg, oprsz, false);
2722 } else {
2723 do_zero(vd, oprsz);
2727 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2728 uint32_t pred_desc)
2730 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2731 if (last_active_pred(vn, vg, oprsz)) {
2732 return compute_brks_z(vd, vm, vg, oprsz, false);
2733 } else {
2734 return do_zero(vd, oprsz);
2738 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2740 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2741 compute_brk_z(vd, vn, vg, oprsz, true);
2744 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2746 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747 return compute_brks_z(vd, vn, vg, oprsz, true);
2750 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2752 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2753 compute_brk_z(vd, vn, vg, oprsz, false);
2756 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2758 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2759 return compute_brks_z(vd, vn, vg, oprsz, false);
2762 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2764 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2765 compute_brk_m(vd, vn, vg, oprsz, true);
2768 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2770 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771 return compute_brks_m(vd, vn, vg, oprsz, true);
2774 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2776 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2777 compute_brk_m(vd, vn, vg, oprsz, false);
2780 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2782 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2783 return compute_brks_m(vd, vn, vg, oprsz, false);
2786 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2788 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2790 if (!last_active_pred(vn, vg, oprsz)) {
2791 do_zero(vd, oprsz);
2795 /* As if PredTest(Ones(PL), D, esz). */
2796 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2797 uint64_t esz_mask)
2799 uint32_t flags = PREDTEST_INIT;
2800 intptr_t i;
2802 for (i = 0; i < oprsz / 8; i++) {
2803 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2805 if (oprsz & 7) {
2806 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2807 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2809 return flags;
2812 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2816 if (last_active_pred(vn, vg, oprsz)) {
2817 return predtest_ones(vd, oprsz, -1);
2818 } else {
2819 return do_zero(vd, oprsz);
2823 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2825 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2826 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2827 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2828 intptr_t i;
2830 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2831 uint64_t t = n[i] & g[i] & mask;
2832 sum += ctpop64(t);
2834 return sum;
2837 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2839 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2840 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2841 uint64_t esz_mask = pred_esz_masks[esz];
2842 ARMPredicateReg *d = vd;
2843 uint32_t flags;
2844 intptr_t i;
2846 /* Begin with a zero predicate register. */
2847 flags = do_zero(d, oprsz);
2848 if (count == 0) {
2849 return flags;
2852 /* Set all of the requested bits. */
2853 for (i = 0; i < count / 64; ++i) {
2854 d->p[i] = esz_mask;
2856 if (count & 63) {
2857 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2860 return predtest_ones(d, oprsz, esz_mask);
2863 /* Recursive reduction on a function;
2864 * C.f. the ARM ARM function ReducePredicated.
2866 * While it would be possible to write this without the DATA temporary,
2867 * it is much simpler to process the predicate register this way.
2868 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2869 * little to gain with a more complex non-recursive form.
2871 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2872 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2874 if (n == 1) { \
2875 return *data; \
2876 } else { \
2877 uintptr_t half = n / 2; \
2878 TYPE lo = NAME##_reduce(data, status, half); \
2879 TYPE hi = NAME##_reduce(data + half, status, half); \
2880 return TYPE##_##FUNC(lo, hi, status); \
2883 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2885 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2886 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2887 for (i = 0; i < oprsz; ) { \
2888 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2889 do { \
2890 TYPE nn = *(TYPE *)(vn + H(i)); \
2891 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2892 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2893 } while (i & 15); \
2895 for (; i < maxsz; i += sizeof(TYPE)) { \
2896 *(TYPE *)((void *)data + i) = IDENT; \
2898 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2901 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2902 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2903 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2905 /* Identity is floatN_default_nan, without the function call. */
2906 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2907 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2908 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2910 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2911 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2912 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2914 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2915 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2916 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2918 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2919 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2920 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2922 #undef DO_REDUCE
2924 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2925 void *status, uint32_t desc)
2927 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2928 float16 result = nn;
2930 do {
2931 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2932 do {
2933 if (pg & 1) {
2934 float16 mm = *(float16 *)(vm + H1_2(i));
2935 result = float16_add(result, mm, status);
2937 i += sizeof(float16), pg >>= sizeof(float16);
2938 } while (i & 15);
2939 } while (i < opr_sz);
2941 return result;
2944 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2945 void *status, uint32_t desc)
2947 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2948 float32 result = nn;
2950 do {
2951 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2952 do {
2953 if (pg & 1) {
2954 float32 mm = *(float32 *)(vm + H1_2(i));
2955 result = float32_add(result, mm, status);
2957 i += sizeof(float32), pg >>= sizeof(float32);
2958 } while (i & 15);
2959 } while (i < opr_sz);
2961 return result;
2964 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2965 void *status, uint32_t desc)
2967 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2968 uint64_t *m = vm;
2969 uint8_t *pg = vg;
2971 for (i = 0; i < opr_sz; i++) {
2972 if (pg[H1(i)] & 1) {
2973 nn = float64_add(nn, m[i], status);
2977 return nn;
2980 /* Fully general three-operand expander, controlled by a predicate,
2981 * With the extra float_status parameter.
2983 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2984 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2985 void *status, uint32_t desc) \
2987 intptr_t i = simd_oprsz(desc); \
2988 uint64_t *g = vg; \
2989 do { \
2990 uint64_t pg = g[(i - 1) >> 6]; \
2991 do { \
2992 i -= sizeof(TYPE); \
2993 if (likely((pg >> (i & 63)) & 1)) { \
2994 TYPE nn = *(TYPE *)(vn + H(i)); \
2995 TYPE mm = *(TYPE *)(vm + H(i)); \
2996 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2998 } while (i & 63); \
2999 } while (i != 0); \
3002 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3003 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3004 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3006 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3007 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3008 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3010 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3011 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3012 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3014 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3015 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3016 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3018 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3019 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3020 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3022 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3023 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3024 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3026 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3027 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3028 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3030 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3031 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3032 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3034 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3036 return float16_abs(float16_sub(a, b, s));
3039 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3041 return float32_abs(float32_sub(a, b, s));
3044 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3046 return float64_abs(float64_sub(a, b, s));
3049 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3050 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3051 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3053 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3055 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3056 return float64_scalbn(a, b_int, s);
3059 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3060 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3061 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3063 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3064 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3065 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3067 #undef DO_ZPZZ_FP
3069 /* Three-operand expander, with one scalar operand, controlled by
3070 * a predicate, with the extra float_status parameter.
3072 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3073 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3074 void *status, uint32_t desc) \
3076 intptr_t i = simd_oprsz(desc); \
3077 uint64_t *g = vg; \
3078 TYPE mm = scalar; \
3079 do { \
3080 uint64_t pg = g[(i - 1) >> 6]; \
3081 do { \
3082 i -= sizeof(TYPE); \
3083 if (likely((pg >> (i & 63)) & 1)) { \
3084 TYPE nn = *(TYPE *)(vn + H(i)); \
3085 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3087 } while (i & 63); \
3088 } while (i != 0); \
3091 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3092 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3093 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3095 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3096 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3097 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3099 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3100 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3101 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3103 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3105 return float16_sub(b, a, s);
3108 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3110 return float32_sub(b, a, s);
3113 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3115 return float64_sub(b, a, s);
3118 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3119 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3120 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3122 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3123 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3124 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3126 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3127 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3128 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3130 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3131 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3132 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3134 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3135 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3136 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3138 /* Fully general two-operand expander, controlled by a predicate,
3139 * With the extra float_status parameter.
3141 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3142 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3144 intptr_t i = simd_oprsz(desc); \
3145 uint64_t *g = vg; \
3146 do { \
3147 uint64_t pg = g[(i - 1) >> 6]; \
3148 do { \
3149 i -= sizeof(TYPE); \
3150 if (likely((pg >> (i & 63)) & 1)) { \
3151 TYPE nn = *(TYPE *)(vn + H(i)); \
3152 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3154 } while (i & 63); \
3155 } while (i != 0); \
3158 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3159 * FZ16. When converting from fp16, this affects flushing input denormals;
3160 * when converting to fp16, this affects flushing output denormals.
3162 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3164 bool save = get_flush_inputs_to_zero(fpst);
3165 float32 ret;
3167 set_flush_inputs_to_zero(false, fpst);
3168 ret = float16_to_float32(f, true, fpst);
3169 set_flush_inputs_to_zero(save, fpst);
3170 return ret;
3173 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3175 bool save = get_flush_inputs_to_zero(fpst);
3176 float64 ret;
3178 set_flush_inputs_to_zero(false, fpst);
3179 ret = float16_to_float64(f, true, fpst);
3180 set_flush_inputs_to_zero(save, fpst);
3181 return ret;
3184 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3186 bool save = get_flush_to_zero(fpst);
3187 float16 ret;
3189 set_flush_to_zero(false, fpst);
3190 ret = float32_to_float16(f, true, fpst);
3191 set_flush_to_zero(save, fpst);
3192 return ret;
3195 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3197 bool save = get_flush_to_zero(fpst);
3198 float16 ret;
3200 set_flush_to_zero(false, fpst);
3201 ret = float64_to_float16(f, true, fpst);
3202 set_flush_to_zero(save, fpst);
3203 return ret;
3206 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3208 if (float16_is_any_nan(f)) {
3209 float_raise(float_flag_invalid, s);
3210 return 0;
3212 return float16_to_int16_round_to_zero(f, s);
3215 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3217 if (float16_is_any_nan(f)) {
3218 float_raise(float_flag_invalid, s);
3219 return 0;
3221 return float16_to_int64_round_to_zero(f, s);
3224 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3226 if (float32_is_any_nan(f)) {
3227 float_raise(float_flag_invalid, s);
3228 return 0;
3230 return float32_to_int64_round_to_zero(f, s);
3233 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3235 if (float64_is_any_nan(f)) {
3236 float_raise(float_flag_invalid, s);
3237 return 0;
3239 return float64_to_int64_round_to_zero(f, s);
3242 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3244 if (float16_is_any_nan(f)) {
3245 float_raise(float_flag_invalid, s);
3246 return 0;
3248 return float16_to_uint16_round_to_zero(f, s);
3251 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3253 if (float16_is_any_nan(f)) {
3254 float_raise(float_flag_invalid, s);
3255 return 0;
3257 return float16_to_uint64_round_to_zero(f, s);
3260 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3262 if (float32_is_any_nan(f)) {
3263 float_raise(float_flag_invalid, s);
3264 return 0;
3266 return float32_to_uint64_round_to_zero(f, s);
3269 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3271 if (float64_is_any_nan(f)) {
3272 float_raise(float_flag_invalid, s);
3273 return 0;
3275 return float64_to_uint64_round_to_zero(f, s);
3278 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3279 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3280 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3281 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3282 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3283 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3285 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3286 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3287 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3288 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3289 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3290 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3291 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3293 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3294 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3295 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3296 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3297 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3298 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3299 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3301 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3302 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3303 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3305 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3306 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3307 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3309 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3310 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3311 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3313 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3314 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3315 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3317 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3318 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3319 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3320 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3321 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3322 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3323 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3325 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3326 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3327 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3328 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3329 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3330 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3331 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3333 #undef DO_ZPZ_FP
3335 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3336 float_status *status, uint32_t desc,
3337 uint16_t neg1, uint16_t neg3)
3339 intptr_t i = simd_oprsz(desc);
3340 uint64_t *g = vg;
3342 do {
3343 uint64_t pg = g[(i - 1) >> 6];
3344 do {
3345 i -= 2;
3346 if (likely((pg >> (i & 63)) & 1)) {
3347 float16 e1, e2, e3, r;
3349 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3350 e2 = *(uint16_t *)(vm + H1_2(i));
3351 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3352 r = float16_muladd(e1, e2, e3, 0, status);
3353 *(uint16_t *)(vd + H1_2(i)) = r;
3355 } while (i & 63);
3356 } while (i != 0);
3359 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3360 void *vg, void *status, uint32_t desc)
3362 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3365 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3366 void *vg, void *status, uint32_t desc)
3368 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3371 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3372 void *vg, void *status, uint32_t desc)
3374 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3377 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3378 void *vg, void *status, uint32_t desc)
3380 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3383 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3384 float_status *status, uint32_t desc,
3385 uint32_t neg1, uint32_t neg3)
3387 intptr_t i = simd_oprsz(desc);
3388 uint64_t *g = vg;
3390 do {
3391 uint64_t pg = g[(i - 1) >> 6];
3392 do {
3393 i -= 4;
3394 if (likely((pg >> (i & 63)) & 1)) {
3395 float32 e1, e2, e3, r;
3397 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3398 e2 = *(uint32_t *)(vm + H1_4(i));
3399 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3400 r = float32_muladd(e1, e2, e3, 0, status);
3401 *(uint32_t *)(vd + H1_4(i)) = r;
3403 } while (i & 63);
3404 } while (i != 0);
3407 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3408 void *vg, void *status, uint32_t desc)
3410 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3413 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3414 void *vg, void *status, uint32_t desc)
3416 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3419 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3420 void *vg, void *status, uint32_t desc)
3422 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3425 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3426 void *vg, void *status, uint32_t desc)
3428 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3431 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3432 float_status *status, uint32_t desc,
3433 uint64_t neg1, uint64_t neg3)
3435 intptr_t i = simd_oprsz(desc);
3436 uint64_t *g = vg;
3438 do {
3439 uint64_t pg = g[(i - 1) >> 6];
3440 do {
3441 i -= 8;
3442 if (likely((pg >> (i & 63)) & 1)) {
3443 float64 e1, e2, e3, r;
3445 e1 = *(uint64_t *)(vn + i) ^ neg1;
3446 e2 = *(uint64_t *)(vm + i);
3447 e3 = *(uint64_t *)(va + i) ^ neg3;
3448 r = float64_muladd(e1, e2, e3, 0, status);
3449 *(uint64_t *)(vd + i) = r;
3451 } while (i & 63);
3452 } while (i != 0);
3455 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3456 void *vg, void *status, uint32_t desc)
3458 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3461 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3462 void *vg, void *status, uint32_t desc)
3464 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3467 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3468 void *vg, void *status, uint32_t desc)
3470 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3473 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3474 void *vg, void *status, uint32_t desc)
3476 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3479 /* Two operand floating-point comparison controlled by a predicate.
3480 * Unlike the integer version, we are not allowed to optimistically
3481 * compare operands, since the comparison may have side effects wrt
3482 * the FPSR.
3484 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3485 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3486 void *status, uint32_t desc) \
3488 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3489 uint64_t *d = vd, *g = vg; \
3490 do { \
3491 uint64_t out = 0, pg = g[j]; \
3492 do { \
3493 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3494 if (likely((pg >> (i & 63)) & 1)) { \
3495 TYPE nn = *(TYPE *)(vn + H(i)); \
3496 TYPE mm = *(TYPE *)(vm + H(i)); \
3497 out |= OP(TYPE, nn, mm, status); \
3499 } while (i & 63); \
3500 d[j--] = out; \
3501 } while (i > 0); \
3504 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3505 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3506 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3507 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3508 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3509 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3511 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3512 DO_FPCMP_PPZZ_H(NAME, OP) \
3513 DO_FPCMP_PPZZ_S(NAME, OP) \
3514 DO_FPCMP_PPZZ_D(NAME, OP)
3516 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3517 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3518 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3519 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3520 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3521 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3522 #define DO_FCMUO(TYPE, X, Y, ST) \
3523 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3524 #define DO_FACGE(TYPE, X, Y, ST) \
3525 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3526 #define DO_FACGT(TYPE, X, Y, ST) \
3527 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3529 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3530 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3531 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3532 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3533 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3534 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3535 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3537 #undef DO_FPCMP_PPZZ_ALL
3538 #undef DO_FPCMP_PPZZ_D
3539 #undef DO_FPCMP_PPZZ_S
3540 #undef DO_FPCMP_PPZZ_H
3541 #undef DO_FPCMP_PPZZ
3543 /* One operand floating-point comparison against zero, controlled
3544 * by a predicate.
3546 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3547 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3548 void *status, uint32_t desc) \
3550 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3551 uint64_t *d = vd, *g = vg; \
3552 do { \
3553 uint64_t out = 0, pg = g[j]; \
3554 do { \
3555 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3556 if ((pg >> (i & 63)) & 1) { \
3557 TYPE nn = *(TYPE *)(vn + H(i)); \
3558 out |= OP(TYPE, nn, 0, status); \
3560 } while (i & 63); \
3561 d[j--] = out; \
3562 } while (i > 0); \
3565 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3566 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3567 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3568 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3569 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3570 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3572 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3573 DO_FPCMP_PPZ0_H(NAME, OP) \
3574 DO_FPCMP_PPZ0_S(NAME, OP) \
3575 DO_FPCMP_PPZ0_D(NAME, OP)
3577 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3578 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3579 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3580 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3581 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3582 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3584 /* FP Trig Multiply-Add. */
3586 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3588 static const float16 coeff[16] = {
3589 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3590 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3592 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3593 intptr_t x = simd_data(desc);
3594 float16 *d = vd, *n = vn, *m = vm;
3595 for (i = 0; i < opr_sz; i++) {
3596 float16 mm = m[i];
3597 intptr_t xx = x;
3598 if (float16_is_neg(mm)) {
3599 mm = float16_abs(mm);
3600 xx += 8;
3602 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3606 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3608 static const float32 coeff[16] = {
3609 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3610 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3611 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3612 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3614 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3615 intptr_t x = simd_data(desc);
3616 float32 *d = vd, *n = vn, *m = vm;
3617 for (i = 0; i < opr_sz; i++) {
3618 float32 mm = m[i];
3619 intptr_t xx = x;
3620 if (float32_is_neg(mm)) {
3621 mm = float32_abs(mm);
3622 xx += 8;
3624 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3628 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3630 static const float64 coeff[16] = {
3631 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3632 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3633 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3634 0x3de5d8408868552full, 0x0000000000000000ull,
3635 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3636 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3637 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3638 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3640 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3641 intptr_t x = simd_data(desc);
3642 float64 *d = vd, *n = vn, *m = vm;
3643 for (i = 0; i < opr_sz; i++) {
3644 float64 mm = m[i];
3645 intptr_t xx = x;
3646 if (float64_is_neg(mm)) {
3647 mm = float64_abs(mm);
3648 xx += 8;
3650 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3655 * FP Complex Add
3658 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3659 void *vs, uint32_t desc)
3661 intptr_t j, i = simd_oprsz(desc);
3662 uint64_t *g = vg;
3663 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3664 float16 neg_real = float16_chs(neg_imag);
3666 do {
3667 uint64_t pg = g[(i - 1) >> 6];
3668 do {
3669 float16 e0, e1, e2, e3;
3671 /* I holds the real index; J holds the imag index. */
3672 j = i - sizeof(float16);
3673 i -= 2 * sizeof(float16);
3675 e0 = *(float16 *)(vn + H1_2(i));
3676 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3677 e2 = *(float16 *)(vn + H1_2(j));
3678 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3680 if (likely((pg >> (i & 63)) & 1)) {
3681 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3683 if (likely((pg >> (j & 63)) & 1)) {
3684 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3686 } while (i & 63);
3687 } while (i != 0);
3690 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3691 void *vs, uint32_t desc)
3693 intptr_t j, i = simd_oprsz(desc);
3694 uint64_t *g = vg;
3695 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3696 float32 neg_real = float32_chs(neg_imag);
3698 do {
3699 uint64_t pg = g[(i - 1) >> 6];
3700 do {
3701 float32 e0, e1, e2, e3;
3703 /* I holds the real index; J holds the imag index. */
3704 j = i - sizeof(float32);
3705 i -= 2 * sizeof(float32);
3707 e0 = *(float32 *)(vn + H1_2(i));
3708 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3709 e2 = *(float32 *)(vn + H1_2(j));
3710 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3712 if (likely((pg >> (i & 63)) & 1)) {
3713 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3715 if (likely((pg >> (j & 63)) & 1)) {
3716 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3718 } while (i & 63);
3719 } while (i != 0);
3722 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3723 void *vs, uint32_t desc)
3725 intptr_t j, i = simd_oprsz(desc);
3726 uint64_t *g = vg;
3727 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3728 float64 neg_real = float64_chs(neg_imag);
3730 do {
3731 uint64_t pg = g[(i - 1) >> 6];
3732 do {
3733 float64 e0, e1, e2, e3;
3735 /* I holds the real index; J holds the imag index. */
3736 j = i - sizeof(float64);
3737 i -= 2 * sizeof(float64);
3739 e0 = *(float64 *)(vn + H1_2(i));
3740 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3741 e2 = *(float64 *)(vn + H1_2(j));
3742 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3744 if (likely((pg >> (i & 63)) & 1)) {
3745 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3747 if (likely((pg >> (j & 63)) & 1)) {
3748 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3750 } while (i & 63);
3751 } while (i != 0);
3755 * FP Complex Multiply
3758 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3759 void *vg, void *status, uint32_t desc)
3761 intptr_t j, i = simd_oprsz(desc);
3762 unsigned rot = simd_data(desc);
3763 bool flip = rot & 1;
3764 float16 neg_imag, neg_real;
3765 uint64_t *g = vg;
3767 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3768 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3770 do {
3771 uint64_t pg = g[(i - 1) >> 6];
3772 do {
3773 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3775 /* I holds the real index; J holds the imag index. */
3776 j = i - sizeof(float16);
3777 i -= 2 * sizeof(float16);
3779 nr = *(float16 *)(vn + H1_2(i));
3780 ni = *(float16 *)(vn + H1_2(j));
3781 mr = *(float16 *)(vm + H1_2(i));
3782 mi = *(float16 *)(vm + H1_2(j));
3784 e2 = (flip ? ni : nr);
3785 e1 = (flip ? mi : mr) ^ neg_real;
3786 e4 = e2;
3787 e3 = (flip ? mr : mi) ^ neg_imag;
3789 if (likely((pg >> (i & 63)) & 1)) {
3790 d = *(float16 *)(va + H1_2(i));
3791 d = float16_muladd(e2, e1, d, 0, status);
3792 *(float16 *)(vd + H1_2(i)) = d;
3794 if (likely((pg >> (j & 63)) & 1)) {
3795 d = *(float16 *)(va + H1_2(j));
3796 d = float16_muladd(e4, e3, d, 0, status);
3797 *(float16 *)(vd + H1_2(j)) = d;
3799 } while (i & 63);
3800 } while (i != 0);
3803 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3804 void *vg, void *status, uint32_t desc)
3806 intptr_t j, i = simd_oprsz(desc);
3807 unsigned rot = simd_data(desc);
3808 bool flip = rot & 1;
3809 float32 neg_imag, neg_real;
3810 uint64_t *g = vg;
3812 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3813 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3815 do {
3816 uint64_t pg = g[(i - 1) >> 6];
3817 do {
3818 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3820 /* I holds the real index; J holds the imag index. */
3821 j = i - sizeof(float32);
3822 i -= 2 * sizeof(float32);
3824 nr = *(float32 *)(vn + H1_2(i));
3825 ni = *(float32 *)(vn + H1_2(j));
3826 mr = *(float32 *)(vm + H1_2(i));
3827 mi = *(float32 *)(vm + H1_2(j));
3829 e2 = (flip ? ni : nr);
3830 e1 = (flip ? mi : mr) ^ neg_real;
3831 e4 = e2;
3832 e3 = (flip ? mr : mi) ^ neg_imag;
3834 if (likely((pg >> (i & 63)) & 1)) {
3835 d = *(float32 *)(va + H1_2(i));
3836 d = float32_muladd(e2, e1, d, 0, status);
3837 *(float32 *)(vd + H1_2(i)) = d;
3839 if (likely((pg >> (j & 63)) & 1)) {
3840 d = *(float32 *)(va + H1_2(j));
3841 d = float32_muladd(e4, e3, d, 0, status);
3842 *(float32 *)(vd + H1_2(j)) = d;
3844 } while (i & 63);
3845 } while (i != 0);
3848 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3849 void *vg, void *status, uint32_t desc)
3851 intptr_t j, i = simd_oprsz(desc);
3852 unsigned rot = simd_data(desc);
3853 bool flip = rot & 1;
3854 float64 neg_imag, neg_real;
3855 uint64_t *g = vg;
3857 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3858 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3860 do {
3861 uint64_t pg = g[(i - 1) >> 6];
3862 do {
3863 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3865 /* I holds the real index; J holds the imag index. */
3866 j = i - sizeof(float64);
3867 i -= 2 * sizeof(float64);
3869 nr = *(float64 *)(vn + H1_2(i));
3870 ni = *(float64 *)(vn + H1_2(j));
3871 mr = *(float64 *)(vm + H1_2(i));
3872 mi = *(float64 *)(vm + H1_2(j));
3874 e2 = (flip ? ni : nr);
3875 e1 = (flip ? mi : mr) ^ neg_real;
3876 e4 = e2;
3877 e3 = (flip ? mr : mi) ^ neg_imag;
3879 if (likely((pg >> (i & 63)) & 1)) {
3880 d = *(float64 *)(va + H1_2(i));
3881 d = float64_muladd(e2, e1, d, 0, status);
3882 *(float64 *)(vd + H1_2(i)) = d;
3884 if (likely((pg >> (j & 63)) & 1)) {
3885 d = *(float64 *)(va + H1_2(j));
3886 d = float64_muladd(e4, e3, d, 0, status);
3887 *(float64 *)(vd + H1_2(j)) = d;
3889 } while (i & 63);
3890 } while (i != 0);
3894 * Load contiguous data, protected by a governing predicate.
3898 * Load one element into @vd + @reg_off from @host.
3899 * The controlling predicate is known to be true.
3901 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3904 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3905 * The controlling predicate is known to be true.
3907 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3908 target_ulong vaddr, uintptr_t retaddr);
3911 * Generate the above primitives.
3914 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3915 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3917 TYPEM val = HOST(host); \
3918 *(TYPEE *)(vd + H(reg_off)) = val; \
3921 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3922 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3923 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3925 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3926 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3927 target_ulong addr, uintptr_t ra) \
3929 *(TYPEE *)(vd + H(reg_off)) = \
3930 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
3933 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3934 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3935 target_ulong addr, uintptr_t ra) \
3937 TLB(env, useronly_clean_ptr(addr), \
3938 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
3941 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
3942 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
3943 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3945 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3946 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3947 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3948 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3949 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3950 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3951 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3953 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
3954 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
3955 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3957 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3958 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3959 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3960 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
3962 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3963 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
3964 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
3965 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3966 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
3968 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
3969 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
3970 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
3971 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3972 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
3974 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
3975 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3976 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
3977 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
3978 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
3980 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3981 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3982 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
3984 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
3985 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
3986 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
3988 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
3989 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
3991 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
3992 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
3994 #undef DO_LD_TLB
3995 #undef DO_ST_TLB
3996 #undef DO_LD_HOST
3997 #undef DO_LD_PRIM_1
3998 #undef DO_ST_PRIM_1
3999 #undef DO_LD_PRIM_2
4000 #undef DO_ST_PRIM_2
4003 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4004 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4005 * element >= @reg_off, or @reg_max if there were no active elements at all.
4007 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4008 intptr_t reg_max, int esz)
4010 uint64_t pg_mask = pred_esz_masks[esz];
4011 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4013 /* In normal usage, the first element is active. */
4014 if (likely(pg & 1)) {
4015 return reg_off;
4018 if (pg == 0) {
4019 reg_off &= -64;
4020 do {
4021 reg_off += 64;
4022 if (unlikely(reg_off >= reg_max)) {
4023 /* The entire predicate was false. */
4024 return reg_max;
4026 pg = vg[reg_off >> 6] & pg_mask;
4027 } while (pg == 0);
4029 reg_off += ctz64(pg);
4031 /* We should never see an out of range predicate bit set. */
4032 tcg_debug_assert(reg_off < reg_max);
4033 return reg_off;
4037 * Resolve the guest virtual address to info->host and info->flags.
4038 * If @nofault, return false if the page is invalid, otherwise
4039 * exit via page fault exception.
4042 typedef struct {
4043 void *host;
4044 int flags;
4045 MemTxAttrs attrs;
4046 } SVEHostPage;
4048 static bool sve_probe_page(SVEHostPage *info, bool nofault,
4049 CPUARMState *env, target_ulong addr,
4050 int mem_off, MMUAccessType access_type,
4051 int mmu_idx, uintptr_t retaddr)
4053 int flags;
4055 addr += mem_off;
4058 * User-only currently always issues with TBI. See the comment
4059 * above useronly_clean_ptr. Usually we clean this top byte away
4060 * during translation, but we can't do that for e.g. vector + imm
4061 * addressing modes.
4063 * We currently always enable TBI for user-only, and do not provide
4064 * a way to turn it off. So clean the pointer unconditionally here,
4065 * rather than look it up here, or pass it down from above.
4067 addr = useronly_clean_ptr(addr);
4069 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4070 &info->host, retaddr);
4071 info->flags = flags;
4073 if (flags & TLB_INVALID_MASK) {
4074 g_assert(nofault);
4075 return false;
4078 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4079 info->host -= mem_off;
4081 #ifdef CONFIG_USER_ONLY
4082 memset(&info->attrs, 0, sizeof(info->attrs));
4083 #else
4085 * Find the iotlbentry for addr and return the transaction attributes.
4086 * This *must* be present in the TLB because we just found the mapping.
4089 uintptr_t index = tlb_index(env, mmu_idx, addr);
4091 # ifdef CONFIG_DEBUG_TCG
4092 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4093 target_ulong comparator = (access_type == MMU_DATA_LOAD
4094 ? entry->addr_read
4095 : tlb_addr_write(entry));
4096 g_assert(tlb_hit(comparator, addr));
4097 # endif
4099 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4100 info->attrs = iotlbentry->attrs;
4102 #endif
4104 return true;
4109 * Analyse contiguous data, protected by a governing predicate.
4112 typedef enum {
4113 FAULT_NO,
4114 FAULT_FIRST,
4115 FAULT_ALL,
4116 } SVEContFault;
4118 typedef struct {
4120 * First and last element wholly contained within the two pages.
4121 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4122 * reg_off_last[0] may be < 0 if the first element crosses pages.
4123 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4124 * are set >= 0 only if there are complete elements on a second page.
4126 * The reg_off_* offsets are relative to the internal vector register.
4127 * The mem_off_first offset is relative to the memory address; the
4128 * two offsets are different when a load operation extends, a store
4129 * operation truncates, or for multi-register operations.
4131 int16_t mem_off_first[2];
4132 int16_t reg_off_first[2];
4133 int16_t reg_off_last[2];
4136 * One element that is misaligned and spans both pages,
4137 * or -1 if there is no such active element.
4139 int16_t mem_off_split;
4140 int16_t reg_off_split;
4143 * The byte offset at which the entire operation crosses a page boundary.
4144 * Set >= 0 if and only if the entire operation spans two pages.
4146 int16_t page_split;
4148 /* TLB data for the two pages. */
4149 SVEHostPage page[2];
4150 } SVEContLdSt;
4153 * Find first active element on each page, and a loose bound for the
4154 * final element on each page. Identify any single element that spans
4155 * the page boundary. Return true if there are any active elements.
4157 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4158 uint64_t *vg, intptr_t reg_max,
4159 int esz, int msize)
4161 const int esize = 1 << esz;
4162 const uint64_t pg_mask = pred_esz_masks[esz];
4163 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4164 intptr_t mem_off_last, mem_off_split;
4165 intptr_t page_split, elt_split;
4166 intptr_t i;
4168 /* Set all of the element indices to -1, and the TLB data to 0. */
4169 memset(info, -1, offsetof(SVEContLdSt, page));
4170 memset(info->page, 0, sizeof(info->page));
4172 /* Gross scan over the entire predicate to find bounds. */
4173 i = 0;
4174 do {
4175 uint64_t pg = vg[i] & pg_mask;
4176 if (pg) {
4177 reg_off_last = i * 64 + 63 - clz64(pg);
4178 if (reg_off_first < 0) {
4179 reg_off_first = i * 64 + ctz64(pg);
4182 } while (++i * 64 < reg_max);
4184 if (unlikely(reg_off_first < 0)) {
4185 /* No active elements, no pages touched. */
4186 return false;
4188 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4190 info->reg_off_first[0] = reg_off_first;
4191 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4192 mem_off_last = (reg_off_last >> esz) * msize;
4194 page_split = -(addr | TARGET_PAGE_MASK);
4195 if (likely(mem_off_last + msize <= page_split)) {
4196 /* The entire operation fits within a single page. */
4197 info->reg_off_last[0] = reg_off_last;
4198 return true;
4201 info->page_split = page_split;
4202 elt_split = page_split / msize;
4203 reg_off_split = elt_split << esz;
4204 mem_off_split = elt_split * msize;
4207 * This is the last full element on the first page, but it is not
4208 * necessarily active. If there is no full element, i.e. the first
4209 * active element is the one that's split, this value remains -1.
4210 * It is useful as iteration bounds.
4212 if (elt_split != 0) {
4213 info->reg_off_last[0] = reg_off_split - esize;
4216 /* Determine if an unaligned element spans the pages. */
4217 if (page_split % msize != 0) {
4218 /* It is helpful to know if the split element is active. */
4219 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4220 info->reg_off_split = reg_off_split;
4221 info->mem_off_split = mem_off_split;
4223 if (reg_off_split == reg_off_last) {
4224 /* The page crossing element is last. */
4225 return true;
4228 reg_off_split += esize;
4229 mem_off_split += msize;
4233 * We do want the first active element on the second page, because
4234 * this may affect the address reported in an exception.
4236 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4237 tcg_debug_assert(reg_off_split <= reg_off_last);
4238 info->reg_off_first[1] = reg_off_split;
4239 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4240 info->reg_off_last[1] = reg_off_last;
4241 return true;
4245 * Resolve the guest virtual addresses to info->page[].
4246 * Control the generation of page faults with @fault. Return false if
4247 * there is no work to do, which can only happen with @fault == FAULT_NO.
4249 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4250 CPUARMState *env, target_ulong addr,
4251 MMUAccessType access_type, uintptr_t retaddr)
4253 int mmu_idx = cpu_mmu_index(env, false);
4254 int mem_off = info->mem_off_first[0];
4255 bool nofault = fault == FAULT_NO;
4256 bool have_work = true;
4258 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4259 access_type, mmu_idx, retaddr)) {
4260 /* No work to be done. */
4261 return false;
4264 if (likely(info->page_split < 0)) {
4265 /* The entire operation was on the one page. */
4266 return true;
4270 * If the second page is invalid, then we want the fault address to be
4271 * the first byte on that page which is accessed.
4273 if (info->mem_off_split >= 0) {
4275 * There is an element split across the pages. The fault address
4276 * should be the first byte of the second page.
4278 mem_off = info->page_split;
4280 * If the split element is also the first active element
4281 * of the vector, then: For first-fault we should continue
4282 * to generate faults for the second page. For no-fault,
4283 * we have work only if the second page is valid.
4285 if (info->mem_off_first[0] < info->mem_off_split) {
4286 nofault = FAULT_FIRST;
4287 have_work = false;
4289 } else {
4291 * There is no element split across the pages. The fault address
4292 * should be the first active element on the second page.
4294 mem_off = info->mem_off_first[1];
4296 * There must have been one active element on the first page,
4297 * so we're out of first-fault territory.
4299 nofault = fault != FAULT_ALL;
4302 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4303 access_type, mmu_idx, retaddr);
4304 return have_work;
4307 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4308 uint64_t *vg, target_ulong addr,
4309 int esize, int msize, int wp_access,
4310 uintptr_t retaddr)
4312 #ifndef CONFIG_USER_ONLY
4313 intptr_t mem_off, reg_off, reg_last;
4314 int flags0 = info->page[0].flags;
4315 int flags1 = info->page[1].flags;
4317 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4318 return;
4321 /* Indicate that watchpoints are handled. */
4322 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4323 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4325 if (flags0 & TLB_WATCHPOINT) {
4326 mem_off = info->mem_off_first[0];
4327 reg_off = info->reg_off_first[0];
4328 reg_last = info->reg_off_last[0];
4330 while (reg_off <= reg_last) {
4331 uint64_t pg = vg[reg_off >> 6];
4332 do {
4333 if ((pg >> (reg_off & 63)) & 1) {
4334 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4335 msize, info->page[0].attrs,
4336 wp_access, retaddr);
4338 reg_off += esize;
4339 mem_off += msize;
4340 } while (reg_off <= reg_last && (reg_off & 63));
4344 mem_off = info->mem_off_split;
4345 if (mem_off >= 0) {
4346 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4347 info->page[0].attrs, wp_access, retaddr);
4350 mem_off = info->mem_off_first[1];
4351 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4352 reg_off = info->reg_off_first[1];
4353 reg_last = info->reg_off_last[1];
4355 do {
4356 uint64_t pg = vg[reg_off >> 6];
4357 do {
4358 if ((pg >> (reg_off & 63)) & 1) {
4359 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4360 msize, info->page[1].attrs,
4361 wp_access, retaddr);
4363 reg_off += esize;
4364 mem_off += msize;
4365 } while (reg_off & 63);
4366 } while (reg_off <= reg_last);
4368 #endif
4371 typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4373 static inline QEMU_ALWAYS_INLINE
4374 void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4375 uint64_t *vg, target_ulong addr, int esize,
4376 int msize, uint32_t mtedesc, uintptr_t ra,
4377 mte_check_fn *check)
4379 intptr_t mem_off, reg_off, reg_last;
4381 /* Process the page only if MemAttr == Tagged. */
4382 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4383 mem_off = info->mem_off_first[0];
4384 reg_off = info->reg_off_first[0];
4385 reg_last = info->reg_off_split;
4386 if (reg_last < 0) {
4387 reg_last = info->reg_off_last[0];
4390 do {
4391 uint64_t pg = vg[reg_off >> 6];
4392 do {
4393 if ((pg >> (reg_off & 63)) & 1) {
4394 check(env, mtedesc, addr, ra);
4396 reg_off += esize;
4397 mem_off += msize;
4398 } while (reg_off <= reg_last && (reg_off & 63));
4399 } while (reg_off <= reg_last);
4402 mem_off = info->mem_off_first[1];
4403 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4404 reg_off = info->reg_off_first[1];
4405 reg_last = info->reg_off_last[1];
4407 do {
4408 uint64_t pg = vg[reg_off >> 6];
4409 do {
4410 if ((pg >> (reg_off & 63)) & 1) {
4411 check(env, mtedesc, addr, ra);
4413 reg_off += esize;
4414 mem_off += msize;
4415 } while (reg_off & 63);
4416 } while (reg_off <= reg_last);
4420 typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4421 uint64_t *vg, target_ulong addr,
4422 int esize, int msize, uint32_t mtedesc,
4423 uintptr_t ra);
4425 static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4426 uint64_t *vg, target_ulong addr,
4427 int esize, int msize, uint32_t mtedesc,
4428 uintptr_t ra)
4430 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4431 mtedesc, ra, mte_check1);
4434 static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4435 uint64_t *vg, target_ulong addr,
4436 int esize, int msize, uint32_t mtedesc,
4437 uintptr_t ra)
4439 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4440 mtedesc, ra, mte_checkN);
4445 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4447 static inline QEMU_ALWAYS_INLINE
4448 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4449 uint32_t desc, const uintptr_t retaddr,
4450 const int esz, const int msz, const int N, uint32_t mtedesc,
4451 sve_ldst1_host_fn *host_fn,
4452 sve_ldst1_tlb_fn *tlb_fn,
4453 sve_cont_ldst_mte_check_fn *mte_check_fn)
4455 const unsigned rd = simd_data(desc);
4456 const intptr_t reg_max = simd_oprsz(desc);
4457 intptr_t reg_off, reg_last, mem_off;
4458 SVEContLdSt info;
4459 void *host;
4460 int flags, i;
4462 /* Find the active elements. */
4463 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4464 /* The entire predicate was false; no load occurs. */
4465 for (i = 0; i < N; ++i) {
4466 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4468 return;
4471 /* Probe the page(s). Exit with exception for any invalid page. */
4472 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4474 /* Handle watchpoints for all active elements. */
4475 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4476 BP_MEM_READ, retaddr);
4479 * Handle mte checks for all active elements.
4480 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4482 if (mte_check_fn && mtedesc) {
4483 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4484 mtedesc, retaddr);
4487 flags = info.page[0].flags | info.page[1].flags;
4488 if (unlikely(flags != 0)) {
4489 #ifdef CONFIG_USER_ONLY
4490 g_assert_not_reached();
4491 #else
4493 * At least one page includes MMIO.
4494 * Any bus operation can fail with cpu_transaction_failed,
4495 * which for ARM will raise SyncExternal. Perform the load
4496 * into scratch memory to preserve register state until the end.
4498 ARMVectorReg scratch[4] = { };
4500 mem_off = info.mem_off_first[0];
4501 reg_off = info.reg_off_first[0];
4502 reg_last = info.reg_off_last[1];
4503 if (reg_last < 0) {
4504 reg_last = info.reg_off_split;
4505 if (reg_last < 0) {
4506 reg_last = info.reg_off_last[0];
4510 do {
4511 uint64_t pg = vg[reg_off >> 6];
4512 do {
4513 if ((pg >> (reg_off & 63)) & 1) {
4514 for (i = 0; i < N; ++i) {
4515 tlb_fn(env, &scratch[i], reg_off,
4516 addr + mem_off + (i << msz), retaddr);
4519 reg_off += 1 << esz;
4520 mem_off += N << msz;
4521 } while (reg_off & 63);
4522 } while (reg_off <= reg_last);
4524 for (i = 0; i < N; ++i) {
4525 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4527 return;
4528 #endif
4531 /* The entire operation is in RAM, on valid pages. */
4533 for (i = 0; i < N; ++i) {
4534 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4537 mem_off = info.mem_off_first[0];
4538 reg_off = info.reg_off_first[0];
4539 reg_last = info.reg_off_last[0];
4540 host = info.page[0].host;
4542 while (reg_off <= reg_last) {
4543 uint64_t pg = vg[reg_off >> 6];
4544 do {
4545 if ((pg >> (reg_off & 63)) & 1) {
4546 for (i = 0; i < N; ++i) {
4547 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4548 host + mem_off + (i << msz));
4551 reg_off += 1 << esz;
4552 mem_off += N << msz;
4553 } while (reg_off <= reg_last && (reg_off & 63));
4557 * Use the slow path to manage the cross-page misalignment.
4558 * But we know this is RAM and cannot trap.
4560 mem_off = info.mem_off_split;
4561 if (unlikely(mem_off >= 0)) {
4562 reg_off = info.reg_off_split;
4563 for (i = 0; i < N; ++i) {
4564 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4565 addr + mem_off + (i << msz), retaddr);
4569 mem_off = info.mem_off_first[1];
4570 if (unlikely(mem_off >= 0)) {
4571 reg_off = info.reg_off_first[1];
4572 reg_last = info.reg_off_last[1];
4573 host = info.page[1].host;
4575 do {
4576 uint64_t pg = vg[reg_off >> 6];
4577 do {
4578 if ((pg >> (reg_off & 63)) & 1) {
4579 for (i = 0; i < N; ++i) {
4580 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4581 host + mem_off + (i << msz));
4584 reg_off += 1 << esz;
4585 mem_off += N << msz;
4586 } while (reg_off & 63);
4587 } while (reg_off <= reg_last);
4591 static inline QEMU_ALWAYS_INLINE
4592 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4593 uint32_t desc, const uintptr_t ra,
4594 const int esz, const int msz, const int N,
4595 sve_ldst1_host_fn *host_fn,
4596 sve_ldst1_tlb_fn *tlb_fn)
4598 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4599 int bit55 = extract64(addr, 55, 1);
4601 /* Remove mtedesc from the normal sve descriptor. */
4602 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4604 /* Perform gross MTE suppression early. */
4605 if (!tbi_check(desc, bit55) ||
4606 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4607 mtedesc = 0;
4610 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4611 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
4614 #define DO_LD1_1(NAME, ESZ) \
4615 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4616 target_ulong addr, uint32_t desc) \
4618 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4619 sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
4621 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4622 target_ulong addr, uint32_t desc) \
4624 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4625 sve_##NAME##_host, sve_##NAME##_tlb); \
4628 #define DO_LD1_2(NAME, ESZ, MSZ) \
4629 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4630 target_ulong addr, uint32_t desc) \
4632 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4633 sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
4635 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4636 target_ulong addr, uint32_t desc) \
4638 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4639 sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
4641 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4642 target_ulong addr, uint32_t desc) \
4644 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4645 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4647 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4648 target_ulong addr, uint32_t desc) \
4650 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4651 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4654 DO_LD1_1(ld1bb, MO_8)
4655 DO_LD1_1(ld1bhu, MO_16)
4656 DO_LD1_1(ld1bhs, MO_16)
4657 DO_LD1_1(ld1bsu, MO_32)
4658 DO_LD1_1(ld1bss, MO_32)
4659 DO_LD1_1(ld1bdu, MO_64)
4660 DO_LD1_1(ld1bds, MO_64)
4662 DO_LD1_2(ld1hh, MO_16, MO_16)
4663 DO_LD1_2(ld1hsu, MO_32, MO_16)
4664 DO_LD1_2(ld1hss, MO_32, MO_16)
4665 DO_LD1_2(ld1hdu, MO_64, MO_16)
4666 DO_LD1_2(ld1hds, MO_64, MO_16)
4668 DO_LD1_2(ld1ss, MO_32, MO_32)
4669 DO_LD1_2(ld1sdu, MO_64, MO_32)
4670 DO_LD1_2(ld1sds, MO_64, MO_32)
4672 DO_LD1_2(ld1dd, MO_64, MO_64)
4674 #undef DO_LD1_1
4675 #undef DO_LD1_2
4677 #define DO_LDN_1(N) \
4678 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4679 target_ulong addr, uint32_t desc) \
4681 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4682 sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
4684 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4685 target_ulong addr, uint32_t desc) \
4687 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4688 sve_ld1bb_host, sve_ld1bb_tlb); \
4691 #define DO_LDN_2(N, SUFF, ESZ) \
4692 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4693 target_ulong addr, uint32_t desc) \
4695 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4696 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
4698 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4699 target_ulong addr, uint32_t desc) \
4701 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4702 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
4704 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4705 target_ulong addr, uint32_t desc) \
4707 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4708 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4710 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4711 target_ulong addr, uint32_t desc) \
4713 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4714 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4717 DO_LDN_1(2)
4718 DO_LDN_1(3)
4719 DO_LDN_1(4)
4721 DO_LDN_2(2, hh, MO_16)
4722 DO_LDN_2(3, hh, MO_16)
4723 DO_LDN_2(4, hh, MO_16)
4725 DO_LDN_2(2, ss, MO_32)
4726 DO_LDN_2(3, ss, MO_32)
4727 DO_LDN_2(4, ss, MO_32)
4729 DO_LDN_2(2, dd, MO_64)
4730 DO_LDN_2(3, dd, MO_64)
4731 DO_LDN_2(4, dd, MO_64)
4733 #undef DO_LDN_1
4734 #undef DO_LDN_2
4737 * Load contiguous data, first-fault and no-fault.
4739 * For user-only, one could argue that we should hold the mmap_lock during
4740 * the operation so that there is no race between page_check_range and the
4741 * load operation. However, unmapping pages out from under a running thread
4742 * is extraordinarily unlikely. This theoretical race condition also affects
4743 * linux-user/ in its get_user/put_user macros.
4745 * TODO: Construct some helpers, written in assembly, that interact with
4746 * handle_cpu_signal to produce memory ops which can properly report errors
4747 * without racing.
4750 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4751 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4752 * option, which leaves subsequent data unchanged.
4754 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4756 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4758 if (i & 63) {
4759 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4760 i = ROUND_UP(i, 64);
4762 for (; i < oprsz; i += 64) {
4763 ffr[i / 64] = 0;
4768 * Common helper for all contiguous no-fault and first-fault loads.
4770 static inline QEMU_ALWAYS_INLINE
4771 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4772 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
4773 const int esz, const int msz, const SVEContFault fault,
4774 sve_ldst1_host_fn *host_fn,
4775 sve_ldst1_tlb_fn *tlb_fn)
4777 const unsigned rd = simd_data(desc);
4778 void *vd = &env->vfp.zregs[rd];
4779 const intptr_t reg_max = simd_oprsz(desc);
4780 intptr_t reg_off, mem_off, reg_last;
4781 SVEContLdSt info;
4782 int flags;
4783 void *host;
4785 /* Find the active elements. */
4786 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4787 /* The entire predicate was false; no load occurs. */
4788 memset(vd, 0, reg_max);
4789 return;
4791 reg_off = info.reg_off_first[0];
4793 /* Probe the page(s). */
4794 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4795 /* Fault on first element. */
4796 tcg_debug_assert(fault == FAULT_NO);
4797 memset(vd, 0, reg_max);
4798 goto do_fault;
4801 mem_off = info.mem_off_first[0];
4802 flags = info.page[0].flags;
4805 * Disable MTE checking if the Tagged bit is not set. Since TBI must
4806 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
4808 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4809 mtedesc = 0;
4812 if (fault == FAULT_FIRST) {
4813 /* Trapping mte check for the first-fault element. */
4814 if (mtedesc) {
4815 mte_check1(env, mtedesc, addr + mem_off, retaddr);
4819 * Special handling of the first active element,
4820 * if it crosses a page boundary or is MMIO.
4822 bool is_split = mem_off == info.mem_off_split;
4823 if (unlikely(flags != 0) || unlikely(is_split)) {
4825 * Use the slow path for cross-page handling.
4826 * Might trap for MMIO or watchpoints.
4828 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4830 /* After any fault, zero the other elements. */
4831 swap_memzero(vd, reg_off);
4832 reg_off += 1 << esz;
4833 mem_off += 1 << msz;
4834 swap_memzero(vd + reg_off, reg_max - reg_off);
4836 if (is_split) {
4837 goto second_page;
4839 } else {
4840 memset(vd, 0, reg_max);
4842 } else {
4843 memset(vd, 0, reg_max);
4844 if (unlikely(mem_off == info.mem_off_split)) {
4845 /* The first active element crosses a page boundary. */
4846 flags |= info.page[1].flags;
4847 if (unlikely(flags & TLB_MMIO)) {
4848 /* Some page is MMIO, see below. */
4849 goto do_fault;
4851 if (unlikely(flags & TLB_WATCHPOINT) &&
4852 (cpu_watchpoint_address_matches
4853 (env_cpu(env), addr + mem_off, 1 << msz)
4854 & BP_MEM_READ)) {
4855 /* Watchpoint hit, see below. */
4856 goto do_fault;
4858 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4859 goto do_fault;
4862 * Use the slow path for cross-page handling.
4863 * This is RAM, without a watchpoint, and will not trap.
4865 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4866 goto second_page;
4871 * From this point on, all memory operations are MemSingleNF.
4873 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4874 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4876 * Unfortuately we do not have access to the memory attributes from the
4877 * PTE to tell Device memory from Normal memory. So we make a mostly
4878 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4879 * This gives the right answer for the common cases of "Normal memory,
4880 * backed by host RAM" and "Device memory, backed by MMIO".
4881 * The architecture allows us to suppress an NF load and return
4882 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4883 * case of "Normal memory, backed by MMIO" is permitted. The case we
4884 * get wrong is "Device memory, backed by host RAM", for which we
4885 * should return (UNKNOWN, FAULT) for but do not.
4887 * Similarly, CPU_BP breakpoints would raise exceptions, and so
4888 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
4889 * architectural breakpoints the same.
4891 if (unlikely(flags & TLB_MMIO)) {
4892 goto do_fault;
4895 reg_last = info.reg_off_last[0];
4896 host = info.page[0].host;
4898 do {
4899 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4900 do {
4901 if ((pg >> (reg_off & 63)) & 1) {
4902 if (unlikely(flags & TLB_WATCHPOINT) &&
4903 (cpu_watchpoint_address_matches
4904 (env_cpu(env), addr + mem_off, 1 << msz)
4905 & BP_MEM_READ)) {
4906 goto do_fault;
4908 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4909 goto do_fault;
4911 host_fn(vd, reg_off, host + mem_off);
4913 reg_off += 1 << esz;
4914 mem_off += 1 << msz;
4915 } while (reg_off <= reg_last && (reg_off & 63));
4916 } while (reg_off <= reg_last);
4919 * MemSingleNF is allowed to fail for any reason. We have special
4920 * code above to handle the first element crossing a page boundary.
4921 * As an implementation choice, decline to handle a cross-page element
4922 * in any other position.
4924 reg_off = info.reg_off_split;
4925 if (reg_off >= 0) {
4926 goto do_fault;
4929 second_page:
4930 reg_off = info.reg_off_first[1];
4931 if (likely(reg_off < 0)) {
4932 /* No active elements on the second page. All done. */
4933 return;
4937 * MemSingleNF is allowed to fail for any reason. As an implementation
4938 * choice, decline to handle elements on the second page. This should
4939 * be low frequency as the guest walks through memory -- the next
4940 * iteration of the guest's loop should be aligned on the page boundary,
4941 * and then all following iterations will stay aligned.
4944 do_fault:
4945 record_fault(env, reg_off, reg_max);
4948 static inline QEMU_ALWAYS_INLINE
4949 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4950 uint32_t desc, const uintptr_t retaddr,
4951 const int esz, const int msz, const SVEContFault fault,
4952 sve_ldst1_host_fn *host_fn,
4953 sve_ldst1_tlb_fn *tlb_fn)
4955 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4956 int bit55 = extract64(addr, 55, 1);
4958 /* Remove mtedesc from the normal sve descriptor. */
4959 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4961 /* Perform gross MTE suppression early. */
4962 if (!tbi_check(desc, bit55) ||
4963 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4964 mtedesc = 0;
4967 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4968 esz, msz, fault, host_fn, tlb_fn);
4971 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4972 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4973 target_ulong addr, uint32_t desc) \
4975 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
4976 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4978 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4979 target_ulong addr, uint32_t desc) \
4981 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4982 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4984 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
4985 target_ulong addr, uint32_t desc) \
4987 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
4988 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4990 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
4991 target_ulong addr, uint32_t desc) \
4993 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
4994 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4997 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4998 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4999 target_ulong addr, uint32_t desc) \
5001 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5002 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5004 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5005 target_ulong addr, uint32_t desc) \
5007 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5008 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5010 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5011 target_ulong addr, uint32_t desc) \
5013 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5014 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5016 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5017 target_ulong addr, uint32_t desc) \
5019 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5020 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5022 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5023 target_ulong addr, uint32_t desc) \
5025 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5026 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5028 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5029 target_ulong addr, uint32_t desc) \
5031 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5032 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5034 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5035 target_ulong addr, uint32_t desc) \
5037 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5038 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5040 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5041 target_ulong addr, uint32_t desc) \
5043 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5044 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5047 DO_LDFF1_LDNF1_1(bb, MO_8)
5048 DO_LDFF1_LDNF1_1(bhu, MO_16)
5049 DO_LDFF1_LDNF1_1(bhs, MO_16)
5050 DO_LDFF1_LDNF1_1(bsu, MO_32)
5051 DO_LDFF1_LDNF1_1(bss, MO_32)
5052 DO_LDFF1_LDNF1_1(bdu, MO_64)
5053 DO_LDFF1_LDNF1_1(bds, MO_64)
5055 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5056 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5057 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5058 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5059 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5061 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5062 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5063 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5065 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5067 #undef DO_LDFF1_LDNF1_1
5068 #undef DO_LDFF1_LDNF1_2
5071 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5074 static inline QEMU_ALWAYS_INLINE
5075 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5076 uint32_t desc, const uintptr_t retaddr,
5077 const int esz, const int msz, const int N, uint32_t mtedesc,
5078 sve_ldst1_host_fn *host_fn,
5079 sve_ldst1_tlb_fn *tlb_fn,
5080 sve_cont_ldst_mte_check_fn *mte_check_fn)
5082 const unsigned rd = simd_data(desc);
5083 const intptr_t reg_max = simd_oprsz(desc);
5084 intptr_t reg_off, reg_last, mem_off;
5085 SVEContLdSt info;
5086 void *host;
5087 int i, flags;
5089 /* Find the active elements. */
5090 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5091 /* The entire predicate was false; no store occurs. */
5092 return;
5095 /* Probe the page(s). Exit with exception for any invalid page. */
5096 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5098 /* Handle watchpoints for all active elements. */
5099 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5100 BP_MEM_WRITE, retaddr);
5103 * Handle mte checks for all active elements.
5104 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5106 if (mte_check_fn && mtedesc) {
5107 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5108 mtedesc, retaddr);
5111 flags = info.page[0].flags | info.page[1].flags;
5112 if (unlikely(flags != 0)) {
5113 #ifdef CONFIG_USER_ONLY
5114 g_assert_not_reached();
5115 #else
5117 * At least one page includes MMIO.
5118 * Any bus operation can fail with cpu_transaction_failed,
5119 * which for ARM will raise SyncExternal. We cannot avoid
5120 * this fault and will leave with the store incomplete.
5122 mem_off = info.mem_off_first[0];
5123 reg_off = info.reg_off_first[0];
5124 reg_last = info.reg_off_last[1];
5125 if (reg_last < 0) {
5126 reg_last = info.reg_off_split;
5127 if (reg_last < 0) {
5128 reg_last = info.reg_off_last[0];
5132 do {
5133 uint64_t pg = vg[reg_off >> 6];
5134 do {
5135 if ((pg >> (reg_off & 63)) & 1) {
5136 for (i = 0; i < N; ++i) {
5137 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5138 addr + mem_off + (i << msz), retaddr);
5141 reg_off += 1 << esz;
5142 mem_off += N << msz;
5143 } while (reg_off & 63);
5144 } while (reg_off <= reg_last);
5145 return;
5146 #endif
5149 mem_off = info.mem_off_first[0];
5150 reg_off = info.reg_off_first[0];
5151 reg_last = info.reg_off_last[0];
5152 host = info.page[0].host;
5154 while (reg_off <= reg_last) {
5155 uint64_t pg = vg[reg_off >> 6];
5156 do {
5157 if ((pg >> (reg_off & 63)) & 1) {
5158 for (i = 0; i < N; ++i) {
5159 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5160 host + mem_off + (i << msz));
5163 reg_off += 1 << esz;
5164 mem_off += N << msz;
5165 } while (reg_off <= reg_last && (reg_off & 63));
5169 * Use the slow path to manage the cross-page misalignment.
5170 * But we know this is RAM and cannot trap.
5172 mem_off = info.mem_off_split;
5173 if (unlikely(mem_off >= 0)) {
5174 reg_off = info.reg_off_split;
5175 for (i = 0; i < N; ++i) {
5176 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5177 addr + mem_off + (i << msz), retaddr);
5181 mem_off = info.mem_off_first[1];
5182 if (unlikely(mem_off >= 0)) {
5183 reg_off = info.reg_off_first[1];
5184 reg_last = info.reg_off_last[1];
5185 host = info.page[1].host;
5187 do {
5188 uint64_t pg = vg[reg_off >> 6];
5189 do {
5190 if ((pg >> (reg_off & 63)) & 1) {
5191 for (i = 0; i < N; ++i) {
5192 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5193 host + mem_off + (i << msz));
5196 reg_off += 1 << esz;
5197 mem_off += N << msz;
5198 } while (reg_off & 63);
5199 } while (reg_off <= reg_last);
5203 static inline QEMU_ALWAYS_INLINE
5204 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5205 uint32_t desc, const uintptr_t ra,
5206 const int esz, const int msz, const int N,
5207 sve_ldst1_host_fn *host_fn,
5208 sve_ldst1_tlb_fn *tlb_fn)
5210 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5211 int bit55 = extract64(addr, 55, 1);
5213 /* Remove mtedesc from the normal sve descriptor. */
5214 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5216 /* Perform gross MTE suppression early. */
5217 if (!tbi_check(desc, bit55) ||
5218 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5219 mtedesc = 0;
5222 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5223 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
5226 #define DO_STN_1(N, NAME, ESZ) \
5227 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5228 target_ulong addr, uint32_t desc) \
5230 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5231 sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
5233 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5234 target_ulong addr, uint32_t desc) \
5236 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5237 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5240 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5241 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5242 target_ulong addr, uint32_t desc) \
5244 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5245 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
5247 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5248 target_ulong addr, uint32_t desc) \
5250 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5251 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
5253 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5254 target_ulong addr, uint32_t desc) \
5256 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5257 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5259 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5260 target_ulong addr, uint32_t desc) \
5262 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5263 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5266 DO_STN_1(1, bb, MO_8)
5267 DO_STN_1(1, bh, MO_16)
5268 DO_STN_1(1, bs, MO_32)
5269 DO_STN_1(1, bd, MO_64)
5270 DO_STN_1(2, bb, MO_8)
5271 DO_STN_1(3, bb, MO_8)
5272 DO_STN_1(4, bb, MO_8)
5274 DO_STN_2(1, hh, MO_16, MO_16)
5275 DO_STN_2(1, hs, MO_32, MO_16)
5276 DO_STN_2(1, hd, MO_64, MO_16)
5277 DO_STN_2(2, hh, MO_16, MO_16)
5278 DO_STN_2(3, hh, MO_16, MO_16)
5279 DO_STN_2(4, hh, MO_16, MO_16)
5281 DO_STN_2(1, ss, MO_32, MO_32)
5282 DO_STN_2(1, sd, MO_64, MO_32)
5283 DO_STN_2(2, ss, MO_32, MO_32)
5284 DO_STN_2(3, ss, MO_32, MO_32)
5285 DO_STN_2(4, ss, MO_32, MO_32)
5287 DO_STN_2(1, dd, MO_64, MO_64)
5288 DO_STN_2(2, dd, MO_64, MO_64)
5289 DO_STN_2(3, dd, MO_64, MO_64)
5290 DO_STN_2(4, dd, MO_64, MO_64)
5292 #undef DO_STN_1
5293 #undef DO_STN_2
5296 * Loads with a vector index.
5300 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5302 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5304 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5306 return *(uint32_t *)(reg + H1_4(reg_ofs));
5309 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5311 return *(int32_t *)(reg + H1_4(reg_ofs));
5314 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5316 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5319 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5321 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5324 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5326 return *(uint64_t *)(reg + reg_ofs);
5329 static inline QEMU_ALWAYS_INLINE
5330 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5331 target_ulong base, uint32_t desc, uintptr_t retaddr,
5332 uint32_t mtedesc, int esize, int msize,
5333 zreg_off_fn *off_fn,
5334 sve_ldst1_host_fn *host_fn,
5335 sve_ldst1_tlb_fn *tlb_fn)
5337 const int mmu_idx = cpu_mmu_index(env, false);
5338 const intptr_t reg_max = simd_oprsz(desc);
5339 const int scale = simd_data(desc);
5340 ARMVectorReg scratch;
5341 intptr_t reg_off;
5342 SVEHostPage info, info2;
5344 memset(&scratch, 0, reg_max);
5345 reg_off = 0;
5346 do {
5347 uint64_t pg = vg[reg_off >> 6];
5348 do {
5349 if (likely(pg & 1)) {
5350 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5351 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5353 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5354 mmu_idx, retaddr);
5356 if (likely(in_page >= msize)) {
5357 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5358 cpu_check_watchpoint(env_cpu(env), addr, msize,
5359 info.attrs, BP_MEM_READ, retaddr);
5361 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5362 mte_check1(env, mtedesc, addr, retaddr);
5364 host_fn(&scratch, reg_off, info.host);
5365 } else {
5366 /* Element crosses the page boundary. */
5367 sve_probe_page(&info2, false, env, addr + in_page, 0,
5368 MMU_DATA_LOAD, mmu_idx, retaddr);
5369 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5370 cpu_check_watchpoint(env_cpu(env), addr,
5371 msize, info.attrs,
5372 BP_MEM_READ, retaddr);
5374 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5375 mte_check1(env, mtedesc, addr, retaddr);
5377 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5380 reg_off += esize;
5381 pg >>= esize;
5382 } while (reg_off & 63);
5383 } while (reg_off < reg_max);
5385 /* Wait until all exceptions have been raised to write back. */
5386 memcpy(vd, &scratch, reg_max);
5389 static inline QEMU_ALWAYS_INLINE
5390 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5391 target_ulong base, uint32_t desc, uintptr_t retaddr,
5392 int esize, int msize, zreg_off_fn *off_fn,
5393 sve_ldst1_host_fn *host_fn,
5394 sve_ldst1_tlb_fn *tlb_fn)
5396 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5397 /* Remove mtedesc from the normal sve descriptor. */
5398 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5401 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5402 * offset base entirely over the address space hole to change the
5403 * pointer tag, or change the bit55 selector. So we could here
5404 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5406 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5407 esize, msize, off_fn, host_fn, tlb_fn);
5410 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5411 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5412 void *vm, target_ulong base, uint32_t desc) \
5414 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5415 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5417 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5418 void *vm, target_ulong base, uint32_t desc) \
5420 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5421 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5424 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5425 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5426 void *vm, target_ulong base, uint32_t desc) \
5428 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5429 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5431 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5432 void *vm, target_ulong base, uint32_t desc) \
5434 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5435 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5438 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5439 DO_LD1_ZPZ_S(bsu, zss, MO_8)
5440 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5441 DO_LD1_ZPZ_D(bdu, zss, MO_8)
5442 DO_LD1_ZPZ_D(bdu, zd, MO_8)
5444 DO_LD1_ZPZ_S(bss, zsu, MO_8)
5445 DO_LD1_ZPZ_S(bss, zss, MO_8)
5446 DO_LD1_ZPZ_D(bds, zsu, MO_8)
5447 DO_LD1_ZPZ_D(bds, zss, MO_8)
5448 DO_LD1_ZPZ_D(bds, zd, MO_8)
5450 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5451 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5452 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5453 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5454 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5456 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5457 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5458 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5459 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5460 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5462 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5463 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5464 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5465 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5466 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5468 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5469 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5470 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5471 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5472 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5474 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5475 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5476 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5477 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5478 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5480 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5481 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5482 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5483 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5484 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5486 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5487 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5488 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5490 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5491 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5492 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5494 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5495 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5496 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5498 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5499 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5500 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5502 #undef DO_LD1_ZPZ_S
5503 #undef DO_LD1_ZPZ_D
5505 /* First fault loads with a vector index. */
5508 * Common helpers for all gather first-faulting loads.
5511 static inline QEMU_ALWAYS_INLINE
5512 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5513 target_ulong base, uint32_t desc, uintptr_t retaddr,
5514 uint32_t mtedesc, const int esz, const int msz,
5515 zreg_off_fn *off_fn,
5516 sve_ldst1_host_fn *host_fn,
5517 sve_ldst1_tlb_fn *tlb_fn)
5519 const int mmu_idx = cpu_mmu_index(env, false);
5520 const intptr_t reg_max = simd_oprsz(desc);
5521 const int scale = simd_data(desc);
5522 const int esize = 1 << esz;
5523 const int msize = 1 << msz;
5524 intptr_t reg_off;
5525 SVEHostPage info;
5526 target_ulong addr, in_page;
5528 /* Skip to the first true predicate. */
5529 reg_off = find_next_active(vg, 0, reg_max, esz);
5530 if (unlikely(reg_off >= reg_max)) {
5531 /* The entire predicate was false; no load occurs. */
5532 memset(vd, 0, reg_max);
5533 return;
5537 * Probe the first element, allowing faults.
5539 addr = base + (off_fn(vm, reg_off) << scale);
5540 if (mtedesc) {
5541 mte_check1(env, mtedesc, addr, retaddr);
5543 tlb_fn(env, vd, reg_off, addr, retaddr);
5545 /* After any fault, zero the other elements. */
5546 swap_memzero(vd, reg_off);
5547 reg_off += esize;
5548 swap_memzero(vd + reg_off, reg_max - reg_off);
5551 * Probe the remaining elements, not allowing faults.
5553 while (reg_off < reg_max) {
5554 uint64_t pg = vg[reg_off >> 6];
5555 do {
5556 if (likely((pg >> (reg_off & 63)) & 1)) {
5557 addr = base + (off_fn(vm, reg_off) << scale);
5558 in_page = -(addr | TARGET_PAGE_MASK);
5560 if (unlikely(in_page < msize)) {
5561 /* Stop if the element crosses a page boundary. */
5562 goto fault;
5565 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5566 mmu_idx, retaddr);
5567 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5568 goto fault;
5570 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5571 (cpu_watchpoint_address_matches
5572 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5573 goto fault;
5575 if (mtedesc &&
5576 arm_tlb_mte_tagged(&info.attrs) &&
5577 !mte_probe1(env, mtedesc, addr)) {
5578 goto fault;
5581 host_fn(vd, reg_off, info.host);
5583 reg_off += esize;
5584 } while (reg_off & 63);
5586 return;
5588 fault:
5589 record_fault(env, reg_off, reg_max);
5592 static inline QEMU_ALWAYS_INLINE
5593 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5594 target_ulong base, uint32_t desc, uintptr_t retaddr,
5595 const int esz, const int msz,
5596 zreg_off_fn *off_fn,
5597 sve_ldst1_host_fn *host_fn,
5598 sve_ldst1_tlb_fn *tlb_fn)
5600 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5601 /* Remove mtedesc from the normal sve descriptor. */
5602 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5605 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5606 * offset base entirely over the address space hole to change the
5607 * pointer tag, or change the bit55 selector. So we could here
5608 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5610 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5611 esz, msz, off_fn, host_fn, tlb_fn);
5614 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5615 void HELPER(sve_ldff##MEM##_##OFS) \
5616 (CPUARMState *env, void *vd, void *vg, \
5617 void *vm, target_ulong base, uint32_t desc) \
5619 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
5620 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5622 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5623 (CPUARMState *env, void *vd, void *vg, \
5624 void *vm, target_ulong base, uint32_t desc) \
5626 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5627 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5630 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5631 void HELPER(sve_ldff##MEM##_##OFS) \
5632 (CPUARMState *env, void *vd, void *vg, \
5633 void *vm, target_ulong base, uint32_t desc) \
5635 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
5636 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5638 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5639 (CPUARMState *env, void *vd, void *vg, \
5640 void *vm, target_ulong base, uint32_t desc) \
5642 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5643 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5646 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5647 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5648 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5649 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5650 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5652 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5653 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5654 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5655 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5656 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5658 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5659 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5660 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5661 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5662 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5664 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5665 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5666 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5667 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5668 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5670 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5671 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5672 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5673 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5674 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5676 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5677 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5678 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5679 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5680 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5682 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5683 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5684 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5685 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5686 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5688 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5689 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5690 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5691 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5692 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5694 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5695 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5696 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5698 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5699 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5700 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5702 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5703 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5704 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5706 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5707 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5708 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5710 /* Stores with a vector index. */
5712 static inline QEMU_ALWAYS_INLINE
5713 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5714 target_ulong base, uint32_t desc, uintptr_t retaddr,
5715 uint32_t mtedesc, int esize, int msize,
5716 zreg_off_fn *off_fn,
5717 sve_ldst1_host_fn *host_fn,
5718 sve_ldst1_tlb_fn *tlb_fn)
5720 const int mmu_idx = cpu_mmu_index(env, false);
5721 const intptr_t reg_max = simd_oprsz(desc);
5722 const int scale = simd_data(desc);
5723 void *host[ARM_MAX_VQ * 4];
5724 intptr_t reg_off, i;
5725 SVEHostPage info, info2;
5728 * Probe all of the elements for host addresses and flags.
5730 i = reg_off = 0;
5731 do {
5732 uint64_t pg = vg[reg_off >> 6];
5733 do {
5734 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5735 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5737 host[i] = NULL;
5738 if (likely((pg >> (reg_off & 63)) & 1)) {
5739 if (likely(in_page >= msize)) {
5740 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5741 mmu_idx, retaddr);
5742 host[i] = info.host;
5743 } else {
5745 * Element crosses the page boundary.
5746 * Probe both pages, but do not record the host address,
5747 * so that we use the slow path.
5749 sve_probe_page(&info, false, env, addr, 0,
5750 MMU_DATA_STORE, mmu_idx, retaddr);
5751 sve_probe_page(&info2, false, env, addr + in_page, 0,
5752 MMU_DATA_STORE, mmu_idx, retaddr);
5753 info.flags |= info2.flags;
5756 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5757 cpu_check_watchpoint(env_cpu(env), addr, msize,
5758 info.attrs, BP_MEM_WRITE, retaddr);
5761 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5762 mte_check1(env, mtedesc, addr, retaddr);
5765 i += 1;
5766 reg_off += esize;
5767 } while (reg_off & 63);
5768 } while (reg_off < reg_max);
5771 * Now that we have recognized all exceptions except SyncExternal
5772 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5774 * Note for the common case of an element in RAM, not crossing a page
5775 * boundary, we have stored the host address in host[]. This doubles
5776 * as a first-level check against the predicate, since only enabled
5777 * elements have non-null host addresses.
5779 i = reg_off = 0;
5780 do {
5781 void *h = host[i];
5782 if (likely(h != NULL)) {
5783 host_fn(vd, reg_off, h);
5784 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5785 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5786 tlb_fn(env, vd, reg_off, addr, retaddr);
5788 i += 1;
5789 reg_off += esize;
5790 } while (reg_off < reg_max);
5793 static inline QEMU_ALWAYS_INLINE
5794 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5795 target_ulong base, uint32_t desc, uintptr_t retaddr,
5796 int esize, int msize, zreg_off_fn *off_fn,
5797 sve_ldst1_host_fn *host_fn,
5798 sve_ldst1_tlb_fn *tlb_fn)
5800 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801 /* Remove mtedesc from the normal sve descriptor. */
5802 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5805 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5806 * offset base entirely over the address space hole to change the
5807 * pointer tag, or change the bit55 selector. So we could here
5808 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5810 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5811 esize, msize, off_fn, host_fn, tlb_fn);
5814 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5815 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5816 void *vm, target_ulong base, uint32_t desc) \
5818 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5819 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5821 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5822 void *vm, target_ulong base, uint32_t desc) \
5824 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5825 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5828 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5829 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5830 void *vm, target_ulong base, uint32_t desc) \
5832 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5833 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5835 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5836 void *vm, target_ulong base, uint32_t desc) \
5838 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5839 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5842 DO_ST1_ZPZ_S(bs, zsu, MO_8)
5843 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5844 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5845 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5846 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5848 DO_ST1_ZPZ_S(bs, zss, MO_8)
5849 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5850 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5851 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5852 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5854 DO_ST1_ZPZ_D(bd, zsu, MO_8)
5855 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5856 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5857 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5858 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5859 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5860 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5862 DO_ST1_ZPZ_D(bd, zss, MO_8)
5863 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5864 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5865 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5866 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5867 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5868 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5870 DO_ST1_ZPZ_D(bd, zd, MO_8)
5871 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5872 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5873 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5874 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5875 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5876 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5878 #undef DO_ST1_ZPZ_S
5879 #undef DO_ST1_ZPZ_D