hw/timer/sse-timer: Model the SSE Subsystem System Timer
[qemu/ar7.git] / target / arm / sve_helper.c
blob844db08bd577016081329d6c4002348c359695e4
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
39 #else
40 #define H1(x) (x)
41 #define H1_2(x) (x)
42 #define H1_4(x) (x)
43 #define H2(x) (x)
44 #define H4(x) (x)
45 #endif
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
57 /* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
76 return flags;
79 /* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
98 return flags;
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
118 return flags;
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
129 * printf("0x%016lx,\n", m);
132 static inline uint64_t expand_pred_b(uint8_t byte)
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
222 return word[byte];
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
239 static inline uint64_t expand_pred_h(uint8_t byte)
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
251 return word[byte & 0x55];
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
262 return word[byte & 0x11];
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
268 return rol32(h, 16);
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
282 return rol64(h, 32);
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
314 #undef DO_AND
315 #undef DO_BIC
316 #undef DO_EOR
317 #undef DO_ORR
318 #undef DO_ORN
319 #undef DO_NOR
320 #undef DO_NAND
321 #undef DO_SEL
322 #undef LOGICAL_PPPP
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
450 return (n * m) >> 8;
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455 return (n * m) >> 16;
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460 return (n * m) >> 32;
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
520 #undef DO_ZPZZ
521 #undef DO_ZPZZ_D
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
556 #undef DO_ZPZW
558 /* Fully general two-operand expander, controlled by a predicate.
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
612 #define DO_CNOT(N) (N == 0)
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
631 #define DO_NOT(N) (~N)
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
659 #define DO_ABS(N) (N < 0 ? -N : N)
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
666 #define DO_NEG(N) (-N)
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
715 #undef DO_ZZW
717 #undef DO_CLS_B
718 #undef DO_CLS_H
719 #undef DO_CLZ_B
720 #undef DO_CLZ_H
721 #undef DO_CNOT
722 #undef DO_FABS
723 #undef DO_FNEG
724 #undef DO_ABS
725 #undef DO_NEG
726 #undef DO_ZPZ
727 #undef DO_ZPZ_D
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
753 return (TYPERET)ret; \
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
769 return ret; \
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
816 #undef DO_VPZ
817 #undef DO_VPZ_D
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
830 #define DO_SUBR(X, Y) (Y - X)
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
857 #undef DO_ZZI
859 #undef DO_AND
860 #undef DO_ORR
861 #undef DO_EOR
862 #undef DO_BIC
863 #undef DO_ADD
864 #undef DO_SUB
865 #undef DO_MAX
866 #undef DO_MIN
867 #undef DO_ABD
868 #undef DO_MUL
869 #undef DO_DIV
870 #undef DO_ASR
871 #undef DO_LSR
872 #undef DO_LSL
873 #undef DO_SUBR
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
894 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
895 uint32_t flags = PREDTEST_INIT;
896 uint64_t *d = vd, *g = vg;
897 intptr_t i = 0;
899 do {
900 uint64_t this_d = d[i];
901 uint64_t this_g = g[i];
903 if (this_g) {
904 if (!(flags & 4)) {
905 /* Set in D the first bit of G. */
906 this_d |= this_g & -this_g;
907 d[i] = this_d;
909 flags = iter_predtest_fwd(this_d, this_g, flags);
911 } while (++i < words);
913 return flags;
916 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
918 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
919 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
920 uint32_t flags = PREDTEST_INIT;
921 uint64_t *d = vd, *g = vg, esz_mask;
922 intptr_t i, next;
924 next = last_active_element(vd, words, esz) + (1 << esz);
925 esz_mask = pred_esz_masks[esz];
927 /* Similar to the pseudocode for pnext, but scaled by ESZ
928 so that we find the correct bit. */
929 if (next < words * 64) {
930 uint64_t mask = -1;
932 if (next & 63) {
933 mask = ~((1ull << (next & 63)) - 1);
934 next &= -64;
936 do {
937 uint64_t this_g = g[next / 64] & esz_mask & mask;
938 if (this_g != 0) {
939 next = (next & -64) + ctz64(this_g);
940 break;
942 next += 64;
943 mask = -1;
944 } while (next < words * 64);
947 i = 0;
948 do {
949 uint64_t this_d = 0;
950 if (i == next / 64) {
951 this_d = 1ull << (next & 63);
953 d[i] = this_d;
954 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
955 } while (++i < words);
957 return flags;
961 * Copy Zn into Zd, and store zero into inactive elements.
962 * If inv, store zeros into the active elements.
964 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
966 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
967 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
968 uint64_t *d = vd, *n = vn;
969 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
976 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
980 uint64_t *d = vd, *n = vn;
981 uint8_t *pg = vg;
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
988 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
992 uint64_t *d = vd, *n = vn;
993 uint8_t *pg = vg;
995 for (i = 0; i < opr_sz; i += 1) {
996 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1000 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1002 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1003 uint64_t *d = vd, *n = vn;
1004 uint8_t *pg = vg;
1005 uint8_t inv = simd_data(desc);
1007 for (i = 0; i < opr_sz; i += 1) {
1008 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1012 /* Three-operand expander, immediate operand, controlled by a predicate.
1014 #define DO_ZPZI(NAME, TYPE, H, OP) \
1015 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1017 intptr_t i, opr_sz = simd_oprsz(desc); \
1018 TYPE imm = simd_data(desc); \
1019 for (i = 0; i < opr_sz; ) { \
1020 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1021 do { \
1022 if (pg & 1) { \
1023 TYPE nn = *(TYPE *)(vn + H(i)); \
1024 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1026 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1027 } while (i & 15); \
1031 /* Similarly, specialized for 64-bit operands. */
1032 #define DO_ZPZI_D(NAME, TYPE, OP) \
1033 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1035 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1036 TYPE *d = vd, *n = vn; \
1037 TYPE imm = simd_data(desc); \
1038 uint8_t *pg = vg; \
1039 for (i = 0; i < opr_sz; i += 1) { \
1040 if (pg[H1(i)] & 1) { \
1041 TYPE nn = n[i]; \
1042 d[i] = OP(nn, imm); \
1047 #define DO_SHR(N, M) (N >> M)
1048 #define DO_SHL(N, M) (N << M)
1050 /* Arithmetic shift right for division. This rounds negative numbers
1051 toward zero as per signed division. Therefore before shifting,
1052 when N is negative, add 2**M-1. */
1053 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1055 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1056 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1057 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1058 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1060 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1061 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1062 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1063 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1065 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1066 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1067 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1068 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1070 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1071 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1072 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1073 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1075 #undef DO_SHR
1076 #undef DO_SHL
1077 #undef DO_ASRD
1078 #undef DO_ZPZI
1079 #undef DO_ZPZI_D
1081 /* Fully general four-operand expander, controlled by a predicate.
1083 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1084 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1085 void *vg, uint32_t desc) \
1087 intptr_t i, opr_sz = simd_oprsz(desc); \
1088 for (i = 0; i < opr_sz; ) { \
1089 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1090 do { \
1091 if (pg & 1) { \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 TYPE mm = *(TYPE *)(vm + H(i)); \
1094 TYPE aa = *(TYPE *)(va + H(i)); \
1095 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1097 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1098 } while (i & 15); \
1102 /* Similarly, specialized for 64-bit operands. */
1103 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1104 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1105 void *vg, uint32_t desc) \
1107 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1108 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1109 uint8_t *pg = vg; \
1110 for (i = 0; i < opr_sz; i += 1) { \
1111 if (pg[H1(i)] & 1) { \
1112 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1113 d[i] = OP(aa, nn, mm); \
1118 #define DO_MLA(A, N, M) (A + N * M)
1119 #define DO_MLS(A, N, M) (A - N * M)
1121 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1122 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1124 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1125 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1127 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1128 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1130 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1131 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1133 #undef DO_MLA
1134 #undef DO_MLS
1135 #undef DO_ZPZZZ
1136 #undef DO_ZPZZZ_D
1138 void HELPER(sve_index_b)(void *vd, uint32_t start,
1139 uint32_t incr, uint32_t desc)
1141 intptr_t i, opr_sz = simd_oprsz(desc);
1142 uint8_t *d = vd;
1143 for (i = 0; i < opr_sz; i += 1) {
1144 d[H1(i)] = start + i * incr;
1148 void HELPER(sve_index_h)(void *vd, uint32_t start,
1149 uint32_t incr, uint32_t desc)
1151 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1152 uint16_t *d = vd;
1153 for (i = 0; i < opr_sz; i += 1) {
1154 d[H2(i)] = start + i * incr;
1158 void HELPER(sve_index_s)(void *vd, uint32_t start,
1159 uint32_t incr, uint32_t desc)
1161 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1162 uint32_t *d = vd;
1163 for (i = 0; i < opr_sz; i += 1) {
1164 d[H4(i)] = start + i * incr;
1168 void HELPER(sve_index_d)(void *vd, uint64_t start,
1169 uint64_t incr, uint32_t desc)
1171 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1172 uint64_t *d = vd;
1173 for (i = 0; i < opr_sz; i += 1) {
1174 d[i] = start + i * incr;
1178 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1180 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1181 uint32_t sh = simd_data(desc);
1182 uint32_t *d = vd, *n = vn, *m = vm;
1183 for (i = 0; i < opr_sz; i += 1) {
1184 d[i] = n[i] + (m[i] << sh);
1188 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1191 uint64_t sh = simd_data(desc);
1192 uint64_t *d = vd, *n = vn, *m = vm;
1193 for (i = 0; i < opr_sz; i += 1) {
1194 d[i] = n[i] + (m[i] << sh);
1198 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1201 uint64_t sh = simd_data(desc);
1202 uint64_t *d = vd, *n = vn, *m = vm;
1203 for (i = 0; i < opr_sz; i += 1) {
1204 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1208 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t sh = simd_data(desc);
1212 uint64_t *d = vd, *n = vn, *m = vm;
1213 for (i = 0; i < opr_sz; i += 1) {
1214 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1218 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1220 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1221 static const uint16_t coeff[] = {
1222 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1223 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1224 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1225 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1228 uint16_t *d = vd, *n = vn;
1230 for (i = 0; i < opr_sz; i++) {
1231 uint16_t nn = n[i];
1232 intptr_t idx = extract32(nn, 0, 5);
1233 uint16_t exp = extract32(nn, 5, 5);
1234 d[i] = coeff[idx] | (exp << 10);
1238 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1240 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1241 static const uint32_t coeff[] = {
1242 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1243 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1244 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1245 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1246 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1247 0x1ef532, 0x20b051, 0x227043, 0x243516,
1248 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1249 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1250 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1251 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1252 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1253 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1254 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1255 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1256 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1257 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1259 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1260 uint32_t *d = vd, *n = vn;
1262 for (i = 0; i < opr_sz; i++) {
1263 uint32_t nn = n[i];
1264 intptr_t idx = extract32(nn, 0, 6);
1265 uint32_t exp = extract32(nn, 6, 8);
1266 d[i] = coeff[idx] | (exp << 23);
1270 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1272 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1273 static const uint64_t coeff[] = {
1274 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1275 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1276 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1277 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1278 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1279 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1280 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1281 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1282 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1283 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1284 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1285 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1286 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1287 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1288 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1289 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1290 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1291 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1292 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1293 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1294 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1295 0xFA7C1819E90D8ull,
1297 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1298 uint64_t *d = vd, *n = vn;
1300 for (i = 0; i < opr_sz; i++) {
1301 uint64_t nn = n[i];
1302 intptr_t idx = extract32(nn, 0, 6);
1303 uint64_t exp = extract32(nn, 6, 11);
1304 d[i] = coeff[idx] | (exp << 52);
1308 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1310 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1311 uint16_t *d = vd, *n = vn, *m = vm;
1312 for (i = 0; i < opr_sz; i += 1) {
1313 uint16_t nn = n[i];
1314 uint16_t mm = m[i];
1315 if (mm & 1) {
1316 nn = float16_one;
1318 d[i] = nn ^ (mm & 2) << 14;
1322 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1325 uint32_t *d = vd, *n = vn, *m = vm;
1326 for (i = 0; i < opr_sz; i += 1) {
1327 uint32_t nn = n[i];
1328 uint32_t mm = m[i];
1329 if (mm & 1) {
1330 nn = float32_one;
1332 d[i] = nn ^ (mm & 2) << 30;
1336 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1339 uint64_t *d = vd, *n = vn, *m = vm;
1340 for (i = 0; i < opr_sz; i += 1) {
1341 uint64_t nn = n[i];
1342 uint64_t mm = m[i];
1343 if (mm & 1) {
1344 nn = float64_one;
1346 d[i] = nn ^ (mm & 2) << 62;
1351 * Signed saturating addition with scalar operand.
1354 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1356 intptr_t i, oprsz = simd_oprsz(desc);
1358 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1359 int r = *(int8_t *)(a + i) + b;
1360 if (r > INT8_MAX) {
1361 r = INT8_MAX;
1362 } else if (r < INT8_MIN) {
1363 r = INT8_MIN;
1365 *(int8_t *)(d + i) = r;
1369 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1371 intptr_t i, oprsz = simd_oprsz(desc);
1373 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1374 int r = *(int16_t *)(a + i) + b;
1375 if (r > INT16_MAX) {
1376 r = INT16_MAX;
1377 } else if (r < INT16_MIN) {
1378 r = INT16_MIN;
1380 *(int16_t *)(d + i) = r;
1384 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1386 intptr_t i, oprsz = simd_oprsz(desc);
1388 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1389 int64_t r = *(int32_t *)(a + i) + b;
1390 if (r > INT32_MAX) {
1391 r = INT32_MAX;
1392 } else if (r < INT32_MIN) {
1393 r = INT32_MIN;
1395 *(int32_t *)(d + i) = r;
1399 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1401 intptr_t i, oprsz = simd_oprsz(desc);
1403 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1404 int64_t ai = *(int64_t *)(a + i);
1405 int64_t r = ai + b;
1406 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1407 /* Signed overflow. */
1408 r = (r < 0 ? INT64_MAX : INT64_MIN);
1410 *(int64_t *)(d + i) = r;
1415 * Unsigned saturating addition with scalar operand.
1418 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1420 intptr_t i, oprsz = simd_oprsz(desc);
1422 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1423 int r = *(uint8_t *)(a + i) + b;
1424 if (r > UINT8_MAX) {
1425 r = UINT8_MAX;
1426 } else if (r < 0) {
1427 r = 0;
1429 *(uint8_t *)(d + i) = r;
1433 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1435 intptr_t i, oprsz = simd_oprsz(desc);
1437 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1438 int r = *(uint16_t *)(a + i) + b;
1439 if (r > UINT16_MAX) {
1440 r = UINT16_MAX;
1441 } else if (r < 0) {
1442 r = 0;
1444 *(uint16_t *)(d + i) = r;
1448 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1450 intptr_t i, oprsz = simd_oprsz(desc);
1452 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1453 int64_t r = *(uint32_t *)(a + i) + b;
1454 if (r > UINT32_MAX) {
1455 r = UINT32_MAX;
1456 } else if (r < 0) {
1457 r = 0;
1459 *(uint32_t *)(d + i) = r;
1463 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1465 intptr_t i, oprsz = simd_oprsz(desc);
1467 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1468 uint64_t r = *(uint64_t *)(a + i) + b;
1469 if (r < b) {
1470 r = UINT64_MAX;
1472 *(uint64_t *)(d + i) = r;
1476 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1478 intptr_t i, oprsz = simd_oprsz(desc);
1480 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1481 uint64_t ai = *(uint64_t *)(a + i);
1482 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1486 /* Two operand predicated copy immediate with merge. All valid immediates
1487 * can fit within 17 signed bits in the simd_data field.
1489 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1490 uint64_t mm, uint32_t desc)
1492 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1493 uint64_t *d = vd, *n = vn;
1494 uint8_t *pg = vg;
1496 mm = dup_const(MO_8, mm);
1497 for (i = 0; i < opr_sz; i += 1) {
1498 uint64_t nn = n[i];
1499 uint64_t pp = expand_pred_b(pg[H1(i)]);
1500 d[i] = (mm & pp) | (nn & ~pp);
1504 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1505 uint64_t mm, uint32_t desc)
1507 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1508 uint64_t *d = vd, *n = vn;
1509 uint8_t *pg = vg;
1511 mm = dup_const(MO_16, mm);
1512 for (i = 0; i < opr_sz; i += 1) {
1513 uint64_t nn = n[i];
1514 uint64_t pp = expand_pred_h(pg[H1(i)]);
1515 d[i] = (mm & pp) | (nn & ~pp);
1519 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1520 uint64_t mm, uint32_t desc)
1522 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1523 uint64_t *d = vd, *n = vn;
1524 uint8_t *pg = vg;
1526 mm = dup_const(MO_32, mm);
1527 for (i = 0; i < opr_sz; i += 1) {
1528 uint64_t nn = n[i];
1529 uint64_t pp = expand_pred_s(pg[H1(i)]);
1530 d[i] = (mm & pp) | (nn & ~pp);
1534 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1535 uint64_t mm, uint32_t desc)
1537 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1538 uint64_t *d = vd, *n = vn;
1539 uint8_t *pg = vg;
1541 for (i = 0; i < opr_sz; i += 1) {
1542 uint64_t nn = n[i];
1543 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1547 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1550 uint64_t *d = vd;
1551 uint8_t *pg = vg;
1553 val = dup_const(MO_8, val);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 d[i] = val & expand_pred_b(pg[H1(i)]);
1559 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd;
1563 uint8_t *pg = vg;
1565 val = dup_const(MO_16, val);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 d[i] = val & expand_pred_h(pg[H1(i)]);
1571 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1574 uint64_t *d = vd;
1575 uint8_t *pg = vg;
1577 val = dup_const(MO_32, val);
1578 for (i = 0; i < opr_sz; i += 1) {
1579 d[i] = val & expand_pred_s(pg[H1(i)]);
1583 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1586 uint64_t *d = vd;
1587 uint8_t *pg = vg;
1589 for (i = 0; i < opr_sz; i += 1) {
1590 d[i] = (pg[H1(i)] & 1 ? val : 0);
1594 /* Big-endian hosts need to frob the byte indices. If the copy
1595 * happens to be 8-byte aligned, then no frobbing necessary.
1597 static void swap_memmove(void *vd, void *vs, size_t n)
1599 uintptr_t d = (uintptr_t)vd;
1600 uintptr_t s = (uintptr_t)vs;
1601 uintptr_t o = (d | s | n) & 7;
1602 size_t i;
1604 #ifndef HOST_WORDS_BIGENDIAN
1605 o = 0;
1606 #endif
1607 switch (o) {
1608 case 0:
1609 memmove(vd, vs, n);
1610 break;
1612 case 4:
1613 if (d < s || d >= s + n) {
1614 for (i = 0; i < n; i += 4) {
1615 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1617 } else {
1618 for (i = n; i > 0; ) {
1619 i -= 4;
1620 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1623 break;
1625 case 2:
1626 case 6:
1627 if (d < s || d >= s + n) {
1628 for (i = 0; i < n; i += 2) {
1629 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1631 } else {
1632 for (i = n; i > 0; ) {
1633 i -= 2;
1634 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1637 break;
1639 default:
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i++) {
1642 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1644 } else {
1645 for (i = n; i > 0; ) {
1646 i -= 1;
1647 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1650 break;
1654 /* Similarly for memset of 0. */
1655 static void swap_memzero(void *vd, size_t n)
1657 uintptr_t d = (uintptr_t)vd;
1658 uintptr_t o = (d | n) & 7;
1659 size_t i;
1661 /* Usually, the first bit of a predicate is set, so N is 0. */
1662 if (likely(n == 0)) {
1663 return;
1666 #ifndef HOST_WORDS_BIGENDIAN
1667 o = 0;
1668 #endif
1669 switch (o) {
1670 case 0:
1671 memset(vd, 0, n);
1672 break;
1674 case 4:
1675 for (i = 0; i < n; i += 4) {
1676 *(uint32_t *)H1_4(d + i) = 0;
1678 break;
1680 case 2:
1681 case 6:
1682 for (i = 0; i < n; i += 2) {
1683 *(uint16_t *)H1_2(d + i) = 0;
1685 break;
1687 default:
1688 for (i = 0; i < n; i++) {
1689 *(uint8_t *)H1(d + i) = 0;
1691 break;
1695 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1697 intptr_t opr_sz = simd_oprsz(desc);
1698 size_t n_ofs = simd_data(desc);
1699 size_t n_siz = opr_sz - n_ofs;
1701 if (vd != vm) {
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 swap_memmove(vd + n_siz, vm, n_ofs);
1704 } else if (vd != vn) {
1705 swap_memmove(vd + n_siz, vd, n_ofs);
1706 swap_memmove(vd, vn + n_ofs, n_siz);
1707 } else {
1708 /* vd == vn == vm. Need temp space. */
1709 ARMVectorReg tmp;
1710 swap_memmove(&tmp, vm, n_ofs);
1711 swap_memmove(vd, vd + n_ofs, n_siz);
1712 memcpy(vd + n_siz, &tmp, n_ofs);
1716 #define DO_INSR(NAME, TYPE, H) \
1717 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1719 intptr_t opr_sz = simd_oprsz(desc); \
1720 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1721 *(TYPE *)(vd + H(0)) = val; \
1724 DO_INSR(sve_insr_b, uint8_t, H1)
1725 DO_INSR(sve_insr_h, uint16_t, H1_2)
1726 DO_INSR(sve_insr_s, uint32_t, H1_4)
1727 DO_INSR(sve_insr_d, uint64_t, )
1729 #undef DO_INSR
1731 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1733 intptr_t i, j, opr_sz = simd_oprsz(desc);
1734 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1735 uint64_t f = *(uint64_t *)(vn + i);
1736 uint64_t b = *(uint64_t *)(vn + j);
1737 *(uint64_t *)(vd + i) = bswap64(b);
1738 *(uint64_t *)(vd + j) = bswap64(f);
1742 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1744 intptr_t i, j, opr_sz = simd_oprsz(desc);
1745 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1746 uint64_t f = *(uint64_t *)(vn + i);
1747 uint64_t b = *(uint64_t *)(vn + j);
1748 *(uint64_t *)(vd + i) = hswap64(b);
1749 *(uint64_t *)(vd + j) = hswap64(f);
1753 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1755 intptr_t i, j, opr_sz = simd_oprsz(desc);
1756 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1757 uint64_t f = *(uint64_t *)(vn + i);
1758 uint64_t b = *(uint64_t *)(vn + j);
1759 *(uint64_t *)(vd + i) = rol64(b, 32);
1760 *(uint64_t *)(vd + j) = rol64(f, 32);
1764 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1766 intptr_t i, j, opr_sz = simd_oprsz(desc);
1767 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1768 uint64_t f = *(uint64_t *)(vn + i);
1769 uint64_t b = *(uint64_t *)(vn + j);
1770 *(uint64_t *)(vd + i) = b;
1771 *(uint64_t *)(vd + j) = f;
1775 #define DO_TBL(NAME, TYPE, H) \
1776 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1778 intptr_t i, opr_sz = simd_oprsz(desc); \
1779 uintptr_t elem = opr_sz / sizeof(TYPE); \
1780 TYPE *d = vd, *n = vn, *m = vm; \
1781 ARMVectorReg tmp; \
1782 if (unlikely(vd == vn)) { \
1783 n = memcpy(&tmp, vn, opr_sz); \
1785 for (i = 0; i < elem; i++) { \
1786 TYPE j = m[H(i)]; \
1787 d[H(i)] = j < elem ? n[H(j)] : 0; \
1791 DO_TBL(sve_tbl_b, uint8_t, H1)
1792 DO_TBL(sve_tbl_h, uint16_t, H2)
1793 DO_TBL(sve_tbl_s, uint32_t, H4)
1794 DO_TBL(sve_tbl_d, uint64_t, )
1796 #undef TBL
1798 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1799 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1801 intptr_t i, opr_sz = simd_oprsz(desc); \
1802 TYPED *d = vd; \
1803 TYPES *n = vn; \
1804 ARMVectorReg tmp; \
1805 if (unlikely(vn - vd < opr_sz)) { \
1806 n = memcpy(&tmp, n, opr_sz / 2); \
1808 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1809 d[HD(i)] = n[HS(i)]; \
1813 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1814 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1815 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1817 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1818 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1819 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1821 #undef DO_UNPK
1823 /* Mask of bits included in the even numbered predicates of width esz.
1824 * We also use this for expand_bits/compress_bits, and so extend the
1825 * same pattern out to 16-bit units.
1827 static const uint64_t even_bit_esz_masks[5] = {
1828 0x5555555555555555ull,
1829 0x3333333333333333ull,
1830 0x0f0f0f0f0f0f0f0full,
1831 0x00ff00ff00ff00ffull,
1832 0x0000ffff0000ffffull,
1835 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1836 * For N==0, this corresponds to the operation that in qemu/bitops.h
1837 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1838 * section 7-2 Shuffling Bits.
1840 static uint64_t expand_bits(uint64_t x, int n)
1842 int i;
1844 x &= 0xffffffffu;
1845 for (i = 4; i >= n; i--) {
1846 int sh = 1 << i;
1847 x = ((x << sh) | x) & even_bit_esz_masks[i];
1849 return x;
1852 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1853 * For N==0, this corresponds to the operation that in qemu/bitops.h
1854 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1855 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1857 static uint64_t compress_bits(uint64_t x, int n)
1859 int i;
1861 for (i = n; i <= 4; i++) {
1862 int sh = 1 << i;
1863 x &= even_bit_esz_masks[i];
1864 x = (x >> sh) | x;
1866 return x & 0xffffffffu;
1869 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1871 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1872 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1873 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
1874 uint64_t *d = vd;
1875 intptr_t i;
1877 if (oprsz <= 8) {
1878 uint64_t nn = *(uint64_t *)vn;
1879 uint64_t mm = *(uint64_t *)vm;
1880 int half = 4 * oprsz;
1882 nn = extract64(nn, high * half, half);
1883 mm = extract64(mm, high * half, half);
1884 nn = expand_bits(nn, esz);
1885 mm = expand_bits(mm, esz);
1886 d[0] = nn + (mm << (1 << esz));
1887 } else {
1888 ARMPredicateReg tmp_n, tmp_m;
1890 /* We produce output faster than we consume input.
1891 Therefore we must be mindful of possible overlap. */
1892 if ((vn - vd) < (uintptr_t)oprsz) {
1893 vn = memcpy(&tmp_n, vn, oprsz);
1895 if ((vm - vd) < (uintptr_t)oprsz) {
1896 vm = memcpy(&tmp_m, vm, oprsz);
1898 if (high) {
1899 high = oprsz >> 1;
1902 if ((high & 3) == 0) {
1903 uint32_t *n = vn, *m = vm;
1904 high >>= 2;
1906 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1907 uint64_t nn = n[H4(high + i)];
1908 uint64_t mm = m[H4(high + i)];
1910 nn = expand_bits(nn, esz);
1911 mm = expand_bits(mm, esz);
1912 d[i] = nn + (mm << (1 << esz));
1914 } else {
1915 uint8_t *n = vn, *m = vm;
1916 uint16_t *d16 = vd;
1918 for (i = 0; i < oprsz / 2; i++) {
1919 uint16_t nn = n[H1(high + i)];
1920 uint16_t mm = m[H1(high + i)];
1922 nn = expand_bits(nn, esz);
1923 mm = expand_bits(mm, esz);
1924 d16[H2(i)] = nn + (mm << (1 << esz));
1930 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1932 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1933 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1934 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
1935 uint64_t *d = vd, *n = vn, *m = vm;
1936 uint64_t l, h;
1937 intptr_t i;
1939 if (oprsz <= 8) {
1940 l = compress_bits(n[0] >> odd, esz);
1941 h = compress_bits(m[0] >> odd, esz);
1942 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1943 } else {
1944 ARMPredicateReg tmp_m;
1945 intptr_t oprsz_16 = oprsz / 16;
1947 if ((vm - vd) < (uintptr_t)oprsz) {
1948 m = memcpy(&tmp_m, vm, oprsz);
1951 for (i = 0; i < oprsz_16; i++) {
1952 l = n[2 * i + 0];
1953 h = n[2 * i + 1];
1954 l = compress_bits(l >> odd, esz);
1955 h = compress_bits(h >> odd, esz);
1956 d[i] = l + (h << 32);
1959 /* For VL which is not a power of 2, the results from M do not
1960 align nicely with the uint64_t for D. Put the aligned results
1961 from M into TMP_M and then copy it into place afterward. */
1962 if (oprsz & 15) {
1963 d[i] = compress_bits(n[2 * i] >> odd, esz);
1965 for (i = 0; i < oprsz_16; i++) {
1966 l = m[2 * i + 0];
1967 h = m[2 * i + 1];
1968 l = compress_bits(l >> odd, esz);
1969 h = compress_bits(h >> odd, esz);
1970 tmp_m.p[i] = l + (h << 32);
1972 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1974 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1975 } else {
1976 for (i = 0; i < oprsz_16; i++) {
1977 l = m[2 * i + 0];
1978 h = m[2 * i + 1];
1979 l = compress_bits(l >> odd, esz);
1980 h = compress_bits(h >> odd, esz);
1981 d[oprsz_16 + i] = l + (h << 32);
1987 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1989 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1990 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1991 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
1992 uint64_t *d = vd, *n = vn, *m = vm;
1993 uint64_t mask;
1994 int shr, shl;
1995 intptr_t i;
1997 shl = 1 << esz;
1998 shr = 0;
1999 mask = even_bit_esz_masks[esz];
2000 if (odd) {
2001 mask <<= shl;
2002 shr = shl;
2003 shl = 0;
2006 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2007 uint64_t nn = (n[i] & mask) >> shr;
2008 uint64_t mm = (m[i] & mask) << shl;
2009 d[i] = nn + mm;
2013 /* Reverse units of 2**N bits. */
2014 static uint64_t reverse_bits_64(uint64_t x, int n)
2016 int i, sh;
2018 x = bswap64(x);
2019 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2020 uint64_t mask = even_bit_esz_masks[i];
2021 x = ((x & mask) << sh) | ((x >> sh) & mask);
2023 return x;
2026 static uint8_t reverse_bits_8(uint8_t x, int n)
2028 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2029 int i, sh;
2031 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2032 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2034 return x;
2037 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2040 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2041 intptr_t i, oprsz_2 = oprsz / 2;
2043 if (oprsz <= 8) {
2044 uint64_t l = *(uint64_t *)vn;
2045 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2046 *(uint64_t *)vd = l;
2047 } else if ((oprsz & 15) == 0) {
2048 for (i = 0; i < oprsz_2; i += 8) {
2049 intptr_t ih = oprsz - 8 - i;
2050 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2051 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2052 *(uint64_t *)(vd + i) = h;
2053 *(uint64_t *)(vd + ih) = l;
2055 } else {
2056 for (i = 0; i < oprsz_2; i += 1) {
2057 intptr_t il = H1(i);
2058 intptr_t ih = H1(oprsz - 1 - i);
2059 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2060 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2061 *(uint8_t *)(vd + il) = h;
2062 *(uint8_t *)(vd + ih) = l;
2067 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2069 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2070 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2071 uint64_t *d = vd;
2072 intptr_t i;
2074 if (oprsz <= 8) {
2075 uint64_t nn = *(uint64_t *)vn;
2076 int half = 4 * oprsz;
2078 nn = extract64(nn, high * half, half);
2079 nn = expand_bits(nn, 0);
2080 d[0] = nn;
2081 } else {
2082 ARMPredicateReg tmp_n;
2084 /* We produce output faster than we consume input.
2085 Therefore we must be mindful of possible overlap. */
2086 if ((vn - vd) < (uintptr_t)oprsz) {
2087 vn = memcpy(&tmp_n, vn, oprsz);
2089 if (high) {
2090 high = oprsz >> 1;
2093 if ((high & 3) == 0) {
2094 uint32_t *n = vn;
2095 high >>= 2;
2097 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2098 uint64_t nn = n[H4(high + i)];
2099 d[i] = expand_bits(nn, 0);
2101 } else {
2102 uint16_t *d16 = vd;
2103 uint8_t *n = vn;
2105 for (i = 0; i < oprsz / 2; i++) {
2106 uint16_t nn = n[H1(high + i)];
2107 d16[H2(i)] = expand_bits(nn, 0);
2113 #define DO_ZIP(NAME, TYPE, H) \
2114 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2116 intptr_t oprsz = simd_oprsz(desc); \
2117 intptr_t i, oprsz_2 = oprsz / 2; \
2118 ARMVectorReg tmp_n, tmp_m; \
2119 /* We produce output faster than we consume input. \
2120 Therefore we must be mindful of possible overlap. */ \
2121 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2122 vn = memcpy(&tmp_n, vn, oprsz_2); \
2124 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2125 vm = memcpy(&tmp_m, vm, oprsz_2); \
2127 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2128 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2129 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2133 DO_ZIP(sve_zip_b, uint8_t, H1)
2134 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2135 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2136 DO_ZIP(sve_zip_d, uint64_t, )
2138 #define DO_UZP(NAME, TYPE, H) \
2139 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2141 intptr_t oprsz = simd_oprsz(desc); \
2142 intptr_t oprsz_2 = oprsz / 2; \
2143 intptr_t odd_ofs = simd_data(desc); \
2144 intptr_t i; \
2145 ARMVectorReg tmp_m; \
2146 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2147 vm = memcpy(&tmp_m, vm, oprsz); \
2149 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2150 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2152 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2153 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2157 DO_UZP(sve_uzp_b, uint8_t, H1)
2158 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2159 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2160 DO_UZP(sve_uzp_d, uint64_t, )
2162 #define DO_TRN(NAME, TYPE, H) \
2163 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2165 intptr_t oprsz = simd_oprsz(desc); \
2166 intptr_t odd_ofs = simd_data(desc); \
2167 intptr_t i; \
2168 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2169 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2170 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2171 *(TYPE *)(vd + H(i + 0)) = ae; \
2172 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2176 DO_TRN(sve_trn_b, uint8_t, H1)
2177 DO_TRN(sve_trn_h, uint16_t, H1_2)
2178 DO_TRN(sve_trn_s, uint32_t, H1_4)
2179 DO_TRN(sve_trn_d, uint64_t, )
2181 #undef DO_ZIP
2182 #undef DO_UZP
2183 #undef DO_TRN
2185 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2187 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2188 uint32_t *d = vd, *n = vn;
2189 uint8_t *pg = vg;
2191 for (i = j = 0; i < opr_sz; i++) {
2192 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2193 d[H4(j)] = n[H4(i)];
2194 j++;
2197 for (; j < opr_sz; j++) {
2198 d[H4(j)] = 0;
2202 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2204 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2205 uint64_t *d = vd, *n = vn;
2206 uint8_t *pg = vg;
2208 for (i = j = 0; i < opr_sz; i++) {
2209 if (pg[H1(i)] & 1) {
2210 d[j] = n[i];
2211 j++;
2214 for (; j < opr_sz; j++) {
2215 d[j] = 0;
2219 /* Similar to the ARM LastActiveElement pseudocode function, except the
2220 * result is multiplied by the element size. This includes the not found
2221 * indication; e.g. not found for esz=3 is -8.
2223 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2225 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2226 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2228 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2231 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2233 intptr_t opr_sz = simd_oprsz(desc) / 8;
2234 int esz = simd_data(desc);
2235 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2236 intptr_t i, first_i, last_i;
2237 ARMVectorReg tmp;
2239 first_i = last_i = 0;
2240 first_g = last_g = 0;
2242 /* Find the extent of the active elements within VG. */
2243 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2244 pg = *(uint64_t *)(vg + i) & mask;
2245 if (pg) {
2246 if (last_g == 0) {
2247 last_g = pg;
2248 last_i = i;
2250 first_g = pg;
2251 first_i = i;
2255 len = 0;
2256 if (first_g != 0) {
2257 first_i = first_i * 8 + ctz64(first_g);
2258 last_i = last_i * 8 + 63 - clz64(last_g);
2259 len = last_i - first_i + (1 << esz);
2260 if (vd == vm) {
2261 vm = memcpy(&tmp, vm, opr_sz * 8);
2263 swap_memmove(vd, vn + first_i, len);
2265 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2268 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2269 void *vg, uint32_t desc)
2271 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2272 uint64_t *d = vd, *n = vn, *m = vm;
2273 uint8_t *pg = vg;
2275 for (i = 0; i < opr_sz; i += 1) {
2276 uint64_t nn = n[i], mm = m[i];
2277 uint64_t pp = expand_pred_b(pg[H1(i)]);
2278 d[i] = (nn & pp) | (mm & ~pp);
2282 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2283 void *vg, uint32_t desc)
2285 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2286 uint64_t *d = vd, *n = vn, *m = vm;
2287 uint8_t *pg = vg;
2289 for (i = 0; i < opr_sz; i += 1) {
2290 uint64_t nn = n[i], mm = m[i];
2291 uint64_t pp = expand_pred_h(pg[H1(i)]);
2292 d[i] = (nn & pp) | (mm & ~pp);
2296 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2297 void *vg, uint32_t desc)
2299 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2300 uint64_t *d = vd, *n = vn, *m = vm;
2301 uint8_t *pg = vg;
2303 for (i = 0; i < opr_sz; i += 1) {
2304 uint64_t nn = n[i], mm = m[i];
2305 uint64_t pp = expand_pred_s(pg[H1(i)]);
2306 d[i] = (nn & pp) | (mm & ~pp);
2310 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2311 void *vg, uint32_t desc)
2313 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2314 uint64_t *d = vd, *n = vn, *m = vm;
2315 uint8_t *pg = vg;
2317 for (i = 0; i < opr_sz; i += 1) {
2318 uint64_t nn = n[i], mm = m[i];
2319 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2323 /* Two operand comparison controlled by a predicate.
2324 * ??? It is very tempting to want to be able to expand this inline
2325 * with x86 instructions, e.g.
2327 * vcmpeqw zm, zn, %ymm0
2328 * vpmovmskb %ymm0, %eax
2329 * and $0x5555, %eax
2330 * and pg, %eax
2332 * or even aarch64, e.g.
2334 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2335 * cmeq v0.8h, zn, zm
2336 * and v0.8h, v0.8h, mask
2337 * addv h0, v0.8h
2338 * and v0.8b, pg
2340 * However, coming up with an abstraction that allows vector inputs and
2341 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2342 * scalar outputs, is tricky.
2344 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2345 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2347 intptr_t opr_sz = simd_oprsz(desc); \
2348 uint32_t flags = PREDTEST_INIT; \
2349 intptr_t i = opr_sz; \
2350 do { \
2351 uint64_t out = 0, pg; \
2352 do { \
2353 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2354 TYPE nn = *(TYPE *)(vn + H(i)); \
2355 TYPE mm = *(TYPE *)(vm + H(i)); \
2356 out |= nn OP mm; \
2357 } while (i & 63); \
2358 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2359 out &= pg; \
2360 *(uint64_t *)(vd + (i >> 3)) = out; \
2361 flags = iter_predtest_bwd(out, pg, flags); \
2362 } while (i > 0); \
2363 return flags; \
2366 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2367 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2368 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2369 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2370 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2371 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2372 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2373 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2375 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2376 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2377 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2378 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2380 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2381 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2382 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2383 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2385 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2386 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2387 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2388 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2390 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2391 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2392 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2393 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2395 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2396 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2397 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2398 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2400 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2401 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2402 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2403 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2405 #undef DO_CMP_PPZZ_B
2406 #undef DO_CMP_PPZZ_H
2407 #undef DO_CMP_PPZZ_S
2408 #undef DO_CMP_PPZZ_D
2409 #undef DO_CMP_PPZZ
2411 /* Similar, but the second source is "wide". */
2412 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2413 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2415 intptr_t opr_sz = simd_oprsz(desc); \
2416 uint32_t flags = PREDTEST_INIT; \
2417 intptr_t i = opr_sz; \
2418 do { \
2419 uint64_t out = 0, pg; \
2420 do { \
2421 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2422 do { \
2423 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2424 TYPE nn = *(TYPE *)(vn + H(i)); \
2425 out |= nn OP mm; \
2426 } while (i & 7); \
2427 } while (i & 63); \
2428 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2429 out &= pg; \
2430 *(uint64_t *)(vd + (i >> 3)) = out; \
2431 flags = iter_predtest_bwd(out, pg, flags); \
2432 } while (i > 0); \
2433 return flags; \
2436 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2437 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2438 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2439 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2440 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2441 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2443 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2444 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2445 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2447 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2448 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2449 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2451 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2452 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2453 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2455 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2456 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2457 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2459 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2460 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2461 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2463 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2464 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2465 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2467 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2468 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2469 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2471 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2472 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2473 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2475 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2476 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2477 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2479 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2480 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2481 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2483 #undef DO_CMP_PPZW_B
2484 #undef DO_CMP_PPZW_H
2485 #undef DO_CMP_PPZW_S
2486 #undef DO_CMP_PPZW
2488 /* Similar, but the second source is immediate. */
2489 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2490 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2492 intptr_t opr_sz = simd_oprsz(desc); \
2493 uint32_t flags = PREDTEST_INIT; \
2494 TYPE mm = simd_data(desc); \
2495 intptr_t i = opr_sz; \
2496 do { \
2497 uint64_t out = 0, pg; \
2498 do { \
2499 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2500 TYPE nn = *(TYPE *)(vn + H(i)); \
2501 out |= nn OP mm; \
2502 } while (i & 63); \
2503 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2504 out &= pg; \
2505 *(uint64_t *)(vd + (i >> 3)) = out; \
2506 flags = iter_predtest_bwd(out, pg, flags); \
2507 } while (i > 0); \
2508 return flags; \
2511 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2512 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2513 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2514 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2515 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2516 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2517 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2518 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2520 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2521 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2522 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2523 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2525 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2526 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2527 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2528 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2530 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2531 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2532 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2533 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2535 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2536 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2537 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2538 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2540 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2541 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2542 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2543 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2545 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2546 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2547 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2548 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2550 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2551 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2552 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2553 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2555 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2556 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2557 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2558 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2560 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2561 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2562 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2563 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2565 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2566 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2567 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2568 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2570 #undef DO_CMP_PPZI_B
2571 #undef DO_CMP_PPZI_H
2572 #undef DO_CMP_PPZI_S
2573 #undef DO_CMP_PPZI_D
2574 #undef DO_CMP_PPZI
2576 /* Similar to the ARM LastActive pseudocode function. */
2577 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2579 intptr_t i;
2581 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2582 uint64_t pg = *(uint64_t *)(vg + i);
2583 if (pg) {
2584 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2587 return 0;
2590 /* Compute a mask into RETB that is true for all G, up to and including
2591 * (if after) or excluding (if !after) the first G & N.
2592 * Return true if BRK found.
2594 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2595 bool brk, bool after)
2597 uint64_t b;
2599 if (brk) {
2600 b = 0;
2601 } else if ((g & n) == 0) {
2602 /* For all G, no N are set; break not found. */
2603 b = g;
2604 } else {
2605 /* Break somewhere in N. Locate it. */
2606 b = g & n; /* guard true, pred true */
2607 b = b & -b; /* first such */
2608 if (after) {
2609 b = b | (b - 1); /* break after same */
2610 } else {
2611 b = b - 1; /* break before same */
2613 brk = true;
2616 *retb = b;
2617 return brk;
2620 /* Compute a zeroing BRK. */
2621 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2622 intptr_t oprsz, bool after)
2624 bool brk = false;
2625 intptr_t i;
2627 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2628 uint64_t this_b, this_g = g[i];
2630 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2631 d[i] = this_b & this_g;
2635 /* Likewise, but also compute flags. */
2636 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2637 intptr_t oprsz, bool after)
2639 uint32_t flags = PREDTEST_INIT;
2640 bool brk = false;
2641 intptr_t i;
2643 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2644 uint64_t this_b, this_d, this_g = g[i];
2646 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2647 d[i] = this_d = this_b & this_g;
2648 flags = iter_predtest_fwd(this_d, this_g, flags);
2650 return flags;
2653 /* Compute a merging BRK. */
2654 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2655 intptr_t oprsz, bool after)
2657 bool brk = false;
2658 intptr_t i;
2660 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2661 uint64_t this_b, this_g = g[i];
2663 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2664 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2668 /* Likewise, but also compute flags. */
2669 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2670 intptr_t oprsz, bool after)
2672 uint32_t flags = PREDTEST_INIT;
2673 bool brk = false;
2674 intptr_t i;
2676 for (i = 0; i < oprsz / 8; ++i) {
2677 uint64_t this_b, this_d = d[i], this_g = g[i];
2679 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2680 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2681 flags = iter_predtest_fwd(this_d, this_g, flags);
2683 return flags;
2686 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2688 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2689 * The compiler should turn this into 4 64-bit integer stores.
2691 memset(d, 0, sizeof(ARMPredicateReg));
2692 return PREDTEST_INIT;
2695 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2696 uint32_t pred_desc)
2698 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2699 if (last_active_pred(vn, vg, oprsz)) {
2700 compute_brk_z(vd, vm, vg, oprsz, true);
2701 } else {
2702 do_zero(vd, oprsz);
2706 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2707 uint32_t pred_desc)
2709 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2710 if (last_active_pred(vn, vg, oprsz)) {
2711 return compute_brks_z(vd, vm, vg, oprsz, true);
2712 } else {
2713 return do_zero(vd, oprsz);
2717 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2718 uint32_t pred_desc)
2720 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2721 if (last_active_pred(vn, vg, oprsz)) {
2722 compute_brk_z(vd, vm, vg, oprsz, false);
2723 } else {
2724 do_zero(vd, oprsz);
2728 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2729 uint32_t pred_desc)
2731 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2732 if (last_active_pred(vn, vg, oprsz)) {
2733 return compute_brks_z(vd, vm, vg, oprsz, false);
2734 } else {
2735 return do_zero(vd, oprsz);
2739 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2741 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2742 compute_brk_z(vd, vn, vg, oprsz, true);
2745 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2747 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2748 return compute_brks_z(vd, vn, vg, oprsz, true);
2751 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2753 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2754 compute_brk_z(vd, vn, vg, oprsz, false);
2757 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2759 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2760 return compute_brks_z(vd, vn, vg, oprsz, false);
2763 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2765 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2766 compute_brk_m(vd, vn, vg, oprsz, true);
2769 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2771 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2772 return compute_brks_m(vd, vn, vg, oprsz, true);
2775 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2777 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2778 compute_brk_m(vd, vn, vg, oprsz, false);
2781 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2783 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2784 return compute_brks_m(vd, vn, vg, oprsz, false);
2787 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2789 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2791 if (!last_active_pred(vn, vg, oprsz)) {
2792 do_zero(vd, oprsz);
2796 /* As if PredTest(Ones(PL), D, esz). */
2797 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2798 uint64_t esz_mask)
2800 uint32_t flags = PREDTEST_INIT;
2801 intptr_t i;
2803 for (i = 0; i < oprsz / 8; i++) {
2804 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2806 if (oprsz & 7) {
2807 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2808 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2810 return flags;
2813 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2815 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2817 if (last_active_pred(vn, vg, oprsz)) {
2818 return predtest_ones(vd, oprsz, -1);
2819 } else {
2820 return do_zero(vd, oprsz);
2824 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2826 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2827 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2828 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2829 intptr_t i;
2831 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2832 uint64_t t = n[i] & g[i] & mask;
2833 sum += ctpop64(t);
2835 return sum;
2838 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2840 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2841 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2842 uint64_t esz_mask = pred_esz_masks[esz];
2843 ARMPredicateReg *d = vd;
2844 uint32_t flags;
2845 intptr_t i;
2847 /* Begin with a zero predicate register. */
2848 flags = do_zero(d, oprsz);
2849 if (count == 0) {
2850 return flags;
2853 /* Set all of the requested bits. */
2854 for (i = 0; i < count / 64; ++i) {
2855 d->p[i] = esz_mask;
2857 if (count & 63) {
2858 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2861 return predtest_ones(d, oprsz, esz_mask);
2864 /* Recursive reduction on a function;
2865 * C.f. the ARM ARM function ReducePredicated.
2867 * While it would be possible to write this without the DATA temporary,
2868 * it is much simpler to process the predicate register this way.
2869 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2870 * little to gain with a more complex non-recursive form.
2872 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2873 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2875 if (n == 1) { \
2876 return *data; \
2877 } else { \
2878 uintptr_t half = n / 2; \
2879 TYPE lo = NAME##_reduce(data, status, half); \
2880 TYPE hi = NAME##_reduce(data + half, status, half); \
2881 return TYPE##_##FUNC(lo, hi, status); \
2884 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2886 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2887 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2888 for (i = 0; i < oprsz; ) { \
2889 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2890 do { \
2891 TYPE nn = *(TYPE *)(vn + H(i)); \
2892 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2893 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2894 } while (i & 15); \
2896 for (; i < maxsz; i += sizeof(TYPE)) { \
2897 *(TYPE *)((void *)data + i) = IDENT; \
2899 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2902 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2903 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2904 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2906 /* Identity is floatN_default_nan, without the function call. */
2907 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2908 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2909 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2911 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2912 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2913 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2915 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2916 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2917 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2919 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2920 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2921 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2923 #undef DO_REDUCE
2925 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2926 void *status, uint32_t desc)
2928 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2929 float16 result = nn;
2931 do {
2932 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2933 do {
2934 if (pg & 1) {
2935 float16 mm = *(float16 *)(vm + H1_2(i));
2936 result = float16_add(result, mm, status);
2938 i += sizeof(float16), pg >>= sizeof(float16);
2939 } while (i & 15);
2940 } while (i < opr_sz);
2942 return result;
2945 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2946 void *status, uint32_t desc)
2948 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2949 float32 result = nn;
2951 do {
2952 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2953 do {
2954 if (pg & 1) {
2955 float32 mm = *(float32 *)(vm + H1_2(i));
2956 result = float32_add(result, mm, status);
2958 i += sizeof(float32), pg >>= sizeof(float32);
2959 } while (i & 15);
2960 } while (i < opr_sz);
2962 return result;
2965 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2966 void *status, uint32_t desc)
2968 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2969 uint64_t *m = vm;
2970 uint8_t *pg = vg;
2972 for (i = 0; i < opr_sz; i++) {
2973 if (pg[H1(i)] & 1) {
2974 nn = float64_add(nn, m[i], status);
2978 return nn;
2981 /* Fully general three-operand expander, controlled by a predicate,
2982 * With the extra float_status parameter.
2984 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2985 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2986 void *status, uint32_t desc) \
2988 intptr_t i = simd_oprsz(desc); \
2989 uint64_t *g = vg; \
2990 do { \
2991 uint64_t pg = g[(i - 1) >> 6]; \
2992 do { \
2993 i -= sizeof(TYPE); \
2994 if (likely((pg >> (i & 63)) & 1)) { \
2995 TYPE nn = *(TYPE *)(vn + H(i)); \
2996 TYPE mm = *(TYPE *)(vm + H(i)); \
2997 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2999 } while (i & 63); \
3000 } while (i != 0); \
3003 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3004 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3005 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3007 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3008 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3009 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3011 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3012 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3013 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3015 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3016 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3017 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3019 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3020 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3021 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3023 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3024 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3025 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3027 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3028 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3029 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3031 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3032 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3033 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3035 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3037 return float16_abs(float16_sub(a, b, s));
3040 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3042 return float32_abs(float32_sub(a, b, s));
3045 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3047 return float64_abs(float64_sub(a, b, s));
3050 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3051 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3052 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3054 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3056 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3057 return float64_scalbn(a, b_int, s);
3060 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3061 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3062 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3064 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3065 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3066 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3068 #undef DO_ZPZZ_FP
3070 /* Three-operand expander, with one scalar operand, controlled by
3071 * a predicate, with the extra float_status parameter.
3073 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3074 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3075 void *status, uint32_t desc) \
3077 intptr_t i = simd_oprsz(desc); \
3078 uint64_t *g = vg; \
3079 TYPE mm = scalar; \
3080 do { \
3081 uint64_t pg = g[(i - 1) >> 6]; \
3082 do { \
3083 i -= sizeof(TYPE); \
3084 if (likely((pg >> (i & 63)) & 1)) { \
3085 TYPE nn = *(TYPE *)(vn + H(i)); \
3086 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3088 } while (i & 63); \
3089 } while (i != 0); \
3092 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3093 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3094 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3096 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3097 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3098 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3100 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3101 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3102 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3104 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3106 return float16_sub(b, a, s);
3109 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3111 return float32_sub(b, a, s);
3114 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3116 return float64_sub(b, a, s);
3119 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3120 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3121 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3123 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3124 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3125 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3127 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3128 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3129 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3131 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3132 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3133 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3135 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3136 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3137 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3139 /* Fully general two-operand expander, controlled by a predicate,
3140 * With the extra float_status parameter.
3142 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3143 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3145 intptr_t i = simd_oprsz(desc); \
3146 uint64_t *g = vg; \
3147 do { \
3148 uint64_t pg = g[(i - 1) >> 6]; \
3149 do { \
3150 i -= sizeof(TYPE); \
3151 if (likely((pg >> (i & 63)) & 1)) { \
3152 TYPE nn = *(TYPE *)(vn + H(i)); \
3153 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3155 } while (i & 63); \
3156 } while (i != 0); \
3159 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3160 * FZ16. When converting from fp16, this affects flushing input denormals;
3161 * when converting to fp16, this affects flushing output denormals.
3163 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3165 bool save = get_flush_inputs_to_zero(fpst);
3166 float32 ret;
3168 set_flush_inputs_to_zero(false, fpst);
3169 ret = float16_to_float32(f, true, fpst);
3170 set_flush_inputs_to_zero(save, fpst);
3171 return ret;
3174 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3176 bool save = get_flush_inputs_to_zero(fpst);
3177 float64 ret;
3179 set_flush_inputs_to_zero(false, fpst);
3180 ret = float16_to_float64(f, true, fpst);
3181 set_flush_inputs_to_zero(save, fpst);
3182 return ret;
3185 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3187 bool save = get_flush_to_zero(fpst);
3188 float16 ret;
3190 set_flush_to_zero(false, fpst);
3191 ret = float32_to_float16(f, true, fpst);
3192 set_flush_to_zero(save, fpst);
3193 return ret;
3196 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3198 bool save = get_flush_to_zero(fpst);
3199 float16 ret;
3201 set_flush_to_zero(false, fpst);
3202 ret = float64_to_float16(f, true, fpst);
3203 set_flush_to_zero(save, fpst);
3204 return ret;
3207 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3209 if (float16_is_any_nan(f)) {
3210 float_raise(float_flag_invalid, s);
3211 return 0;
3213 return float16_to_int16_round_to_zero(f, s);
3216 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3218 if (float16_is_any_nan(f)) {
3219 float_raise(float_flag_invalid, s);
3220 return 0;
3222 return float16_to_int64_round_to_zero(f, s);
3225 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3227 if (float32_is_any_nan(f)) {
3228 float_raise(float_flag_invalid, s);
3229 return 0;
3231 return float32_to_int64_round_to_zero(f, s);
3234 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3236 if (float64_is_any_nan(f)) {
3237 float_raise(float_flag_invalid, s);
3238 return 0;
3240 return float64_to_int64_round_to_zero(f, s);
3243 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3245 if (float16_is_any_nan(f)) {
3246 float_raise(float_flag_invalid, s);
3247 return 0;
3249 return float16_to_uint16_round_to_zero(f, s);
3252 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3254 if (float16_is_any_nan(f)) {
3255 float_raise(float_flag_invalid, s);
3256 return 0;
3258 return float16_to_uint64_round_to_zero(f, s);
3261 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3263 if (float32_is_any_nan(f)) {
3264 float_raise(float_flag_invalid, s);
3265 return 0;
3267 return float32_to_uint64_round_to_zero(f, s);
3270 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3272 if (float64_is_any_nan(f)) {
3273 float_raise(float_flag_invalid, s);
3274 return 0;
3276 return float64_to_uint64_round_to_zero(f, s);
3279 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3280 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3281 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3282 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3283 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3284 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3286 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3287 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3288 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3289 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3290 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3291 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3292 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3294 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3295 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3296 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3297 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3298 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3299 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3300 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3302 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3303 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3304 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3306 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3307 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3308 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3310 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3311 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3312 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3314 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3315 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3316 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3318 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3319 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3320 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3321 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3322 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3323 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3324 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3326 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3327 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3328 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3329 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3330 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3331 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3332 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3334 #undef DO_ZPZ_FP
3336 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3337 float_status *status, uint32_t desc,
3338 uint16_t neg1, uint16_t neg3)
3340 intptr_t i = simd_oprsz(desc);
3341 uint64_t *g = vg;
3343 do {
3344 uint64_t pg = g[(i - 1) >> 6];
3345 do {
3346 i -= 2;
3347 if (likely((pg >> (i & 63)) & 1)) {
3348 float16 e1, e2, e3, r;
3350 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3351 e2 = *(uint16_t *)(vm + H1_2(i));
3352 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3353 r = float16_muladd(e1, e2, e3, 0, status);
3354 *(uint16_t *)(vd + H1_2(i)) = r;
3356 } while (i & 63);
3357 } while (i != 0);
3360 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3361 void *vg, void *status, uint32_t desc)
3363 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3366 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3367 void *vg, void *status, uint32_t desc)
3369 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3372 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3373 void *vg, void *status, uint32_t desc)
3375 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3378 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3379 void *vg, void *status, uint32_t desc)
3381 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3384 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3385 float_status *status, uint32_t desc,
3386 uint32_t neg1, uint32_t neg3)
3388 intptr_t i = simd_oprsz(desc);
3389 uint64_t *g = vg;
3391 do {
3392 uint64_t pg = g[(i - 1) >> 6];
3393 do {
3394 i -= 4;
3395 if (likely((pg >> (i & 63)) & 1)) {
3396 float32 e1, e2, e3, r;
3398 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3399 e2 = *(uint32_t *)(vm + H1_4(i));
3400 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3401 r = float32_muladd(e1, e2, e3, 0, status);
3402 *(uint32_t *)(vd + H1_4(i)) = r;
3404 } while (i & 63);
3405 } while (i != 0);
3408 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3409 void *vg, void *status, uint32_t desc)
3411 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3414 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3415 void *vg, void *status, uint32_t desc)
3417 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3420 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3421 void *vg, void *status, uint32_t desc)
3423 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3426 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3427 void *vg, void *status, uint32_t desc)
3429 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3432 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3433 float_status *status, uint32_t desc,
3434 uint64_t neg1, uint64_t neg3)
3436 intptr_t i = simd_oprsz(desc);
3437 uint64_t *g = vg;
3439 do {
3440 uint64_t pg = g[(i - 1) >> 6];
3441 do {
3442 i -= 8;
3443 if (likely((pg >> (i & 63)) & 1)) {
3444 float64 e1, e2, e3, r;
3446 e1 = *(uint64_t *)(vn + i) ^ neg1;
3447 e2 = *(uint64_t *)(vm + i);
3448 e3 = *(uint64_t *)(va + i) ^ neg3;
3449 r = float64_muladd(e1, e2, e3, 0, status);
3450 *(uint64_t *)(vd + i) = r;
3452 } while (i & 63);
3453 } while (i != 0);
3456 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3457 void *vg, void *status, uint32_t desc)
3459 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3462 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3463 void *vg, void *status, uint32_t desc)
3465 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3468 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3469 void *vg, void *status, uint32_t desc)
3471 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3474 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3475 void *vg, void *status, uint32_t desc)
3477 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3480 /* Two operand floating-point comparison controlled by a predicate.
3481 * Unlike the integer version, we are not allowed to optimistically
3482 * compare operands, since the comparison may have side effects wrt
3483 * the FPSR.
3485 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3486 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3487 void *status, uint32_t desc) \
3489 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3490 uint64_t *d = vd, *g = vg; \
3491 do { \
3492 uint64_t out = 0, pg = g[j]; \
3493 do { \
3494 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3495 if (likely((pg >> (i & 63)) & 1)) { \
3496 TYPE nn = *(TYPE *)(vn + H(i)); \
3497 TYPE mm = *(TYPE *)(vm + H(i)); \
3498 out |= OP(TYPE, nn, mm, status); \
3500 } while (i & 63); \
3501 d[j--] = out; \
3502 } while (i > 0); \
3505 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3506 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3507 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3508 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3509 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3510 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3512 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3513 DO_FPCMP_PPZZ_H(NAME, OP) \
3514 DO_FPCMP_PPZZ_S(NAME, OP) \
3515 DO_FPCMP_PPZZ_D(NAME, OP)
3517 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3518 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3519 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3520 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3521 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3522 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3523 #define DO_FCMUO(TYPE, X, Y, ST) \
3524 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3525 #define DO_FACGE(TYPE, X, Y, ST) \
3526 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3527 #define DO_FACGT(TYPE, X, Y, ST) \
3528 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3530 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3531 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3532 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3533 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3534 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3535 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3536 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3538 #undef DO_FPCMP_PPZZ_ALL
3539 #undef DO_FPCMP_PPZZ_D
3540 #undef DO_FPCMP_PPZZ_S
3541 #undef DO_FPCMP_PPZZ_H
3542 #undef DO_FPCMP_PPZZ
3544 /* One operand floating-point comparison against zero, controlled
3545 * by a predicate.
3547 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3548 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3549 void *status, uint32_t desc) \
3551 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3552 uint64_t *d = vd, *g = vg; \
3553 do { \
3554 uint64_t out = 0, pg = g[j]; \
3555 do { \
3556 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3557 if ((pg >> (i & 63)) & 1) { \
3558 TYPE nn = *(TYPE *)(vn + H(i)); \
3559 out |= OP(TYPE, nn, 0, status); \
3561 } while (i & 63); \
3562 d[j--] = out; \
3563 } while (i > 0); \
3566 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3567 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3568 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3569 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3570 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3571 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3573 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3574 DO_FPCMP_PPZ0_H(NAME, OP) \
3575 DO_FPCMP_PPZ0_S(NAME, OP) \
3576 DO_FPCMP_PPZ0_D(NAME, OP)
3578 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3579 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3580 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3581 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3582 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3583 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3585 /* FP Trig Multiply-Add. */
3587 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3589 static const float16 coeff[16] = {
3590 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3591 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3593 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3594 intptr_t x = simd_data(desc);
3595 float16 *d = vd, *n = vn, *m = vm;
3596 for (i = 0; i < opr_sz; i++) {
3597 float16 mm = m[i];
3598 intptr_t xx = x;
3599 if (float16_is_neg(mm)) {
3600 mm = float16_abs(mm);
3601 xx += 8;
3603 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3607 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3609 static const float32 coeff[16] = {
3610 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3611 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3612 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3613 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3615 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3616 intptr_t x = simd_data(desc);
3617 float32 *d = vd, *n = vn, *m = vm;
3618 for (i = 0; i < opr_sz; i++) {
3619 float32 mm = m[i];
3620 intptr_t xx = x;
3621 if (float32_is_neg(mm)) {
3622 mm = float32_abs(mm);
3623 xx += 8;
3625 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3629 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3631 static const float64 coeff[16] = {
3632 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3633 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3634 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3635 0x3de5d8408868552full, 0x0000000000000000ull,
3636 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3637 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3638 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3639 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3641 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3642 intptr_t x = simd_data(desc);
3643 float64 *d = vd, *n = vn, *m = vm;
3644 for (i = 0; i < opr_sz; i++) {
3645 float64 mm = m[i];
3646 intptr_t xx = x;
3647 if (float64_is_neg(mm)) {
3648 mm = float64_abs(mm);
3649 xx += 8;
3651 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3656 * FP Complex Add
3659 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3660 void *vs, uint32_t desc)
3662 intptr_t j, i = simd_oprsz(desc);
3663 uint64_t *g = vg;
3664 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3665 float16 neg_real = float16_chs(neg_imag);
3667 do {
3668 uint64_t pg = g[(i - 1) >> 6];
3669 do {
3670 float16 e0, e1, e2, e3;
3672 /* I holds the real index; J holds the imag index. */
3673 j = i - sizeof(float16);
3674 i -= 2 * sizeof(float16);
3676 e0 = *(float16 *)(vn + H1_2(i));
3677 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3678 e2 = *(float16 *)(vn + H1_2(j));
3679 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3681 if (likely((pg >> (i & 63)) & 1)) {
3682 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3684 if (likely((pg >> (j & 63)) & 1)) {
3685 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3687 } while (i & 63);
3688 } while (i != 0);
3691 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3692 void *vs, uint32_t desc)
3694 intptr_t j, i = simd_oprsz(desc);
3695 uint64_t *g = vg;
3696 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3697 float32 neg_real = float32_chs(neg_imag);
3699 do {
3700 uint64_t pg = g[(i - 1) >> 6];
3701 do {
3702 float32 e0, e1, e2, e3;
3704 /* I holds the real index; J holds the imag index. */
3705 j = i - sizeof(float32);
3706 i -= 2 * sizeof(float32);
3708 e0 = *(float32 *)(vn + H1_2(i));
3709 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3710 e2 = *(float32 *)(vn + H1_2(j));
3711 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3713 if (likely((pg >> (i & 63)) & 1)) {
3714 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3716 if (likely((pg >> (j & 63)) & 1)) {
3717 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3719 } while (i & 63);
3720 } while (i != 0);
3723 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3724 void *vs, uint32_t desc)
3726 intptr_t j, i = simd_oprsz(desc);
3727 uint64_t *g = vg;
3728 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3729 float64 neg_real = float64_chs(neg_imag);
3731 do {
3732 uint64_t pg = g[(i - 1) >> 6];
3733 do {
3734 float64 e0, e1, e2, e3;
3736 /* I holds the real index; J holds the imag index. */
3737 j = i - sizeof(float64);
3738 i -= 2 * sizeof(float64);
3740 e0 = *(float64 *)(vn + H1_2(i));
3741 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3742 e2 = *(float64 *)(vn + H1_2(j));
3743 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3745 if (likely((pg >> (i & 63)) & 1)) {
3746 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3748 if (likely((pg >> (j & 63)) & 1)) {
3749 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3751 } while (i & 63);
3752 } while (i != 0);
3756 * FP Complex Multiply
3759 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3760 void *vg, void *status, uint32_t desc)
3762 intptr_t j, i = simd_oprsz(desc);
3763 unsigned rot = simd_data(desc);
3764 bool flip = rot & 1;
3765 float16 neg_imag, neg_real;
3766 uint64_t *g = vg;
3768 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3769 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3771 do {
3772 uint64_t pg = g[(i - 1) >> 6];
3773 do {
3774 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3776 /* I holds the real index; J holds the imag index. */
3777 j = i - sizeof(float16);
3778 i -= 2 * sizeof(float16);
3780 nr = *(float16 *)(vn + H1_2(i));
3781 ni = *(float16 *)(vn + H1_2(j));
3782 mr = *(float16 *)(vm + H1_2(i));
3783 mi = *(float16 *)(vm + H1_2(j));
3785 e2 = (flip ? ni : nr);
3786 e1 = (flip ? mi : mr) ^ neg_real;
3787 e4 = e2;
3788 e3 = (flip ? mr : mi) ^ neg_imag;
3790 if (likely((pg >> (i & 63)) & 1)) {
3791 d = *(float16 *)(va + H1_2(i));
3792 d = float16_muladd(e2, e1, d, 0, status);
3793 *(float16 *)(vd + H1_2(i)) = d;
3795 if (likely((pg >> (j & 63)) & 1)) {
3796 d = *(float16 *)(va + H1_2(j));
3797 d = float16_muladd(e4, e3, d, 0, status);
3798 *(float16 *)(vd + H1_2(j)) = d;
3800 } while (i & 63);
3801 } while (i != 0);
3804 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3805 void *vg, void *status, uint32_t desc)
3807 intptr_t j, i = simd_oprsz(desc);
3808 unsigned rot = simd_data(desc);
3809 bool flip = rot & 1;
3810 float32 neg_imag, neg_real;
3811 uint64_t *g = vg;
3813 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3814 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3816 do {
3817 uint64_t pg = g[(i - 1) >> 6];
3818 do {
3819 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3821 /* I holds the real index; J holds the imag index. */
3822 j = i - sizeof(float32);
3823 i -= 2 * sizeof(float32);
3825 nr = *(float32 *)(vn + H1_2(i));
3826 ni = *(float32 *)(vn + H1_2(j));
3827 mr = *(float32 *)(vm + H1_2(i));
3828 mi = *(float32 *)(vm + H1_2(j));
3830 e2 = (flip ? ni : nr);
3831 e1 = (flip ? mi : mr) ^ neg_real;
3832 e4 = e2;
3833 e3 = (flip ? mr : mi) ^ neg_imag;
3835 if (likely((pg >> (i & 63)) & 1)) {
3836 d = *(float32 *)(va + H1_2(i));
3837 d = float32_muladd(e2, e1, d, 0, status);
3838 *(float32 *)(vd + H1_2(i)) = d;
3840 if (likely((pg >> (j & 63)) & 1)) {
3841 d = *(float32 *)(va + H1_2(j));
3842 d = float32_muladd(e4, e3, d, 0, status);
3843 *(float32 *)(vd + H1_2(j)) = d;
3845 } while (i & 63);
3846 } while (i != 0);
3849 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3850 void *vg, void *status, uint32_t desc)
3852 intptr_t j, i = simd_oprsz(desc);
3853 unsigned rot = simd_data(desc);
3854 bool flip = rot & 1;
3855 float64 neg_imag, neg_real;
3856 uint64_t *g = vg;
3858 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3859 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3861 do {
3862 uint64_t pg = g[(i - 1) >> 6];
3863 do {
3864 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3866 /* I holds the real index; J holds the imag index. */
3867 j = i - sizeof(float64);
3868 i -= 2 * sizeof(float64);
3870 nr = *(float64 *)(vn + H1_2(i));
3871 ni = *(float64 *)(vn + H1_2(j));
3872 mr = *(float64 *)(vm + H1_2(i));
3873 mi = *(float64 *)(vm + H1_2(j));
3875 e2 = (flip ? ni : nr);
3876 e1 = (flip ? mi : mr) ^ neg_real;
3877 e4 = e2;
3878 e3 = (flip ? mr : mi) ^ neg_imag;
3880 if (likely((pg >> (i & 63)) & 1)) {
3881 d = *(float64 *)(va + H1_2(i));
3882 d = float64_muladd(e2, e1, d, 0, status);
3883 *(float64 *)(vd + H1_2(i)) = d;
3885 if (likely((pg >> (j & 63)) & 1)) {
3886 d = *(float64 *)(va + H1_2(j));
3887 d = float64_muladd(e4, e3, d, 0, status);
3888 *(float64 *)(vd + H1_2(j)) = d;
3890 } while (i & 63);
3891 } while (i != 0);
3895 * Load contiguous data, protected by a governing predicate.
3899 * Load one element into @vd + @reg_off from @host.
3900 * The controlling predicate is known to be true.
3902 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3905 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3906 * The controlling predicate is known to be true.
3908 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3909 target_ulong vaddr, uintptr_t retaddr);
3912 * Generate the above primitives.
3915 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3916 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3918 TYPEM val = HOST(host); \
3919 *(TYPEE *)(vd + H(reg_off)) = val; \
3922 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3923 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3924 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3926 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3927 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3928 target_ulong addr, uintptr_t ra) \
3930 *(TYPEE *)(vd + H(reg_off)) = \
3931 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
3934 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3935 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3936 target_ulong addr, uintptr_t ra) \
3938 TLB(env, useronly_clean_ptr(addr), \
3939 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
3942 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
3943 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
3944 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3946 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3947 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3948 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3949 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3950 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3951 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3952 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3954 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
3955 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
3956 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3958 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3959 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3960 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3961 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
3963 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3964 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
3965 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
3966 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3967 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
3969 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
3970 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
3971 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
3972 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3973 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
3975 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
3976 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3977 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
3978 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
3979 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
3981 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3982 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3983 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
3985 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
3986 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
3987 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
3989 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
3990 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
3992 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
3993 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
3995 #undef DO_LD_TLB
3996 #undef DO_ST_TLB
3997 #undef DO_LD_HOST
3998 #undef DO_LD_PRIM_1
3999 #undef DO_ST_PRIM_1
4000 #undef DO_LD_PRIM_2
4001 #undef DO_ST_PRIM_2
4004 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4005 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4006 * element >= @reg_off, or @reg_max if there were no active elements at all.
4008 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4009 intptr_t reg_max, int esz)
4011 uint64_t pg_mask = pred_esz_masks[esz];
4012 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4014 /* In normal usage, the first element is active. */
4015 if (likely(pg & 1)) {
4016 return reg_off;
4019 if (pg == 0) {
4020 reg_off &= -64;
4021 do {
4022 reg_off += 64;
4023 if (unlikely(reg_off >= reg_max)) {
4024 /* The entire predicate was false. */
4025 return reg_max;
4027 pg = vg[reg_off >> 6] & pg_mask;
4028 } while (pg == 0);
4030 reg_off += ctz64(pg);
4032 /* We should never see an out of range predicate bit set. */
4033 tcg_debug_assert(reg_off < reg_max);
4034 return reg_off;
4038 * Resolve the guest virtual address to info->host and info->flags.
4039 * If @nofault, return false if the page is invalid, otherwise
4040 * exit via page fault exception.
4043 typedef struct {
4044 void *host;
4045 int flags;
4046 MemTxAttrs attrs;
4047 } SVEHostPage;
4049 static bool sve_probe_page(SVEHostPage *info, bool nofault,
4050 CPUARMState *env, target_ulong addr,
4051 int mem_off, MMUAccessType access_type,
4052 int mmu_idx, uintptr_t retaddr)
4054 int flags;
4056 addr += mem_off;
4059 * User-only currently always issues with TBI. See the comment
4060 * above useronly_clean_ptr. Usually we clean this top byte away
4061 * during translation, but we can't do that for e.g. vector + imm
4062 * addressing modes.
4064 * We currently always enable TBI for user-only, and do not provide
4065 * a way to turn it off. So clean the pointer unconditionally here,
4066 * rather than look it up here, or pass it down from above.
4068 addr = useronly_clean_ptr(addr);
4070 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4071 &info->host, retaddr);
4072 info->flags = flags;
4074 if (flags & TLB_INVALID_MASK) {
4075 g_assert(nofault);
4076 return false;
4079 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4080 info->host -= mem_off;
4082 #ifdef CONFIG_USER_ONLY
4083 memset(&info->attrs, 0, sizeof(info->attrs));
4084 #else
4086 * Find the iotlbentry for addr and return the transaction attributes.
4087 * This *must* be present in the TLB because we just found the mapping.
4090 uintptr_t index = tlb_index(env, mmu_idx, addr);
4092 # ifdef CONFIG_DEBUG_TCG
4093 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4094 target_ulong comparator = (access_type == MMU_DATA_LOAD
4095 ? entry->addr_read
4096 : tlb_addr_write(entry));
4097 g_assert(tlb_hit(comparator, addr));
4098 # endif
4100 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4101 info->attrs = iotlbentry->attrs;
4103 #endif
4105 return true;
4110 * Analyse contiguous data, protected by a governing predicate.
4113 typedef enum {
4114 FAULT_NO,
4115 FAULT_FIRST,
4116 FAULT_ALL,
4117 } SVEContFault;
4119 typedef struct {
4121 * First and last element wholly contained within the two pages.
4122 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4123 * reg_off_last[0] may be < 0 if the first element crosses pages.
4124 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4125 * are set >= 0 only if there are complete elements on a second page.
4127 * The reg_off_* offsets are relative to the internal vector register.
4128 * The mem_off_first offset is relative to the memory address; the
4129 * two offsets are different when a load operation extends, a store
4130 * operation truncates, or for multi-register operations.
4132 int16_t mem_off_first[2];
4133 int16_t reg_off_first[2];
4134 int16_t reg_off_last[2];
4137 * One element that is misaligned and spans both pages,
4138 * or -1 if there is no such active element.
4140 int16_t mem_off_split;
4141 int16_t reg_off_split;
4144 * The byte offset at which the entire operation crosses a page boundary.
4145 * Set >= 0 if and only if the entire operation spans two pages.
4147 int16_t page_split;
4149 /* TLB data for the two pages. */
4150 SVEHostPage page[2];
4151 } SVEContLdSt;
4154 * Find first active element on each page, and a loose bound for the
4155 * final element on each page. Identify any single element that spans
4156 * the page boundary. Return true if there are any active elements.
4158 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4159 uint64_t *vg, intptr_t reg_max,
4160 int esz, int msize)
4162 const int esize = 1 << esz;
4163 const uint64_t pg_mask = pred_esz_masks[esz];
4164 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4165 intptr_t mem_off_last, mem_off_split;
4166 intptr_t page_split, elt_split;
4167 intptr_t i;
4169 /* Set all of the element indices to -1, and the TLB data to 0. */
4170 memset(info, -1, offsetof(SVEContLdSt, page));
4171 memset(info->page, 0, sizeof(info->page));
4173 /* Gross scan over the entire predicate to find bounds. */
4174 i = 0;
4175 do {
4176 uint64_t pg = vg[i] & pg_mask;
4177 if (pg) {
4178 reg_off_last = i * 64 + 63 - clz64(pg);
4179 if (reg_off_first < 0) {
4180 reg_off_first = i * 64 + ctz64(pg);
4183 } while (++i * 64 < reg_max);
4185 if (unlikely(reg_off_first < 0)) {
4186 /* No active elements, no pages touched. */
4187 return false;
4189 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4191 info->reg_off_first[0] = reg_off_first;
4192 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4193 mem_off_last = (reg_off_last >> esz) * msize;
4195 page_split = -(addr | TARGET_PAGE_MASK);
4196 if (likely(mem_off_last + msize <= page_split)) {
4197 /* The entire operation fits within a single page. */
4198 info->reg_off_last[0] = reg_off_last;
4199 return true;
4202 info->page_split = page_split;
4203 elt_split = page_split / msize;
4204 reg_off_split = elt_split << esz;
4205 mem_off_split = elt_split * msize;
4208 * This is the last full element on the first page, but it is not
4209 * necessarily active. If there is no full element, i.e. the first
4210 * active element is the one that's split, this value remains -1.
4211 * It is useful as iteration bounds.
4213 if (elt_split != 0) {
4214 info->reg_off_last[0] = reg_off_split - esize;
4217 /* Determine if an unaligned element spans the pages. */
4218 if (page_split % msize != 0) {
4219 /* It is helpful to know if the split element is active. */
4220 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4221 info->reg_off_split = reg_off_split;
4222 info->mem_off_split = mem_off_split;
4224 if (reg_off_split == reg_off_last) {
4225 /* The page crossing element is last. */
4226 return true;
4229 reg_off_split += esize;
4230 mem_off_split += msize;
4234 * We do want the first active element on the second page, because
4235 * this may affect the address reported in an exception.
4237 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4238 tcg_debug_assert(reg_off_split <= reg_off_last);
4239 info->reg_off_first[1] = reg_off_split;
4240 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4241 info->reg_off_last[1] = reg_off_last;
4242 return true;
4246 * Resolve the guest virtual addresses to info->page[].
4247 * Control the generation of page faults with @fault. Return false if
4248 * there is no work to do, which can only happen with @fault == FAULT_NO.
4250 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4251 CPUARMState *env, target_ulong addr,
4252 MMUAccessType access_type, uintptr_t retaddr)
4254 int mmu_idx = cpu_mmu_index(env, false);
4255 int mem_off = info->mem_off_first[0];
4256 bool nofault = fault == FAULT_NO;
4257 bool have_work = true;
4259 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4260 access_type, mmu_idx, retaddr)) {
4261 /* No work to be done. */
4262 return false;
4265 if (likely(info->page_split < 0)) {
4266 /* The entire operation was on the one page. */
4267 return true;
4271 * If the second page is invalid, then we want the fault address to be
4272 * the first byte on that page which is accessed.
4274 if (info->mem_off_split >= 0) {
4276 * There is an element split across the pages. The fault address
4277 * should be the first byte of the second page.
4279 mem_off = info->page_split;
4281 * If the split element is also the first active element
4282 * of the vector, then: For first-fault we should continue
4283 * to generate faults for the second page. For no-fault,
4284 * we have work only if the second page is valid.
4286 if (info->mem_off_first[0] < info->mem_off_split) {
4287 nofault = FAULT_FIRST;
4288 have_work = false;
4290 } else {
4292 * There is no element split across the pages. The fault address
4293 * should be the first active element on the second page.
4295 mem_off = info->mem_off_first[1];
4297 * There must have been one active element on the first page,
4298 * so we're out of first-fault territory.
4300 nofault = fault != FAULT_ALL;
4303 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4304 access_type, mmu_idx, retaddr);
4305 return have_work;
4308 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4309 uint64_t *vg, target_ulong addr,
4310 int esize, int msize, int wp_access,
4311 uintptr_t retaddr)
4313 #ifndef CONFIG_USER_ONLY
4314 intptr_t mem_off, reg_off, reg_last;
4315 int flags0 = info->page[0].flags;
4316 int flags1 = info->page[1].flags;
4318 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4319 return;
4322 /* Indicate that watchpoints are handled. */
4323 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4324 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4326 if (flags0 & TLB_WATCHPOINT) {
4327 mem_off = info->mem_off_first[0];
4328 reg_off = info->reg_off_first[0];
4329 reg_last = info->reg_off_last[0];
4331 while (reg_off <= reg_last) {
4332 uint64_t pg = vg[reg_off >> 6];
4333 do {
4334 if ((pg >> (reg_off & 63)) & 1) {
4335 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4336 msize, info->page[0].attrs,
4337 wp_access, retaddr);
4339 reg_off += esize;
4340 mem_off += msize;
4341 } while (reg_off <= reg_last && (reg_off & 63));
4345 mem_off = info->mem_off_split;
4346 if (mem_off >= 0) {
4347 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4348 info->page[0].attrs, wp_access, retaddr);
4351 mem_off = info->mem_off_first[1];
4352 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4353 reg_off = info->reg_off_first[1];
4354 reg_last = info->reg_off_last[1];
4356 do {
4357 uint64_t pg = vg[reg_off >> 6];
4358 do {
4359 if ((pg >> (reg_off & 63)) & 1) {
4360 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4361 msize, info->page[1].attrs,
4362 wp_access, retaddr);
4364 reg_off += esize;
4365 mem_off += msize;
4366 } while (reg_off & 63);
4367 } while (reg_off <= reg_last);
4369 #endif
4372 typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4374 static inline QEMU_ALWAYS_INLINE
4375 void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4376 uint64_t *vg, target_ulong addr, int esize,
4377 int msize, uint32_t mtedesc, uintptr_t ra,
4378 mte_check_fn *check)
4380 intptr_t mem_off, reg_off, reg_last;
4382 /* Process the page only if MemAttr == Tagged. */
4383 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4384 mem_off = info->mem_off_first[0];
4385 reg_off = info->reg_off_first[0];
4386 reg_last = info->reg_off_split;
4387 if (reg_last < 0) {
4388 reg_last = info->reg_off_last[0];
4391 do {
4392 uint64_t pg = vg[reg_off >> 6];
4393 do {
4394 if ((pg >> (reg_off & 63)) & 1) {
4395 check(env, mtedesc, addr, ra);
4397 reg_off += esize;
4398 mem_off += msize;
4399 } while (reg_off <= reg_last && (reg_off & 63));
4400 } while (reg_off <= reg_last);
4403 mem_off = info->mem_off_first[1];
4404 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4405 reg_off = info->reg_off_first[1];
4406 reg_last = info->reg_off_last[1];
4408 do {
4409 uint64_t pg = vg[reg_off >> 6];
4410 do {
4411 if ((pg >> (reg_off & 63)) & 1) {
4412 check(env, mtedesc, addr, ra);
4414 reg_off += esize;
4415 mem_off += msize;
4416 } while (reg_off & 63);
4417 } while (reg_off <= reg_last);
4421 typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4422 uint64_t *vg, target_ulong addr,
4423 int esize, int msize, uint32_t mtedesc,
4424 uintptr_t ra);
4426 static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4427 uint64_t *vg, target_ulong addr,
4428 int esize, int msize, uint32_t mtedesc,
4429 uintptr_t ra)
4431 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4432 mtedesc, ra, mte_check1);
4435 static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4436 uint64_t *vg, target_ulong addr,
4437 int esize, int msize, uint32_t mtedesc,
4438 uintptr_t ra)
4440 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4441 mtedesc, ra, mte_checkN);
4446 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4448 static inline QEMU_ALWAYS_INLINE
4449 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4450 uint32_t desc, const uintptr_t retaddr,
4451 const int esz, const int msz, const int N, uint32_t mtedesc,
4452 sve_ldst1_host_fn *host_fn,
4453 sve_ldst1_tlb_fn *tlb_fn,
4454 sve_cont_ldst_mte_check_fn *mte_check_fn)
4456 const unsigned rd = simd_data(desc);
4457 const intptr_t reg_max = simd_oprsz(desc);
4458 intptr_t reg_off, reg_last, mem_off;
4459 SVEContLdSt info;
4460 void *host;
4461 int flags, i;
4463 /* Find the active elements. */
4464 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4465 /* The entire predicate was false; no load occurs. */
4466 for (i = 0; i < N; ++i) {
4467 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4469 return;
4472 /* Probe the page(s). Exit with exception for any invalid page. */
4473 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4475 /* Handle watchpoints for all active elements. */
4476 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4477 BP_MEM_READ, retaddr);
4480 * Handle mte checks for all active elements.
4481 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4483 if (mte_check_fn && mtedesc) {
4484 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4485 mtedesc, retaddr);
4488 flags = info.page[0].flags | info.page[1].flags;
4489 if (unlikely(flags != 0)) {
4490 #ifdef CONFIG_USER_ONLY
4491 g_assert_not_reached();
4492 #else
4494 * At least one page includes MMIO.
4495 * Any bus operation can fail with cpu_transaction_failed,
4496 * which for ARM will raise SyncExternal. Perform the load
4497 * into scratch memory to preserve register state until the end.
4499 ARMVectorReg scratch[4] = { };
4501 mem_off = info.mem_off_first[0];
4502 reg_off = info.reg_off_first[0];
4503 reg_last = info.reg_off_last[1];
4504 if (reg_last < 0) {
4505 reg_last = info.reg_off_split;
4506 if (reg_last < 0) {
4507 reg_last = info.reg_off_last[0];
4511 do {
4512 uint64_t pg = vg[reg_off >> 6];
4513 do {
4514 if ((pg >> (reg_off & 63)) & 1) {
4515 for (i = 0; i < N; ++i) {
4516 tlb_fn(env, &scratch[i], reg_off,
4517 addr + mem_off + (i << msz), retaddr);
4520 reg_off += 1 << esz;
4521 mem_off += N << msz;
4522 } while (reg_off & 63);
4523 } while (reg_off <= reg_last);
4525 for (i = 0; i < N; ++i) {
4526 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4528 return;
4529 #endif
4532 /* The entire operation is in RAM, on valid pages. */
4534 for (i = 0; i < N; ++i) {
4535 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4538 mem_off = info.mem_off_first[0];
4539 reg_off = info.reg_off_first[0];
4540 reg_last = info.reg_off_last[0];
4541 host = info.page[0].host;
4543 while (reg_off <= reg_last) {
4544 uint64_t pg = vg[reg_off >> 6];
4545 do {
4546 if ((pg >> (reg_off & 63)) & 1) {
4547 for (i = 0; i < N; ++i) {
4548 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4549 host + mem_off + (i << msz));
4552 reg_off += 1 << esz;
4553 mem_off += N << msz;
4554 } while (reg_off <= reg_last && (reg_off & 63));
4558 * Use the slow path to manage the cross-page misalignment.
4559 * But we know this is RAM and cannot trap.
4561 mem_off = info.mem_off_split;
4562 if (unlikely(mem_off >= 0)) {
4563 reg_off = info.reg_off_split;
4564 for (i = 0; i < N; ++i) {
4565 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4566 addr + mem_off + (i << msz), retaddr);
4570 mem_off = info.mem_off_first[1];
4571 if (unlikely(mem_off >= 0)) {
4572 reg_off = info.reg_off_first[1];
4573 reg_last = info.reg_off_last[1];
4574 host = info.page[1].host;
4576 do {
4577 uint64_t pg = vg[reg_off >> 6];
4578 do {
4579 if ((pg >> (reg_off & 63)) & 1) {
4580 for (i = 0; i < N; ++i) {
4581 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4582 host + mem_off + (i << msz));
4585 reg_off += 1 << esz;
4586 mem_off += N << msz;
4587 } while (reg_off & 63);
4588 } while (reg_off <= reg_last);
4592 static inline QEMU_ALWAYS_INLINE
4593 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4594 uint32_t desc, const uintptr_t ra,
4595 const int esz, const int msz, const int N,
4596 sve_ldst1_host_fn *host_fn,
4597 sve_ldst1_tlb_fn *tlb_fn)
4599 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4600 int bit55 = extract64(addr, 55, 1);
4602 /* Remove mtedesc from the normal sve descriptor. */
4603 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4605 /* Perform gross MTE suppression early. */
4606 if (!tbi_check(desc, bit55) ||
4607 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4608 mtedesc = 0;
4611 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4612 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
4615 #define DO_LD1_1(NAME, ESZ) \
4616 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4617 target_ulong addr, uint32_t desc) \
4619 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4620 sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
4622 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4623 target_ulong addr, uint32_t desc) \
4625 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4626 sve_##NAME##_host, sve_##NAME##_tlb); \
4629 #define DO_LD1_2(NAME, ESZ, MSZ) \
4630 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4631 target_ulong addr, uint32_t desc) \
4633 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4634 sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
4636 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4637 target_ulong addr, uint32_t desc) \
4639 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4640 sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
4642 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4643 target_ulong addr, uint32_t desc) \
4645 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4646 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4648 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4649 target_ulong addr, uint32_t desc) \
4651 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4652 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4655 DO_LD1_1(ld1bb, MO_8)
4656 DO_LD1_1(ld1bhu, MO_16)
4657 DO_LD1_1(ld1bhs, MO_16)
4658 DO_LD1_1(ld1bsu, MO_32)
4659 DO_LD1_1(ld1bss, MO_32)
4660 DO_LD1_1(ld1bdu, MO_64)
4661 DO_LD1_1(ld1bds, MO_64)
4663 DO_LD1_2(ld1hh, MO_16, MO_16)
4664 DO_LD1_2(ld1hsu, MO_32, MO_16)
4665 DO_LD1_2(ld1hss, MO_32, MO_16)
4666 DO_LD1_2(ld1hdu, MO_64, MO_16)
4667 DO_LD1_2(ld1hds, MO_64, MO_16)
4669 DO_LD1_2(ld1ss, MO_32, MO_32)
4670 DO_LD1_2(ld1sdu, MO_64, MO_32)
4671 DO_LD1_2(ld1sds, MO_64, MO_32)
4673 DO_LD1_2(ld1dd, MO_64, MO_64)
4675 #undef DO_LD1_1
4676 #undef DO_LD1_2
4678 #define DO_LDN_1(N) \
4679 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4680 target_ulong addr, uint32_t desc) \
4682 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4683 sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
4685 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4686 target_ulong addr, uint32_t desc) \
4688 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4689 sve_ld1bb_host, sve_ld1bb_tlb); \
4692 #define DO_LDN_2(N, SUFF, ESZ) \
4693 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4694 target_ulong addr, uint32_t desc) \
4696 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4697 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
4699 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4700 target_ulong addr, uint32_t desc) \
4702 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4703 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
4705 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4706 target_ulong addr, uint32_t desc) \
4708 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4709 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4711 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4712 target_ulong addr, uint32_t desc) \
4714 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4715 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4718 DO_LDN_1(2)
4719 DO_LDN_1(3)
4720 DO_LDN_1(4)
4722 DO_LDN_2(2, hh, MO_16)
4723 DO_LDN_2(3, hh, MO_16)
4724 DO_LDN_2(4, hh, MO_16)
4726 DO_LDN_2(2, ss, MO_32)
4727 DO_LDN_2(3, ss, MO_32)
4728 DO_LDN_2(4, ss, MO_32)
4730 DO_LDN_2(2, dd, MO_64)
4731 DO_LDN_2(3, dd, MO_64)
4732 DO_LDN_2(4, dd, MO_64)
4734 #undef DO_LDN_1
4735 #undef DO_LDN_2
4738 * Load contiguous data, first-fault and no-fault.
4740 * For user-only, one could argue that we should hold the mmap_lock during
4741 * the operation so that there is no race between page_check_range and the
4742 * load operation. However, unmapping pages out from under a running thread
4743 * is extraordinarily unlikely. This theoretical race condition also affects
4744 * linux-user/ in its get_user/put_user macros.
4746 * TODO: Construct some helpers, written in assembly, that interact with
4747 * handle_cpu_signal to produce memory ops which can properly report errors
4748 * without racing.
4751 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4752 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4753 * option, which leaves subsequent data unchanged.
4755 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4757 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4759 if (i & 63) {
4760 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4761 i = ROUND_UP(i, 64);
4763 for (; i < oprsz; i += 64) {
4764 ffr[i / 64] = 0;
4769 * Common helper for all contiguous no-fault and first-fault loads.
4771 static inline QEMU_ALWAYS_INLINE
4772 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4773 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
4774 const int esz, const int msz, const SVEContFault fault,
4775 sve_ldst1_host_fn *host_fn,
4776 sve_ldst1_tlb_fn *tlb_fn)
4778 const unsigned rd = simd_data(desc);
4779 void *vd = &env->vfp.zregs[rd];
4780 const intptr_t reg_max = simd_oprsz(desc);
4781 intptr_t reg_off, mem_off, reg_last;
4782 SVEContLdSt info;
4783 int flags;
4784 void *host;
4786 /* Find the active elements. */
4787 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4788 /* The entire predicate was false; no load occurs. */
4789 memset(vd, 0, reg_max);
4790 return;
4792 reg_off = info.reg_off_first[0];
4794 /* Probe the page(s). */
4795 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4796 /* Fault on first element. */
4797 tcg_debug_assert(fault == FAULT_NO);
4798 memset(vd, 0, reg_max);
4799 goto do_fault;
4802 mem_off = info.mem_off_first[0];
4803 flags = info.page[0].flags;
4806 * Disable MTE checking if the Tagged bit is not set. Since TBI must
4807 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
4809 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4810 mtedesc = 0;
4813 if (fault == FAULT_FIRST) {
4814 /* Trapping mte check for the first-fault element. */
4815 if (mtedesc) {
4816 mte_check1(env, mtedesc, addr + mem_off, retaddr);
4820 * Special handling of the first active element,
4821 * if it crosses a page boundary or is MMIO.
4823 bool is_split = mem_off == info.mem_off_split;
4824 if (unlikely(flags != 0) || unlikely(is_split)) {
4826 * Use the slow path for cross-page handling.
4827 * Might trap for MMIO or watchpoints.
4829 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4831 /* After any fault, zero the other elements. */
4832 swap_memzero(vd, reg_off);
4833 reg_off += 1 << esz;
4834 mem_off += 1 << msz;
4835 swap_memzero(vd + reg_off, reg_max - reg_off);
4837 if (is_split) {
4838 goto second_page;
4840 } else {
4841 memset(vd, 0, reg_max);
4843 } else {
4844 memset(vd, 0, reg_max);
4845 if (unlikely(mem_off == info.mem_off_split)) {
4846 /* The first active element crosses a page boundary. */
4847 flags |= info.page[1].flags;
4848 if (unlikely(flags & TLB_MMIO)) {
4849 /* Some page is MMIO, see below. */
4850 goto do_fault;
4852 if (unlikely(flags & TLB_WATCHPOINT) &&
4853 (cpu_watchpoint_address_matches
4854 (env_cpu(env), addr + mem_off, 1 << msz)
4855 & BP_MEM_READ)) {
4856 /* Watchpoint hit, see below. */
4857 goto do_fault;
4859 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4860 goto do_fault;
4863 * Use the slow path for cross-page handling.
4864 * This is RAM, without a watchpoint, and will not trap.
4866 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4867 goto second_page;
4872 * From this point on, all memory operations are MemSingleNF.
4874 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4875 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4877 * Unfortuately we do not have access to the memory attributes from the
4878 * PTE to tell Device memory from Normal memory. So we make a mostly
4879 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4880 * This gives the right answer for the common cases of "Normal memory,
4881 * backed by host RAM" and "Device memory, backed by MMIO".
4882 * The architecture allows us to suppress an NF load and return
4883 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4884 * case of "Normal memory, backed by MMIO" is permitted. The case we
4885 * get wrong is "Device memory, backed by host RAM", for which we
4886 * should return (UNKNOWN, FAULT) for but do not.
4888 * Similarly, CPU_BP breakpoints would raise exceptions, and so
4889 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
4890 * architectural breakpoints the same.
4892 if (unlikely(flags & TLB_MMIO)) {
4893 goto do_fault;
4896 reg_last = info.reg_off_last[0];
4897 host = info.page[0].host;
4899 do {
4900 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4901 do {
4902 if ((pg >> (reg_off & 63)) & 1) {
4903 if (unlikely(flags & TLB_WATCHPOINT) &&
4904 (cpu_watchpoint_address_matches
4905 (env_cpu(env), addr + mem_off, 1 << msz)
4906 & BP_MEM_READ)) {
4907 goto do_fault;
4909 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4910 goto do_fault;
4912 host_fn(vd, reg_off, host + mem_off);
4914 reg_off += 1 << esz;
4915 mem_off += 1 << msz;
4916 } while (reg_off <= reg_last && (reg_off & 63));
4917 } while (reg_off <= reg_last);
4920 * MemSingleNF is allowed to fail for any reason. We have special
4921 * code above to handle the first element crossing a page boundary.
4922 * As an implementation choice, decline to handle a cross-page element
4923 * in any other position.
4925 reg_off = info.reg_off_split;
4926 if (reg_off >= 0) {
4927 goto do_fault;
4930 second_page:
4931 reg_off = info.reg_off_first[1];
4932 if (likely(reg_off < 0)) {
4933 /* No active elements on the second page. All done. */
4934 return;
4938 * MemSingleNF is allowed to fail for any reason. As an implementation
4939 * choice, decline to handle elements on the second page. This should
4940 * be low frequency as the guest walks through memory -- the next
4941 * iteration of the guest's loop should be aligned on the page boundary,
4942 * and then all following iterations will stay aligned.
4945 do_fault:
4946 record_fault(env, reg_off, reg_max);
4949 static inline QEMU_ALWAYS_INLINE
4950 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4951 uint32_t desc, const uintptr_t retaddr,
4952 const int esz, const int msz, const SVEContFault fault,
4953 sve_ldst1_host_fn *host_fn,
4954 sve_ldst1_tlb_fn *tlb_fn)
4956 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4957 int bit55 = extract64(addr, 55, 1);
4959 /* Remove mtedesc from the normal sve descriptor. */
4960 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4962 /* Perform gross MTE suppression early. */
4963 if (!tbi_check(desc, bit55) ||
4964 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4965 mtedesc = 0;
4968 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4969 esz, msz, fault, host_fn, tlb_fn);
4972 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4973 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4974 target_ulong addr, uint32_t desc) \
4976 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
4977 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4979 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4980 target_ulong addr, uint32_t desc) \
4982 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4983 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4985 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
4986 target_ulong addr, uint32_t desc) \
4988 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
4989 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4991 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
4992 target_ulong addr, uint32_t desc) \
4994 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
4995 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4998 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4999 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5000 target_ulong addr, uint32_t desc) \
5002 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5003 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5005 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5006 target_ulong addr, uint32_t desc) \
5008 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5009 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5011 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5012 target_ulong addr, uint32_t desc) \
5014 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5015 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5017 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5018 target_ulong addr, uint32_t desc) \
5020 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5021 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5023 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5024 target_ulong addr, uint32_t desc) \
5026 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5027 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5029 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5030 target_ulong addr, uint32_t desc) \
5032 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5033 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5035 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5036 target_ulong addr, uint32_t desc) \
5038 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5039 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5041 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5042 target_ulong addr, uint32_t desc) \
5044 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5045 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5048 DO_LDFF1_LDNF1_1(bb, MO_8)
5049 DO_LDFF1_LDNF1_1(bhu, MO_16)
5050 DO_LDFF1_LDNF1_1(bhs, MO_16)
5051 DO_LDFF1_LDNF1_1(bsu, MO_32)
5052 DO_LDFF1_LDNF1_1(bss, MO_32)
5053 DO_LDFF1_LDNF1_1(bdu, MO_64)
5054 DO_LDFF1_LDNF1_1(bds, MO_64)
5056 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5057 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5058 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5059 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5060 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5062 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5063 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5064 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5066 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5068 #undef DO_LDFF1_LDNF1_1
5069 #undef DO_LDFF1_LDNF1_2
5072 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5075 static inline QEMU_ALWAYS_INLINE
5076 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5077 uint32_t desc, const uintptr_t retaddr,
5078 const int esz, const int msz, const int N, uint32_t mtedesc,
5079 sve_ldst1_host_fn *host_fn,
5080 sve_ldst1_tlb_fn *tlb_fn,
5081 sve_cont_ldst_mte_check_fn *mte_check_fn)
5083 const unsigned rd = simd_data(desc);
5084 const intptr_t reg_max = simd_oprsz(desc);
5085 intptr_t reg_off, reg_last, mem_off;
5086 SVEContLdSt info;
5087 void *host;
5088 int i, flags;
5090 /* Find the active elements. */
5091 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5092 /* The entire predicate was false; no store occurs. */
5093 return;
5096 /* Probe the page(s). Exit with exception for any invalid page. */
5097 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5099 /* Handle watchpoints for all active elements. */
5100 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5101 BP_MEM_WRITE, retaddr);
5104 * Handle mte checks for all active elements.
5105 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5107 if (mte_check_fn && mtedesc) {
5108 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5109 mtedesc, retaddr);
5112 flags = info.page[0].flags | info.page[1].flags;
5113 if (unlikely(flags != 0)) {
5114 #ifdef CONFIG_USER_ONLY
5115 g_assert_not_reached();
5116 #else
5118 * At least one page includes MMIO.
5119 * Any bus operation can fail with cpu_transaction_failed,
5120 * which for ARM will raise SyncExternal. We cannot avoid
5121 * this fault and will leave with the store incomplete.
5123 mem_off = info.mem_off_first[0];
5124 reg_off = info.reg_off_first[0];
5125 reg_last = info.reg_off_last[1];
5126 if (reg_last < 0) {
5127 reg_last = info.reg_off_split;
5128 if (reg_last < 0) {
5129 reg_last = info.reg_off_last[0];
5133 do {
5134 uint64_t pg = vg[reg_off >> 6];
5135 do {
5136 if ((pg >> (reg_off & 63)) & 1) {
5137 for (i = 0; i < N; ++i) {
5138 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5139 addr + mem_off + (i << msz), retaddr);
5142 reg_off += 1 << esz;
5143 mem_off += N << msz;
5144 } while (reg_off & 63);
5145 } while (reg_off <= reg_last);
5146 return;
5147 #endif
5150 mem_off = info.mem_off_first[0];
5151 reg_off = info.reg_off_first[0];
5152 reg_last = info.reg_off_last[0];
5153 host = info.page[0].host;
5155 while (reg_off <= reg_last) {
5156 uint64_t pg = vg[reg_off >> 6];
5157 do {
5158 if ((pg >> (reg_off & 63)) & 1) {
5159 for (i = 0; i < N; ++i) {
5160 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5161 host + mem_off + (i << msz));
5164 reg_off += 1 << esz;
5165 mem_off += N << msz;
5166 } while (reg_off <= reg_last && (reg_off & 63));
5170 * Use the slow path to manage the cross-page misalignment.
5171 * But we know this is RAM and cannot trap.
5173 mem_off = info.mem_off_split;
5174 if (unlikely(mem_off >= 0)) {
5175 reg_off = info.reg_off_split;
5176 for (i = 0; i < N; ++i) {
5177 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5178 addr + mem_off + (i << msz), retaddr);
5182 mem_off = info.mem_off_first[1];
5183 if (unlikely(mem_off >= 0)) {
5184 reg_off = info.reg_off_first[1];
5185 reg_last = info.reg_off_last[1];
5186 host = info.page[1].host;
5188 do {
5189 uint64_t pg = vg[reg_off >> 6];
5190 do {
5191 if ((pg >> (reg_off & 63)) & 1) {
5192 for (i = 0; i < N; ++i) {
5193 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5194 host + mem_off + (i << msz));
5197 reg_off += 1 << esz;
5198 mem_off += N << msz;
5199 } while (reg_off & 63);
5200 } while (reg_off <= reg_last);
5204 static inline QEMU_ALWAYS_INLINE
5205 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5206 uint32_t desc, const uintptr_t ra,
5207 const int esz, const int msz, const int N,
5208 sve_ldst1_host_fn *host_fn,
5209 sve_ldst1_tlb_fn *tlb_fn)
5211 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5212 int bit55 = extract64(addr, 55, 1);
5214 /* Remove mtedesc from the normal sve descriptor. */
5215 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5217 /* Perform gross MTE suppression early. */
5218 if (!tbi_check(desc, bit55) ||
5219 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5220 mtedesc = 0;
5223 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5224 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
5227 #define DO_STN_1(N, NAME, ESZ) \
5228 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5229 target_ulong addr, uint32_t desc) \
5231 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5232 sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
5234 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5235 target_ulong addr, uint32_t desc) \
5237 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5238 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5241 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5242 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5243 target_ulong addr, uint32_t desc) \
5245 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5246 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
5248 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5249 target_ulong addr, uint32_t desc) \
5251 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5252 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
5254 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5255 target_ulong addr, uint32_t desc) \
5257 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5258 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5260 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5261 target_ulong addr, uint32_t desc) \
5263 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5264 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5267 DO_STN_1(1, bb, MO_8)
5268 DO_STN_1(1, bh, MO_16)
5269 DO_STN_1(1, bs, MO_32)
5270 DO_STN_1(1, bd, MO_64)
5271 DO_STN_1(2, bb, MO_8)
5272 DO_STN_1(3, bb, MO_8)
5273 DO_STN_1(4, bb, MO_8)
5275 DO_STN_2(1, hh, MO_16, MO_16)
5276 DO_STN_2(1, hs, MO_32, MO_16)
5277 DO_STN_2(1, hd, MO_64, MO_16)
5278 DO_STN_2(2, hh, MO_16, MO_16)
5279 DO_STN_2(3, hh, MO_16, MO_16)
5280 DO_STN_2(4, hh, MO_16, MO_16)
5282 DO_STN_2(1, ss, MO_32, MO_32)
5283 DO_STN_2(1, sd, MO_64, MO_32)
5284 DO_STN_2(2, ss, MO_32, MO_32)
5285 DO_STN_2(3, ss, MO_32, MO_32)
5286 DO_STN_2(4, ss, MO_32, MO_32)
5288 DO_STN_2(1, dd, MO_64, MO_64)
5289 DO_STN_2(2, dd, MO_64, MO_64)
5290 DO_STN_2(3, dd, MO_64, MO_64)
5291 DO_STN_2(4, dd, MO_64, MO_64)
5293 #undef DO_STN_1
5294 #undef DO_STN_2
5297 * Loads with a vector index.
5301 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5303 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5305 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5307 return *(uint32_t *)(reg + H1_4(reg_ofs));
5310 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5312 return *(int32_t *)(reg + H1_4(reg_ofs));
5315 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5317 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5320 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5322 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5325 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5327 return *(uint64_t *)(reg + reg_ofs);
5330 static inline QEMU_ALWAYS_INLINE
5331 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5332 target_ulong base, uint32_t desc, uintptr_t retaddr,
5333 uint32_t mtedesc, int esize, int msize,
5334 zreg_off_fn *off_fn,
5335 sve_ldst1_host_fn *host_fn,
5336 sve_ldst1_tlb_fn *tlb_fn)
5338 const int mmu_idx = cpu_mmu_index(env, false);
5339 const intptr_t reg_max = simd_oprsz(desc);
5340 const int scale = simd_data(desc);
5341 ARMVectorReg scratch;
5342 intptr_t reg_off;
5343 SVEHostPage info, info2;
5345 memset(&scratch, 0, reg_max);
5346 reg_off = 0;
5347 do {
5348 uint64_t pg = vg[reg_off >> 6];
5349 do {
5350 if (likely(pg & 1)) {
5351 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5352 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5354 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5355 mmu_idx, retaddr);
5357 if (likely(in_page >= msize)) {
5358 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5359 cpu_check_watchpoint(env_cpu(env), addr, msize,
5360 info.attrs, BP_MEM_READ, retaddr);
5362 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5363 mte_check1(env, mtedesc, addr, retaddr);
5365 host_fn(&scratch, reg_off, info.host);
5366 } else {
5367 /* Element crosses the page boundary. */
5368 sve_probe_page(&info2, false, env, addr + in_page, 0,
5369 MMU_DATA_LOAD, mmu_idx, retaddr);
5370 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5371 cpu_check_watchpoint(env_cpu(env), addr,
5372 msize, info.attrs,
5373 BP_MEM_READ, retaddr);
5375 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5376 mte_check1(env, mtedesc, addr, retaddr);
5378 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5381 reg_off += esize;
5382 pg >>= esize;
5383 } while (reg_off & 63);
5384 } while (reg_off < reg_max);
5386 /* Wait until all exceptions have been raised to write back. */
5387 memcpy(vd, &scratch, reg_max);
5390 static inline QEMU_ALWAYS_INLINE
5391 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5392 target_ulong base, uint32_t desc, uintptr_t retaddr,
5393 int esize, int msize, zreg_off_fn *off_fn,
5394 sve_ldst1_host_fn *host_fn,
5395 sve_ldst1_tlb_fn *tlb_fn)
5397 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5398 /* Remove mtedesc from the normal sve descriptor. */
5399 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5402 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5403 * offset base entirely over the address space hole to change the
5404 * pointer tag, or change the bit55 selector. So we could here
5405 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5407 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5408 esize, msize, off_fn, host_fn, tlb_fn);
5411 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5412 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5413 void *vm, target_ulong base, uint32_t desc) \
5415 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5416 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5418 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5419 void *vm, target_ulong base, uint32_t desc) \
5421 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5422 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5425 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5426 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5427 void *vm, target_ulong base, uint32_t desc) \
5429 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5430 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5432 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5433 void *vm, target_ulong base, uint32_t desc) \
5435 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5436 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5439 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5440 DO_LD1_ZPZ_S(bsu, zss, MO_8)
5441 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5442 DO_LD1_ZPZ_D(bdu, zss, MO_8)
5443 DO_LD1_ZPZ_D(bdu, zd, MO_8)
5445 DO_LD1_ZPZ_S(bss, zsu, MO_8)
5446 DO_LD1_ZPZ_S(bss, zss, MO_8)
5447 DO_LD1_ZPZ_D(bds, zsu, MO_8)
5448 DO_LD1_ZPZ_D(bds, zss, MO_8)
5449 DO_LD1_ZPZ_D(bds, zd, MO_8)
5451 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5452 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5453 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5454 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5455 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5457 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5458 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5459 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5460 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5461 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5463 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5464 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5465 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5466 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5467 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5469 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5470 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5471 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5472 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5473 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5475 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5476 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5477 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5478 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5479 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5481 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5482 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5483 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5484 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5485 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5487 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5488 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5489 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5491 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5492 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5493 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5495 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5496 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5497 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5499 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5500 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5501 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5503 #undef DO_LD1_ZPZ_S
5504 #undef DO_LD1_ZPZ_D
5506 /* First fault loads with a vector index. */
5509 * Common helpers for all gather first-faulting loads.
5512 static inline QEMU_ALWAYS_INLINE
5513 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5514 target_ulong base, uint32_t desc, uintptr_t retaddr,
5515 uint32_t mtedesc, const int esz, const int msz,
5516 zreg_off_fn *off_fn,
5517 sve_ldst1_host_fn *host_fn,
5518 sve_ldst1_tlb_fn *tlb_fn)
5520 const int mmu_idx = cpu_mmu_index(env, false);
5521 const intptr_t reg_max = simd_oprsz(desc);
5522 const int scale = simd_data(desc);
5523 const int esize = 1 << esz;
5524 const int msize = 1 << msz;
5525 intptr_t reg_off;
5526 SVEHostPage info;
5527 target_ulong addr, in_page;
5529 /* Skip to the first true predicate. */
5530 reg_off = find_next_active(vg, 0, reg_max, esz);
5531 if (unlikely(reg_off >= reg_max)) {
5532 /* The entire predicate was false; no load occurs. */
5533 memset(vd, 0, reg_max);
5534 return;
5538 * Probe the first element, allowing faults.
5540 addr = base + (off_fn(vm, reg_off) << scale);
5541 if (mtedesc) {
5542 mte_check1(env, mtedesc, addr, retaddr);
5544 tlb_fn(env, vd, reg_off, addr, retaddr);
5546 /* After any fault, zero the other elements. */
5547 swap_memzero(vd, reg_off);
5548 reg_off += esize;
5549 swap_memzero(vd + reg_off, reg_max - reg_off);
5552 * Probe the remaining elements, not allowing faults.
5554 while (reg_off < reg_max) {
5555 uint64_t pg = vg[reg_off >> 6];
5556 do {
5557 if (likely((pg >> (reg_off & 63)) & 1)) {
5558 addr = base + (off_fn(vm, reg_off) << scale);
5559 in_page = -(addr | TARGET_PAGE_MASK);
5561 if (unlikely(in_page < msize)) {
5562 /* Stop if the element crosses a page boundary. */
5563 goto fault;
5566 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5567 mmu_idx, retaddr);
5568 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5569 goto fault;
5571 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5572 (cpu_watchpoint_address_matches
5573 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5574 goto fault;
5576 if (mtedesc &&
5577 arm_tlb_mte_tagged(&info.attrs) &&
5578 !mte_probe1(env, mtedesc, addr)) {
5579 goto fault;
5582 host_fn(vd, reg_off, info.host);
5584 reg_off += esize;
5585 } while (reg_off & 63);
5587 return;
5589 fault:
5590 record_fault(env, reg_off, reg_max);
5593 static inline QEMU_ALWAYS_INLINE
5594 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5595 target_ulong base, uint32_t desc, uintptr_t retaddr,
5596 const int esz, const int msz,
5597 zreg_off_fn *off_fn,
5598 sve_ldst1_host_fn *host_fn,
5599 sve_ldst1_tlb_fn *tlb_fn)
5601 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5602 /* Remove mtedesc from the normal sve descriptor. */
5603 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5606 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5607 * offset base entirely over the address space hole to change the
5608 * pointer tag, or change the bit55 selector. So we could here
5609 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5611 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5612 esz, msz, off_fn, host_fn, tlb_fn);
5615 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5616 void HELPER(sve_ldff##MEM##_##OFS) \
5617 (CPUARMState *env, void *vd, void *vg, \
5618 void *vm, target_ulong base, uint32_t desc) \
5620 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
5621 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5623 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5624 (CPUARMState *env, void *vd, void *vg, \
5625 void *vm, target_ulong base, uint32_t desc) \
5627 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5628 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5631 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5632 void HELPER(sve_ldff##MEM##_##OFS) \
5633 (CPUARMState *env, void *vd, void *vg, \
5634 void *vm, target_ulong base, uint32_t desc) \
5636 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
5637 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5639 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5640 (CPUARMState *env, void *vd, void *vg, \
5641 void *vm, target_ulong base, uint32_t desc) \
5643 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5644 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5647 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5648 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5649 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5650 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5651 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5653 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5654 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5655 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5656 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5657 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5659 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5660 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5661 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5662 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5663 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5665 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5666 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5667 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5668 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5669 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5671 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5672 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5673 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5674 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5675 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5677 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5678 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5679 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5680 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5681 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5683 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5684 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5685 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5686 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5687 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5689 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5690 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5691 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5692 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5693 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5695 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5696 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5697 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5699 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5700 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5701 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5703 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5704 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5705 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5707 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5708 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5709 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5711 /* Stores with a vector index. */
5713 static inline QEMU_ALWAYS_INLINE
5714 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5715 target_ulong base, uint32_t desc, uintptr_t retaddr,
5716 uint32_t mtedesc, int esize, int msize,
5717 zreg_off_fn *off_fn,
5718 sve_ldst1_host_fn *host_fn,
5719 sve_ldst1_tlb_fn *tlb_fn)
5721 const int mmu_idx = cpu_mmu_index(env, false);
5722 const intptr_t reg_max = simd_oprsz(desc);
5723 const int scale = simd_data(desc);
5724 void *host[ARM_MAX_VQ * 4];
5725 intptr_t reg_off, i;
5726 SVEHostPage info, info2;
5729 * Probe all of the elements for host addresses and flags.
5731 i = reg_off = 0;
5732 do {
5733 uint64_t pg = vg[reg_off >> 6];
5734 do {
5735 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5736 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5738 host[i] = NULL;
5739 if (likely((pg >> (reg_off & 63)) & 1)) {
5740 if (likely(in_page >= msize)) {
5741 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5742 mmu_idx, retaddr);
5743 host[i] = info.host;
5744 } else {
5746 * Element crosses the page boundary.
5747 * Probe both pages, but do not record the host address,
5748 * so that we use the slow path.
5750 sve_probe_page(&info, false, env, addr, 0,
5751 MMU_DATA_STORE, mmu_idx, retaddr);
5752 sve_probe_page(&info2, false, env, addr + in_page, 0,
5753 MMU_DATA_STORE, mmu_idx, retaddr);
5754 info.flags |= info2.flags;
5757 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5758 cpu_check_watchpoint(env_cpu(env), addr, msize,
5759 info.attrs, BP_MEM_WRITE, retaddr);
5762 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5763 mte_check1(env, mtedesc, addr, retaddr);
5766 i += 1;
5767 reg_off += esize;
5768 } while (reg_off & 63);
5769 } while (reg_off < reg_max);
5772 * Now that we have recognized all exceptions except SyncExternal
5773 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5775 * Note for the common case of an element in RAM, not crossing a page
5776 * boundary, we have stored the host address in host[]. This doubles
5777 * as a first-level check against the predicate, since only enabled
5778 * elements have non-null host addresses.
5780 i = reg_off = 0;
5781 do {
5782 void *h = host[i];
5783 if (likely(h != NULL)) {
5784 host_fn(vd, reg_off, h);
5785 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5786 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5787 tlb_fn(env, vd, reg_off, addr, retaddr);
5789 i += 1;
5790 reg_off += esize;
5791 } while (reg_off < reg_max);
5794 static inline QEMU_ALWAYS_INLINE
5795 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5796 target_ulong base, uint32_t desc, uintptr_t retaddr,
5797 int esize, int msize, zreg_off_fn *off_fn,
5798 sve_ldst1_host_fn *host_fn,
5799 sve_ldst1_tlb_fn *tlb_fn)
5801 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5802 /* Remove mtedesc from the normal sve descriptor. */
5803 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5806 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5807 * offset base entirely over the address space hole to change the
5808 * pointer tag, or change the bit55 selector. So we could here
5809 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5811 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5812 esize, msize, off_fn, host_fn, tlb_fn);
5815 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5816 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5817 void *vm, target_ulong base, uint32_t desc) \
5819 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5820 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5822 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5823 void *vm, target_ulong base, uint32_t desc) \
5825 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5826 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5829 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5830 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5831 void *vm, target_ulong base, uint32_t desc) \
5833 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5834 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5836 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5837 void *vm, target_ulong base, uint32_t desc) \
5839 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5840 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5843 DO_ST1_ZPZ_S(bs, zsu, MO_8)
5844 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5845 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5846 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5847 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5849 DO_ST1_ZPZ_S(bs, zss, MO_8)
5850 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5851 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5852 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5853 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5855 DO_ST1_ZPZ_D(bd, zsu, MO_8)
5856 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5857 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5858 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5859 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5860 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5861 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5863 DO_ST1_ZPZ_D(bd, zss, MO_8)
5864 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5865 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5866 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5867 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5868 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5869 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5871 DO_ST1_ZPZ_D(bd, zd, MO_8)
5872 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5873 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5874 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5875 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5876 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5877 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5879 #undef DO_ST1_ZPZ_S
5880 #undef DO_ST1_ZPZ_D