target/arm: Reorganize SVE WHILE
[qemu/ar7.git] / target / arm / sve_helper.c
blob87594a8adb43fc8a7d2caee0f8828cf4e80a2e7f
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 return flags;
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
96 return flags;
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
116 return flags;
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
127 * printf("0x%016lx,\n", m);
130 static inline uint64_t expand_pred_b(uint8_t byte)
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
220 return word[byte];
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 static inline uint64_t expand_pred_h(uint8_t byte)
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
249 return word[byte & 0x55];
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
260 return word[byte & 0x11];
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
266 return rol32(h, 16);
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
280 return rol64(h, 32);
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
375 * We must avoid the C undefined behaviour cases: division by
376 * zero and signed division of INT_MIN by -1. Both of these
377 * have architecturally defined required results for Arm.
378 * We special case all signed divisions by -1 to avoid having
379 * to deduce the minimum integer for the type involved.
381 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
384 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
389 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
394 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
399 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
404 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
409 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
414 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
419 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
424 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
429 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
434 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
439 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
444 /* Because the computation type is at least twice as large as required,
445 these work for both signed and unsigned source types. */
446 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
448 return (n * m) >> 8;
451 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
453 return (n * m) >> 16;
456 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
458 return (n * m) >> 32;
461 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
468 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
475 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
480 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
485 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
490 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
493 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
496 /* Note that all bits of the shift are significant
497 and not modulo the element size. */
498 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
502 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
506 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
510 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
514 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
518 #undef DO_ZPZZ
519 #undef DO_ZPZZ_D
521 /* Three-operand expander, controlled by a predicate, in which the
522 * third operand is "wide". That is, for D = N op M, the same 64-bit
523 * value of M is used with all of the narrower values of N.
525 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
542 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
546 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
550 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
554 #undef DO_ZPZW
556 /* Fully general two-operand expander, controlled by a predicate.
558 #define DO_ZPZ(NAME, TYPE, H, OP) \
559 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
574 /* Similarly, specialized for 64-bit operands. */
575 #define DO_ZPZ_D(NAME, TYPE, OP) \
576 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
589 #define DO_CLS_B(N) (clrsb32(N) - 24)
590 #define DO_CLS_H(N) (clrsb32(N) - 16)
592 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
597 #define DO_CLZ_B(N) (clz32(N) - 24)
598 #define DO_CLZ_H(N) (clz32(N) - 16)
600 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
605 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
610 #define DO_CNOT(N) (N == 0)
612 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
617 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
619 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
623 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
625 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
629 #define DO_NOT(N) (~N)
631 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
636 #define DO_SXTB(N) ((int8_t)N)
637 #define DO_SXTH(N) ((int16_t)N)
638 #define DO_SXTS(N) ((int32_t)N)
639 #define DO_UXTB(N) ((uint8_t)N)
640 #define DO_UXTH(N) ((uint16_t)N)
641 #define DO_UXTS(N) ((uint32_t)N)
643 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
650 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
657 #define DO_ABS(N) (N < 0 ? -N : N)
659 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
664 #define DO_NEG(N) (-N)
666 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
671 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
675 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
678 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
680 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
685 /* Three-operand expander, unpredicated, in which the third operand is "wide".
687 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
701 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
705 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
709 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
713 #undef DO_ZZW
715 #undef DO_CLS_B
716 #undef DO_CLS_H
717 #undef DO_CLZ_B
718 #undef DO_CLZ_H
719 #undef DO_CNOT
720 #undef DO_FABS
721 #undef DO_FNEG
722 #undef DO_ABS
723 #undef DO_NEG
724 #undef DO_ZPZ
725 #undef DO_ZPZ_D
727 /* Two-operand reduction expander, controlled by a predicate.
728 * The difference between TYPERED and TYPERET has to do with
729 * sign-extension. E.g. for SMAX, TYPERED must be signed,
730 * but TYPERET must be unsigned so that e.g. a 32-bit value
731 * is not sign-extended to the ABI uint64_t return type.
733 /* ??? If we were to vectorize this by hand the reduction ordering
734 * would change. For integer operands, this is perfectly fine.
736 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
751 return (TYPERET)ret; \
754 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
767 return ret; \
770 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
775 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
780 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
785 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
789 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
794 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
799 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
804 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
809 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
814 #undef DO_VPZ
815 #undef DO_VPZ_D
817 /* Two vector operand, one scalar operand, unpredicated. */
818 #define DO_ZZI(NAME, TYPE, OP) \
819 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
828 #define DO_SUBR(X, Y) (Y - X)
830 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
835 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
840 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
845 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
850 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
855 #undef DO_ZZI
857 #undef DO_AND
858 #undef DO_ORR
859 #undef DO_EOR
860 #undef DO_BIC
861 #undef DO_ADD
862 #undef DO_SUB
863 #undef DO_MAX
864 #undef DO_MIN
865 #undef DO_ABD
866 #undef DO_MUL
867 #undef DO_DIV
868 #undef DO_ASR
869 #undef DO_LSR
870 #undef DO_LSL
871 #undef DO_SUBR
873 /* Similar to the ARM LastActiveElement pseudocode function, except the
874 result is multiplied by the element size. This includes the not found
875 indication; e.g. not found for esz=3 is -8. */
876 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
890 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
900 if (this_g) {
901 if (!(flags & 4)) {
902 /* Set in D the first bit of G. */
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
906 flags = iter_predtest_fwd(this_d, this_g, flags);
908 } while (++i < words);
910 return flags;
913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
924 /* Similar to the pseudocode for pnext, but scaled by ESZ
925 so that we find the correct bit. */
926 if (next < words * 64) {
927 uint64_t mask = -1;
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
954 return flags;
957 /* Store zero into every active element of Zd. We will use this for two
958 * and three-operand predicated instructions for which logic dictates a
959 * zero result. In particular, logical shift by element size, which is
960 * otherwise undefined on the host.
962 * For element sizes smaller than uint64_t, we use tables to expand
963 * the N bits of the controlling predicate to a byte mask, and clear
964 * those bytes.
966 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
976 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
986 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
996 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1008 /* Copy Zn into Zd, and store zero into inactive elements. */
1009 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1019 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1029 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1039 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1049 /* Three-operand expander, immediate operand, controlled by a predicate.
1051 #define DO_ZPZI(NAME, TYPE, H, OP) \
1052 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1068 /* Similarly, specialized for 64-bit operands. */
1069 #define DO_ZPZI_D(NAME, TYPE, OP) \
1070 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1084 #define DO_SHR(N, M) (N >> M)
1085 #define DO_SHL(N, M) (N << M)
1087 /* Arithmetic shift right for division. This rounds negative numbers
1088 toward zero as per signed division. Therefore before shifting,
1089 when N is negative, add 2**M-1. */
1090 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1112 #undef DO_SHR
1113 #undef DO_SHL
1114 #undef DO_ASRD
1115 #undef DO_ZPZI
1116 #undef DO_ZPZI_D
1118 /* Fully general four-operand expander, controlled by a predicate.
1120 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1139 /* Similarly, specialized for 64-bit operands. */
1140 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1155 #define DO_MLA(A, N, M) (A + N * M)
1156 #define DO_MLS(A, N, M) (A - N * M)
1158 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1161 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1164 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1167 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1170 #undef DO_MLA
1171 #undef DO_MLS
1172 #undef DO_ZPZZZ
1173 #undef DO_ZPZZZ_D
1175 void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1185 void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1195 void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1205 void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1215 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1225 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1235 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1245 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1255 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1257 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1275 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1277 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1307 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1345 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1355 d[i] = nn ^ (mm & 2) << 14;
1359 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1369 d[i] = nn ^ (mm & 2) << 30;
1373 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1383 d[i] = nn ^ (mm & 2) << 62;
1388 * Signed saturating addition with scalar operand.
1391 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1393 intptr_t i, oprsz = simd_oprsz(desc);
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1402 *(int8_t *)(d + i) = r;
1406 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1408 intptr_t i, oprsz = simd_oprsz(desc);
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1417 *(int16_t *)(d + i) = r;
1421 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1423 intptr_t i, oprsz = simd_oprsz(desc);
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1432 *(int32_t *)(d + i) = r;
1436 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1438 intptr_t i, oprsz = simd_oprsz(desc);
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444 /* Signed overflow. */
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1447 *(int64_t *)(d + i) = r;
1452 * Unsigned saturating addition with scalar operand.
1455 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1457 intptr_t i, oprsz = simd_oprsz(desc);
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1466 *(uint8_t *)(d + i) = r;
1470 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1472 intptr_t i, oprsz = simd_oprsz(desc);
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1481 *(uint16_t *)(d + i) = r;
1485 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1487 intptr_t i, oprsz = simd_oprsz(desc);
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1496 *(uint32_t *)(d + i) = r;
1500 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1502 intptr_t i, oprsz = simd_oprsz(desc);
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1509 *(uint64_t *)(d + i) = r;
1513 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1515 intptr_t i, oprsz = simd_oprsz(desc);
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1523 /* Two operand predicated copy immediate with merge. All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1526 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1541 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1556 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1571 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1584 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1596 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1608 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1620 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1631 /* Big-endian hosts need to frob the byte indicies. If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1634 static void swap_memmove(void *vd, void *vs, size_t n)
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1641 #ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643 #endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1660 break;
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1674 break;
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1687 break;
1691 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1693 intptr_t opr_sz = simd_oprsz(desc);
1694 size_t n_ofs = simd_data(desc);
1695 size_t n_siz = opr_sz - n_ofs;
1697 if (vd != vm) {
1698 swap_memmove(vd, vn + n_ofs, n_siz);
1699 swap_memmove(vd + n_siz, vm, n_ofs);
1700 } else if (vd != vn) {
1701 swap_memmove(vd + n_siz, vd, n_ofs);
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 } else {
1704 /* vd == vn == vm. Need temp space. */
1705 ARMVectorReg tmp;
1706 swap_memmove(&tmp, vm, n_ofs);
1707 swap_memmove(vd, vd + n_ofs, n_siz);
1708 memcpy(vd + n_siz, &tmp, n_ofs);
1712 #define DO_INSR(NAME, TYPE, H) \
1713 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1715 intptr_t opr_sz = simd_oprsz(desc); \
1716 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1717 *(TYPE *)(vd + H(0)) = val; \
1720 DO_INSR(sve_insr_b, uint8_t, H1)
1721 DO_INSR(sve_insr_h, uint16_t, H1_2)
1722 DO_INSR(sve_insr_s, uint32_t, H1_4)
1723 DO_INSR(sve_insr_d, uint64_t, )
1725 #undef DO_INSR
1727 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1729 intptr_t i, j, opr_sz = simd_oprsz(desc);
1730 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1731 uint64_t f = *(uint64_t *)(vn + i);
1732 uint64_t b = *(uint64_t *)(vn + j);
1733 *(uint64_t *)(vd + i) = bswap64(b);
1734 *(uint64_t *)(vd + j) = bswap64(f);
1738 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1740 intptr_t i, j, opr_sz = simd_oprsz(desc);
1741 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1742 uint64_t f = *(uint64_t *)(vn + i);
1743 uint64_t b = *(uint64_t *)(vn + j);
1744 *(uint64_t *)(vd + i) = hswap64(b);
1745 *(uint64_t *)(vd + j) = hswap64(f);
1749 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1751 intptr_t i, j, opr_sz = simd_oprsz(desc);
1752 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1753 uint64_t f = *(uint64_t *)(vn + i);
1754 uint64_t b = *(uint64_t *)(vn + j);
1755 *(uint64_t *)(vd + i) = rol64(b, 32);
1756 *(uint64_t *)(vd + j) = rol64(f, 32);
1760 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1762 intptr_t i, j, opr_sz = simd_oprsz(desc);
1763 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1764 uint64_t f = *(uint64_t *)(vn + i);
1765 uint64_t b = *(uint64_t *)(vn + j);
1766 *(uint64_t *)(vd + i) = b;
1767 *(uint64_t *)(vd + j) = f;
1771 #define DO_TBL(NAME, TYPE, H) \
1772 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1774 intptr_t i, opr_sz = simd_oprsz(desc); \
1775 uintptr_t elem = opr_sz / sizeof(TYPE); \
1776 TYPE *d = vd, *n = vn, *m = vm; \
1777 ARMVectorReg tmp; \
1778 if (unlikely(vd == vn)) { \
1779 n = memcpy(&tmp, vn, opr_sz); \
1781 for (i = 0; i < elem; i++) { \
1782 TYPE j = m[H(i)]; \
1783 d[H(i)] = j < elem ? n[H(j)] : 0; \
1787 DO_TBL(sve_tbl_b, uint8_t, H1)
1788 DO_TBL(sve_tbl_h, uint16_t, H2)
1789 DO_TBL(sve_tbl_s, uint32_t, H4)
1790 DO_TBL(sve_tbl_d, uint64_t, )
1792 #undef TBL
1794 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1795 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1797 intptr_t i, opr_sz = simd_oprsz(desc); \
1798 TYPED *d = vd; \
1799 TYPES *n = vn; \
1800 ARMVectorReg tmp; \
1801 if (unlikely(vn - vd < opr_sz)) { \
1802 n = memcpy(&tmp, n, opr_sz / 2); \
1804 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1805 d[HD(i)] = n[HS(i)]; \
1809 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1810 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1811 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1813 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1814 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1815 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1817 #undef DO_UNPK
1819 /* Mask of bits included in the even numbered predicates of width esz.
1820 * We also use this for expand_bits/compress_bits, and so extend the
1821 * same pattern out to 16-bit units.
1823 static const uint64_t even_bit_esz_masks[5] = {
1824 0x5555555555555555ull,
1825 0x3333333333333333ull,
1826 0x0f0f0f0f0f0f0f0full,
1827 0x00ff00ff00ff00ffull,
1828 0x0000ffff0000ffffull,
1831 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1832 * For N==0, this corresponds to the operation that in qemu/bitops.h
1833 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1834 * section 7-2 Shuffling Bits.
1836 static uint64_t expand_bits(uint64_t x, int n)
1838 int i;
1840 x &= 0xffffffffu;
1841 for (i = 4; i >= n; i--) {
1842 int sh = 1 << i;
1843 x = ((x << sh) | x) & even_bit_esz_masks[i];
1845 return x;
1848 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1849 * For N==0, this corresponds to the operation that in qemu/bitops.h
1850 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1851 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1853 static uint64_t compress_bits(uint64_t x, int n)
1855 int i;
1857 for (i = n; i <= 4; i++) {
1858 int sh = 1 << i;
1859 x &= even_bit_esz_masks[i];
1860 x = (x >> sh) | x;
1862 return x & 0xffffffffu;
1865 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1867 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1868 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1869 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1870 uint64_t *d = vd;
1871 intptr_t i;
1873 if (oprsz <= 8) {
1874 uint64_t nn = *(uint64_t *)vn;
1875 uint64_t mm = *(uint64_t *)vm;
1876 int half = 4 * oprsz;
1878 nn = extract64(nn, high * half, half);
1879 mm = extract64(mm, high * half, half);
1880 nn = expand_bits(nn, esz);
1881 mm = expand_bits(mm, esz);
1882 d[0] = nn + (mm << (1 << esz));
1883 } else {
1884 ARMPredicateReg tmp_n, tmp_m;
1886 /* We produce output faster than we consume input.
1887 Therefore we must be mindful of possible overlap. */
1888 if ((vn - vd) < (uintptr_t)oprsz) {
1889 vn = memcpy(&tmp_n, vn, oprsz);
1891 if ((vm - vd) < (uintptr_t)oprsz) {
1892 vm = memcpy(&tmp_m, vm, oprsz);
1894 if (high) {
1895 high = oprsz >> 1;
1898 if ((high & 3) == 0) {
1899 uint32_t *n = vn, *m = vm;
1900 high >>= 2;
1902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1903 uint64_t nn = n[H4(high + i)];
1904 uint64_t mm = m[H4(high + i)];
1906 nn = expand_bits(nn, esz);
1907 mm = expand_bits(mm, esz);
1908 d[i] = nn + (mm << (1 << esz));
1910 } else {
1911 uint8_t *n = vn, *m = vm;
1912 uint16_t *d16 = vd;
1914 for (i = 0; i < oprsz / 2; i++) {
1915 uint16_t nn = n[H1(high + i)];
1916 uint16_t mm = m[H1(high + i)];
1918 nn = expand_bits(nn, esz);
1919 mm = expand_bits(mm, esz);
1920 d16[H2(i)] = nn + (mm << (1 << esz));
1926 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1928 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1929 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1930 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1931 uint64_t *d = vd, *n = vn, *m = vm;
1932 uint64_t l, h;
1933 intptr_t i;
1935 if (oprsz <= 8) {
1936 l = compress_bits(n[0] >> odd, esz);
1937 h = compress_bits(m[0] >> odd, esz);
1938 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1939 } else {
1940 ARMPredicateReg tmp_m;
1941 intptr_t oprsz_16 = oprsz / 16;
1943 if ((vm - vd) < (uintptr_t)oprsz) {
1944 m = memcpy(&tmp_m, vm, oprsz);
1947 for (i = 0; i < oprsz_16; i++) {
1948 l = n[2 * i + 0];
1949 h = n[2 * i + 1];
1950 l = compress_bits(l >> odd, esz);
1951 h = compress_bits(h >> odd, esz);
1952 d[i] = l + (h << 32);
1955 /* For VL which is not a power of 2, the results from M do not
1956 align nicely with the uint64_t for D. Put the aligned results
1957 from M into TMP_M and then copy it into place afterward. */
1958 if (oprsz & 15) {
1959 d[i] = compress_bits(n[2 * i] >> odd, esz);
1961 for (i = 0; i < oprsz_16; i++) {
1962 l = m[2 * i + 0];
1963 h = m[2 * i + 1];
1964 l = compress_bits(l >> odd, esz);
1965 h = compress_bits(h >> odd, esz);
1966 tmp_m.p[i] = l + (h << 32);
1968 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1970 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1971 } else {
1972 for (i = 0; i < oprsz_16; i++) {
1973 l = m[2 * i + 0];
1974 h = m[2 * i + 1];
1975 l = compress_bits(l >> odd, esz);
1976 h = compress_bits(h >> odd, esz);
1977 d[oprsz_16 + i] = l + (h << 32);
1983 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1985 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1986 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1987 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1988 uint64_t *d = vd, *n = vn, *m = vm;
1989 uint64_t mask;
1990 int shr, shl;
1991 intptr_t i;
1993 shl = 1 << esz;
1994 shr = 0;
1995 mask = even_bit_esz_masks[esz];
1996 if (odd) {
1997 mask <<= shl;
1998 shr = shl;
1999 shl = 0;
2002 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2003 uint64_t nn = (n[i] & mask) >> shr;
2004 uint64_t mm = (m[i] & mask) << shl;
2005 d[i] = nn + mm;
2009 /* Reverse units of 2**N bits. */
2010 static uint64_t reverse_bits_64(uint64_t x, int n)
2012 int i, sh;
2014 x = bswap64(x);
2015 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2016 uint64_t mask = even_bit_esz_masks[i];
2017 x = ((x & mask) << sh) | ((x >> sh) & mask);
2019 return x;
2022 static uint8_t reverse_bits_8(uint8_t x, int n)
2024 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2025 int i, sh;
2027 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2028 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2030 return x;
2033 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2035 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2036 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2037 intptr_t i, oprsz_2 = oprsz / 2;
2039 if (oprsz <= 8) {
2040 uint64_t l = *(uint64_t *)vn;
2041 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2042 *(uint64_t *)vd = l;
2043 } else if ((oprsz & 15) == 0) {
2044 for (i = 0; i < oprsz_2; i += 8) {
2045 intptr_t ih = oprsz - 8 - i;
2046 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2047 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2048 *(uint64_t *)(vd + i) = h;
2049 *(uint64_t *)(vd + ih) = l;
2051 } else {
2052 for (i = 0; i < oprsz_2; i += 1) {
2053 intptr_t il = H1(i);
2054 intptr_t ih = H1(oprsz - 1 - i);
2055 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2056 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2057 *(uint8_t *)(vd + il) = h;
2058 *(uint8_t *)(vd + ih) = l;
2063 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2065 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2066 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2067 uint64_t *d = vd;
2068 intptr_t i;
2070 if (oprsz <= 8) {
2071 uint64_t nn = *(uint64_t *)vn;
2072 int half = 4 * oprsz;
2074 nn = extract64(nn, high * half, half);
2075 nn = expand_bits(nn, 0);
2076 d[0] = nn;
2077 } else {
2078 ARMPredicateReg tmp_n;
2080 /* We produce output faster than we consume input.
2081 Therefore we must be mindful of possible overlap. */
2082 if ((vn - vd) < (uintptr_t)oprsz) {
2083 vn = memcpy(&tmp_n, vn, oprsz);
2085 if (high) {
2086 high = oprsz >> 1;
2089 if ((high & 3) == 0) {
2090 uint32_t *n = vn;
2091 high >>= 2;
2093 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2094 uint64_t nn = n[H4(high + i)];
2095 d[i] = expand_bits(nn, 0);
2097 } else {
2098 uint16_t *d16 = vd;
2099 uint8_t *n = vn;
2101 for (i = 0; i < oprsz / 2; i++) {
2102 uint16_t nn = n[H1(high + i)];
2103 d16[H2(i)] = expand_bits(nn, 0);
2109 #define DO_ZIP(NAME, TYPE, H) \
2110 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2112 intptr_t oprsz = simd_oprsz(desc); \
2113 intptr_t i, oprsz_2 = oprsz / 2; \
2114 ARMVectorReg tmp_n, tmp_m; \
2115 /* We produce output faster than we consume input. \
2116 Therefore we must be mindful of possible overlap. */ \
2117 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2118 vn = memcpy(&tmp_n, vn, oprsz_2); \
2120 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2121 vm = memcpy(&tmp_m, vm, oprsz_2); \
2123 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2124 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2125 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2129 DO_ZIP(sve_zip_b, uint8_t, H1)
2130 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2131 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2132 DO_ZIP(sve_zip_d, uint64_t, )
2134 #define DO_UZP(NAME, TYPE, H) \
2135 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2137 intptr_t oprsz = simd_oprsz(desc); \
2138 intptr_t oprsz_2 = oprsz / 2; \
2139 intptr_t odd_ofs = simd_data(desc); \
2140 intptr_t i; \
2141 ARMVectorReg tmp_m; \
2142 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2143 vm = memcpy(&tmp_m, vm, oprsz); \
2145 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2146 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2153 DO_UZP(sve_uzp_b, uint8_t, H1)
2154 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2155 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2156 DO_UZP(sve_uzp_d, uint64_t, )
2158 #define DO_TRN(NAME, TYPE, H) \
2159 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2161 intptr_t oprsz = simd_oprsz(desc); \
2162 intptr_t odd_ofs = simd_data(desc); \
2163 intptr_t i; \
2164 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2165 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2166 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2167 *(TYPE *)(vd + H(i + 0)) = ae; \
2168 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2172 DO_TRN(sve_trn_b, uint8_t, H1)
2173 DO_TRN(sve_trn_h, uint16_t, H1_2)
2174 DO_TRN(sve_trn_s, uint32_t, H1_4)
2175 DO_TRN(sve_trn_d, uint64_t, )
2177 #undef DO_ZIP
2178 #undef DO_UZP
2179 #undef DO_TRN
2181 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2183 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2184 uint32_t *d = vd, *n = vn;
2185 uint8_t *pg = vg;
2187 for (i = j = 0; i < opr_sz; i++) {
2188 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2189 d[H4(j)] = n[H4(i)];
2190 j++;
2193 for (; j < opr_sz; j++) {
2194 d[H4(j)] = 0;
2198 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2200 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2201 uint64_t *d = vd, *n = vn;
2202 uint8_t *pg = vg;
2204 for (i = j = 0; i < opr_sz; i++) {
2205 if (pg[H1(i)] & 1) {
2206 d[j] = n[i];
2207 j++;
2210 for (; j < opr_sz; j++) {
2211 d[j] = 0;
2215 /* Similar to the ARM LastActiveElement pseudocode function, except the
2216 * result is multiplied by the element size. This includes the not found
2217 * indication; e.g. not found for esz=3 is -8.
2219 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2221 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2222 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2224 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2227 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2229 intptr_t opr_sz = simd_oprsz(desc) / 8;
2230 int esz = simd_data(desc);
2231 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2232 intptr_t i, first_i, last_i;
2233 ARMVectorReg tmp;
2235 first_i = last_i = 0;
2236 first_g = last_g = 0;
2238 /* Find the extent of the active elements within VG. */
2239 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2240 pg = *(uint64_t *)(vg + i) & mask;
2241 if (pg) {
2242 if (last_g == 0) {
2243 last_g = pg;
2244 last_i = i;
2246 first_g = pg;
2247 first_i = i;
2251 len = 0;
2252 if (first_g != 0) {
2253 first_i = first_i * 8 + ctz64(first_g);
2254 last_i = last_i * 8 + 63 - clz64(last_g);
2255 len = last_i - first_i + (1 << esz);
2256 if (vd == vm) {
2257 vm = memcpy(&tmp, vm, opr_sz * 8);
2259 swap_memmove(vd, vn + first_i, len);
2261 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2264 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2265 void *vg, uint32_t desc)
2267 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2268 uint64_t *d = vd, *n = vn, *m = vm;
2269 uint8_t *pg = vg;
2271 for (i = 0; i < opr_sz; i += 1) {
2272 uint64_t nn = n[i], mm = m[i];
2273 uint64_t pp = expand_pred_b(pg[H1(i)]);
2274 d[i] = (nn & pp) | (mm & ~pp);
2278 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2279 void *vg, uint32_t desc)
2281 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282 uint64_t *d = vd, *n = vn, *m = vm;
2283 uint8_t *pg = vg;
2285 for (i = 0; i < opr_sz; i += 1) {
2286 uint64_t nn = n[i], mm = m[i];
2287 uint64_t pp = expand_pred_h(pg[H1(i)]);
2288 d[i] = (nn & pp) | (mm & ~pp);
2292 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2293 void *vg, uint32_t desc)
2295 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2296 uint64_t *d = vd, *n = vn, *m = vm;
2297 uint8_t *pg = vg;
2299 for (i = 0; i < opr_sz; i += 1) {
2300 uint64_t nn = n[i], mm = m[i];
2301 uint64_t pp = expand_pred_s(pg[H1(i)]);
2302 d[i] = (nn & pp) | (mm & ~pp);
2306 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2307 void *vg, uint32_t desc)
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t *d = vd, *n = vn, *m = vm;
2311 uint8_t *pg = vg;
2313 for (i = 0; i < opr_sz; i += 1) {
2314 uint64_t nn = n[i], mm = m[i];
2315 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2319 /* Two operand comparison controlled by a predicate.
2320 * ??? It is very tempting to want to be able to expand this inline
2321 * with x86 instructions, e.g.
2323 * vcmpeqw zm, zn, %ymm0
2324 * vpmovmskb %ymm0, %eax
2325 * and $0x5555, %eax
2326 * and pg, %eax
2328 * or even aarch64, e.g.
2330 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2331 * cmeq v0.8h, zn, zm
2332 * and v0.8h, v0.8h, mask
2333 * addv h0, v0.8h
2334 * and v0.8b, pg
2336 * However, coming up with an abstraction that allows vector inputs and
2337 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2338 * scalar outputs, is tricky.
2340 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2341 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2343 intptr_t opr_sz = simd_oprsz(desc); \
2344 uint32_t flags = PREDTEST_INIT; \
2345 intptr_t i = opr_sz; \
2346 do { \
2347 uint64_t out = 0, pg; \
2348 do { \
2349 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2350 TYPE nn = *(TYPE *)(vn + H(i)); \
2351 TYPE mm = *(TYPE *)(vm + H(i)); \
2352 out |= nn OP mm; \
2353 } while (i & 63); \
2354 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2355 out &= pg; \
2356 *(uint64_t *)(vd + (i >> 3)) = out; \
2357 flags = iter_predtest_bwd(out, pg, flags); \
2358 } while (i > 0); \
2359 return flags; \
2362 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2363 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2364 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2365 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2366 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2367 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2368 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2369 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2371 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2372 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2373 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2374 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2376 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2377 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2378 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2379 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2381 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2382 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2383 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2384 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2386 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2387 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2388 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2389 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2391 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2392 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2393 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2394 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2396 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2397 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2398 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2399 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2401 #undef DO_CMP_PPZZ_B
2402 #undef DO_CMP_PPZZ_H
2403 #undef DO_CMP_PPZZ_S
2404 #undef DO_CMP_PPZZ_D
2405 #undef DO_CMP_PPZZ
2407 /* Similar, but the second source is "wide". */
2408 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2409 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2411 intptr_t opr_sz = simd_oprsz(desc); \
2412 uint32_t flags = PREDTEST_INIT; \
2413 intptr_t i = opr_sz; \
2414 do { \
2415 uint64_t out = 0, pg; \
2416 do { \
2417 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2418 do { \
2419 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2420 TYPE nn = *(TYPE *)(vn + H(i)); \
2421 out |= nn OP mm; \
2422 } while (i & 7); \
2423 } while (i & 63); \
2424 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2425 out &= pg; \
2426 *(uint64_t *)(vd + (i >> 3)) = out; \
2427 flags = iter_predtest_bwd(out, pg, flags); \
2428 } while (i > 0); \
2429 return flags; \
2432 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2433 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2434 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2435 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2436 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2437 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2439 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2440 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2441 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2443 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2444 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2445 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2447 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2448 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2449 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2451 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2452 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2453 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2455 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2456 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2457 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2459 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2460 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2461 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2463 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2464 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2465 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2467 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2468 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2469 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2471 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2472 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2473 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2475 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2476 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2477 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2479 #undef DO_CMP_PPZW_B
2480 #undef DO_CMP_PPZW_H
2481 #undef DO_CMP_PPZW_S
2482 #undef DO_CMP_PPZW
2484 /* Similar, but the second source is immediate. */
2485 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2486 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2488 intptr_t opr_sz = simd_oprsz(desc); \
2489 uint32_t flags = PREDTEST_INIT; \
2490 TYPE mm = simd_data(desc); \
2491 intptr_t i = opr_sz; \
2492 do { \
2493 uint64_t out = 0, pg; \
2494 do { \
2495 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2496 TYPE nn = *(TYPE *)(vn + H(i)); \
2497 out |= nn OP mm; \
2498 } while (i & 63); \
2499 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2500 out &= pg; \
2501 *(uint64_t *)(vd + (i >> 3)) = out; \
2502 flags = iter_predtest_bwd(out, pg, flags); \
2503 } while (i > 0); \
2504 return flags; \
2507 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2508 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2509 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2510 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2511 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2512 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2513 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2514 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2516 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2517 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2518 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2519 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2521 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2522 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2523 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2524 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2526 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2527 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2528 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2529 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2531 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2532 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2533 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2534 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2536 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2537 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2538 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2539 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2541 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2542 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2543 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2544 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2546 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2547 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2548 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2549 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2551 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2552 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2553 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2554 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2556 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2557 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2558 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2559 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2561 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2562 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2563 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2564 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2566 #undef DO_CMP_PPZI_B
2567 #undef DO_CMP_PPZI_H
2568 #undef DO_CMP_PPZI_S
2569 #undef DO_CMP_PPZI_D
2570 #undef DO_CMP_PPZI
2572 /* Similar to the ARM LastActive pseudocode function. */
2573 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2575 intptr_t i;
2577 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2578 uint64_t pg = *(uint64_t *)(vg + i);
2579 if (pg) {
2580 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2583 return 0;
2586 /* Compute a mask into RETB that is true for all G, up to and including
2587 * (if after) or excluding (if !after) the first G & N.
2588 * Return true if BRK found.
2590 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2591 bool brk, bool after)
2593 uint64_t b;
2595 if (brk) {
2596 b = 0;
2597 } else if ((g & n) == 0) {
2598 /* For all G, no N are set; break not found. */
2599 b = g;
2600 } else {
2601 /* Break somewhere in N. Locate it. */
2602 b = g & n; /* guard true, pred true */
2603 b = b & -b; /* first such */
2604 if (after) {
2605 b = b | (b - 1); /* break after same */
2606 } else {
2607 b = b - 1; /* break before same */
2609 brk = true;
2612 *retb = b;
2613 return brk;
2616 /* Compute a zeroing BRK. */
2617 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2618 intptr_t oprsz, bool after)
2620 bool brk = false;
2621 intptr_t i;
2623 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2624 uint64_t this_b, this_g = g[i];
2626 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2627 d[i] = this_b & this_g;
2631 /* Likewise, but also compute flags. */
2632 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2633 intptr_t oprsz, bool after)
2635 uint32_t flags = PREDTEST_INIT;
2636 bool brk = false;
2637 intptr_t i;
2639 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640 uint64_t this_b, this_d, this_g = g[i];
2642 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643 d[i] = this_d = this_b & this_g;
2644 flags = iter_predtest_fwd(this_d, this_g, flags);
2646 return flags;
2649 /* Compute a merging BRK. */
2650 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2651 intptr_t oprsz, bool after)
2653 bool brk = false;
2654 intptr_t i;
2656 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2657 uint64_t this_b, this_g = g[i];
2659 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2660 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2664 /* Likewise, but also compute flags. */
2665 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2666 intptr_t oprsz, bool after)
2668 uint32_t flags = PREDTEST_INIT;
2669 bool brk = false;
2670 intptr_t i;
2672 for (i = 0; i < oprsz / 8; ++i) {
2673 uint64_t this_b, this_d = d[i], this_g = g[i];
2675 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2677 flags = iter_predtest_fwd(this_d, this_g, flags);
2679 return flags;
2682 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2684 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2685 * The compiler should turn this into 4 64-bit integer stores.
2687 memset(d, 0, sizeof(ARMPredicateReg));
2688 return PREDTEST_INIT;
2691 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2692 uint32_t pred_desc)
2694 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2695 if (last_active_pred(vn, vg, oprsz)) {
2696 compute_brk_z(vd, vm, vg, oprsz, true);
2697 } else {
2698 do_zero(vd, oprsz);
2702 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2703 uint32_t pred_desc)
2705 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2706 if (last_active_pred(vn, vg, oprsz)) {
2707 return compute_brks_z(vd, vm, vg, oprsz, true);
2708 } else {
2709 return do_zero(vd, oprsz);
2713 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2714 uint32_t pred_desc)
2716 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717 if (last_active_pred(vn, vg, oprsz)) {
2718 compute_brk_z(vd, vm, vg, oprsz, false);
2719 } else {
2720 do_zero(vd, oprsz);
2724 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2725 uint32_t pred_desc)
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 if (last_active_pred(vn, vg, oprsz)) {
2729 return compute_brks_z(vd, vm, vg, oprsz, false);
2730 } else {
2731 return do_zero(vd, oprsz);
2735 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 compute_brk_z(vd, vn, vg, oprsz, true);
2741 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2743 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744 return compute_brks_z(vd, vn, vg, oprsz, true);
2747 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2749 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2750 compute_brk_z(vd, vn, vg, oprsz, false);
2753 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2755 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2756 return compute_brks_z(vd, vn, vg, oprsz, false);
2759 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2761 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2762 compute_brk_m(vd, vn, vg, oprsz, true);
2765 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2767 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2768 return compute_brks_m(vd, vn, vg, oprsz, true);
2771 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2773 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2774 compute_brk_m(vd, vn, vg, oprsz, false);
2777 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780 return compute_brks_m(vd, vn, vg, oprsz, false);
2783 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787 if (!last_active_pred(vn, vg, oprsz)) {
2788 do_zero(vd, oprsz);
2792 /* As if PredTest(Ones(PL), D, esz). */
2793 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2794 uint64_t esz_mask)
2796 uint32_t flags = PREDTEST_INIT;
2797 intptr_t i;
2799 for (i = 0; i < oprsz / 8; i++) {
2800 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2802 if (oprsz & 7) {
2803 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2804 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2806 return flags;
2809 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2811 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2813 if (last_active_pred(vn, vg, oprsz)) {
2814 return predtest_ones(vd, oprsz, -1);
2815 } else {
2816 return do_zero(vd, oprsz);
2820 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2824 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2825 intptr_t i;
2827 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2828 uint64_t t = n[i] & g[i] & mask;
2829 sum += ctpop64(t);
2831 return sum;
2834 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2836 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2837 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2838 uint64_t esz_mask = pred_esz_masks[esz];
2839 ARMPredicateReg *d = vd;
2840 uint32_t flags;
2841 intptr_t i;
2843 /* Begin with a zero predicate register. */
2844 flags = do_zero(d, oprsz);
2845 if (count == 0) {
2846 return flags;
2849 /* Set all of the requested bits. */
2850 for (i = 0; i < count / 64; ++i) {
2851 d->p[i] = esz_mask;
2853 if (count & 63) {
2854 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2857 return predtest_ones(d, oprsz, esz_mask);
2860 /* Recursive reduction on a function;
2861 * C.f. the ARM ARM function ReducePredicated.
2863 * While it would be possible to write this without the DATA temporary,
2864 * it is much simpler to process the predicate register this way.
2865 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2866 * little to gain with a more complex non-recursive form.
2868 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2869 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2871 if (n == 1) { \
2872 return *data; \
2873 } else { \
2874 uintptr_t half = n / 2; \
2875 TYPE lo = NAME##_reduce(data, status, half); \
2876 TYPE hi = NAME##_reduce(data + half, status, half); \
2877 return TYPE##_##FUNC(lo, hi, status); \
2880 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2882 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2883 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2884 for (i = 0; i < oprsz; ) { \
2885 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2886 do { \
2887 TYPE nn = *(TYPE *)(vn + H(i)); \
2888 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2889 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2890 } while (i & 15); \
2892 for (; i < maxsz; i += sizeof(TYPE)) { \
2893 *(TYPE *)((void *)data + i) = IDENT; \
2895 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2898 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2899 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2900 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2902 /* Identity is floatN_default_nan, without the function call. */
2903 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2904 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2905 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2907 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2908 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2909 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2911 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2912 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2913 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2915 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2916 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2917 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2919 #undef DO_REDUCE
2921 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2922 void *status, uint32_t desc)
2924 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2925 float16 result = nn;
2927 do {
2928 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2929 do {
2930 if (pg & 1) {
2931 float16 mm = *(float16 *)(vm + H1_2(i));
2932 result = float16_add(result, mm, status);
2934 i += sizeof(float16), pg >>= sizeof(float16);
2935 } while (i & 15);
2936 } while (i < opr_sz);
2938 return result;
2941 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2942 void *status, uint32_t desc)
2944 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2945 float32 result = nn;
2947 do {
2948 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2949 do {
2950 if (pg & 1) {
2951 float32 mm = *(float32 *)(vm + H1_2(i));
2952 result = float32_add(result, mm, status);
2954 i += sizeof(float32), pg >>= sizeof(float32);
2955 } while (i & 15);
2956 } while (i < opr_sz);
2958 return result;
2961 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2962 void *status, uint32_t desc)
2964 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2965 uint64_t *m = vm;
2966 uint8_t *pg = vg;
2968 for (i = 0; i < opr_sz; i++) {
2969 if (pg[H1(i)] & 1) {
2970 nn = float64_add(nn, m[i], status);
2974 return nn;
2977 /* Fully general three-operand expander, controlled by a predicate,
2978 * With the extra float_status parameter.
2980 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2981 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2982 void *status, uint32_t desc) \
2984 intptr_t i = simd_oprsz(desc); \
2985 uint64_t *g = vg; \
2986 do { \
2987 uint64_t pg = g[(i - 1) >> 6]; \
2988 do { \
2989 i -= sizeof(TYPE); \
2990 if (likely((pg >> (i & 63)) & 1)) { \
2991 TYPE nn = *(TYPE *)(vn + H(i)); \
2992 TYPE mm = *(TYPE *)(vm + H(i)); \
2993 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2995 } while (i & 63); \
2996 } while (i != 0); \
2999 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3000 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3001 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3003 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3004 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3005 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3007 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3008 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3009 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3011 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3012 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3013 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3015 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3016 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3017 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3019 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3020 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3021 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3023 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3024 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3025 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3027 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3028 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3029 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3031 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3033 return float16_abs(float16_sub(a, b, s));
3036 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3038 return float32_abs(float32_sub(a, b, s));
3041 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3043 return float64_abs(float64_sub(a, b, s));
3046 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3047 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3048 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3050 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3052 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3053 return float64_scalbn(a, b_int, s);
3056 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3057 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3058 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3060 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3061 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3062 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3064 #undef DO_ZPZZ_FP
3066 /* Three-operand expander, with one scalar operand, controlled by
3067 * a predicate, with the extra float_status parameter.
3069 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3070 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3071 void *status, uint32_t desc) \
3073 intptr_t i = simd_oprsz(desc); \
3074 uint64_t *g = vg; \
3075 TYPE mm = scalar; \
3076 do { \
3077 uint64_t pg = g[(i - 1) >> 6]; \
3078 do { \
3079 i -= sizeof(TYPE); \
3080 if (likely((pg >> (i & 63)) & 1)) { \
3081 TYPE nn = *(TYPE *)(vn + H(i)); \
3082 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3084 } while (i & 63); \
3085 } while (i != 0); \
3088 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3089 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3090 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3092 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3093 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3094 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3096 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3097 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3098 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3100 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3102 return float16_sub(b, a, s);
3105 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3107 return float32_sub(b, a, s);
3110 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3112 return float64_sub(b, a, s);
3115 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3116 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3117 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3119 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3120 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3121 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3123 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3124 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3125 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3127 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3128 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3129 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3131 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3132 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3133 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3135 /* Fully general two-operand expander, controlled by a predicate,
3136 * With the extra float_status parameter.
3138 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3139 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3141 intptr_t i = simd_oprsz(desc); \
3142 uint64_t *g = vg; \
3143 do { \
3144 uint64_t pg = g[(i - 1) >> 6]; \
3145 do { \
3146 i -= sizeof(TYPE); \
3147 if (likely((pg >> (i & 63)) & 1)) { \
3148 TYPE nn = *(TYPE *)(vn + H(i)); \
3149 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3151 } while (i & 63); \
3152 } while (i != 0); \
3155 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3156 * FZ16. When converting from fp16, this affects flushing input denormals;
3157 * when converting to fp16, this affects flushing output denormals.
3159 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3161 flag save = get_flush_inputs_to_zero(fpst);
3162 float32 ret;
3164 set_flush_inputs_to_zero(false, fpst);
3165 ret = float16_to_float32(f, true, fpst);
3166 set_flush_inputs_to_zero(save, fpst);
3167 return ret;
3170 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3172 flag save = get_flush_inputs_to_zero(fpst);
3173 float64 ret;
3175 set_flush_inputs_to_zero(false, fpst);
3176 ret = float16_to_float64(f, true, fpst);
3177 set_flush_inputs_to_zero(save, fpst);
3178 return ret;
3181 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3183 flag save = get_flush_to_zero(fpst);
3184 float16 ret;
3186 set_flush_to_zero(false, fpst);
3187 ret = float32_to_float16(f, true, fpst);
3188 set_flush_to_zero(save, fpst);
3189 return ret;
3192 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3194 flag save = get_flush_to_zero(fpst);
3195 float16 ret;
3197 set_flush_to_zero(false, fpst);
3198 ret = float64_to_float16(f, true, fpst);
3199 set_flush_to_zero(save, fpst);
3200 return ret;
3203 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3205 if (float16_is_any_nan(f)) {
3206 float_raise(float_flag_invalid, s);
3207 return 0;
3209 return float16_to_int16_round_to_zero(f, s);
3212 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3214 if (float16_is_any_nan(f)) {
3215 float_raise(float_flag_invalid, s);
3216 return 0;
3218 return float16_to_int64_round_to_zero(f, s);
3221 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3223 if (float32_is_any_nan(f)) {
3224 float_raise(float_flag_invalid, s);
3225 return 0;
3227 return float32_to_int64_round_to_zero(f, s);
3230 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3232 if (float64_is_any_nan(f)) {
3233 float_raise(float_flag_invalid, s);
3234 return 0;
3236 return float64_to_int64_round_to_zero(f, s);
3239 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3241 if (float16_is_any_nan(f)) {
3242 float_raise(float_flag_invalid, s);
3243 return 0;
3245 return float16_to_uint16_round_to_zero(f, s);
3248 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3250 if (float16_is_any_nan(f)) {
3251 float_raise(float_flag_invalid, s);
3252 return 0;
3254 return float16_to_uint64_round_to_zero(f, s);
3257 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3259 if (float32_is_any_nan(f)) {
3260 float_raise(float_flag_invalid, s);
3261 return 0;
3263 return float32_to_uint64_round_to_zero(f, s);
3266 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3268 if (float64_is_any_nan(f)) {
3269 float_raise(float_flag_invalid, s);
3270 return 0;
3272 return float64_to_uint64_round_to_zero(f, s);
3275 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3276 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3277 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3278 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3279 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3280 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3282 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3283 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3284 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3285 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3286 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3287 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3288 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3290 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3291 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3292 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3293 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3294 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3295 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3296 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3298 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3299 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3300 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3302 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3303 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3304 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3306 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3307 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3308 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3310 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3311 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3312 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3314 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3315 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3316 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3317 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3318 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3319 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3320 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3322 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3323 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3324 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3325 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3326 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3327 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3328 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3330 #undef DO_ZPZ_FP
3332 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3333 * "properly", so we need to encode some of the registers into DESC.
3335 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3337 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3338 uint16_t neg1, uint16_t neg3)
3340 intptr_t i = simd_oprsz(desc);
3341 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3342 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3343 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3344 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3345 void *vd = &env->vfp.zregs[rd];
3346 void *vn = &env->vfp.zregs[rn];
3347 void *vm = &env->vfp.zregs[rm];
3348 void *va = &env->vfp.zregs[ra];
3349 uint64_t *g = vg;
3351 do {
3352 uint64_t pg = g[(i - 1) >> 6];
3353 do {
3354 i -= 2;
3355 if (likely((pg >> (i & 63)) & 1)) {
3356 float16 e1, e2, e3, r;
3358 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3359 e2 = *(uint16_t *)(vm + H1_2(i));
3360 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3361 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3362 *(uint16_t *)(vd + H1_2(i)) = r;
3364 } while (i & 63);
3365 } while (i != 0);
3368 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3370 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3373 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3375 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3378 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3380 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3383 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3385 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3388 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3389 uint32_t neg1, uint32_t neg3)
3391 intptr_t i = simd_oprsz(desc);
3392 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3393 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3394 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3395 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3396 void *vd = &env->vfp.zregs[rd];
3397 void *vn = &env->vfp.zregs[rn];
3398 void *vm = &env->vfp.zregs[rm];
3399 void *va = &env->vfp.zregs[ra];
3400 uint64_t *g = vg;
3402 do {
3403 uint64_t pg = g[(i - 1) >> 6];
3404 do {
3405 i -= 4;
3406 if (likely((pg >> (i & 63)) & 1)) {
3407 float32 e1, e2, e3, r;
3409 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3410 e2 = *(uint32_t *)(vm + H1_4(i));
3411 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3412 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3413 *(uint32_t *)(vd + H1_4(i)) = r;
3415 } while (i & 63);
3416 } while (i != 0);
3419 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3421 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3424 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3426 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3429 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3431 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3434 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3436 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3439 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3440 uint64_t neg1, uint64_t neg3)
3442 intptr_t i = simd_oprsz(desc);
3443 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3444 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3445 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3446 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3447 void *vd = &env->vfp.zregs[rd];
3448 void *vn = &env->vfp.zregs[rn];
3449 void *vm = &env->vfp.zregs[rm];
3450 void *va = &env->vfp.zregs[ra];
3451 uint64_t *g = vg;
3453 do {
3454 uint64_t pg = g[(i - 1) >> 6];
3455 do {
3456 i -= 8;
3457 if (likely((pg >> (i & 63)) & 1)) {
3458 float64 e1, e2, e3, r;
3460 e1 = *(uint64_t *)(vn + i) ^ neg1;
3461 e2 = *(uint64_t *)(vm + i);
3462 e3 = *(uint64_t *)(va + i) ^ neg3;
3463 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3464 *(uint64_t *)(vd + i) = r;
3466 } while (i & 63);
3467 } while (i != 0);
3470 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3472 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3475 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3477 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3480 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3482 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3485 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3487 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3490 /* Two operand floating-point comparison controlled by a predicate.
3491 * Unlike the integer version, we are not allowed to optimistically
3492 * compare operands, since the comparison may have side effects wrt
3493 * the FPSR.
3495 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3496 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3497 void *status, uint32_t desc) \
3499 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3500 uint64_t *d = vd, *g = vg; \
3501 do { \
3502 uint64_t out = 0, pg = g[j]; \
3503 do { \
3504 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3505 if (likely((pg >> (i & 63)) & 1)) { \
3506 TYPE nn = *(TYPE *)(vn + H(i)); \
3507 TYPE mm = *(TYPE *)(vm + H(i)); \
3508 out |= OP(TYPE, nn, mm, status); \
3510 } while (i & 63); \
3511 d[j--] = out; \
3512 } while (i > 0); \
3515 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3516 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3517 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3518 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3519 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3520 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3522 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3523 DO_FPCMP_PPZZ_H(NAME, OP) \
3524 DO_FPCMP_PPZZ_S(NAME, OP) \
3525 DO_FPCMP_PPZZ_D(NAME, OP)
3527 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3528 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3529 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3530 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3531 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3532 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3533 #define DO_FCMUO(TYPE, X, Y, ST) \
3534 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3535 #define DO_FACGE(TYPE, X, Y, ST) \
3536 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3537 #define DO_FACGT(TYPE, X, Y, ST) \
3538 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3540 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3541 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3542 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3543 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3544 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3545 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3546 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3548 #undef DO_FPCMP_PPZZ_ALL
3549 #undef DO_FPCMP_PPZZ_D
3550 #undef DO_FPCMP_PPZZ_S
3551 #undef DO_FPCMP_PPZZ_H
3552 #undef DO_FPCMP_PPZZ
3554 /* One operand floating-point comparison against zero, controlled
3555 * by a predicate.
3557 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3558 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3559 void *status, uint32_t desc) \
3561 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3562 uint64_t *d = vd, *g = vg; \
3563 do { \
3564 uint64_t out = 0, pg = g[j]; \
3565 do { \
3566 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3567 if ((pg >> (i & 63)) & 1) { \
3568 TYPE nn = *(TYPE *)(vn + H(i)); \
3569 out |= OP(TYPE, nn, 0, status); \
3571 } while (i & 63); \
3572 d[j--] = out; \
3573 } while (i > 0); \
3576 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3577 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3578 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3579 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3580 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3581 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3583 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3584 DO_FPCMP_PPZ0_H(NAME, OP) \
3585 DO_FPCMP_PPZ0_S(NAME, OP) \
3586 DO_FPCMP_PPZ0_D(NAME, OP)
3588 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3589 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3590 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3591 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3592 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3593 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3595 /* FP Trig Multiply-Add. */
3597 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3599 static const float16 coeff[16] = {
3600 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3601 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3603 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3604 intptr_t x = simd_data(desc);
3605 float16 *d = vd, *n = vn, *m = vm;
3606 for (i = 0; i < opr_sz; i++) {
3607 float16 mm = m[i];
3608 intptr_t xx = x;
3609 if (float16_is_neg(mm)) {
3610 mm = float16_abs(mm);
3611 xx += 8;
3613 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3617 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3619 static const float32 coeff[16] = {
3620 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3621 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3622 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3623 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3625 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3626 intptr_t x = simd_data(desc);
3627 float32 *d = vd, *n = vn, *m = vm;
3628 for (i = 0; i < opr_sz; i++) {
3629 float32 mm = m[i];
3630 intptr_t xx = x;
3631 if (float32_is_neg(mm)) {
3632 mm = float32_abs(mm);
3633 xx += 8;
3635 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3639 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3641 static const float64 coeff[16] = {
3642 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3643 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3644 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3645 0x3de5d8408868552full, 0x0000000000000000ull,
3646 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3647 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3648 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3649 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3651 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3652 intptr_t x = simd_data(desc);
3653 float64 *d = vd, *n = vn, *m = vm;
3654 for (i = 0; i < opr_sz; i++) {
3655 float64 mm = m[i];
3656 intptr_t xx = x;
3657 if (float64_is_neg(mm)) {
3658 mm = float64_abs(mm);
3659 xx += 8;
3661 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3666 * FP Complex Add
3669 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3670 void *vs, uint32_t desc)
3672 intptr_t j, i = simd_oprsz(desc);
3673 uint64_t *g = vg;
3674 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3675 float16 neg_real = float16_chs(neg_imag);
3677 do {
3678 uint64_t pg = g[(i - 1) >> 6];
3679 do {
3680 float16 e0, e1, e2, e3;
3682 /* I holds the real index; J holds the imag index. */
3683 j = i - sizeof(float16);
3684 i -= 2 * sizeof(float16);
3686 e0 = *(float16 *)(vn + H1_2(i));
3687 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3688 e2 = *(float16 *)(vn + H1_2(j));
3689 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3691 if (likely((pg >> (i & 63)) & 1)) {
3692 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3694 if (likely((pg >> (j & 63)) & 1)) {
3695 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3697 } while (i & 63);
3698 } while (i != 0);
3701 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3702 void *vs, uint32_t desc)
3704 intptr_t j, i = simd_oprsz(desc);
3705 uint64_t *g = vg;
3706 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3707 float32 neg_real = float32_chs(neg_imag);
3709 do {
3710 uint64_t pg = g[(i - 1) >> 6];
3711 do {
3712 float32 e0, e1, e2, e3;
3714 /* I holds the real index; J holds the imag index. */
3715 j = i - sizeof(float32);
3716 i -= 2 * sizeof(float32);
3718 e0 = *(float32 *)(vn + H1_2(i));
3719 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3720 e2 = *(float32 *)(vn + H1_2(j));
3721 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3723 if (likely((pg >> (i & 63)) & 1)) {
3724 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3726 if (likely((pg >> (j & 63)) & 1)) {
3727 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3729 } while (i & 63);
3730 } while (i != 0);
3733 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3734 void *vs, uint32_t desc)
3736 intptr_t j, i = simd_oprsz(desc);
3737 uint64_t *g = vg;
3738 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3739 float64 neg_real = float64_chs(neg_imag);
3741 do {
3742 uint64_t pg = g[(i - 1) >> 6];
3743 do {
3744 float64 e0, e1, e2, e3;
3746 /* I holds the real index; J holds the imag index. */
3747 j = i - sizeof(float64);
3748 i -= 2 * sizeof(float64);
3750 e0 = *(float64 *)(vn + H1_2(i));
3751 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3752 e2 = *(float64 *)(vn + H1_2(j));
3753 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3755 if (likely((pg >> (i & 63)) & 1)) {
3756 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3758 if (likely((pg >> (j & 63)) & 1)) {
3759 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3761 } while (i & 63);
3762 } while (i != 0);
3766 * FP Complex Multiply
3769 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3771 void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3773 intptr_t j, i = simd_oprsz(desc);
3774 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3775 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3776 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3777 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3778 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3779 bool flip = rot & 1;
3780 float16 neg_imag, neg_real;
3781 void *vd = &env->vfp.zregs[rd];
3782 void *vn = &env->vfp.zregs[rn];
3783 void *vm = &env->vfp.zregs[rm];
3784 void *va = &env->vfp.zregs[ra];
3785 uint64_t *g = vg;
3787 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3788 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3790 do {
3791 uint64_t pg = g[(i - 1) >> 6];
3792 do {
3793 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3795 /* I holds the real index; J holds the imag index. */
3796 j = i - sizeof(float16);
3797 i -= 2 * sizeof(float16);
3799 nr = *(float16 *)(vn + H1_2(i));
3800 ni = *(float16 *)(vn + H1_2(j));
3801 mr = *(float16 *)(vm + H1_2(i));
3802 mi = *(float16 *)(vm + H1_2(j));
3804 e2 = (flip ? ni : nr);
3805 e1 = (flip ? mi : mr) ^ neg_real;
3806 e4 = e2;
3807 e3 = (flip ? mr : mi) ^ neg_imag;
3809 if (likely((pg >> (i & 63)) & 1)) {
3810 d = *(float16 *)(va + H1_2(i));
3811 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3812 *(float16 *)(vd + H1_2(i)) = d;
3814 if (likely((pg >> (j & 63)) & 1)) {
3815 d = *(float16 *)(va + H1_2(j));
3816 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3817 *(float16 *)(vd + H1_2(j)) = d;
3819 } while (i & 63);
3820 } while (i != 0);
3823 void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3825 intptr_t j, i = simd_oprsz(desc);
3826 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3827 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3828 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3829 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3830 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3831 bool flip = rot & 1;
3832 float32 neg_imag, neg_real;
3833 void *vd = &env->vfp.zregs[rd];
3834 void *vn = &env->vfp.zregs[rn];
3835 void *vm = &env->vfp.zregs[rm];
3836 void *va = &env->vfp.zregs[ra];
3837 uint64_t *g = vg;
3839 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3840 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3842 do {
3843 uint64_t pg = g[(i - 1) >> 6];
3844 do {
3845 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3847 /* I holds the real index; J holds the imag index. */
3848 j = i - sizeof(float32);
3849 i -= 2 * sizeof(float32);
3851 nr = *(float32 *)(vn + H1_2(i));
3852 ni = *(float32 *)(vn + H1_2(j));
3853 mr = *(float32 *)(vm + H1_2(i));
3854 mi = *(float32 *)(vm + H1_2(j));
3856 e2 = (flip ? ni : nr);
3857 e1 = (flip ? mi : mr) ^ neg_real;
3858 e4 = e2;
3859 e3 = (flip ? mr : mi) ^ neg_imag;
3861 if (likely((pg >> (i & 63)) & 1)) {
3862 d = *(float32 *)(va + H1_2(i));
3863 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3864 *(float32 *)(vd + H1_2(i)) = d;
3866 if (likely((pg >> (j & 63)) & 1)) {
3867 d = *(float32 *)(va + H1_2(j));
3868 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3869 *(float32 *)(vd + H1_2(j)) = d;
3871 } while (i & 63);
3872 } while (i != 0);
3875 void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3877 intptr_t j, i = simd_oprsz(desc);
3878 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3879 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3880 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3881 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3882 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3883 bool flip = rot & 1;
3884 float64 neg_imag, neg_real;
3885 void *vd = &env->vfp.zregs[rd];
3886 void *vn = &env->vfp.zregs[rn];
3887 void *vm = &env->vfp.zregs[rm];
3888 void *va = &env->vfp.zregs[ra];
3889 uint64_t *g = vg;
3891 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3892 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3894 do {
3895 uint64_t pg = g[(i - 1) >> 6];
3896 do {
3897 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3899 /* I holds the real index; J holds the imag index. */
3900 j = i - sizeof(float64);
3901 i -= 2 * sizeof(float64);
3903 nr = *(float64 *)(vn + H1_2(i));
3904 ni = *(float64 *)(vn + H1_2(j));
3905 mr = *(float64 *)(vm + H1_2(i));
3906 mi = *(float64 *)(vm + H1_2(j));
3908 e2 = (flip ? ni : nr);
3909 e1 = (flip ? mi : mr) ^ neg_real;
3910 e4 = e2;
3911 e3 = (flip ? mr : mi) ^ neg_imag;
3913 if (likely((pg >> (i & 63)) & 1)) {
3914 d = *(float64 *)(va + H1_2(i));
3915 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3916 *(float64 *)(vd + H1_2(i)) = d;
3918 if (likely((pg >> (j & 63)) & 1)) {
3919 d = *(float64 *)(va + H1_2(j));
3920 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3921 *(float64 *)(vd + H1_2(j)) = d;
3923 } while (i & 63);
3924 } while (i != 0);
3928 * Load contiguous data, protected by a governing predicate.
3930 #define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3931 static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3932 target_ulong addr, intptr_t oprsz, \
3933 uintptr_t ra) \
3935 intptr_t i = 0; \
3936 do { \
3937 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3938 do { \
3939 TYPEM m = 0; \
3940 if (pg & 1) { \
3941 m = FN(env, addr, ra); \
3943 *(TYPEE *)(vd + H(i)) = m; \
3944 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3945 addr += sizeof(TYPEM); \
3946 } while (i & 15); \
3947 } while (i < oprsz); \
3949 void HELPER(NAME)(CPUARMState *env, void *vg, \
3950 target_ulong addr, uint32_t desc) \
3952 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3953 addr, simd_oprsz(desc), GETPC()); \
3956 #define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3957 void HELPER(NAME)(CPUARMState *env, void *vg, \
3958 target_ulong addr, uint32_t desc) \
3960 intptr_t i, oprsz = simd_oprsz(desc); \
3961 intptr_t ra = GETPC(); \
3962 unsigned rd = simd_data(desc); \
3963 void *d1 = &env->vfp.zregs[rd]; \
3964 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3965 for (i = 0; i < oprsz; ) { \
3966 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3967 do { \
3968 TYPEM m1 = 0, m2 = 0; \
3969 if (pg & 1) { \
3970 m1 = FN(env, addr, ra); \
3971 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3973 *(TYPEE *)(d1 + H(i)) = m1; \
3974 *(TYPEE *)(d2 + H(i)) = m2; \
3975 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3976 addr += 2 * sizeof(TYPEM); \
3977 } while (i & 15); \
3981 #define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3982 void HELPER(NAME)(CPUARMState *env, void *vg, \
3983 target_ulong addr, uint32_t desc) \
3985 intptr_t i, oprsz = simd_oprsz(desc); \
3986 intptr_t ra = GETPC(); \
3987 unsigned rd = simd_data(desc); \
3988 void *d1 = &env->vfp.zregs[rd]; \
3989 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3990 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3991 for (i = 0; i < oprsz; ) { \
3992 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3993 do { \
3994 TYPEM m1 = 0, m2 = 0, m3 = 0; \
3995 if (pg & 1) { \
3996 m1 = FN(env, addr, ra); \
3997 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3998 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4000 *(TYPEE *)(d1 + H(i)) = m1; \
4001 *(TYPEE *)(d2 + H(i)) = m2; \
4002 *(TYPEE *)(d3 + H(i)) = m3; \
4003 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4004 addr += 3 * sizeof(TYPEM); \
4005 } while (i & 15); \
4009 #define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
4010 void HELPER(NAME)(CPUARMState *env, void *vg, \
4011 target_ulong addr, uint32_t desc) \
4013 intptr_t i, oprsz = simd_oprsz(desc); \
4014 intptr_t ra = GETPC(); \
4015 unsigned rd = simd_data(desc); \
4016 void *d1 = &env->vfp.zregs[rd]; \
4017 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4018 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4019 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4020 for (i = 0; i < oprsz; ) { \
4021 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4022 do { \
4023 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
4024 if (pg & 1) { \
4025 m1 = FN(env, addr, ra); \
4026 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4027 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4028 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
4030 *(TYPEE *)(d1 + H(i)) = m1; \
4031 *(TYPEE *)(d2 + H(i)) = m2; \
4032 *(TYPEE *)(d3 + H(i)) = m3; \
4033 *(TYPEE *)(d4 + H(i)) = m4; \
4034 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4035 addr += 4 * sizeof(TYPEM); \
4036 } while (i & 15); \
4040 DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4041 DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4042 DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4043 DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4044 DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4045 DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4047 DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4048 DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4049 DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4050 DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4052 DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4053 DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4055 DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4056 DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4057 DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4058 DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4060 DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4061 DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4062 DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4063 DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4065 DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4066 DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4067 DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4068 DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4070 DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4071 DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4072 DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4073 DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4075 #undef DO_LD1
4076 #undef DO_LD2
4077 #undef DO_LD3
4078 #undef DO_LD4
4081 * Load contiguous data, first-fault and no-fault.
4084 #ifdef CONFIG_USER_ONLY
4086 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4087 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4088 * option, which leaves subsequent data unchanged.
4090 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4092 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4094 if (i & 63) {
4095 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4096 i = ROUND_UP(i, 64);
4098 for (; i < oprsz; i += 64) {
4099 ffr[i / 64] = 0;
4103 /* Hold the mmap lock during the operation so that there is no race
4104 * between page_check_range and the load operation. We expect the
4105 * usual case to have no faults at all, so we check the whole range
4106 * first and if successful defer to the normal load operation.
4108 * TODO: Change mmap_lock to a rwlock so that multiple readers
4109 * can run simultaneously. This will probably help other uses
4110 * within QEMU as well.
4112 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4113 static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
4114 target_ulong addr, intptr_t oprsz, \
4115 bool first, uintptr_t ra) \
4117 intptr_t i = 0; \
4118 do { \
4119 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4120 do { \
4121 TYPEM m = 0; \
4122 if (pg & 1) { \
4123 if (!first && \
4124 unlikely(page_check_range(addr, sizeof(TYPEM), \
4125 PAGE_READ))) { \
4126 record_fault(env, i, oprsz); \
4127 return; \
4129 m = FN(env, addr, ra); \
4130 first = false; \
4132 *(TYPEE *)(vd + H(i)) = m; \
4133 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4134 addr += sizeof(TYPEM); \
4135 } while (i & 15); \
4136 } while (i < oprsz); \
4138 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4139 target_ulong addr, uint32_t desc) \
4141 intptr_t oprsz = simd_oprsz(desc); \
4142 unsigned rd = simd_data(desc); \
4143 void *vd = &env->vfp.zregs[rd]; \
4144 mmap_lock(); \
4145 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4146 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4147 } else { \
4148 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
4150 mmap_unlock(); \
4153 /* No-fault loads are like first-fault loads without the
4154 * first faulting special case.
4156 #define DO_LDNF1(PART) \
4157 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4158 target_ulong addr, uint32_t desc) \
4160 intptr_t oprsz = simd_oprsz(desc); \
4161 unsigned rd = simd_data(desc); \
4162 void *vd = &env->vfp.zregs[rd]; \
4163 mmap_lock(); \
4164 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4165 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4166 } else { \
4167 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
4169 mmap_unlock(); \
4172 #else
4174 /* TODO: System mode is not yet supported.
4175 * This would probably use tlb_vaddr_to_host.
4177 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4178 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4179 target_ulong addr, uint32_t desc) \
4181 g_assert_not_reached(); \
4184 #define DO_LDNF1(PART) \
4185 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4186 target_ulong addr, uint32_t desc) \
4188 g_assert_not_reached(); \
4191 #endif
4193 DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4194 DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4195 DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4196 DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4197 DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4198 DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4199 DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4201 DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4202 DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4203 DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4204 DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4205 DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4207 DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4208 DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4209 DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4211 DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4213 #undef DO_LDFF1
4215 DO_LDNF1(bb_r)
4216 DO_LDNF1(bhu_r)
4217 DO_LDNF1(bhs_r)
4218 DO_LDNF1(bsu_r)
4219 DO_LDNF1(bss_r)
4220 DO_LDNF1(bdu_r)
4221 DO_LDNF1(bds_r)
4223 DO_LDNF1(hh_r)
4224 DO_LDNF1(hsu_r)
4225 DO_LDNF1(hss_r)
4226 DO_LDNF1(hdu_r)
4227 DO_LDNF1(hds_r)
4229 DO_LDNF1(ss_r)
4230 DO_LDNF1(sdu_r)
4231 DO_LDNF1(sds_r)
4233 DO_LDNF1(dd_r)
4235 #undef DO_LDNF1
4238 * Store contiguous data, protected by a governing predicate.
4240 #define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
4241 void HELPER(NAME)(CPUARMState *env, void *vg, \
4242 target_ulong addr, uint32_t desc) \
4244 intptr_t i, oprsz = simd_oprsz(desc); \
4245 intptr_t ra = GETPC(); \
4246 unsigned rd = simd_data(desc); \
4247 void *vd = &env->vfp.zregs[rd]; \
4248 for (i = 0; i < oprsz; ) { \
4249 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4250 do { \
4251 if (pg & 1) { \
4252 TYPEM m = *(TYPEE *)(vd + H(i)); \
4253 FN(env, addr, m, ra); \
4255 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4256 addr += sizeof(TYPEM); \
4257 } while (i & 15); \
4261 #define DO_ST1_D(NAME, FN, TYPEM) \
4262 void HELPER(NAME)(CPUARMState *env, void *vg, \
4263 target_ulong addr, uint32_t desc) \
4265 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4266 intptr_t ra = GETPC(); \
4267 unsigned rd = simd_data(desc); \
4268 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
4269 uint8_t *pg = vg; \
4270 for (i = 0; i < oprsz; i += 1) { \
4271 if (pg[H1(i)] & 1) { \
4272 FN(env, addr, d[i], ra); \
4274 addr += sizeof(TYPEM); \
4278 #define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
4279 void HELPER(NAME)(CPUARMState *env, void *vg, \
4280 target_ulong addr, uint32_t desc) \
4282 intptr_t i, oprsz = simd_oprsz(desc); \
4283 intptr_t ra = GETPC(); \
4284 unsigned rd = simd_data(desc); \
4285 void *d1 = &env->vfp.zregs[rd]; \
4286 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4287 for (i = 0; i < oprsz; ) { \
4288 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4289 do { \
4290 if (pg & 1) { \
4291 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4292 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4293 FN(env, addr, m1, ra); \
4294 FN(env, addr + sizeof(TYPEM), m2, ra); \
4296 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4297 addr += 2 * sizeof(TYPEM); \
4298 } while (i & 15); \
4302 #define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
4303 void HELPER(NAME)(CPUARMState *env, void *vg, \
4304 target_ulong addr, uint32_t desc) \
4306 intptr_t i, oprsz = simd_oprsz(desc); \
4307 intptr_t ra = GETPC(); \
4308 unsigned rd = simd_data(desc); \
4309 void *d1 = &env->vfp.zregs[rd]; \
4310 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4311 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4312 for (i = 0; i < oprsz; ) { \
4313 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4314 do { \
4315 if (pg & 1) { \
4316 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4317 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4318 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4319 FN(env, addr, m1, ra); \
4320 FN(env, addr + sizeof(TYPEM), m2, ra); \
4321 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4323 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4324 addr += 3 * sizeof(TYPEM); \
4325 } while (i & 15); \
4329 #define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
4330 void HELPER(NAME)(CPUARMState *env, void *vg, \
4331 target_ulong addr, uint32_t desc) \
4333 intptr_t i, oprsz = simd_oprsz(desc); \
4334 intptr_t ra = GETPC(); \
4335 unsigned rd = simd_data(desc); \
4336 void *d1 = &env->vfp.zregs[rd]; \
4337 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4338 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4339 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4340 for (i = 0; i < oprsz; ) { \
4341 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4342 do { \
4343 if (pg & 1) { \
4344 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4345 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4346 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4347 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
4348 FN(env, addr, m1, ra); \
4349 FN(env, addr + sizeof(TYPEM), m2, ra); \
4350 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4351 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4353 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4354 addr += 4 * sizeof(TYPEM); \
4355 } while (i & 15); \
4359 DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4360 DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4361 DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4363 DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4364 DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4366 DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4368 DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4369 DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4370 DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4371 DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4373 DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4374 DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4375 DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4376 DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4378 DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4379 DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4380 DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4381 DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4383 DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4385 void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4386 target_ulong addr, uint32_t desc)
4388 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4389 intptr_t ra = GETPC();
4390 unsigned rd = simd_data(desc);
4391 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4392 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4393 uint8_t *pg = vg;
4395 for (i = 0; i < oprsz; i += 1) {
4396 if (pg[H1(i)] & 1) {
4397 cpu_stq_data_ra(env, addr, d1[i], ra);
4398 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4400 addr += 2 * 8;
4404 void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4405 target_ulong addr, uint32_t desc)
4407 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4408 intptr_t ra = GETPC();
4409 unsigned rd = simd_data(desc);
4410 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4411 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4412 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4413 uint8_t *pg = vg;
4415 for (i = 0; i < oprsz; i += 1) {
4416 if (pg[H1(i)] & 1) {
4417 cpu_stq_data_ra(env, addr, d1[i], ra);
4418 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4419 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4421 addr += 3 * 8;
4425 void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4426 target_ulong addr, uint32_t desc)
4428 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4429 intptr_t ra = GETPC();
4430 unsigned rd = simd_data(desc);
4431 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4432 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4433 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4434 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4435 uint8_t *pg = vg;
4437 for (i = 0; i < oprsz; i += 1) {
4438 if (pg[H1(i)] & 1) {
4439 cpu_stq_data_ra(env, addr, d1[i], ra);
4440 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4441 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4442 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4444 addr += 4 * 8;
4448 /* Loads with a vector index. */
4450 #define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4451 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4452 target_ulong base, uint32_t desc) \
4454 intptr_t i, oprsz = simd_oprsz(desc); \
4455 unsigned scale = simd_data(desc); \
4456 uintptr_t ra = GETPC(); \
4457 for (i = 0; i < oprsz; ) { \
4458 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4459 do { \
4460 TYPEM m = 0; \
4461 if (pg & 1) { \
4462 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4463 m = FN(env, base + (off << scale), ra); \
4465 *(uint32_t *)(vd + H1_4(i)) = m; \
4466 i += 4, pg >>= 4; \
4467 } while (i & 15); \
4471 #define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4472 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4473 target_ulong base, uint32_t desc) \
4475 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4476 unsigned scale = simd_data(desc); \
4477 uintptr_t ra = GETPC(); \
4478 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4479 for (i = 0; i < oprsz; i++) { \
4480 TYPEM mm = 0; \
4481 if (pg[H1(i)] & 1) { \
4482 target_ulong off = (TYPEI)m[i]; \
4483 mm = FN(env, base + (off << scale), ra); \
4485 d[i] = mm; \
4489 DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4490 DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4491 DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4492 DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4493 DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4495 DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4496 DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4497 DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4498 DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4499 DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4501 DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4502 DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4503 DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4504 DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4505 DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4506 DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4507 DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4509 DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4510 DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4511 DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4512 DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4513 DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4514 DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4515 DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4517 DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4518 DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4519 DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4520 DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4521 DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4522 DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4523 DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4525 /* First fault loads with a vector index. */
4527 #ifdef CONFIG_USER_ONLY
4529 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4530 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4531 target_ulong base, uint32_t desc) \
4533 intptr_t i, oprsz = simd_oprsz(desc); \
4534 unsigned scale = simd_data(desc); \
4535 uintptr_t ra = GETPC(); \
4536 bool first = true; \
4537 mmap_lock(); \
4538 for (i = 0; i < oprsz; ) { \
4539 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4540 do { \
4541 TYPEM m = 0; \
4542 if (pg & 1) { \
4543 target_ulong off = *(TYPEI *)(vm + H(i)); \
4544 target_ulong addr = base + (off << scale); \
4545 if (!first && \
4546 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4547 record_fault(env, i, oprsz); \
4548 goto exit; \
4550 m = FN(env, addr, ra); \
4551 first = false; \
4553 *(TYPEE *)(vd + H(i)) = m; \
4554 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4555 } while (i & 15); \
4557 exit: \
4558 mmap_unlock(); \
4561 #else
4563 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4564 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4565 target_ulong base, uint32_t desc) \
4567 g_assert_not_reached(); \
4570 #endif
4572 #define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4573 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4574 #define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4575 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4577 DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4578 DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4579 DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4580 DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4581 DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4583 DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4584 DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4585 DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4586 DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4587 DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4589 DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4590 DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4591 DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4592 DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4593 DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4594 DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4595 DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4597 DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4598 DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4599 DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4600 DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4601 DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4602 DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4603 DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4605 DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4606 DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4607 DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4608 DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4609 DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4610 DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4611 DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4613 /* Stores with a vector index. */
4615 #define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
4616 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4617 target_ulong base, uint32_t desc) \
4619 intptr_t i, oprsz = simd_oprsz(desc); \
4620 unsigned scale = simd_data(desc); \
4621 uintptr_t ra = GETPC(); \
4622 for (i = 0; i < oprsz; ) { \
4623 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4624 do { \
4625 if (likely(pg & 1)) { \
4626 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4627 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
4628 FN(env, base + (off << scale), d, ra); \
4630 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
4631 } while (i & 15); \
4635 #define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
4636 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4637 target_ulong base, uint32_t desc) \
4639 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4640 unsigned scale = simd_data(desc); \
4641 uintptr_t ra = GETPC(); \
4642 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4643 for (i = 0; i < oprsz; i++) { \
4644 if (likely(pg[H1(i)] & 1)) { \
4645 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
4646 FN(env, base + off, d[i], ra); \
4651 DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4652 DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4653 DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4655 DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4656 DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4657 DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4659 DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4660 DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4661 DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4662 DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4664 DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4665 DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4666 DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4667 DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4669 DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4670 DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4671 DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4672 DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)