target/arm: Implement SVE floating-point unary operations
[qemu.git] / target / arm / sve_helper.c
blob83bd8c426904222f71f062a3d94315e4480ef8a0
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 return flags;
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
96 return flags;
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
116 return flags;
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
127 * printf("0x%016lx,\n", m);
130 static inline uint64_t expand_pred_b(uint8_t byte)
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
220 return word[byte];
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 static inline uint64_t expand_pred_h(uint8_t byte)
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
249 return word[byte & 0x55];
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
260 return word[byte & 0x11];
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
266 return rol32(h, 16);
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
280 return rol64(h, 32);
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
372 #define DO_DIV(N, M) (M ? N / M : 0)
374 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
375 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
376 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
377 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
379 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
380 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
381 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
382 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
384 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
385 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
386 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
387 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
389 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
390 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
391 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
392 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
394 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
395 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
396 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
397 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
399 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
400 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
401 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
402 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
404 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
405 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
406 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
407 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
409 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
410 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
411 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
412 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
414 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
415 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
416 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
417 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
419 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
420 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
421 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
422 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
424 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
425 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
426 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
427 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
429 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
430 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
431 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
432 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
434 /* Because the computation type is at least twice as large as required,
435 these work for both signed and unsigned source types. */
436 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
438 return (n * m) >> 8;
441 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
443 return (n * m) >> 16;
446 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
448 return (n * m) >> 32;
451 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
453 uint64_t lo, hi;
454 muls64(&lo, &hi, n, m);
455 return hi;
458 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
460 uint64_t lo, hi;
461 mulu64(&lo, &hi, n, m);
462 return hi;
465 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
466 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
467 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
468 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
470 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
471 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
472 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
473 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
475 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
476 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
477 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
478 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
480 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
481 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
483 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
484 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
486 /* Note that all bits of the shift are significant
487 and not modulo the element size. */
488 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
489 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
490 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
492 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
493 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
494 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
496 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
497 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
498 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
500 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
501 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
502 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
504 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
505 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
506 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
508 #undef DO_ZPZZ
509 #undef DO_ZPZZ_D
511 /* Three-operand expander, controlled by a predicate, in which the
512 * third operand is "wide". That is, for D = N op M, the same 64-bit
513 * value of M is used with all of the narrower values of N.
515 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
516 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
518 intptr_t i, opr_sz = simd_oprsz(desc); \
519 for (i = 0; i < opr_sz; ) { \
520 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
521 TYPEW mm = *(TYPEW *)(vm + i); \
522 do { \
523 if (pg & 1) { \
524 TYPE nn = *(TYPE *)(vn + H(i)); \
525 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
527 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
528 } while (i & 7); \
532 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
533 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
534 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
536 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
537 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
538 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
540 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
541 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
542 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
544 #undef DO_ZPZW
546 /* Fully general two-operand expander, controlled by a predicate.
548 #define DO_ZPZ(NAME, TYPE, H, OP) \
549 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
551 intptr_t i, opr_sz = simd_oprsz(desc); \
552 for (i = 0; i < opr_sz; ) { \
553 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
554 do { \
555 if (pg & 1) { \
556 TYPE nn = *(TYPE *)(vn + H(i)); \
557 *(TYPE *)(vd + H(i)) = OP(nn); \
559 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
560 } while (i & 15); \
564 /* Similarly, specialized for 64-bit operands. */
565 #define DO_ZPZ_D(NAME, TYPE, OP) \
566 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
568 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
569 TYPE *d = vd, *n = vn; \
570 uint8_t *pg = vg; \
571 for (i = 0; i < opr_sz; i += 1) { \
572 if (pg[H1(i)] & 1) { \
573 TYPE nn = n[i]; \
574 d[i] = OP(nn); \
579 #define DO_CLS_B(N) (clrsb32(N) - 24)
580 #define DO_CLS_H(N) (clrsb32(N) - 16)
582 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
583 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
584 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
585 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
587 #define DO_CLZ_B(N) (clz32(N) - 24)
588 #define DO_CLZ_H(N) (clz32(N) - 16)
590 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
591 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
592 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
593 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
595 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
596 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
597 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
598 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
600 #define DO_CNOT(N) (N == 0)
602 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
603 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
604 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
605 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
607 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
609 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
610 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
611 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
613 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
615 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
616 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
617 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
619 #define DO_NOT(N) (~N)
621 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
622 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
623 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
624 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
626 #define DO_SXTB(N) ((int8_t)N)
627 #define DO_SXTH(N) ((int16_t)N)
628 #define DO_SXTS(N) ((int32_t)N)
629 #define DO_UXTB(N) ((uint8_t)N)
630 #define DO_UXTH(N) ((uint16_t)N)
631 #define DO_UXTS(N) ((uint32_t)N)
633 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
634 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
635 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
636 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
637 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
638 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
640 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
641 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
642 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
643 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
644 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
645 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
647 #define DO_ABS(N) (N < 0 ? -N : N)
649 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
650 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
651 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
652 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
654 #define DO_NEG(N) (-N)
656 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
657 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
658 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
659 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
661 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
662 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
663 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
665 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
666 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
668 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
670 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
671 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
672 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
673 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
675 /* Three-operand expander, unpredicated, in which the third operand is "wide".
677 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
678 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
680 intptr_t i, opr_sz = simd_oprsz(desc); \
681 for (i = 0; i < opr_sz; ) { \
682 TYPEW mm = *(TYPEW *)(vm + i); \
683 do { \
684 TYPE nn = *(TYPE *)(vn + H(i)); \
685 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
686 i += sizeof(TYPE); \
687 } while (i & 7); \
691 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
692 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
693 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
695 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
696 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
697 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
699 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
700 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
701 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
703 #undef DO_ZZW
705 #undef DO_CLS_B
706 #undef DO_CLS_H
707 #undef DO_CLZ_B
708 #undef DO_CLZ_H
709 #undef DO_CNOT
710 #undef DO_FABS
711 #undef DO_FNEG
712 #undef DO_ABS
713 #undef DO_NEG
714 #undef DO_ZPZ
715 #undef DO_ZPZ_D
717 /* Two-operand reduction expander, controlled by a predicate.
718 * The difference between TYPERED and TYPERET has to do with
719 * sign-extension. E.g. for SMAX, TYPERED must be signed,
720 * but TYPERET must be unsigned so that e.g. a 32-bit value
721 * is not sign-extended to the ABI uint64_t return type.
723 /* ??? If we were to vectorize this by hand the reduction ordering
724 * would change. For integer operands, this is perfectly fine.
726 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
727 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
729 intptr_t i, opr_sz = simd_oprsz(desc); \
730 TYPERED ret = INIT; \
731 for (i = 0; i < opr_sz; ) { \
732 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
733 do { \
734 if (pg & 1) { \
735 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
736 ret = OP(ret, nn); \
738 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
739 } while (i & 15); \
741 return (TYPERET)ret; \
744 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
745 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
748 TYPEE *n = vn; \
749 uint8_t *pg = vg; \
750 TYPER ret = INIT; \
751 for (i = 0; i < opr_sz; i += 1) { \
752 if (pg[H1(i)] & 1) { \
753 TYPEE nn = n[i]; \
754 ret = OP(ret, nn); \
757 return ret; \
760 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
761 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
762 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
763 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
765 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
766 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
767 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
768 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
770 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
771 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
772 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
773 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
775 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
776 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
777 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
779 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
780 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
781 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
782 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
784 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
785 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
786 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
787 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
789 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
790 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
791 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
792 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
794 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
795 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
796 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
797 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
799 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
800 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
801 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
802 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
804 #undef DO_VPZ
805 #undef DO_VPZ_D
807 /* Two vector operand, one scalar operand, unpredicated. */
808 #define DO_ZZI(NAME, TYPE, OP) \
809 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
811 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
812 TYPE s = s64, *d = vd, *n = vn; \
813 for (i = 0; i < opr_sz; ++i) { \
814 d[i] = OP(n[i], s); \
818 #define DO_SUBR(X, Y) (Y - X)
820 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
821 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
822 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
823 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
825 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
826 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
827 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
828 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
830 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
831 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
832 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
833 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
835 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
836 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
837 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
838 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
840 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
841 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
842 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
843 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
845 #undef DO_ZZI
847 #undef DO_AND
848 #undef DO_ORR
849 #undef DO_EOR
850 #undef DO_BIC
851 #undef DO_ADD
852 #undef DO_SUB
853 #undef DO_MAX
854 #undef DO_MIN
855 #undef DO_ABD
856 #undef DO_MUL
857 #undef DO_DIV
858 #undef DO_ASR
859 #undef DO_LSR
860 #undef DO_LSL
861 #undef DO_SUBR
863 /* Similar to the ARM LastActiveElement pseudocode function, except the
864 result is multiplied by the element size. This includes the not found
865 indication; e.g. not found for esz=3 is -8. */
866 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
868 uint64_t mask = pred_esz_masks[esz];
869 intptr_t i = words;
871 do {
872 uint64_t this_g = g[--i] & mask;
873 if (this_g) {
874 return i * 64 + (63 - clz64(this_g));
876 } while (i > 0);
877 return (intptr_t)-1 << esz;
880 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
882 uint32_t flags = PREDTEST_INIT;
883 uint64_t *d = vd, *g = vg;
884 intptr_t i = 0;
886 do {
887 uint64_t this_d = d[i];
888 uint64_t this_g = g[i];
890 if (this_g) {
891 if (!(flags & 4)) {
892 /* Set in D the first bit of G. */
893 this_d |= this_g & -this_g;
894 d[i] = this_d;
896 flags = iter_predtest_fwd(this_d, this_g, flags);
898 } while (++i < words);
900 return flags;
903 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
905 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
906 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
907 uint32_t flags = PREDTEST_INIT;
908 uint64_t *d = vd, *g = vg, esz_mask;
909 intptr_t i, next;
911 next = last_active_element(vd, words, esz) + (1 << esz);
912 esz_mask = pred_esz_masks[esz];
914 /* Similar to the pseudocode for pnext, but scaled by ESZ
915 so that we find the correct bit. */
916 if (next < words * 64) {
917 uint64_t mask = -1;
919 if (next & 63) {
920 mask = ~((1ull << (next & 63)) - 1);
921 next &= -64;
923 do {
924 uint64_t this_g = g[next / 64] & esz_mask & mask;
925 if (this_g != 0) {
926 next = (next & -64) + ctz64(this_g);
927 break;
929 next += 64;
930 mask = -1;
931 } while (next < words * 64);
934 i = 0;
935 do {
936 uint64_t this_d = 0;
937 if (i == next / 64) {
938 this_d = 1ull << (next & 63);
940 d[i] = this_d;
941 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
942 } while (++i < words);
944 return flags;
947 /* Store zero into every active element of Zd. We will use this for two
948 * and three-operand predicated instructions for which logic dictates a
949 * zero result. In particular, logical shift by element size, which is
950 * otherwise undefined on the host.
952 * For element sizes smaller than uint64_t, we use tables to expand
953 * the N bits of the controlling predicate to a byte mask, and clear
954 * those bytes.
956 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
958 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
959 uint64_t *d = vd;
960 uint8_t *pg = vg;
961 for (i = 0; i < opr_sz; i += 1) {
962 d[i] &= ~expand_pred_b(pg[H1(i)]);
966 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_h(pg[H1(i)]);
976 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_s(pg[H1(i)]);
986 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 if (pg[H1(i)] & 1) {
993 d[i] = 0;
998 /* Copy Zn into Zd, and store zero into inactive elements. */
999 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1002 uint64_t *d = vd, *n = vn;
1003 uint8_t *pg = vg;
1004 for (i = 0; i < opr_sz; i += 1) {
1005 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1009 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1019 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1029 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1039 /* Three-operand expander, immediate operand, controlled by a predicate.
1041 #define DO_ZPZI(NAME, TYPE, H, OP) \
1042 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1044 intptr_t i, opr_sz = simd_oprsz(desc); \
1045 TYPE imm = simd_data(desc); \
1046 for (i = 0; i < opr_sz; ) { \
1047 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1048 do { \
1049 if (pg & 1) { \
1050 TYPE nn = *(TYPE *)(vn + H(i)); \
1051 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1053 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1054 } while (i & 15); \
1058 /* Similarly, specialized for 64-bit operands. */
1059 #define DO_ZPZI_D(NAME, TYPE, OP) \
1060 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1062 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1063 TYPE *d = vd, *n = vn; \
1064 TYPE imm = simd_data(desc); \
1065 uint8_t *pg = vg; \
1066 for (i = 0; i < opr_sz; i += 1) { \
1067 if (pg[H1(i)] & 1) { \
1068 TYPE nn = n[i]; \
1069 d[i] = OP(nn, imm); \
1074 #define DO_SHR(N, M) (N >> M)
1075 #define DO_SHL(N, M) (N << M)
1077 /* Arithmetic shift right for division. This rounds negative numbers
1078 toward zero as per signed division. Therefore before shifting,
1079 when N is negative, add 2**M-1. */
1080 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1082 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1083 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1084 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1085 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1087 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1088 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1089 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1090 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1092 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1093 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1094 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1095 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1097 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1098 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1099 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1100 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1102 #undef DO_SHR
1103 #undef DO_SHL
1104 #undef DO_ASRD
1105 #undef DO_ZPZI
1106 #undef DO_ZPZI_D
1108 /* Fully general four-operand expander, controlled by a predicate.
1110 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1111 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1112 void *vg, uint32_t desc) \
1114 intptr_t i, opr_sz = simd_oprsz(desc); \
1115 for (i = 0; i < opr_sz; ) { \
1116 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1117 do { \
1118 if (pg & 1) { \
1119 TYPE nn = *(TYPE *)(vn + H(i)); \
1120 TYPE mm = *(TYPE *)(vm + H(i)); \
1121 TYPE aa = *(TYPE *)(va + H(i)); \
1122 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1124 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1125 } while (i & 15); \
1129 /* Similarly, specialized for 64-bit operands. */
1130 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1131 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1132 void *vg, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1135 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1136 uint8_t *pg = vg; \
1137 for (i = 0; i < opr_sz; i += 1) { \
1138 if (pg[H1(i)] & 1) { \
1139 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1140 d[i] = OP(aa, nn, mm); \
1145 #define DO_MLA(A, N, M) (A + N * M)
1146 #define DO_MLS(A, N, M) (A - N * M)
1148 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1149 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1151 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1152 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1154 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1155 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1157 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1158 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1160 #undef DO_MLA
1161 #undef DO_MLS
1162 #undef DO_ZPZZZ
1163 #undef DO_ZPZZZ_D
1165 void HELPER(sve_index_b)(void *vd, uint32_t start,
1166 uint32_t incr, uint32_t desc)
1168 intptr_t i, opr_sz = simd_oprsz(desc);
1169 uint8_t *d = vd;
1170 for (i = 0; i < opr_sz; i += 1) {
1171 d[H1(i)] = start + i * incr;
1175 void HELPER(sve_index_h)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1178 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1179 uint16_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H2(i)] = start + i * incr;
1185 void HELPER(sve_index_s)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1189 uint32_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H4(i)] = start + i * incr;
1195 void HELPER(sve_index_d)(void *vd, uint64_t start,
1196 uint64_t incr, uint32_t desc)
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1199 uint64_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[i] = start + i * incr;
1205 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1207 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1208 uint32_t sh = simd_data(desc);
1209 uint32_t *d = vd, *n = vn, *m = vm;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = n[i] + (m[i] << sh);
1215 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1218 uint64_t sh = simd_data(desc);
1219 uint64_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1225 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1235 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1245 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1247 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1248 static const uint16_t coeff[] = {
1249 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1250 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1251 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1252 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1254 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1255 uint16_t *d = vd, *n = vn;
1257 for (i = 0; i < opr_sz; i++) {
1258 uint16_t nn = n[i];
1259 intptr_t idx = extract32(nn, 0, 5);
1260 uint16_t exp = extract32(nn, 5, 5);
1261 d[i] = coeff[idx] | (exp << 10);
1265 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1267 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1268 static const uint32_t coeff[] = {
1269 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1270 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1271 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1272 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1273 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1274 0x1ef532, 0x20b051, 0x227043, 0x243516,
1275 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1276 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1277 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1278 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1279 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1280 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1281 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1282 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1283 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1284 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1286 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1287 uint32_t *d = vd, *n = vn;
1289 for (i = 0; i < opr_sz; i++) {
1290 uint32_t nn = n[i];
1291 intptr_t idx = extract32(nn, 0, 6);
1292 uint32_t exp = extract32(nn, 6, 8);
1293 d[i] = coeff[idx] | (exp << 23);
1297 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1299 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1300 static const uint64_t coeff[] = {
1301 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1302 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1303 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1304 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1305 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1306 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1307 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1308 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1309 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1310 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1311 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1312 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1313 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1314 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1315 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1316 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1317 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1318 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1319 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1320 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1321 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1322 0xFA7C1819E90D8ull,
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1325 uint64_t *d = vd, *n = vn;
1327 for (i = 0; i < opr_sz; i++) {
1328 uint64_t nn = n[i];
1329 intptr_t idx = extract32(nn, 0, 6);
1330 uint64_t exp = extract32(nn, 6, 11);
1331 d[i] = coeff[idx] | (exp << 52);
1335 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1337 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1338 uint16_t *d = vd, *n = vn, *m = vm;
1339 for (i = 0; i < opr_sz; i += 1) {
1340 uint16_t nn = n[i];
1341 uint16_t mm = m[i];
1342 if (mm & 1) {
1343 nn = float16_one;
1345 d[i] = nn ^ (mm & 2) << 14;
1349 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1351 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1352 uint32_t *d = vd, *n = vn, *m = vm;
1353 for (i = 0; i < opr_sz; i += 1) {
1354 uint32_t nn = n[i];
1355 uint32_t mm = m[i];
1356 if (mm & 1) {
1357 nn = float32_one;
1359 d[i] = nn ^ (mm & 2) << 30;
1363 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1365 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1366 uint64_t *d = vd, *n = vn, *m = vm;
1367 for (i = 0; i < opr_sz; i += 1) {
1368 uint64_t nn = n[i];
1369 uint64_t mm = m[i];
1370 if (mm & 1) {
1371 nn = float64_one;
1373 d[i] = nn ^ (mm & 2) << 62;
1378 * Signed saturating addition with scalar operand.
1381 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1383 intptr_t i, oprsz = simd_oprsz(desc);
1385 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1386 int r = *(int8_t *)(a + i) + b;
1387 if (r > INT8_MAX) {
1388 r = INT8_MAX;
1389 } else if (r < INT8_MIN) {
1390 r = INT8_MIN;
1392 *(int8_t *)(d + i) = r;
1396 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1398 intptr_t i, oprsz = simd_oprsz(desc);
1400 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1401 int r = *(int16_t *)(a + i) + b;
1402 if (r > INT16_MAX) {
1403 r = INT16_MAX;
1404 } else if (r < INT16_MIN) {
1405 r = INT16_MIN;
1407 *(int16_t *)(d + i) = r;
1411 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1413 intptr_t i, oprsz = simd_oprsz(desc);
1415 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1416 int64_t r = *(int32_t *)(a + i) + b;
1417 if (r > INT32_MAX) {
1418 r = INT32_MAX;
1419 } else if (r < INT32_MIN) {
1420 r = INT32_MIN;
1422 *(int32_t *)(d + i) = r;
1426 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1428 intptr_t i, oprsz = simd_oprsz(desc);
1430 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1431 int64_t ai = *(int64_t *)(a + i);
1432 int64_t r = ai + b;
1433 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1434 /* Signed overflow. */
1435 r = (r < 0 ? INT64_MAX : INT64_MIN);
1437 *(int64_t *)(d + i) = r;
1442 * Unsigned saturating addition with scalar operand.
1445 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1447 intptr_t i, oprsz = simd_oprsz(desc);
1449 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1450 int r = *(uint8_t *)(a + i) + b;
1451 if (r > UINT8_MAX) {
1452 r = UINT8_MAX;
1453 } else if (r < 0) {
1454 r = 0;
1456 *(uint8_t *)(d + i) = r;
1460 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1462 intptr_t i, oprsz = simd_oprsz(desc);
1464 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1465 int r = *(uint16_t *)(a + i) + b;
1466 if (r > UINT16_MAX) {
1467 r = UINT16_MAX;
1468 } else if (r < 0) {
1469 r = 0;
1471 *(uint16_t *)(d + i) = r;
1475 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1477 intptr_t i, oprsz = simd_oprsz(desc);
1479 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1480 int64_t r = *(uint32_t *)(a + i) + b;
1481 if (r > UINT32_MAX) {
1482 r = UINT32_MAX;
1483 } else if (r < 0) {
1484 r = 0;
1486 *(uint32_t *)(d + i) = r;
1490 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1492 intptr_t i, oprsz = simd_oprsz(desc);
1494 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1495 uint64_t r = *(uint64_t *)(a + i) + b;
1496 if (r < b) {
1497 r = UINT64_MAX;
1499 *(uint64_t *)(d + i) = r;
1503 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1505 intptr_t i, oprsz = simd_oprsz(desc);
1507 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1508 uint64_t ai = *(uint64_t *)(a + i);
1509 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1513 /* Two operand predicated copy immediate with merge. All valid immediates
1514 * can fit within 17 signed bits in the simd_data field.
1516 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1517 uint64_t mm, uint32_t desc)
1519 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1520 uint64_t *d = vd, *n = vn;
1521 uint8_t *pg = vg;
1523 mm = dup_const(MO_8, mm);
1524 for (i = 0; i < opr_sz; i += 1) {
1525 uint64_t nn = n[i];
1526 uint64_t pp = expand_pred_b(pg[H1(i)]);
1527 d[i] = (mm & pp) | (nn & ~pp);
1531 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1532 uint64_t mm, uint32_t desc)
1534 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1535 uint64_t *d = vd, *n = vn;
1536 uint8_t *pg = vg;
1538 mm = dup_const(MO_16, mm);
1539 for (i = 0; i < opr_sz; i += 1) {
1540 uint64_t nn = n[i];
1541 uint64_t pp = expand_pred_h(pg[H1(i)]);
1542 d[i] = (mm & pp) | (nn & ~pp);
1546 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1547 uint64_t mm, uint32_t desc)
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1550 uint64_t *d = vd, *n = vn;
1551 uint8_t *pg = vg;
1553 mm = dup_const(MO_32, mm);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 uint64_t nn = n[i];
1556 uint64_t pp = expand_pred_s(pg[H1(i)]);
1557 d[i] = (mm & pp) | (nn & ~pp);
1561 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1562 uint64_t mm, uint32_t desc)
1564 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1565 uint64_t *d = vd, *n = vn;
1566 uint8_t *pg = vg;
1568 for (i = 0; i < opr_sz; i += 1) {
1569 uint64_t nn = n[i];
1570 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1574 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577 uint64_t *d = vd;
1578 uint8_t *pg = vg;
1580 val = dup_const(MO_8, val);
1581 for (i = 0; i < opr_sz; i += 1) {
1582 d[i] = val & expand_pred_b(pg[H1(i)]);
1586 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1588 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1589 uint64_t *d = vd;
1590 uint8_t *pg = vg;
1592 val = dup_const(MO_16, val);
1593 for (i = 0; i < opr_sz; i += 1) {
1594 d[i] = val & expand_pred_h(pg[H1(i)]);
1598 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1600 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1601 uint64_t *d = vd;
1602 uint8_t *pg = vg;
1604 val = dup_const(MO_32, val);
1605 for (i = 0; i < opr_sz; i += 1) {
1606 d[i] = val & expand_pred_s(pg[H1(i)]);
1610 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1612 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1613 uint64_t *d = vd;
1614 uint8_t *pg = vg;
1616 for (i = 0; i < opr_sz; i += 1) {
1617 d[i] = (pg[H1(i)] & 1 ? val : 0);
1621 /* Big-endian hosts need to frob the byte indicies. If the copy
1622 * happens to be 8-byte aligned, then no frobbing necessary.
1624 static void swap_memmove(void *vd, void *vs, size_t n)
1626 uintptr_t d = (uintptr_t)vd;
1627 uintptr_t s = (uintptr_t)vs;
1628 uintptr_t o = (d | s | n) & 7;
1629 size_t i;
1631 #ifndef HOST_WORDS_BIGENDIAN
1632 o = 0;
1633 #endif
1634 switch (o) {
1635 case 0:
1636 memmove(vd, vs, n);
1637 break;
1639 case 4:
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i += 4) {
1642 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1644 } else {
1645 for (i = n; i > 0; ) {
1646 i -= 4;
1647 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1650 break;
1652 case 2:
1653 case 6:
1654 if (d < s || d >= s + n) {
1655 for (i = 0; i < n; i += 2) {
1656 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1658 } else {
1659 for (i = n; i > 0; ) {
1660 i -= 2;
1661 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1664 break;
1666 default:
1667 if (d < s || d >= s + n) {
1668 for (i = 0; i < n; i++) {
1669 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1671 } else {
1672 for (i = n; i > 0; ) {
1673 i -= 1;
1674 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1677 break;
1681 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1683 intptr_t opr_sz = simd_oprsz(desc);
1684 size_t n_ofs = simd_data(desc);
1685 size_t n_siz = opr_sz - n_ofs;
1687 if (vd != vm) {
1688 swap_memmove(vd, vn + n_ofs, n_siz);
1689 swap_memmove(vd + n_siz, vm, n_ofs);
1690 } else if (vd != vn) {
1691 swap_memmove(vd + n_siz, vd, n_ofs);
1692 swap_memmove(vd, vn + n_ofs, n_siz);
1693 } else {
1694 /* vd == vn == vm. Need temp space. */
1695 ARMVectorReg tmp;
1696 swap_memmove(&tmp, vm, n_ofs);
1697 swap_memmove(vd, vd + n_ofs, n_siz);
1698 memcpy(vd + n_siz, &tmp, n_ofs);
1702 #define DO_INSR(NAME, TYPE, H) \
1703 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1705 intptr_t opr_sz = simd_oprsz(desc); \
1706 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1707 *(TYPE *)(vd + H(0)) = val; \
1710 DO_INSR(sve_insr_b, uint8_t, H1)
1711 DO_INSR(sve_insr_h, uint16_t, H1_2)
1712 DO_INSR(sve_insr_s, uint32_t, H1_4)
1713 DO_INSR(sve_insr_d, uint64_t, )
1715 #undef DO_INSR
1717 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1719 intptr_t i, j, opr_sz = simd_oprsz(desc);
1720 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1721 uint64_t f = *(uint64_t *)(vn + i);
1722 uint64_t b = *(uint64_t *)(vn + j);
1723 *(uint64_t *)(vd + i) = bswap64(b);
1724 *(uint64_t *)(vd + j) = bswap64(f);
1728 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1730 intptr_t i, j, opr_sz = simd_oprsz(desc);
1731 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1732 uint64_t f = *(uint64_t *)(vn + i);
1733 uint64_t b = *(uint64_t *)(vn + j);
1734 *(uint64_t *)(vd + i) = hswap64(b);
1735 *(uint64_t *)(vd + j) = hswap64(f);
1739 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1741 intptr_t i, j, opr_sz = simd_oprsz(desc);
1742 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1743 uint64_t f = *(uint64_t *)(vn + i);
1744 uint64_t b = *(uint64_t *)(vn + j);
1745 *(uint64_t *)(vd + i) = rol64(b, 32);
1746 *(uint64_t *)(vd + j) = rol64(f, 32);
1750 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1752 intptr_t i, j, opr_sz = simd_oprsz(desc);
1753 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1754 uint64_t f = *(uint64_t *)(vn + i);
1755 uint64_t b = *(uint64_t *)(vn + j);
1756 *(uint64_t *)(vd + i) = b;
1757 *(uint64_t *)(vd + j) = f;
1761 #define DO_TBL(NAME, TYPE, H) \
1762 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1764 intptr_t i, opr_sz = simd_oprsz(desc); \
1765 uintptr_t elem = opr_sz / sizeof(TYPE); \
1766 TYPE *d = vd, *n = vn, *m = vm; \
1767 ARMVectorReg tmp; \
1768 if (unlikely(vd == vn)) { \
1769 n = memcpy(&tmp, vn, opr_sz); \
1771 for (i = 0; i < elem; i++) { \
1772 TYPE j = m[H(i)]; \
1773 d[H(i)] = j < elem ? n[H(j)] : 0; \
1777 DO_TBL(sve_tbl_b, uint8_t, H1)
1778 DO_TBL(sve_tbl_h, uint16_t, H2)
1779 DO_TBL(sve_tbl_s, uint32_t, H4)
1780 DO_TBL(sve_tbl_d, uint64_t, )
1782 #undef TBL
1784 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1785 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1787 intptr_t i, opr_sz = simd_oprsz(desc); \
1788 TYPED *d = vd; \
1789 TYPES *n = vn; \
1790 ARMVectorReg tmp; \
1791 if (unlikely(vn - vd < opr_sz)) { \
1792 n = memcpy(&tmp, n, opr_sz / 2); \
1794 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1795 d[HD(i)] = n[HS(i)]; \
1799 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1800 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1801 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1803 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1804 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1805 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1807 #undef DO_UNPK
1809 /* Mask of bits included in the even numbered predicates of width esz.
1810 * We also use this for expand_bits/compress_bits, and so extend the
1811 * same pattern out to 16-bit units.
1813 static const uint64_t even_bit_esz_masks[5] = {
1814 0x5555555555555555ull,
1815 0x3333333333333333ull,
1816 0x0f0f0f0f0f0f0f0full,
1817 0x00ff00ff00ff00ffull,
1818 0x0000ffff0000ffffull,
1821 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1822 * For N==0, this corresponds to the operation that in qemu/bitops.h
1823 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1824 * section 7-2 Shuffling Bits.
1826 static uint64_t expand_bits(uint64_t x, int n)
1828 int i;
1830 x &= 0xffffffffu;
1831 for (i = 4; i >= n; i--) {
1832 int sh = 1 << i;
1833 x = ((x << sh) | x) & even_bit_esz_masks[i];
1835 return x;
1838 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1839 * For N==0, this corresponds to the operation that in qemu/bitops.h
1840 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1841 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1843 static uint64_t compress_bits(uint64_t x, int n)
1845 int i;
1847 for (i = n; i <= 4; i++) {
1848 int sh = 1 << i;
1849 x &= even_bit_esz_masks[i];
1850 x = (x >> sh) | x;
1852 return x & 0xffffffffu;
1855 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1857 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1858 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1859 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1860 uint64_t *d = vd;
1861 intptr_t i;
1863 if (oprsz <= 8) {
1864 uint64_t nn = *(uint64_t *)vn;
1865 uint64_t mm = *(uint64_t *)vm;
1866 int half = 4 * oprsz;
1868 nn = extract64(nn, high * half, half);
1869 mm = extract64(mm, high * half, half);
1870 nn = expand_bits(nn, esz);
1871 mm = expand_bits(mm, esz);
1872 d[0] = nn + (mm << (1 << esz));
1873 } else {
1874 ARMPredicateReg tmp_n, tmp_m;
1876 /* We produce output faster than we consume input.
1877 Therefore we must be mindful of possible overlap. */
1878 if ((vn - vd) < (uintptr_t)oprsz) {
1879 vn = memcpy(&tmp_n, vn, oprsz);
1881 if ((vm - vd) < (uintptr_t)oprsz) {
1882 vm = memcpy(&tmp_m, vm, oprsz);
1884 if (high) {
1885 high = oprsz >> 1;
1888 if ((high & 3) == 0) {
1889 uint32_t *n = vn, *m = vm;
1890 high >>= 2;
1892 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1893 uint64_t nn = n[H4(high + i)];
1894 uint64_t mm = m[H4(high + i)];
1896 nn = expand_bits(nn, esz);
1897 mm = expand_bits(mm, esz);
1898 d[i] = nn + (mm << (1 << esz));
1900 } else {
1901 uint8_t *n = vn, *m = vm;
1902 uint16_t *d16 = vd;
1904 for (i = 0; i < oprsz / 2; i++) {
1905 uint16_t nn = n[H1(high + i)];
1906 uint16_t mm = m[H1(high + i)];
1908 nn = expand_bits(nn, esz);
1909 mm = expand_bits(mm, esz);
1910 d16[H2(i)] = nn + (mm << (1 << esz));
1916 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1918 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1919 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1920 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1921 uint64_t *d = vd, *n = vn, *m = vm;
1922 uint64_t l, h;
1923 intptr_t i;
1925 if (oprsz <= 8) {
1926 l = compress_bits(n[0] >> odd, esz);
1927 h = compress_bits(m[0] >> odd, esz);
1928 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1929 } else {
1930 ARMPredicateReg tmp_m;
1931 intptr_t oprsz_16 = oprsz / 16;
1933 if ((vm - vd) < (uintptr_t)oprsz) {
1934 m = memcpy(&tmp_m, vm, oprsz);
1937 for (i = 0; i < oprsz_16; i++) {
1938 l = n[2 * i + 0];
1939 h = n[2 * i + 1];
1940 l = compress_bits(l >> odd, esz);
1941 h = compress_bits(h >> odd, esz);
1942 d[i] = l + (h << 32);
1945 /* For VL which is not a power of 2, the results from M do not
1946 align nicely with the uint64_t for D. Put the aligned results
1947 from M into TMP_M and then copy it into place afterward. */
1948 if (oprsz & 15) {
1949 d[i] = compress_bits(n[2 * i] >> odd, esz);
1951 for (i = 0; i < oprsz_16; i++) {
1952 l = m[2 * i + 0];
1953 h = m[2 * i + 1];
1954 l = compress_bits(l >> odd, esz);
1955 h = compress_bits(h >> odd, esz);
1956 tmp_m.p[i] = l + (h << 32);
1958 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1960 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1961 } else {
1962 for (i = 0; i < oprsz_16; i++) {
1963 l = m[2 * i + 0];
1964 h = m[2 * i + 1];
1965 l = compress_bits(l >> odd, esz);
1966 h = compress_bits(h >> odd, esz);
1967 d[oprsz_16 + i] = l + (h << 32);
1973 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1975 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1976 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1977 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1978 uint64_t *d = vd, *n = vn, *m = vm;
1979 uint64_t mask;
1980 int shr, shl;
1981 intptr_t i;
1983 shl = 1 << esz;
1984 shr = 0;
1985 mask = even_bit_esz_masks[esz];
1986 if (odd) {
1987 mask <<= shl;
1988 shr = shl;
1989 shl = 0;
1992 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1993 uint64_t nn = (n[i] & mask) >> shr;
1994 uint64_t mm = (m[i] & mask) << shl;
1995 d[i] = nn + mm;
1999 /* Reverse units of 2**N bits. */
2000 static uint64_t reverse_bits_64(uint64_t x, int n)
2002 int i, sh;
2004 x = bswap64(x);
2005 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2006 uint64_t mask = even_bit_esz_masks[i];
2007 x = ((x & mask) << sh) | ((x >> sh) & mask);
2009 return x;
2012 static uint8_t reverse_bits_8(uint8_t x, int n)
2014 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2015 int i, sh;
2017 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2018 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2020 return x;
2023 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2025 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2026 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2027 intptr_t i, oprsz_2 = oprsz / 2;
2029 if (oprsz <= 8) {
2030 uint64_t l = *(uint64_t *)vn;
2031 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2032 *(uint64_t *)vd = l;
2033 } else if ((oprsz & 15) == 0) {
2034 for (i = 0; i < oprsz_2; i += 8) {
2035 intptr_t ih = oprsz - 8 - i;
2036 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2037 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2038 *(uint64_t *)(vd + i) = h;
2039 *(uint64_t *)(vd + ih) = l;
2041 } else {
2042 for (i = 0; i < oprsz_2; i += 1) {
2043 intptr_t il = H1(i);
2044 intptr_t ih = H1(oprsz - 1 - i);
2045 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2046 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2047 *(uint8_t *)(vd + il) = h;
2048 *(uint8_t *)(vd + ih) = l;
2053 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2055 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2056 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2057 uint64_t *d = vd;
2058 intptr_t i;
2060 if (oprsz <= 8) {
2061 uint64_t nn = *(uint64_t *)vn;
2062 int half = 4 * oprsz;
2064 nn = extract64(nn, high * half, half);
2065 nn = expand_bits(nn, 0);
2066 d[0] = nn;
2067 } else {
2068 ARMPredicateReg tmp_n;
2070 /* We produce output faster than we consume input.
2071 Therefore we must be mindful of possible overlap. */
2072 if ((vn - vd) < (uintptr_t)oprsz) {
2073 vn = memcpy(&tmp_n, vn, oprsz);
2075 if (high) {
2076 high = oprsz >> 1;
2079 if ((high & 3) == 0) {
2080 uint32_t *n = vn;
2081 high >>= 2;
2083 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2084 uint64_t nn = n[H4(high + i)];
2085 d[i] = expand_bits(nn, 0);
2087 } else {
2088 uint16_t *d16 = vd;
2089 uint8_t *n = vn;
2091 for (i = 0; i < oprsz / 2; i++) {
2092 uint16_t nn = n[H1(high + i)];
2093 d16[H2(i)] = expand_bits(nn, 0);
2099 #define DO_ZIP(NAME, TYPE, H) \
2100 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2102 intptr_t oprsz = simd_oprsz(desc); \
2103 intptr_t i, oprsz_2 = oprsz / 2; \
2104 ARMVectorReg tmp_n, tmp_m; \
2105 /* We produce output faster than we consume input. \
2106 Therefore we must be mindful of possible overlap. */ \
2107 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2108 vn = memcpy(&tmp_n, vn, oprsz_2); \
2110 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2111 vm = memcpy(&tmp_m, vm, oprsz_2); \
2113 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2114 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2115 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2119 DO_ZIP(sve_zip_b, uint8_t, H1)
2120 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2121 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2122 DO_ZIP(sve_zip_d, uint64_t, )
2124 #define DO_UZP(NAME, TYPE, H) \
2125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2127 intptr_t oprsz = simd_oprsz(desc); \
2128 intptr_t oprsz_2 = oprsz / 2; \
2129 intptr_t odd_ofs = simd_data(desc); \
2130 intptr_t i; \
2131 ARMVectorReg tmp_m; \
2132 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2133 vm = memcpy(&tmp_m, vm, oprsz); \
2135 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2136 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2138 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2139 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2143 DO_UZP(sve_uzp_b, uint8_t, H1)
2144 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2145 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2146 DO_UZP(sve_uzp_d, uint64_t, )
2148 #define DO_TRN(NAME, TYPE, H) \
2149 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2151 intptr_t oprsz = simd_oprsz(desc); \
2152 intptr_t odd_ofs = simd_data(desc); \
2153 intptr_t i; \
2154 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2155 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2156 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2157 *(TYPE *)(vd + H(i + 0)) = ae; \
2158 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2162 DO_TRN(sve_trn_b, uint8_t, H1)
2163 DO_TRN(sve_trn_h, uint16_t, H1_2)
2164 DO_TRN(sve_trn_s, uint32_t, H1_4)
2165 DO_TRN(sve_trn_d, uint64_t, )
2167 #undef DO_ZIP
2168 #undef DO_UZP
2169 #undef DO_TRN
2171 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2173 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2174 uint32_t *d = vd, *n = vn;
2175 uint8_t *pg = vg;
2177 for (i = j = 0; i < opr_sz; i++) {
2178 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2179 d[H4(j)] = n[H4(i)];
2180 j++;
2183 for (; j < opr_sz; j++) {
2184 d[H4(j)] = 0;
2188 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2190 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2191 uint64_t *d = vd, *n = vn;
2192 uint8_t *pg = vg;
2194 for (i = j = 0; i < opr_sz; i++) {
2195 if (pg[H1(i)] & 1) {
2196 d[j] = n[i];
2197 j++;
2200 for (; j < opr_sz; j++) {
2201 d[j] = 0;
2205 /* Similar to the ARM LastActiveElement pseudocode function, except the
2206 * result is multiplied by the element size. This includes the not found
2207 * indication; e.g. not found for esz=3 is -8.
2209 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2211 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2212 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2214 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2217 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2219 intptr_t opr_sz = simd_oprsz(desc) / 8;
2220 int esz = simd_data(desc);
2221 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2222 intptr_t i, first_i, last_i;
2223 ARMVectorReg tmp;
2225 first_i = last_i = 0;
2226 first_g = last_g = 0;
2228 /* Find the extent of the active elements within VG. */
2229 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2230 pg = *(uint64_t *)(vg + i) & mask;
2231 if (pg) {
2232 if (last_g == 0) {
2233 last_g = pg;
2234 last_i = i;
2236 first_g = pg;
2237 first_i = i;
2241 len = 0;
2242 if (first_g != 0) {
2243 first_i = first_i * 8 + ctz64(first_g);
2244 last_i = last_i * 8 + 63 - clz64(last_g);
2245 len = last_i - first_i + (1 << esz);
2246 if (vd == vm) {
2247 vm = memcpy(&tmp, vm, opr_sz * 8);
2249 swap_memmove(vd, vn + first_i, len);
2251 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2254 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2255 void *vg, uint32_t desc)
2257 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2258 uint64_t *d = vd, *n = vn, *m = vm;
2259 uint8_t *pg = vg;
2261 for (i = 0; i < opr_sz; i += 1) {
2262 uint64_t nn = n[i], mm = m[i];
2263 uint64_t pp = expand_pred_b(pg[H1(i)]);
2264 d[i] = (nn & pp) | (mm & ~pp);
2268 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2269 void *vg, uint32_t desc)
2271 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2272 uint64_t *d = vd, *n = vn, *m = vm;
2273 uint8_t *pg = vg;
2275 for (i = 0; i < opr_sz; i += 1) {
2276 uint64_t nn = n[i], mm = m[i];
2277 uint64_t pp = expand_pred_h(pg[H1(i)]);
2278 d[i] = (nn & pp) | (mm & ~pp);
2282 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2283 void *vg, uint32_t desc)
2285 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2286 uint64_t *d = vd, *n = vn, *m = vm;
2287 uint8_t *pg = vg;
2289 for (i = 0; i < opr_sz; i += 1) {
2290 uint64_t nn = n[i], mm = m[i];
2291 uint64_t pp = expand_pred_s(pg[H1(i)]);
2292 d[i] = (nn & pp) | (mm & ~pp);
2296 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2297 void *vg, uint32_t desc)
2299 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2300 uint64_t *d = vd, *n = vn, *m = vm;
2301 uint8_t *pg = vg;
2303 for (i = 0; i < opr_sz; i += 1) {
2304 uint64_t nn = n[i], mm = m[i];
2305 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2309 /* Two operand comparison controlled by a predicate.
2310 * ??? It is very tempting to want to be able to expand this inline
2311 * with x86 instructions, e.g.
2313 * vcmpeqw zm, zn, %ymm0
2314 * vpmovmskb %ymm0, %eax
2315 * and $0x5555, %eax
2316 * and pg, %eax
2318 * or even aarch64, e.g.
2320 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2321 * cmeq v0.8h, zn, zm
2322 * and v0.8h, v0.8h, mask
2323 * addv h0, v0.8h
2324 * and v0.8b, pg
2326 * However, coming up with an abstraction that allows vector inputs and
2327 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2328 * scalar outputs, is tricky.
2330 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2331 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2333 intptr_t opr_sz = simd_oprsz(desc); \
2334 uint32_t flags = PREDTEST_INIT; \
2335 intptr_t i = opr_sz; \
2336 do { \
2337 uint64_t out = 0, pg; \
2338 do { \
2339 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2340 TYPE nn = *(TYPE *)(vn + H(i)); \
2341 TYPE mm = *(TYPE *)(vm + H(i)); \
2342 out |= nn OP mm; \
2343 } while (i & 63); \
2344 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2345 out &= pg; \
2346 *(uint64_t *)(vd + (i >> 3)) = out; \
2347 flags = iter_predtest_bwd(out, pg, flags); \
2348 } while (i > 0); \
2349 return flags; \
2352 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2353 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2354 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2355 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2356 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2357 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2358 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2359 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2361 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2362 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2363 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2364 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2366 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2367 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2368 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2369 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2371 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2372 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2373 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2374 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2376 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2377 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2378 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2379 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2381 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2382 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2383 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2384 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2386 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2387 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2388 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2389 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2391 #undef DO_CMP_PPZZ_B
2392 #undef DO_CMP_PPZZ_H
2393 #undef DO_CMP_PPZZ_S
2394 #undef DO_CMP_PPZZ_D
2395 #undef DO_CMP_PPZZ
2397 /* Similar, but the second source is "wide". */
2398 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2399 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2401 intptr_t opr_sz = simd_oprsz(desc); \
2402 uint32_t flags = PREDTEST_INIT; \
2403 intptr_t i = opr_sz; \
2404 do { \
2405 uint64_t out = 0, pg; \
2406 do { \
2407 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2408 do { \
2409 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2410 TYPE nn = *(TYPE *)(vn + H(i)); \
2411 out |= nn OP mm; \
2412 } while (i & 7); \
2413 } while (i & 63); \
2414 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2415 out &= pg; \
2416 *(uint64_t *)(vd + (i >> 3)) = out; \
2417 flags = iter_predtest_bwd(out, pg, flags); \
2418 } while (i > 0); \
2419 return flags; \
2422 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2423 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2424 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2425 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2426 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2427 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2429 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2430 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2431 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2433 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2434 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2435 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2437 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2438 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2439 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2441 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2442 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2443 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2445 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2446 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2447 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2449 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2450 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2451 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2453 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2454 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2455 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2457 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2458 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2459 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2461 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2462 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2463 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2465 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2466 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2467 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2469 #undef DO_CMP_PPZW_B
2470 #undef DO_CMP_PPZW_H
2471 #undef DO_CMP_PPZW_S
2472 #undef DO_CMP_PPZW
2474 /* Similar, but the second source is immediate. */
2475 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2476 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2478 intptr_t opr_sz = simd_oprsz(desc); \
2479 uint32_t flags = PREDTEST_INIT; \
2480 TYPE mm = simd_data(desc); \
2481 intptr_t i = opr_sz; \
2482 do { \
2483 uint64_t out = 0, pg; \
2484 do { \
2485 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2486 TYPE nn = *(TYPE *)(vn + H(i)); \
2487 out |= nn OP mm; \
2488 } while (i & 63); \
2489 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2490 out &= pg; \
2491 *(uint64_t *)(vd + (i >> 3)) = out; \
2492 flags = iter_predtest_bwd(out, pg, flags); \
2493 } while (i > 0); \
2494 return flags; \
2497 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2498 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2499 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2500 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2501 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2502 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2503 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2504 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2506 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2507 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2508 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2509 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2511 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2512 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2513 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2514 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2516 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2517 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2518 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2519 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2521 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2522 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2523 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2524 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2526 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2527 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2528 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2529 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2531 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2532 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2533 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2534 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2536 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2537 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2538 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2539 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2541 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2542 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2543 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2544 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2546 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2547 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2548 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2549 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2551 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2552 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2553 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2554 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2556 #undef DO_CMP_PPZI_B
2557 #undef DO_CMP_PPZI_H
2558 #undef DO_CMP_PPZI_S
2559 #undef DO_CMP_PPZI_D
2560 #undef DO_CMP_PPZI
2562 /* Similar to the ARM LastActive pseudocode function. */
2563 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2565 intptr_t i;
2567 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2568 uint64_t pg = *(uint64_t *)(vg + i);
2569 if (pg) {
2570 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2573 return 0;
2576 /* Compute a mask into RETB that is true for all G, up to and including
2577 * (if after) or excluding (if !after) the first G & N.
2578 * Return true if BRK found.
2580 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2581 bool brk, bool after)
2583 uint64_t b;
2585 if (brk) {
2586 b = 0;
2587 } else if ((g & n) == 0) {
2588 /* For all G, no N are set; break not found. */
2589 b = g;
2590 } else {
2591 /* Break somewhere in N. Locate it. */
2592 b = g & n; /* guard true, pred true */
2593 b = b & -b; /* first such */
2594 if (after) {
2595 b = b | (b - 1); /* break after same */
2596 } else {
2597 b = b - 1; /* break before same */
2599 brk = true;
2602 *retb = b;
2603 return brk;
2606 /* Compute a zeroing BRK. */
2607 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2608 intptr_t oprsz, bool after)
2610 bool brk = false;
2611 intptr_t i;
2613 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2614 uint64_t this_b, this_g = g[i];
2616 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2617 d[i] = this_b & this_g;
2621 /* Likewise, but also compute flags. */
2622 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2623 intptr_t oprsz, bool after)
2625 uint32_t flags = PREDTEST_INIT;
2626 bool brk = false;
2627 intptr_t i;
2629 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2630 uint64_t this_b, this_d, this_g = g[i];
2632 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2633 d[i] = this_d = this_b & this_g;
2634 flags = iter_predtest_fwd(this_d, this_g, flags);
2636 return flags;
2639 /* Compute a merging BRK. */
2640 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2641 intptr_t oprsz, bool after)
2643 bool brk = false;
2644 intptr_t i;
2646 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2647 uint64_t this_b, this_g = g[i];
2649 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2650 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2654 /* Likewise, but also compute flags. */
2655 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2656 intptr_t oprsz, bool after)
2658 uint32_t flags = PREDTEST_INIT;
2659 bool brk = false;
2660 intptr_t i;
2662 for (i = 0; i < oprsz / 8; ++i) {
2663 uint64_t this_b, this_d = d[i], this_g = g[i];
2665 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2666 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2667 flags = iter_predtest_fwd(this_d, this_g, flags);
2669 return flags;
2672 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2674 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2675 * The compiler should turn this into 4 64-bit integer stores.
2677 memset(d, 0, sizeof(ARMPredicateReg));
2678 return PREDTEST_INIT;
2681 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2682 uint32_t pred_desc)
2684 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2685 if (last_active_pred(vn, vg, oprsz)) {
2686 compute_brk_z(vd, vm, vg, oprsz, true);
2687 } else {
2688 do_zero(vd, oprsz);
2692 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2693 uint32_t pred_desc)
2695 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2696 if (last_active_pred(vn, vg, oprsz)) {
2697 return compute_brks_z(vd, vm, vg, oprsz, true);
2698 } else {
2699 return do_zero(vd, oprsz);
2703 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2704 uint32_t pred_desc)
2706 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2707 if (last_active_pred(vn, vg, oprsz)) {
2708 compute_brk_z(vd, vm, vg, oprsz, false);
2709 } else {
2710 do_zero(vd, oprsz);
2714 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2715 uint32_t pred_desc)
2717 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2718 if (last_active_pred(vn, vg, oprsz)) {
2719 return compute_brks_z(vd, vm, vg, oprsz, false);
2720 } else {
2721 return do_zero(vd, oprsz);
2725 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 compute_brk_z(vd, vn, vg, oprsz, true);
2731 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2733 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2734 return compute_brks_z(vd, vn, vg, oprsz, true);
2737 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2739 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2740 compute_brk_z(vd, vn, vg, oprsz, false);
2743 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2745 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2746 return compute_brks_z(vd, vn, vg, oprsz, false);
2749 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2751 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2752 compute_brk_m(vd, vn, vg, oprsz, true);
2755 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2757 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2758 return compute_brks_m(vd, vn, vg, oprsz, true);
2761 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2763 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2764 compute_brk_m(vd, vn, vg, oprsz, false);
2767 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2769 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2770 return compute_brks_m(vd, vn, vg, oprsz, false);
2773 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2775 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2777 if (!last_active_pred(vn, vg, oprsz)) {
2778 do_zero(vd, oprsz);
2782 /* As if PredTest(Ones(PL), D, esz). */
2783 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2784 uint64_t esz_mask)
2786 uint32_t flags = PREDTEST_INIT;
2787 intptr_t i;
2789 for (i = 0; i < oprsz / 8; i++) {
2790 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2792 if (oprsz & 7) {
2793 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2794 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2796 return flags;
2799 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2801 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2803 if (last_active_pred(vn, vg, oprsz)) {
2804 return predtest_ones(vd, oprsz, -1);
2805 } else {
2806 return do_zero(vd, oprsz);
2810 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2812 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2813 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2814 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2815 intptr_t i;
2817 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2818 uint64_t t = n[i] & g[i] & mask;
2819 sum += ctpop64(t);
2821 return sum;
2824 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2826 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2827 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2828 uint64_t esz_mask = pred_esz_masks[esz];
2829 ARMPredicateReg *d = vd;
2830 uint32_t flags;
2831 intptr_t i;
2833 /* Begin with a zero predicate register. */
2834 flags = do_zero(d, oprsz);
2835 if (count == 0) {
2836 return flags;
2839 /* Scale from predicate element count to bits. */
2840 count <<= esz;
2841 /* Bound to the bits in the predicate. */
2842 count = MIN(count, oprsz * 8);
2844 /* Set all of the requested bits. */
2845 for (i = 0; i < count / 64; ++i) {
2846 d->p[i] = esz_mask;
2848 if (count & 63) {
2849 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2852 return predtest_ones(d, oprsz, esz_mask);
2855 /* Recursive reduction on a function;
2856 * C.f. the ARM ARM function ReducePredicated.
2858 * While it would be possible to write this without the DATA temporary,
2859 * it is much simpler to process the predicate register this way.
2860 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2861 * little to gain with a more complex non-recursive form.
2863 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2864 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2866 if (n == 1) { \
2867 return *data; \
2868 } else { \
2869 uintptr_t half = n / 2; \
2870 TYPE lo = NAME##_reduce(data, status, half); \
2871 TYPE hi = NAME##_reduce(data + half, status, half); \
2872 return TYPE##_##FUNC(lo, hi, status); \
2875 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2877 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2878 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2879 for (i = 0; i < oprsz; ) { \
2880 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2881 do { \
2882 TYPE nn = *(TYPE *)(vn + H(i)); \
2883 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2884 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2885 } while (i & 15); \
2887 for (; i < maxsz; i += sizeof(TYPE)) { \
2888 *(TYPE *)((void *)data + i) = IDENT; \
2890 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2893 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2894 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2895 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2897 /* Identity is floatN_default_nan, without the function call. */
2898 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2899 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2900 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2902 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2903 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2904 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2906 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2907 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2908 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2910 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2911 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2912 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2914 #undef DO_REDUCE
2916 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2917 void *status, uint32_t desc)
2919 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2920 float16 result = nn;
2922 do {
2923 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2924 do {
2925 if (pg & 1) {
2926 float16 mm = *(float16 *)(vm + H1_2(i));
2927 result = float16_add(result, mm, status);
2929 i += sizeof(float16), pg >>= sizeof(float16);
2930 } while (i & 15);
2931 } while (i < opr_sz);
2933 return result;
2936 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2937 void *status, uint32_t desc)
2939 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2940 float32 result = nn;
2942 do {
2943 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2944 do {
2945 if (pg & 1) {
2946 float32 mm = *(float32 *)(vm + H1_2(i));
2947 result = float32_add(result, mm, status);
2949 i += sizeof(float32), pg >>= sizeof(float32);
2950 } while (i & 15);
2951 } while (i < opr_sz);
2953 return result;
2956 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2957 void *status, uint32_t desc)
2959 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2960 uint64_t *m = vm;
2961 uint8_t *pg = vg;
2963 for (i = 0; i < opr_sz; i++) {
2964 if (pg[H1(i)] & 1) {
2965 nn = float64_add(nn, m[i], status);
2969 return nn;
2972 /* Fully general three-operand expander, controlled by a predicate,
2973 * With the extra float_status parameter.
2975 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2976 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2977 void *status, uint32_t desc) \
2979 intptr_t i = simd_oprsz(desc); \
2980 uint64_t *g = vg; \
2981 do { \
2982 uint64_t pg = g[(i - 1) >> 6]; \
2983 do { \
2984 i -= sizeof(TYPE); \
2985 if (likely((pg >> (i & 63)) & 1)) { \
2986 TYPE nn = *(TYPE *)(vn + H(i)); \
2987 TYPE mm = *(TYPE *)(vm + H(i)); \
2988 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2990 } while (i & 63); \
2991 } while (i != 0); \
2994 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
2995 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
2996 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
2998 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
2999 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3000 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3002 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3003 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3004 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3006 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3007 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3008 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3010 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3011 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3012 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3014 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3015 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3016 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3018 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3019 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3020 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3022 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3023 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3024 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3026 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3028 return float16_abs(float16_sub(a, b, s));
3031 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3033 return float32_abs(float32_sub(a, b, s));
3036 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3038 return float64_abs(float64_sub(a, b, s));
3041 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3042 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3043 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3045 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3047 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3048 return float64_scalbn(a, b_int, s);
3051 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3052 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3053 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3055 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3056 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3057 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3059 #undef DO_ZPZZ_FP
3061 /* Three-operand expander, with one scalar operand, controlled by
3062 * a predicate, with the extra float_status parameter.
3064 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3065 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3066 void *status, uint32_t desc) \
3068 intptr_t i = simd_oprsz(desc); \
3069 uint64_t *g = vg; \
3070 TYPE mm = scalar; \
3071 do { \
3072 uint64_t pg = g[(i - 1) >> 6]; \
3073 do { \
3074 i -= sizeof(TYPE); \
3075 if (likely((pg >> (i & 63)) & 1)) { \
3076 TYPE nn = *(TYPE *)(vn + H(i)); \
3077 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3079 } while (i & 63); \
3080 } while (i != 0); \
3083 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3084 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3085 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3087 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3088 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3089 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3091 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3092 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3093 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3095 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3097 return float16_sub(b, a, s);
3100 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3102 return float32_sub(b, a, s);
3105 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3107 return float64_sub(b, a, s);
3110 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3111 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3112 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3114 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3115 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3116 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3118 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3119 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3120 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3122 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3123 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3124 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3126 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3127 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3128 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3130 /* Fully general two-operand expander, controlled by a predicate,
3131 * With the extra float_status parameter.
3133 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3134 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3136 intptr_t i = simd_oprsz(desc); \
3137 uint64_t *g = vg; \
3138 do { \
3139 uint64_t pg = g[(i - 1) >> 6]; \
3140 do { \
3141 i -= sizeof(TYPE); \
3142 if (likely((pg >> (i & 63)) & 1)) { \
3143 TYPE nn = *(TYPE *)(vn + H(i)); \
3144 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3146 } while (i & 63); \
3147 } while (i != 0); \
3150 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3151 * FZ16. When converting from fp16, this affects flushing input denormals;
3152 * when converting to fp16, this affects flushing output denormals.
3154 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3156 flag save = get_flush_inputs_to_zero(fpst);
3157 float32 ret;
3159 set_flush_inputs_to_zero(false, fpst);
3160 ret = float16_to_float32(f, true, fpst);
3161 set_flush_inputs_to_zero(save, fpst);
3162 return ret;
3165 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3167 flag save = get_flush_inputs_to_zero(fpst);
3168 float64 ret;
3170 set_flush_inputs_to_zero(false, fpst);
3171 ret = float16_to_float64(f, true, fpst);
3172 set_flush_inputs_to_zero(save, fpst);
3173 return ret;
3176 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3178 flag save = get_flush_to_zero(fpst);
3179 float16 ret;
3181 set_flush_to_zero(false, fpst);
3182 ret = float32_to_float16(f, true, fpst);
3183 set_flush_to_zero(save, fpst);
3184 return ret;
3187 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3189 flag save = get_flush_to_zero(fpst);
3190 float16 ret;
3192 set_flush_to_zero(false, fpst);
3193 ret = float64_to_float16(f, true, fpst);
3194 set_flush_to_zero(save, fpst);
3195 return ret;
3198 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3200 if (float16_is_any_nan(f)) {
3201 float_raise(float_flag_invalid, s);
3202 return 0;
3204 return float16_to_int16_round_to_zero(f, s);
3207 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3209 if (float16_is_any_nan(f)) {
3210 float_raise(float_flag_invalid, s);
3211 return 0;
3213 return float16_to_int64_round_to_zero(f, s);
3216 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3218 if (float32_is_any_nan(f)) {
3219 float_raise(float_flag_invalid, s);
3220 return 0;
3222 return float32_to_int64_round_to_zero(f, s);
3225 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3227 if (float64_is_any_nan(f)) {
3228 float_raise(float_flag_invalid, s);
3229 return 0;
3231 return float64_to_int64_round_to_zero(f, s);
3234 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3236 if (float16_is_any_nan(f)) {
3237 float_raise(float_flag_invalid, s);
3238 return 0;
3240 return float16_to_uint16_round_to_zero(f, s);
3243 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3245 if (float16_is_any_nan(f)) {
3246 float_raise(float_flag_invalid, s);
3247 return 0;
3249 return float16_to_uint64_round_to_zero(f, s);
3252 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3254 if (float32_is_any_nan(f)) {
3255 float_raise(float_flag_invalid, s);
3256 return 0;
3258 return float32_to_uint64_round_to_zero(f, s);
3261 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3263 if (float64_is_any_nan(f)) {
3264 float_raise(float_flag_invalid, s);
3265 return 0;
3267 return float64_to_uint64_round_to_zero(f, s);
3270 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3271 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3272 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3273 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3274 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3275 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3277 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3278 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3279 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3280 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3281 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3282 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3283 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3285 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3286 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3287 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3288 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3289 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3290 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3291 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3293 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3294 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3295 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3297 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3298 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3299 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3301 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3302 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3303 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3305 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3306 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3307 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3309 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3310 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3311 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3312 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3313 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3314 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3315 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3317 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3318 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3319 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3320 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3321 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3322 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3323 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3325 #undef DO_ZPZ_FP
3327 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3328 * "properly", so we need to encode some of the registers into DESC.
3330 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3332 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3333 uint16_t neg1, uint16_t neg3)
3335 intptr_t i = simd_oprsz(desc);
3336 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3337 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3338 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3339 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3340 void *vd = &env->vfp.zregs[rd];
3341 void *vn = &env->vfp.zregs[rn];
3342 void *vm = &env->vfp.zregs[rm];
3343 void *va = &env->vfp.zregs[ra];
3344 uint64_t *g = vg;
3346 do {
3347 uint64_t pg = g[(i - 1) >> 6];
3348 do {
3349 i -= 2;
3350 if (likely((pg >> (i & 63)) & 1)) {
3351 float16 e1, e2, e3, r;
3353 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3354 e2 = *(uint16_t *)(vm + H1_2(i));
3355 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3356 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3357 *(uint16_t *)(vd + H1_2(i)) = r;
3359 } while (i & 63);
3360 } while (i != 0);
3363 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3365 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3368 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3370 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3373 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3375 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3378 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3380 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3383 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3384 uint32_t neg1, uint32_t neg3)
3386 intptr_t i = simd_oprsz(desc);
3387 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3388 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3389 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3390 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3391 void *vd = &env->vfp.zregs[rd];
3392 void *vn = &env->vfp.zregs[rn];
3393 void *vm = &env->vfp.zregs[rm];
3394 void *va = &env->vfp.zregs[ra];
3395 uint64_t *g = vg;
3397 do {
3398 uint64_t pg = g[(i - 1) >> 6];
3399 do {
3400 i -= 4;
3401 if (likely((pg >> (i & 63)) & 1)) {
3402 float32 e1, e2, e3, r;
3404 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3405 e2 = *(uint32_t *)(vm + H1_4(i));
3406 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3407 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3408 *(uint32_t *)(vd + H1_4(i)) = r;
3410 } while (i & 63);
3411 } while (i != 0);
3414 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3416 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3419 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3421 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3424 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3426 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3429 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3431 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3434 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3435 uint64_t neg1, uint64_t neg3)
3437 intptr_t i = simd_oprsz(desc);
3438 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3439 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3440 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3441 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3442 void *vd = &env->vfp.zregs[rd];
3443 void *vn = &env->vfp.zregs[rn];
3444 void *vm = &env->vfp.zregs[rm];
3445 void *va = &env->vfp.zregs[ra];
3446 uint64_t *g = vg;
3448 do {
3449 uint64_t pg = g[(i - 1) >> 6];
3450 do {
3451 i -= 8;
3452 if (likely((pg >> (i & 63)) & 1)) {
3453 float64 e1, e2, e3, r;
3455 e1 = *(uint64_t *)(vn + i) ^ neg1;
3456 e2 = *(uint64_t *)(vm + i);
3457 e3 = *(uint64_t *)(va + i) ^ neg3;
3458 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3459 *(uint64_t *)(vd + i) = r;
3461 } while (i & 63);
3462 } while (i != 0);
3465 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3467 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3470 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3472 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3475 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3477 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3480 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3482 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3485 /* Two operand floating-point comparison controlled by a predicate.
3486 * Unlike the integer version, we are not allowed to optimistically
3487 * compare operands, since the comparison may have side effects wrt
3488 * the FPSR.
3490 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3491 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3492 void *status, uint32_t desc) \
3494 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3495 uint64_t *d = vd, *g = vg; \
3496 do { \
3497 uint64_t out = 0, pg = g[j]; \
3498 do { \
3499 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3500 if (likely((pg >> (i & 63)) & 1)) { \
3501 TYPE nn = *(TYPE *)(vn + H(i)); \
3502 TYPE mm = *(TYPE *)(vm + H(i)); \
3503 out |= OP(TYPE, nn, mm, status); \
3505 } while (i & 63); \
3506 d[j--] = out; \
3507 } while (i > 0); \
3510 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3511 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3512 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3513 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3514 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3515 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3517 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3518 DO_FPCMP_PPZZ_H(NAME, OP) \
3519 DO_FPCMP_PPZZ_S(NAME, OP) \
3520 DO_FPCMP_PPZZ_D(NAME, OP)
3522 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3523 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3524 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3525 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3526 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3527 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3528 #define DO_FCMUO(TYPE, X, Y, ST) \
3529 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3530 #define DO_FACGE(TYPE, X, Y, ST) \
3531 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3532 #define DO_FACGT(TYPE, X, Y, ST) \
3533 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3535 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3536 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3537 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3538 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3539 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3540 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3541 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3543 #undef DO_FPCMP_PPZZ_ALL
3544 #undef DO_FPCMP_PPZZ_D
3545 #undef DO_FPCMP_PPZZ_S
3546 #undef DO_FPCMP_PPZZ_H
3547 #undef DO_FPCMP_PPZZ
3549 /* One operand floating-point comparison against zero, controlled
3550 * by a predicate.
3552 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3553 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3554 void *status, uint32_t desc) \
3556 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3557 uint64_t *d = vd, *g = vg; \
3558 do { \
3559 uint64_t out = 0, pg = g[j]; \
3560 do { \
3561 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3562 if ((pg >> (i & 63)) & 1) { \
3563 TYPE nn = *(TYPE *)(vn + H(i)); \
3564 out |= OP(TYPE, nn, 0, status); \
3566 } while (i & 63); \
3567 d[j--] = out; \
3568 } while (i > 0); \
3571 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3572 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3573 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3574 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3575 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3576 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3578 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3579 DO_FPCMP_PPZ0_H(NAME, OP) \
3580 DO_FPCMP_PPZ0_S(NAME, OP) \
3581 DO_FPCMP_PPZ0_D(NAME, OP)
3583 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3584 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3585 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3586 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3587 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3588 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3590 /* FP Trig Multiply-Add. */
3592 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3594 static const float16 coeff[16] = {
3595 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3596 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3598 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3599 intptr_t x = simd_data(desc);
3600 float16 *d = vd, *n = vn, *m = vm;
3601 for (i = 0; i < opr_sz; i++) {
3602 float16 mm = m[i];
3603 intptr_t xx = x;
3604 if (float16_is_neg(mm)) {
3605 mm = float16_abs(mm);
3606 xx += 8;
3608 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3612 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3614 static const float32 coeff[16] = {
3615 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3616 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3617 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3618 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3620 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3621 intptr_t x = simd_data(desc);
3622 float32 *d = vd, *n = vn, *m = vm;
3623 for (i = 0; i < opr_sz; i++) {
3624 float32 mm = m[i];
3625 intptr_t xx = x;
3626 if (float32_is_neg(mm)) {
3627 mm = float32_abs(mm);
3628 xx += 8;
3630 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3634 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3636 static const float64 coeff[16] = {
3637 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3638 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3639 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3640 0x3de5d8408868552full, 0x0000000000000000ull,
3641 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3642 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3643 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3644 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3646 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3647 intptr_t x = simd_data(desc);
3648 float64 *d = vd, *n = vn, *m = vm;
3649 for (i = 0; i < opr_sz; i++) {
3650 float64 mm = m[i];
3651 intptr_t xx = x;
3652 if (float64_is_neg(mm)) {
3653 mm = float64_abs(mm);
3654 xx += 8;
3656 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3661 * Load contiguous data, protected by a governing predicate.
3663 #define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3664 static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3665 target_ulong addr, intptr_t oprsz, \
3666 uintptr_t ra) \
3668 intptr_t i = 0; \
3669 do { \
3670 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3671 do { \
3672 TYPEM m = 0; \
3673 if (pg & 1) { \
3674 m = FN(env, addr, ra); \
3676 *(TYPEE *)(vd + H(i)) = m; \
3677 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3678 addr += sizeof(TYPEM); \
3679 } while (i & 15); \
3680 } while (i < oprsz); \
3682 void HELPER(NAME)(CPUARMState *env, void *vg, \
3683 target_ulong addr, uint32_t desc) \
3685 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3686 addr, simd_oprsz(desc), GETPC()); \
3689 #define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3690 void HELPER(NAME)(CPUARMState *env, void *vg, \
3691 target_ulong addr, uint32_t desc) \
3693 intptr_t i, oprsz = simd_oprsz(desc); \
3694 intptr_t ra = GETPC(); \
3695 unsigned rd = simd_data(desc); \
3696 void *d1 = &env->vfp.zregs[rd]; \
3697 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3698 for (i = 0; i < oprsz; ) { \
3699 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3700 do { \
3701 TYPEM m1 = 0, m2 = 0; \
3702 if (pg & 1) { \
3703 m1 = FN(env, addr, ra); \
3704 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3706 *(TYPEE *)(d1 + H(i)) = m1; \
3707 *(TYPEE *)(d2 + H(i)) = m2; \
3708 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3709 addr += 2 * sizeof(TYPEM); \
3710 } while (i & 15); \
3714 #define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3715 void HELPER(NAME)(CPUARMState *env, void *vg, \
3716 target_ulong addr, uint32_t desc) \
3718 intptr_t i, oprsz = simd_oprsz(desc); \
3719 intptr_t ra = GETPC(); \
3720 unsigned rd = simd_data(desc); \
3721 void *d1 = &env->vfp.zregs[rd]; \
3722 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3723 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3724 for (i = 0; i < oprsz; ) { \
3725 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3726 do { \
3727 TYPEM m1 = 0, m2 = 0, m3 = 0; \
3728 if (pg & 1) { \
3729 m1 = FN(env, addr, ra); \
3730 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3731 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
3733 *(TYPEE *)(d1 + H(i)) = m1; \
3734 *(TYPEE *)(d2 + H(i)) = m2; \
3735 *(TYPEE *)(d3 + H(i)) = m3; \
3736 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3737 addr += 3 * sizeof(TYPEM); \
3738 } while (i & 15); \
3742 #define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
3743 void HELPER(NAME)(CPUARMState *env, void *vg, \
3744 target_ulong addr, uint32_t desc) \
3746 intptr_t i, oprsz = simd_oprsz(desc); \
3747 intptr_t ra = GETPC(); \
3748 unsigned rd = simd_data(desc); \
3749 void *d1 = &env->vfp.zregs[rd]; \
3750 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3751 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3752 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
3753 for (i = 0; i < oprsz; ) { \
3754 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3755 do { \
3756 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
3757 if (pg & 1) { \
3758 m1 = FN(env, addr, ra); \
3759 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3760 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
3761 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
3763 *(TYPEE *)(d1 + H(i)) = m1; \
3764 *(TYPEE *)(d2 + H(i)) = m2; \
3765 *(TYPEE *)(d3 + H(i)) = m3; \
3766 *(TYPEE *)(d4 + H(i)) = m4; \
3767 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3768 addr += 4 * sizeof(TYPEM); \
3769 } while (i & 15); \
3773 DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
3774 DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
3775 DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
3776 DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
3777 DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
3778 DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
3780 DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
3781 DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
3782 DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
3783 DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
3785 DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
3786 DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
3788 DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3789 DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3790 DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3791 DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3793 DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3794 DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3795 DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3796 DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3798 DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3799 DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3800 DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3801 DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3803 DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3804 DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3805 DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3806 DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3808 #undef DO_LD1
3809 #undef DO_LD2
3810 #undef DO_LD3
3811 #undef DO_LD4
3814 * Load contiguous data, first-fault and no-fault.
3817 #ifdef CONFIG_USER_ONLY
3819 /* Fault on byte I. All bits in FFR from I are cleared. The vector
3820 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
3821 * option, which leaves subsequent data unchanged.
3823 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
3825 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
3827 if (i & 63) {
3828 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
3829 i = ROUND_UP(i, 64);
3831 for (; i < oprsz; i += 64) {
3832 ffr[i / 64] = 0;
3836 /* Hold the mmap lock during the operation so that there is no race
3837 * between page_check_range and the load operation. We expect the
3838 * usual case to have no faults at all, so we check the whole range
3839 * first and if successful defer to the normal load operation.
3841 * TODO: Change mmap_lock to a rwlock so that multiple readers
3842 * can run simultaneously. This will probably help other uses
3843 * within QEMU as well.
3845 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
3846 static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
3847 target_ulong addr, intptr_t oprsz, \
3848 bool first, uintptr_t ra) \
3850 intptr_t i = 0; \
3851 do { \
3852 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3853 do { \
3854 TYPEM m = 0; \
3855 if (pg & 1) { \
3856 if (!first && \
3857 unlikely(page_check_range(addr, sizeof(TYPEM), \
3858 PAGE_READ))) { \
3859 record_fault(env, i, oprsz); \
3860 return; \
3862 m = FN(env, addr, ra); \
3863 first = false; \
3865 *(TYPEE *)(vd + H(i)) = m; \
3866 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3867 addr += sizeof(TYPEM); \
3868 } while (i & 15); \
3869 } while (i < oprsz); \
3871 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
3872 target_ulong addr, uint32_t desc) \
3874 intptr_t oprsz = simd_oprsz(desc); \
3875 unsigned rd = simd_data(desc); \
3876 void *vd = &env->vfp.zregs[rd]; \
3877 mmap_lock(); \
3878 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
3879 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
3880 } else { \
3881 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
3883 mmap_unlock(); \
3886 /* No-fault loads are like first-fault loads without the
3887 * first faulting special case.
3889 #define DO_LDNF1(PART) \
3890 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
3891 target_ulong addr, uint32_t desc) \
3893 intptr_t oprsz = simd_oprsz(desc); \
3894 unsigned rd = simd_data(desc); \
3895 void *vd = &env->vfp.zregs[rd]; \
3896 mmap_lock(); \
3897 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
3898 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
3899 } else { \
3900 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
3902 mmap_unlock(); \
3905 #else
3907 /* TODO: System mode is not yet supported.
3908 * This would probably use tlb_vaddr_to_host.
3910 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
3911 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
3912 target_ulong addr, uint32_t desc) \
3914 g_assert_not_reached(); \
3917 #define DO_LDNF1(PART) \
3918 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
3919 target_ulong addr, uint32_t desc) \
3921 g_assert_not_reached(); \
3924 #endif
3926 DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3927 DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
3928 DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
3929 DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
3930 DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
3931 DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
3932 DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
3934 DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3935 DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
3936 DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
3937 DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
3938 DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
3940 DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3941 DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
3942 DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
3944 DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3946 #undef DO_LDFF1
3948 DO_LDNF1(bb_r)
3949 DO_LDNF1(bhu_r)
3950 DO_LDNF1(bhs_r)
3951 DO_LDNF1(bsu_r)
3952 DO_LDNF1(bss_r)
3953 DO_LDNF1(bdu_r)
3954 DO_LDNF1(bds_r)
3956 DO_LDNF1(hh_r)
3957 DO_LDNF1(hsu_r)
3958 DO_LDNF1(hss_r)
3959 DO_LDNF1(hdu_r)
3960 DO_LDNF1(hds_r)
3962 DO_LDNF1(ss_r)
3963 DO_LDNF1(sdu_r)
3964 DO_LDNF1(sds_r)
3966 DO_LDNF1(dd_r)
3968 #undef DO_LDNF1
3971 * Store contiguous data, protected by a governing predicate.
3973 #define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
3974 void HELPER(NAME)(CPUARMState *env, void *vg, \
3975 target_ulong addr, uint32_t desc) \
3977 intptr_t i, oprsz = simd_oprsz(desc); \
3978 intptr_t ra = GETPC(); \
3979 unsigned rd = simd_data(desc); \
3980 void *vd = &env->vfp.zregs[rd]; \
3981 for (i = 0; i < oprsz; ) { \
3982 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3983 do { \
3984 if (pg & 1) { \
3985 TYPEM m = *(TYPEE *)(vd + H(i)); \
3986 FN(env, addr, m, ra); \
3988 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3989 addr += sizeof(TYPEM); \
3990 } while (i & 15); \
3994 #define DO_ST1_D(NAME, FN, TYPEM) \
3995 void HELPER(NAME)(CPUARMState *env, void *vg, \
3996 target_ulong addr, uint32_t desc) \
3998 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
3999 intptr_t ra = GETPC(); \
4000 unsigned rd = simd_data(desc); \
4001 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
4002 uint8_t *pg = vg; \
4003 for (i = 0; i < oprsz; i += 1) { \
4004 if (pg[H1(i)] & 1) { \
4005 FN(env, addr, d[i], ra); \
4007 addr += sizeof(TYPEM); \
4011 #define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
4012 void HELPER(NAME)(CPUARMState *env, void *vg, \
4013 target_ulong addr, uint32_t desc) \
4015 intptr_t i, oprsz = simd_oprsz(desc); \
4016 intptr_t ra = GETPC(); \
4017 unsigned rd = simd_data(desc); \
4018 void *d1 = &env->vfp.zregs[rd]; \
4019 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4020 for (i = 0; i < oprsz; ) { \
4021 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4022 do { \
4023 if (pg & 1) { \
4024 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4025 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4026 FN(env, addr, m1, ra); \
4027 FN(env, addr + sizeof(TYPEM), m2, ra); \
4029 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4030 addr += 2 * sizeof(TYPEM); \
4031 } while (i & 15); \
4035 #define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
4036 void HELPER(NAME)(CPUARMState *env, void *vg, \
4037 target_ulong addr, uint32_t desc) \
4039 intptr_t i, oprsz = simd_oprsz(desc); \
4040 intptr_t ra = GETPC(); \
4041 unsigned rd = simd_data(desc); \
4042 void *d1 = &env->vfp.zregs[rd]; \
4043 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4044 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4045 for (i = 0; i < oprsz; ) { \
4046 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4047 do { \
4048 if (pg & 1) { \
4049 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4050 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4051 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4052 FN(env, addr, m1, ra); \
4053 FN(env, addr + sizeof(TYPEM), m2, ra); \
4054 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4056 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4057 addr += 3 * sizeof(TYPEM); \
4058 } while (i & 15); \
4062 #define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
4063 void HELPER(NAME)(CPUARMState *env, void *vg, \
4064 target_ulong addr, uint32_t desc) \
4066 intptr_t i, oprsz = simd_oprsz(desc); \
4067 intptr_t ra = GETPC(); \
4068 unsigned rd = simd_data(desc); \
4069 void *d1 = &env->vfp.zregs[rd]; \
4070 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4071 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4072 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4073 for (i = 0; i < oprsz; ) { \
4074 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4075 do { \
4076 if (pg & 1) { \
4077 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4078 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4079 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4080 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
4081 FN(env, addr, m1, ra); \
4082 FN(env, addr + sizeof(TYPEM), m2, ra); \
4083 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4084 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4086 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4087 addr += 4 * sizeof(TYPEM); \
4088 } while (i & 15); \
4092 DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4093 DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4094 DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4096 DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4097 DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4099 DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4101 DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4102 DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4103 DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4104 DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4106 DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4107 DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4108 DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4109 DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4111 DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4112 DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4113 DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4114 DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4116 DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4118 void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4119 target_ulong addr, uint32_t desc)
4121 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4122 intptr_t ra = GETPC();
4123 unsigned rd = simd_data(desc);
4124 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4125 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4126 uint8_t *pg = vg;
4128 for (i = 0; i < oprsz; i += 1) {
4129 if (pg[H1(i)] & 1) {
4130 cpu_stq_data_ra(env, addr, d1[i], ra);
4131 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4133 addr += 2 * 8;
4137 void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4138 target_ulong addr, uint32_t desc)
4140 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4141 intptr_t ra = GETPC();
4142 unsigned rd = simd_data(desc);
4143 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4144 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4145 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4146 uint8_t *pg = vg;
4148 for (i = 0; i < oprsz; i += 1) {
4149 if (pg[H1(i)] & 1) {
4150 cpu_stq_data_ra(env, addr, d1[i], ra);
4151 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4152 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4154 addr += 3 * 8;
4158 void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4159 target_ulong addr, uint32_t desc)
4161 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4162 intptr_t ra = GETPC();
4163 unsigned rd = simd_data(desc);
4164 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4165 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4166 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4167 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4168 uint8_t *pg = vg;
4170 for (i = 0; i < oprsz; i += 1) {
4171 if (pg[H1(i)] & 1) {
4172 cpu_stq_data_ra(env, addr, d1[i], ra);
4173 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4174 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4175 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4177 addr += 4 * 8;
4181 /* Loads with a vector index. */
4183 #define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4184 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4185 target_ulong base, uint32_t desc) \
4187 intptr_t i, oprsz = simd_oprsz(desc); \
4188 unsigned scale = simd_data(desc); \
4189 uintptr_t ra = GETPC(); \
4190 for (i = 0; i < oprsz; i++) { \
4191 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4192 do { \
4193 TYPEM m = 0; \
4194 if (pg & 1) { \
4195 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4196 m = FN(env, base + (off << scale), ra); \
4198 *(uint32_t *)(vd + H1_4(i)) = m; \
4199 i += 4, pg >>= 4; \
4200 } while (i & 15); \
4204 #define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4205 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4206 target_ulong base, uint32_t desc) \
4208 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4209 unsigned scale = simd_data(desc); \
4210 uintptr_t ra = GETPC(); \
4211 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4212 for (i = 0; i < oprsz; i++) { \
4213 TYPEM mm = 0; \
4214 if (pg[H1(i)] & 1) { \
4215 target_ulong off = (TYPEI)m[i]; \
4216 mm = FN(env, base + (off << scale), ra); \
4218 d[i] = mm; \
4222 DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4223 DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4224 DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4225 DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4226 DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4228 DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4229 DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4230 DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4231 DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4232 DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4234 DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4235 DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4236 DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4237 DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4238 DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4239 DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4240 DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4242 DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4243 DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4244 DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4245 DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4246 DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4247 DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4248 DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4250 DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4251 DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4252 DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4253 DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4254 DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4255 DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4256 DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4258 /* First fault loads with a vector index. */
4260 #ifdef CONFIG_USER_ONLY
4262 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4263 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4264 target_ulong base, uint32_t desc) \
4266 intptr_t i, oprsz = simd_oprsz(desc); \
4267 unsigned scale = simd_data(desc); \
4268 uintptr_t ra = GETPC(); \
4269 bool first = true; \
4270 mmap_lock(); \
4271 for (i = 0; i < oprsz; i++) { \
4272 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4273 do { \
4274 TYPEM m = 0; \
4275 if (pg & 1) { \
4276 target_ulong off = *(TYPEI *)(vm + H(i)); \
4277 target_ulong addr = base + (off << scale); \
4278 if (!first && \
4279 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4280 record_fault(env, i, oprsz); \
4281 goto exit; \
4283 m = FN(env, addr, ra); \
4284 first = false; \
4286 *(TYPEE *)(vd + H(i)) = m; \
4287 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4288 } while (i & 15); \
4290 exit: \
4291 mmap_unlock(); \
4294 #else
4296 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4297 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4298 target_ulong base, uint32_t desc) \
4300 g_assert_not_reached(); \
4303 #endif
4305 #define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4306 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4307 #define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4308 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4310 DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4311 DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4312 DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4313 DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4314 DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4316 DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4317 DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4318 DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4319 DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4320 DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4322 DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4323 DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4324 DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4325 DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4326 DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4327 DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4328 DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4330 DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4331 DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4332 DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4333 DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4334 DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4335 DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4336 DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4338 DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4339 DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4340 DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4341 DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4342 DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4343 DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4344 DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4346 /* Stores with a vector index. */
4348 #define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
4349 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4350 target_ulong base, uint32_t desc) \
4352 intptr_t i, oprsz = simd_oprsz(desc); \
4353 unsigned scale = simd_data(desc); \
4354 uintptr_t ra = GETPC(); \
4355 for (i = 0; i < oprsz; ) { \
4356 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4357 do { \
4358 if (likely(pg & 1)) { \
4359 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4360 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
4361 FN(env, base + (off << scale), d, ra); \
4363 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
4364 } while (i & 15); \
4368 #define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
4369 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4370 target_ulong base, uint32_t desc) \
4372 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4373 unsigned scale = simd_data(desc); \
4374 uintptr_t ra = GETPC(); \
4375 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4376 for (i = 0; i < oprsz; i++) { \
4377 if (likely(pg[H1(i)] & 1)) { \
4378 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
4379 FN(env, base + off, d[i], ra); \
4384 DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4385 DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4386 DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4388 DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4389 DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4390 DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4392 DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4393 DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4394 DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4395 DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4397 DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4398 DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4399 DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4400 DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4402 DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4403 DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4404 DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4405 DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)