target/arm: Implement SVE Select Vectors Group
[qemu/ar7.git] / target / arm / sve_helper.c
blobf55fdc7dbef35f0ba1317d6ac83c33d2b22dc013
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 return flags;
77 /* The same for a single word predicate. */
78 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
80 return iter_predtest_fwd(d, g, PREDTEST_INIT);
83 /* The same for a multi-word predicate. */
84 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
86 uint32_t flags = PREDTEST_INIT;
87 uint64_t *d = vd, *g = vg;
88 uintptr_t i = 0;
90 do {
91 flags = iter_predtest_fwd(d[i], g[i], flags);
92 } while (++i < words);
94 return flags;
97 /* Expand active predicate bits to bytes, for byte elements.
98 * for (i = 0; i < 256; ++i) {
99 * unsigned long m = 0;
100 * for (j = 0; j < 8; j++) {
101 * if ((i >> j) & 1) {
102 * m |= 0xfful << (j << 3);
105 * printf("0x%016lx,\n", m);
108 static inline uint64_t expand_pred_b(uint8_t byte)
110 static const uint64_t word[256] = {
111 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
112 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
113 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
114 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
115 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
116 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
117 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
118 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
119 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
120 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
121 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
122 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
123 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
124 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
125 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
126 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
127 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
128 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
129 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
130 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
131 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
132 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
133 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
134 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
135 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
136 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
137 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
138 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
139 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
140 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
141 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
142 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
143 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
144 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
145 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
146 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
147 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
148 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
149 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
150 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
151 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
152 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
153 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
154 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
155 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
156 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
157 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
158 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
159 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
160 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
161 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
162 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
163 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
164 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
165 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
166 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
167 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
168 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
169 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
170 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
171 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
172 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
173 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
174 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
175 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
176 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
177 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
178 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
179 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
180 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
181 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
182 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
183 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
184 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
185 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
186 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
187 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
188 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
189 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
190 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
191 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
192 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
193 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
194 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
195 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
196 0xffffffffffffffff,
198 return word[byte];
201 /* Similarly for half-word elements.
202 * for (i = 0; i < 256; ++i) {
203 * unsigned long m = 0;
204 * if (i & 0xaa) {
205 * continue;
207 * for (j = 0; j < 8; j += 2) {
208 * if ((i >> j) & 1) {
209 * m |= 0xfffful << (j << 3);
212 * printf("[0x%x] = 0x%016lx,\n", i, m);
215 static inline uint64_t expand_pred_h(uint8_t byte)
217 static const uint64_t word[] = {
218 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
219 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
220 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
221 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
222 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
223 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
224 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
225 [0x55] = 0xffffffffffffffff,
227 return word[byte & 0x55];
230 /* Similarly for single word elements. */
231 static inline uint64_t expand_pred_s(uint8_t byte)
233 static const uint64_t word[] = {
234 [0x01] = 0x00000000ffffffffull,
235 [0x10] = 0xffffffff00000000ull,
236 [0x11] = 0xffffffffffffffffull,
238 return word[byte & 0x11];
241 /* Swap 16-bit words within a 32-bit word. */
242 static inline uint32_t hswap32(uint32_t h)
244 return rol32(h, 16);
247 /* Swap 16-bit words within a 64-bit word. */
248 static inline uint64_t hswap64(uint64_t h)
250 uint64_t m = 0x0000ffff0000ffffull;
251 h = rol64(h, 32);
252 return ((h & m) << 16) | ((h >> 16) & m);
255 /* Swap 32-bit words within a 64-bit word. */
256 static inline uint64_t wswap64(uint64_t h)
258 return rol64(h, 32);
261 #define LOGICAL_PPPP(NAME, FUNC) \
262 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
264 uintptr_t opr_sz = simd_oprsz(desc); \
265 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
266 uintptr_t i; \
267 for (i = 0; i < opr_sz / 8; ++i) { \
268 d[i] = FUNC(n[i], m[i], g[i]); \
272 #define DO_AND(N, M, G) (((N) & (M)) & (G))
273 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
274 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
275 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
276 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
277 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
278 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
279 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
281 LOGICAL_PPPP(sve_and_pppp, DO_AND)
282 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
283 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
284 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
285 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
286 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
287 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
288 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
290 #undef DO_AND
291 #undef DO_BIC
292 #undef DO_EOR
293 #undef DO_ORR
294 #undef DO_ORN
295 #undef DO_NOR
296 #undef DO_NAND
297 #undef DO_SEL
298 #undef LOGICAL_PPPP
300 /* Fully general three-operand expander, controlled by a predicate.
301 * This is complicated by the host-endian storage of the register file.
303 /* ??? I don't expect the compiler could ever vectorize this itself.
304 * With some tables we can convert bit masks to byte masks, and with
305 * extra care wrt byte/word ordering we could use gcc generic vectors
306 * and do 16 bytes at a time.
308 #define DO_ZPZZ(NAME, TYPE, H, OP) \
309 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
311 intptr_t i, opr_sz = simd_oprsz(desc); \
312 for (i = 0; i < opr_sz; ) { \
313 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
314 do { \
315 if (pg & 1) { \
316 TYPE nn = *(TYPE *)(vn + H(i)); \
317 TYPE mm = *(TYPE *)(vm + H(i)); \
318 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
320 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
321 } while (i & 15); \
325 /* Similarly, specialized for 64-bit operands. */
326 #define DO_ZPZZ_D(NAME, TYPE, OP) \
327 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
329 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
330 TYPE *d = vd, *n = vn, *m = vm; \
331 uint8_t *pg = vg; \
332 for (i = 0; i < opr_sz; i += 1) { \
333 if (pg[H1(i)] & 1) { \
334 TYPE nn = n[i], mm = m[i]; \
335 d[i] = OP(nn, mm); \
340 #define DO_AND(N, M) (N & M)
341 #define DO_EOR(N, M) (N ^ M)
342 #define DO_ORR(N, M) (N | M)
343 #define DO_BIC(N, M) (N & ~M)
344 #define DO_ADD(N, M) (N + M)
345 #define DO_SUB(N, M) (N - M)
346 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
347 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
348 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
349 #define DO_MUL(N, M) (N * M)
350 #define DO_DIV(N, M) (M ? N / M : 0)
352 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
353 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
354 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
355 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
357 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
358 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
359 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
360 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
362 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
363 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
364 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
365 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
367 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
368 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
369 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
370 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
372 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
373 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
374 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
375 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
377 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
378 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
379 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
380 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
382 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
383 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
384 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
385 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
387 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
388 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
389 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
390 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
392 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
393 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
394 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
395 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
397 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
398 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
399 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
400 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
402 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
403 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
404 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
405 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
407 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
408 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
409 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
410 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
412 /* Because the computation type is at least twice as large as required,
413 these work for both signed and unsigned source types. */
414 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
416 return (n * m) >> 8;
419 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
421 return (n * m) >> 16;
424 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
426 return (n * m) >> 32;
429 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
431 uint64_t lo, hi;
432 muls64(&lo, &hi, n, m);
433 return hi;
436 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
438 uint64_t lo, hi;
439 mulu64(&lo, &hi, n, m);
440 return hi;
443 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
444 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
445 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
446 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
448 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
449 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
450 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
451 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
453 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
454 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
455 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
456 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
458 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
459 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
461 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
462 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
464 /* Note that all bits of the shift are significant
465 and not modulo the element size. */
466 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
467 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
468 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
470 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
471 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
472 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
474 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
475 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
476 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
478 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
479 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
480 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
482 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
483 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
484 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
486 #undef DO_ZPZZ
487 #undef DO_ZPZZ_D
489 /* Three-operand expander, controlled by a predicate, in which the
490 * third operand is "wide". That is, for D = N op M, the same 64-bit
491 * value of M is used with all of the narrower values of N.
493 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
494 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
496 intptr_t i, opr_sz = simd_oprsz(desc); \
497 for (i = 0; i < opr_sz; ) { \
498 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
499 TYPEW mm = *(TYPEW *)(vm + i); \
500 do { \
501 if (pg & 1) { \
502 TYPE nn = *(TYPE *)(vn + H(i)); \
503 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
505 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
506 } while (i & 7); \
510 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
511 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
512 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
514 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
515 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
516 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
518 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
519 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
520 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
522 #undef DO_ZPZW
524 /* Fully general two-operand expander, controlled by a predicate.
526 #define DO_ZPZ(NAME, TYPE, H, OP) \
527 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
529 intptr_t i, opr_sz = simd_oprsz(desc); \
530 for (i = 0; i < opr_sz; ) { \
531 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn); \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 15); \
542 /* Similarly, specialized for 64-bit operands. */
543 #define DO_ZPZ_D(NAME, TYPE, OP) \
544 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
546 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
547 TYPE *d = vd, *n = vn; \
548 uint8_t *pg = vg; \
549 for (i = 0; i < opr_sz; i += 1) { \
550 if (pg[H1(i)] & 1) { \
551 TYPE nn = n[i]; \
552 d[i] = OP(nn); \
557 #define DO_CLS_B(N) (clrsb32(N) - 24)
558 #define DO_CLS_H(N) (clrsb32(N) - 16)
560 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
561 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
562 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
563 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
565 #define DO_CLZ_B(N) (clz32(N) - 24)
566 #define DO_CLZ_H(N) (clz32(N) - 16)
568 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
569 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
570 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
571 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
573 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
574 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
575 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
576 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
578 #define DO_CNOT(N) (N == 0)
580 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
581 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
582 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
583 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
585 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
587 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
588 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
589 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
591 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
593 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
594 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
595 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
597 #define DO_NOT(N) (~N)
599 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
600 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
601 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
602 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
604 #define DO_SXTB(N) ((int8_t)N)
605 #define DO_SXTH(N) ((int16_t)N)
606 #define DO_SXTS(N) ((int32_t)N)
607 #define DO_UXTB(N) ((uint8_t)N)
608 #define DO_UXTH(N) ((uint16_t)N)
609 #define DO_UXTS(N) ((uint32_t)N)
611 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
612 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
613 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
614 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
615 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
616 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
618 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
619 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
620 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
621 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
622 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
623 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
625 #define DO_ABS(N) (N < 0 ? -N : N)
627 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
628 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
629 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
630 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
632 #define DO_NEG(N) (-N)
634 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
635 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
636 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
637 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
639 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
640 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
641 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
643 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
644 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
646 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
648 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
649 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
650 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
651 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
653 /* Three-operand expander, unpredicated, in which the third operand is "wide".
655 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
656 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
658 intptr_t i, opr_sz = simd_oprsz(desc); \
659 for (i = 0; i < opr_sz; ) { \
660 TYPEW mm = *(TYPEW *)(vm + i); \
661 do { \
662 TYPE nn = *(TYPE *)(vn + H(i)); \
663 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
664 i += sizeof(TYPE); \
665 } while (i & 7); \
669 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
670 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
671 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
673 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
674 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
675 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
677 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
678 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
679 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
681 #undef DO_ZZW
683 #undef DO_CLS_B
684 #undef DO_CLS_H
685 #undef DO_CLZ_B
686 #undef DO_CLZ_H
687 #undef DO_CNOT
688 #undef DO_FABS
689 #undef DO_FNEG
690 #undef DO_ABS
691 #undef DO_NEG
692 #undef DO_ZPZ
693 #undef DO_ZPZ_D
695 /* Two-operand reduction expander, controlled by a predicate.
696 * The difference between TYPERED and TYPERET has to do with
697 * sign-extension. E.g. for SMAX, TYPERED must be signed,
698 * but TYPERET must be unsigned so that e.g. a 32-bit value
699 * is not sign-extended to the ABI uint64_t return type.
701 /* ??? If we were to vectorize this by hand the reduction ordering
702 * would change. For integer operands, this is perfectly fine.
704 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
705 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
707 intptr_t i, opr_sz = simd_oprsz(desc); \
708 TYPERED ret = INIT; \
709 for (i = 0; i < opr_sz; ) { \
710 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
711 do { \
712 if (pg & 1) { \
713 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
714 ret = OP(ret, nn); \
716 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
717 } while (i & 15); \
719 return (TYPERET)ret; \
722 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
723 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
725 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
726 TYPEE *n = vn; \
727 uint8_t *pg = vg; \
728 TYPER ret = INIT; \
729 for (i = 0; i < opr_sz; i += 1) { \
730 if (pg[H1(i)] & 1) { \
731 TYPEE nn = n[i]; \
732 ret = OP(ret, nn); \
735 return ret; \
738 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
739 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
740 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
741 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
743 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
744 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
745 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
746 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
748 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
749 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
750 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
751 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
753 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
754 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
755 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
757 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
758 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
759 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
760 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
762 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
763 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
764 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
765 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
767 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
768 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
769 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
770 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
772 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
773 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
774 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
775 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
777 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
778 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
779 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
780 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
782 #undef DO_VPZ
783 #undef DO_VPZ_D
785 #undef DO_AND
786 #undef DO_ORR
787 #undef DO_EOR
788 #undef DO_BIC
789 #undef DO_ADD
790 #undef DO_SUB
791 #undef DO_MAX
792 #undef DO_MIN
793 #undef DO_ABD
794 #undef DO_MUL
795 #undef DO_DIV
796 #undef DO_ASR
797 #undef DO_LSR
798 #undef DO_LSL
800 /* Similar to the ARM LastActiveElement pseudocode function, except the
801 result is multiplied by the element size. This includes the not found
802 indication; e.g. not found for esz=3 is -8. */
803 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
805 uint64_t mask = pred_esz_masks[esz];
806 intptr_t i = words;
808 do {
809 uint64_t this_g = g[--i] & mask;
810 if (this_g) {
811 return i * 64 + (63 - clz64(this_g));
813 } while (i > 0);
814 return (intptr_t)-1 << esz;
817 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
819 uint32_t flags = PREDTEST_INIT;
820 uint64_t *d = vd, *g = vg;
821 intptr_t i = 0;
823 do {
824 uint64_t this_d = d[i];
825 uint64_t this_g = g[i];
827 if (this_g) {
828 if (!(flags & 4)) {
829 /* Set in D the first bit of G. */
830 this_d |= this_g & -this_g;
831 d[i] = this_d;
833 flags = iter_predtest_fwd(this_d, this_g, flags);
835 } while (++i < words);
837 return flags;
840 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
842 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
843 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
844 uint32_t flags = PREDTEST_INIT;
845 uint64_t *d = vd, *g = vg, esz_mask;
846 intptr_t i, next;
848 next = last_active_element(vd, words, esz) + (1 << esz);
849 esz_mask = pred_esz_masks[esz];
851 /* Similar to the pseudocode for pnext, but scaled by ESZ
852 so that we find the correct bit. */
853 if (next < words * 64) {
854 uint64_t mask = -1;
856 if (next & 63) {
857 mask = ~((1ull << (next & 63)) - 1);
858 next &= -64;
860 do {
861 uint64_t this_g = g[next / 64] & esz_mask & mask;
862 if (this_g != 0) {
863 next = (next & -64) + ctz64(this_g);
864 break;
866 next += 64;
867 mask = -1;
868 } while (next < words * 64);
871 i = 0;
872 do {
873 uint64_t this_d = 0;
874 if (i == next / 64) {
875 this_d = 1ull << (next & 63);
877 d[i] = this_d;
878 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
879 } while (++i < words);
881 return flags;
884 /* Store zero into every active element of Zd. We will use this for two
885 * and three-operand predicated instructions for which logic dictates a
886 * zero result. In particular, logical shift by element size, which is
887 * otherwise undefined on the host.
889 * For element sizes smaller than uint64_t, we use tables to expand
890 * the N bits of the controlling predicate to a byte mask, and clear
891 * those bytes.
893 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
895 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
896 uint64_t *d = vd;
897 uint8_t *pg = vg;
898 for (i = 0; i < opr_sz; i += 1) {
899 d[i] &= ~expand_pred_b(pg[H1(i)]);
903 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
905 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
906 uint64_t *d = vd;
907 uint8_t *pg = vg;
908 for (i = 0; i < opr_sz; i += 1) {
909 d[i] &= ~expand_pred_h(pg[H1(i)]);
913 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
915 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
916 uint64_t *d = vd;
917 uint8_t *pg = vg;
918 for (i = 0; i < opr_sz; i += 1) {
919 d[i] &= ~expand_pred_s(pg[H1(i)]);
923 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
925 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
926 uint64_t *d = vd;
927 uint8_t *pg = vg;
928 for (i = 0; i < opr_sz; i += 1) {
929 if (pg[H1(i)] & 1) {
930 d[i] = 0;
935 /* Three-operand expander, immediate operand, controlled by a predicate.
937 #define DO_ZPZI(NAME, TYPE, H, OP) \
938 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
940 intptr_t i, opr_sz = simd_oprsz(desc); \
941 TYPE imm = simd_data(desc); \
942 for (i = 0; i < opr_sz; ) { \
943 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
944 do { \
945 if (pg & 1) { \
946 TYPE nn = *(TYPE *)(vn + H(i)); \
947 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
949 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
950 } while (i & 15); \
954 /* Similarly, specialized for 64-bit operands. */
955 #define DO_ZPZI_D(NAME, TYPE, OP) \
956 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
958 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
959 TYPE *d = vd, *n = vn; \
960 TYPE imm = simd_data(desc); \
961 uint8_t *pg = vg; \
962 for (i = 0; i < opr_sz; i += 1) { \
963 if (pg[H1(i)] & 1) { \
964 TYPE nn = n[i]; \
965 d[i] = OP(nn, imm); \
970 #define DO_SHR(N, M) (N >> M)
971 #define DO_SHL(N, M) (N << M)
973 /* Arithmetic shift right for division. This rounds negative numbers
974 toward zero as per signed division. Therefore before shifting,
975 when N is negative, add 2**M-1. */
976 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
978 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
979 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
980 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
981 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
983 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
984 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
985 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
986 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
988 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
989 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
990 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
991 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
993 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
994 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
995 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
996 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
998 #undef DO_SHR
999 #undef DO_SHL
1000 #undef DO_ASRD
1001 #undef DO_ZPZI
1002 #undef DO_ZPZI_D
1004 /* Fully general four-operand expander, controlled by a predicate.
1006 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1007 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1008 void *vg, uint32_t desc) \
1010 intptr_t i, opr_sz = simd_oprsz(desc); \
1011 for (i = 0; i < opr_sz; ) { \
1012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1013 do { \
1014 if (pg & 1) { \
1015 TYPE nn = *(TYPE *)(vn + H(i)); \
1016 TYPE mm = *(TYPE *)(vm + H(i)); \
1017 TYPE aa = *(TYPE *)(va + H(i)); \
1018 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1020 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1021 } while (i & 15); \
1025 /* Similarly, specialized for 64-bit operands. */
1026 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1027 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1028 void *vg, uint32_t desc) \
1030 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1031 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1032 uint8_t *pg = vg; \
1033 for (i = 0; i < opr_sz; i += 1) { \
1034 if (pg[H1(i)] & 1) { \
1035 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1036 d[i] = OP(aa, nn, mm); \
1041 #define DO_MLA(A, N, M) (A + N * M)
1042 #define DO_MLS(A, N, M) (A - N * M)
1044 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1045 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1047 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1048 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1050 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1051 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1053 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1054 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1056 #undef DO_MLA
1057 #undef DO_MLS
1058 #undef DO_ZPZZZ
1059 #undef DO_ZPZZZ_D
1061 void HELPER(sve_index_b)(void *vd, uint32_t start,
1062 uint32_t incr, uint32_t desc)
1064 intptr_t i, opr_sz = simd_oprsz(desc);
1065 uint8_t *d = vd;
1066 for (i = 0; i < opr_sz; i += 1) {
1067 d[H1(i)] = start + i * incr;
1071 void HELPER(sve_index_h)(void *vd, uint32_t start,
1072 uint32_t incr, uint32_t desc)
1074 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1075 uint16_t *d = vd;
1076 for (i = 0; i < opr_sz; i += 1) {
1077 d[H2(i)] = start + i * incr;
1081 void HELPER(sve_index_s)(void *vd, uint32_t start,
1082 uint32_t incr, uint32_t desc)
1084 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1085 uint32_t *d = vd;
1086 for (i = 0; i < opr_sz; i += 1) {
1087 d[H4(i)] = start + i * incr;
1091 void HELPER(sve_index_d)(void *vd, uint64_t start,
1092 uint64_t incr, uint32_t desc)
1094 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1095 uint64_t *d = vd;
1096 for (i = 0; i < opr_sz; i += 1) {
1097 d[i] = start + i * incr;
1101 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1103 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1104 uint32_t sh = simd_data(desc);
1105 uint32_t *d = vd, *n = vn, *m = vm;
1106 for (i = 0; i < opr_sz; i += 1) {
1107 d[i] = n[i] + (m[i] << sh);
1111 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1113 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1114 uint64_t sh = simd_data(desc);
1115 uint64_t *d = vd, *n = vn, *m = vm;
1116 for (i = 0; i < opr_sz; i += 1) {
1117 d[i] = n[i] + (m[i] << sh);
1121 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1123 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1124 uint64_t sh = simd_data(desc);
1125 uint64_t *d = vd, *n = vn, *m = vm;
1126 for (i = 0; i < opr_sz; i += 1) {
1127 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1131 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1133 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1134 uint64_t sh = simd_data(desc);
1135 uint64_t *d = vd, *n = vn, *m = vm;
1136 for (i = 0; i < opr_sz; i += 1) {
1137 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1141 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1143 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1144 static const uint16_t coeff[] = {
1145 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1146 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1147 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1148 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1150 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1151 uint16_t *d = vd, *n = vn;
1153 for (i = 0; i < opr_sz; i++) {
1154 uint16_t nn = n[i];
1155 intptr_t idx = extract32(nn, 0, 5);
1156 uint16_t exp = extract32(nn, 5, 5);
1157 d[i] = coeff[idx] | (exp << 10);
1161 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1163 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1164 static const uint32_t coeff[] = {
1165 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1166 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1167 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1168 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1169 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1170 0x1ef532, 0x20b051, 0x227043, 0x243516,
1171 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1172 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1173 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1174 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1175 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1176 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1177 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1178 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1179 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1180 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1182 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1183 uint32_t *d = vd, *n = vn;
1185 for (i = 0; i < opr_sz; i++) {
1186 uint32_t nn = n[i];
1187 intptr_t idx = extract32(nn, 0, 6);
1188 uint32_t exp = extract32(nn, 6, 8);
1189 d[i] = coeff[idx] | (exp << 23);
1193 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1195 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1196 static const uint64_t coeff[] = {
1197 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1198 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1199 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1200 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1201 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1202 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1203 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1204 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1205 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1206 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1207 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1208 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1209 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1210 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1211 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1212 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1213 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1214 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1215 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1216 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1217 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1218 0xFA7C1819E90D8ull,
1220 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1221 uint64_t *d = vd, *n = vn;
1223 for (i = 0; i < opr_sz; i++) {
1224 uint64_t nn = n[i];
1225 intptr_t idx = extract32(nn, 0, 6);
1226 uint64_t exp = extract32(nn, 6, 11);
1227 d[i] = coeff[idx] | (exp << 52);
1231 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1233 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1234 uint16_t *d = vd, *n = vn, *m = vm;
1235 for (i = 0; i < opr_sz; i += 1) {
1236 uint16_t nn = n[i];
1237 uint16_t mm = m[i];
1238 if (mm & 1) {
1239 nn = float16_one;
1241 d[i] = nn ^ (mm & 2) << 14;
1245 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1248 uint32_t *d = vd, *n = vn, *m = vm;
1249 for (i = 0; i < opr_sz; i += 1) {
1250 uint32_t nn = n[i];
1251 uint32_t mm = m[i];
1252 if (mm & 1) {
1253 nn = float32_one;
1255 d[i] = nn ^ (mm & 2) << 30;
1259 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1261 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1262 uint64_t *d = vd, *n = vn, *m = vm;
1263 for (i = 0; i < opr_sz; i += 1) {
1264 uint64_t nn = n[i];
1265 uint64_t mm = m[i];
1266 if (mm & 1) {
1267 nn = float64_one;
1269 d[i] = nn ^ (mm & 2) << 62;
1274 * Signed saturating addition with scalar operand.
1277 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1279 intptr_t i, oprsz = simd_oprsz(desc);
1281 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1282 int r = *(int8_t *)(a + i) + b;
1283 if (r > INT8_MAX) {
1284 r = INT8_MAX;
1285 } else if (r < INT8_MIN) {
1286 r = INT8_MIN;
1288 *(int8_t *)(d + i) = r;
1292 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1294 intptr_t i, oprsz = simd_oprsz(desc);
1296 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1297 int r = *(int16_t *)(a + i) + b;
1298 if (r > INT16_MAX) {
1299 r = INT16_MAX;
1300 } else if (r < INT16_MIN) {
1301 r = INT16_MIN;
1303 *(int16_t *)(d + i) = r;
1307 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1309 intptr_t i, oprsz = simd_oprsz(desc);
1311 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1312 int64_t r = *(int32_t *)(a + i) + b;
1313 if (r > INT32_MAX) {
1314 r = INT32_MAX;
1315 } else if (r < INT32_MIN) {
1316 r = INT32_MIN;
1318 *(int32_t *)(d + i) = r;
1322 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1324 intptr_t i, oprsz = simd_oprsz(desc);
1326 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1327 int64_t ai = *(int64_t *)(a + i);
1328 int64_t r = ai + b;
1329 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1330 /* Signed overflow. */
1331 r = (r < 0 ? INT64_MAX : INT64_MIN);
1333 *(int64_t *)(d + i) = r;
1338 * Unsigned saturating addition with scalar operand.
1341 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1343 intptr_t i, oprsz = simd_oprsz(desc);
1345 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1346 int r = *(uint8_t *)(a + i) + b;
1347 if (r > UINT8_MAX) {
1348 r = UINT8_MAX;
1349 } else if (r < 0) {
1350 r = 0;
1352 *(uint8_t *)(d + i) = r;
1356 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1358 intptr_t i, oprsz = simd_oprsz(desc);
1360 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1361 int r = *(uint16_t *)(a + i) + b;
1362 if (r > UINT16_MAX) {
1363 r = UINT16_MAX;
1364 } else if (r < 0) {
1365 r = 0;
1367 *(uint16_t *)(d + i) = r;
1371 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1373 intptr_t i, oprsz = simd_oprsz(desc);
1375 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1376 int64_t r = *(uint32_t *)(a + i) + b;
1377 if (r > UINT32_MAX) {
1378 r = UINT32_MAX;
1379 } else if (r < 0) {
1380 r = 0;
1382 *(uint32_t *)(d + i) = r;
1386 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1388 intptr_t i, oprsz = simd_oprsz(desc);
1390 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1391 uint64_t r = *(uint64_t *)(a + i) + b;
1392 if (r < b) {
1393 r = UINT64_MAX;
1395 *(uint64_t *)(d + i) = r;
1399 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1401 intptr_t i, oprsz = simd_oprsz(desc);
1403 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1404 uint64_t ai = *(uint64_t *)(a + i);
1405 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1409 /* Two operand predicated copy immediate with merge. All valid immediates
1410 * can fit within 17 signed bits in the simd_data field.
1412 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1413 uint64_t mm, uint32_t desc)
1415 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1416 uint64_t *d = vd, *n = vn;
1417 uint8_t *pg = vg;
1419 mm = dup_const(MO_8, mm);
1420 for (i = 0; i < opr_sz; i += 1) {
1421 uint64_t nn = n[i];
1422 uint64_t pp = expand_pred_b(pg[H1(i)]);
1423 d[i] = (mm & pp) | (nn & ~pp);
1427 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1428 uint64_t mm, uint32_t desc)
1430 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1431 uint64_t *d = vd, *n = vn;
1432 uint8_t *pg = vg;
1434 mm = dup_const(MO_16, mm);
1435 for (i = 0; i < opr_sz; i += 1) {
1436 uint64_t nn = n[i];
1437 uint64_t pp = expand_pred_h(pg[H1(i)]);
1438 d[i] = (mm & pp) | (nn & ~pp);
1442 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1443 uint64_t mm, uint32_t desc)
1445 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1446 uint64_t *d = vd, *n = vn;
1447 uint8_t *pg = vg;
1449 mm = dup_const(MO_32, mm);
1450 for (i = 0; i < opr_sz; i += 1) {
1451 uint64_t nn = n[i];
1452 uint64_t pp = expand_pred_s(pg[H1(i)]);
1453 d[i] = (mm & pp) | (nn & ~pp);
1457 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1458 uint64_t mm, uint32_t desc)
1460 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1461 uint64_t *d = vd, *n = vn;
1462 uint8_t *pg = vg;
1464 for (i = 0; i < opr_sz; i += 1) {
1465 uint64_t nn = n[i];
1466 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1470 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1472 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1473 uint64_t *d = vd;
1474 uint8_t *pg = vg;
1476 val = dup_const(MO_8, val);
1477 for (i = 0; i < opr_sz; i += 1) {
1478 d[i] = val & expand_pred_b(pg[H1(i)]);
1482 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1484 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1485 uint64_t *d = vd;
1486 uint8_t *pg = vg;
1488 val = dup_const(MO_16, val);
1489 for (i = 0; i < opr_sz; i += 1) {
1490 d[i] = val & expand_pred_h(pg[H1(i)]);
1494 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1496 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1497 uint64_t *d = vd;
1498 uint8_t *pg = vg;
1500 val = dup_const(MO_32, val);
1501 for (i = 0; i < opr_sz; i += 1) {
1502 d[i] = val & expand_pred_s(pg[H1(i)]);
1506 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1508 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1509 uint64_t *d = vd;
1510 uint8_t *pg = vg;
1512 for (i = 0; i < opr_sz; i += 1) {
1513 d[i] = (pg[H1(i)] & 1 ? val : 0);
1517 /* Big-endian hosts need to frob the byte indicies. If the copy
1518 * happens to be 8-byte aligned, then no frobbing necessary.
1520 static void swap_memmove(void *vd, void *vs, size_t n)
1522 uintptr_t d = (uintptr_t)vd;
1523 uintptr_t s = (uintptr_t)vs;
1524 uintptr_t o = (d | s | n) & 7;
1525 size_t i;
1527 #ifndef HOST_WORDS_BIGENDIAN
1528 o = 0;
1529 #endif
1530 switch (o) {
1531 case 0:
1532 memmove(vd, vs, n);
1533 break;
1535 case 4:
1536 if (d < s || d >= s + n) {
1537 for (i = 0; i < n; i += 4) {
1538 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1540 } else {
1541 for (i = n; i > 0; ) {
1542 i -= 4;
1543 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1546 break;
1548 case 2:
1549 case 6:
1550 if (d < s || d >= s + n) {
1551 for (i = 0; i < n; i += 2) {
1552 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1554 } else {
1555 for (i = n; i > 0; ) {
1556 i -= 2;
1557 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1560 break;
1562 default:
1563 if (d < s || d >= s + n) {
1564 for (i = 0; i < n; i++) {
1565 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1567 } else {
1568 for (i = n; i > 0; ) {
1569 i -= 1;
1570 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1573 break;
1577 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1579 intptr_t opr_sz = simd_oprsz(desc);
1580 size_t n_ofs = simd_data(desc);
1581 size_t n_siz = opr_sz - n_ofs;
1583 if (vd != vm) {
1584 swap_memmove(vd, vn + n_ofs, n_siz);
1585 swap_memmove(vd + n_siz, vm, n_ofs);
1586 } else if (vd != vn) {
1587 swap_memmove(vd + n_siz, vd, n_ofs);
1588 swap_memmove(vd, vn + n_ofs, n_siz);
1589 } else {
1590 /* vd == vn == vm. Need temp space. */
1591 ARMVectorReg tmp;
1592 swap_memmove(&tmp, vm, n_ofs);
1593 swap_memmove(vd, vd + n_ofs, n_siz);
1594 memcpy(vd + n_siz, &tmp, n_ofs);
1598 #define DO_INSR(NAME, TYPE, H) \
1599 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1601 intptr_t opr_sz = simd_oprsz(desc); \
1602 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1603 *(TYPE *)(vd + H(0)) = val; \
1606 DO_INSR(sve_insr_b, uint8_t, H1)
1607 DO_INSR(sve_insr_h, uint16_t, H1_2)
1608 DO_INSR(sve_insr_s, uint32_t, H1_4)
1609 DO_INSR(sve_insr_d, uint64_t, )
1611 #undef DO_INSR
1613 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1615 intptr_t i, j, opr_sz = simd_oprsz(desc);
1616 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1617 uint64_t f = *(uint64_t *)(vn + i);
1618 uint64_t b = *(uint64_t *)(vn + j);
1619 *(uint64_t *)(vd + i) = bswap64(b);
1620 *(uint64_t *)(vd + j) = bswap64(f);
1624 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1626 intptr_t i, j, opr_sz = simd_oprsz(desc);
1627 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1628 uint64_t f = *(uint64_t *)(vn + i);
1629 uint64_t b = *(uint64_t *)(vn + j);
1630 *(uint64_t *)(vd + i) = hswap64(b);
1631 *(uint64_t *)(vd + j) = hswap64(f);
1635 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1637 intptr_t i, j, opr_sz = simd_oprsz(desc);
1638 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1639 uint64_t f = *(uint64_t *)(vn + i);
1640 uint64_t b = *(uint64_t *)(vn + j);
1641 *(uint64_t *)(vd + i) = rol64(b, 32);
1642 *(uint64_t *)(vd + j) = rol64(f, 32);
1646 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1648 intptr_t i, j, opr_sz = simd_oprsz(desc);
1649 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1650 uint64_t f = *(uint64_t *)(vn + i);
1651 uint64_t b = *(uint64_t *)(vn + j);
1652 *(uint64_t *)(vd + i) = b;
1653 *(uint64_t *)(vd + j) = f;
1657 #define DO_TBL(NAME, TYPE, H) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1660 intptr_t i, opr_sz = simd_oprsz(desc); \
1661 uintptr_t elem = opr_sz / sizeof(TYPE); \
1662 TYPE *d = vd, *n = vn, *m = vm; \
1663 ARMVectorReg tmp; \
1664 if (unlikely(vd == vn)) { \
1665 n = memcpy(&tmp, vn, opr_sz); \
1667 for (i = 0; i < elem; i++) { \
1668 TYPE j = m[H(i)]; \
1669 d[H(i)] = j < elem ? n[H(j)] : 0; \
1673 DO_TBL(sve_tbl_b, uint8_t, H1)
1674 DO_TBL(sve_tbl_h, uint16_t, H2)
1675 DO_TBL(sve_tbl_s, uint32_t, H4)
1676 DO_TBL(sve_tbl_d, uint64_t, )
1678 #undef TBL
1680 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1681 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1683 intptr_t i, opr_sz = simd_oprsz(desc); \
1684 TYPED *d = vd; \
1685 TYPES *n = vn; \
1686 ARMVectorReg tmp; \
1687 if (unlikely(vn - vd < opr_sz)) { \
1688 n = memcpy(&tmp, n, opr_sz / 2); \
1690 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1691 d[HD(i)] = n[HS(i)]; \
1695 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1696 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1697 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1699 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1700 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1701 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1703 #undef DO_UNPK
1705 /* Mask of bits included in the even numbered predicates of width esz.
1706 * We also use this for expand_bits/compress_bits, and so extend the
1707 * same pattern out to 16-bit units.
1709 static const uint64_t even_bit_esz_masks[5] = {
1710 0x5555555555555555ull,
1711 0x3333333333333333ull,
1712 0x0f0f0f0f0f0f0f0full,
1713 0x00ff00ff00ff00ffull,
1714 0x0000ffff0000ffffull,
1717 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1718 * For N==0, this corresponds to the operation that in qemu/bitops.h
1719 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1720 * section 7-2 Shuffling Bits.
1722 static uint64_t expand_bits(uint64_t x, int n)
1724 int i;
1726 x &= 0xffffffffu;
1727 for (i = 4; i >= n; i--) {
1728 int sh = 1 << i;
1729 x = ((x << sh) | x) & even_bit_esz_masks[i];
1731 return x;
1734 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1735 * For N==0, this corresponds to the operation that in qemu/bitops.h
1736 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1737 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1739 static uint64_t compress_bits(uint64_t x, int n)
1741 int i;
1743 for (i = n; i <= 4; i++) {
1744 int sh = 1 << i;
1745 x &= even_bit_esz_masks[i];
1746 x = (x >> sh) | x;
1748 return x & 0xffffffffu;
1751 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1753 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1754 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1755 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1756 uint64_t *d = vd;
1757 intptr_t i;
1759 if (oprsz <= 8) {
1760 uint64_t nn = *(uint64_t *)vn;
1761 uint64_t mm = *(uint64_t *)vm;
1762 int half = 4 * oprsz;
1764 nn = extract64(nn, high * half, half);
1765 mm = extract64(mm, high * half, half);
1766 nn = expand_bits(nn, esz);
1767 mm = expand_bits(mm, esz);
1768 d[0] = nn + (mm << (1 << esz));
1769 } else {
1770 ARMPredicateReg tmp_n, tmp_m;
1772 /* We produce output faster than we consume input.
1773 Therefore we must be mindful of possible overlap. */
1774 if ((vn - vd) < (uintptr_t)oprsz) {
1775 vn = memcpy(&tmp_n, vn, oprsz);
1777 if ((vm - vd) < (uintptr_t)oprsz) {
1778 vm = memcpy(&tmp_m, vm, oprsz);
1780 if (high) {
1781 high = oprsz >> 1;
1784 if ((high & 3) == 0) {
1785 uint32_t *n = vn, *m = vm;
1786 high >>= 2;
1788 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1789 uint64_t nn = n[H4(high + i)];
1790 uint64_t mm = m[H4(high + i)];
1792 nn = expand_bits(nn, esz);
1793 mm = expand_bits(mm, esz);
1794 d[i] = nn + (mm << (1 << esz));
1796 } else {
1797 uint8_t *n = vn, *m = vm;
1798 uint16_t *d16 = vd;
1800 for (i = 0; i < oprsz / 2; i++) {
1801 uint16_t nn = n[H1(high + i)];
1802 uint16_t mm = m[H1(high + i)];
1804 nn = expand_bits(nn, esz);
1805 mm = expand_bits(mm, esz);
1806 d16[H2(i)] = nn + (mm << (1 << esz));
1812 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1815 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1816 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1817 uint64_t *d = vd, *n = vn, *m = vm;
1818 uint64_t l, h;
1819 intptr_t i;
1821 if (oprsz <= 8) {
1822 l = compress_bits(n[0] >> odd, esz);
1823 h = compress_bits(m[0] >> odd, esz);
1824 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1825 } else {
1826 ARMPredicateReg tmp_m;
1827 intptr_t oprsz_16 = oprsz / 16;
1829 if ((vm - vd) < (uintptr_t)oprsz) {
1830 m = memcpy(&tmp_m, vm, oprsz);
1833 for (i = 0; i < oprsz_16; i++) {
1834 l = n[2 * i + 0];
1835 h = n[2 * i + 1];
1836 l = compress_bits(l >> odd, esz);
1837 h = compress_bits(h >> odd, esz);
1838 d[i] = l + (h << 32);
1841 /* For VL which is not a power of 2, the results from M do not
1842 align nicely with the uint64_t for D. Put the aligned results
1843 from M into TMP_M and then copy it into place afterward. */
1844 if (oprsz & 15) {
1845 d[i] = compress_bits(n[2 * i] >> odd, esz);
1847 for (i = 0; i < oprsz_16; i++) {
1848 l = m[2 * i + 0];
1849 h = m[2 * i + 1];
1850 l = compress_bits(l >> odd, esz);
1851 h = compress_bits(h >> odd, esz);
1852 tmp_m.p[i] = l + (h << 32);
1854 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1856 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1857 } else {
1858 for (i = 0; i < oprsz_16; i++) {
1859 l = m[2 * i + 0];
1860 h = m[2 * i + 1];
1861 l = compress_bits(l >> odd, esz);
1862 h = compress_bits(h >> odd, esz);
1863 d[oprsz_16 + i] = l + (h << 32);
1869 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1871 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1872 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1873 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1874 uint64_t *d = vd, *n = vn, *m = vm;
1875 uint64_t mask;
1876 int shr, shl;
1877 intptr_t i;
1879 shl = 1 << esz;
1880 shr = 0;
1881 mask = even_bit_esz_masks[esz];
1882 if (odd) {
1883 mask <<= shl;
1884 shr = shl;
1885 shl = 0;
1888 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1889 uint64_t nn = (n[i] & mask) >> shr;
1890 uint64_t mm = (m[i] & mask) << shl;
1891 d[i] = nn + mm;
1895 /* Reverse units of 2**N bits. */
1896 static uint64_t reverse_bits_64(uint64_t x, int n)
1898 int i, sh;
1900 x = bswap64(x);
1901 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1902 uint64_t mask = even_bit_esz_masks[i];
1903 x = ((x & mask) << sh) | ((x >> sh) & mask);
1905 return x;
1908 static uint8_t reverse_bits_8(uint8_t x, int n)
1910 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
1911 int i, sh;
1913 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1914 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
1916 return x;
1919 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
1921 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1922 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1923 intptr_t i, oprsz_2 = oprsz / 2;
1925 if (oprsz <= 8) {
1926 uint64_t l = *(uint64_t *)vn;
1927 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
1928 *(uint64_t *)vd = l;
1929 } else if ((oprsz & 15) == 0) {
1930 for (i = 0; i < oprsz_2; i += 8) {
1931 intptr_t ih = oprsz - 8 - i;
1932 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
1933 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
1934 *(uint64_t *)(vd + i) = h;
1935 *(uint64_t *)(vd + ih) = l;
1937 } else {
1938 for (i = 0; i < oprsz_2; i += 1) {
1939 intptr_t il = H1(i);
1940 intptr_t ih = H1(oprsz - 1 - i);
1941 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
1942 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
1943 *(uint8_t *)(vd + il) = h;
1944 *(uint8_t *)(vd + ih) = l;
1949 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
1951 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1952 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1953 uint64_t *d = vd;
1954 intptr_t i;
1956 if (oprsz <= 8) {
1957 uint64_t nn = *(uint64_t *)vn;
1958 int half = 4 * oprsz;
1960 nn = extract64(nn, high * half, half);
1961 nn = expand_bits(nn, 0);
1962 d[0] = nn;
1963 } else {
1964 ARMPredicateReg tmp_n;
1966 /* We produce output faster than we consume input.
1967 Therefore we must be mindful of possible overlap. */
1968 if ((vn - vd) < (uintptr_t)oprsz) {
1969 vn = memcpy(&tmp_n, vn, oprsz);
1971 if (high) {
1972 high = oprsz >> 1;
1975 if ((high & 3) == 0) {
1976 uint32_t *n = vn;
1977 high >>= 2;
1979 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1980 uint64_t nn = n[H4(high + i)];
1981 d[i] = expand_bits(nn, 0);
1983 } else {
1984 uint16_t *d16 = vd;
1985 uint8_t *n = vn;
1987 for (i = 0; i < oprsz / 2; i++) {
1988 uint16_t nn = n[H1(high + i)];
1989 d16[H2(i)] = expand_bits(nn, 0);
1995 #define DO_ZIP(NAME, TYPE, H) \
1996 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1998 intptr_t oprsz = simd_oprsz(desc); \
1999 intptr_t i, oprsz_2 = oprsz / 2; \
2000 ARMVectorReg tmp_n, tmp_m; \
2001 /* We produce output faster than we consume input. \
2002 Therefore we must be mindful of possible overlap. */ \
2003 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2004 vn = memcpy(&tmp_n, vn, oprsz_2); \
2006 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2007 vm = memcpy(&tmp_m, vm, oprsz_2); \
2009 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2010 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2011 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2015 DO_ZIP(sve_zip_b, uint8_t, H1)
2016 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2017 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2018 DO_ZIP(sve_zip_d, uint64_t, )
2020 #define DO_UZP(NAME, TYPE, H) \
2021 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2023 intptr_t oprsz = simd_oprsz(desc); \
2024 intptr_t oprsz_2 = oprsz / 2; \
2025 intptr_t odd_ofs = simd_data(desc); \
2026 intptr_t i; \
2027 ARMVectorReg tmp_m; \
2028 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2029 vm = memcpy(&tmp_m, vm, oprsz); \
2031 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2032 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2034 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2035 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2039 DO_UZP(sve_uzp_b, uint8_t, H1)
2040 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2041 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2042 DO_UZP(sve_uzp_d, uint64_t, )
2044 #define DO_TRN(NAME, TYPE, H) \
2045 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2047 intptr_t oprsz = simd_oprsz(desc); \
2048 intptr_t odd_ofs = simd_data(desc); \
2049 intptr_t i; \
2050 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2051 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2052 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2053 *(TYPE *)(vd + H(i + 0)) = ae; \
2054 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2058 DO_TRN(sve_trn_b, uint8_t, H1)
2059 DO_TRN(sve_trn_h, uint16_t, H1_2)
2060 DO_TRN(sve_trn_s, uint32_t, H1_4)
2061 DO_TRN(sve_trn_d, uint64_t, )
2063 #undef DO_ZIP
2064 #undef DO_UZP
2065 #undef DO_TRN
2067 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2069 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2070 uint32_t *d = vd, *n = vn;
2071 uint8_t *pg = vg;
2073 for (i = j = 0; i < opr_sz; i++) {
2074 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2075 d[H4(j)] = n[H4(i)];
2076 j++;
2079 for (; j < opr_sz; j++) {
2080 d[H4(j)] = 0;
2084 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2086 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2087 uint64_t *d = vd, *n = vn;
2088 uint8_t *pg = vg;
2090 for (i = j = 0; i < opr_sz; i++) {
2091 if (pg[H1(i)] & 1) {
2092 d[j] = n[i];
2093 j++;
2096 for (; j < opr_sz; j++) {
2097 d[j] = 0;
2101 /* Similar to the ARM LastActiveElement pseudocode function, except the
2102 * result is multiplied by the element size. This includes the not found
2103 * indication; e.g. not found for esz=3 is -8.
2105 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2107 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2108 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2110 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2113 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2115 intptr_t opr_sz = simd_oprsz(desc) / 8;
2116 int esz = simd_data(desc);
2117 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2118 intptr_t i, first_i, last_i;
2119 ARMVectorReg tmp;
2121 first_i = last_i = 0;
2122 first_g = last_g = 0;
2124 /* Find the extent of the active elements within VG. */
2125 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2126 pg = *(uint64_t *)(vg + i) & mask;
2127 if (pg) {
2128 if (last_g == 0) {
2129 last_g = pg;
2130 last_i = i;
2132 first_g = pg;
2133 first_i = i;
2137 len = 0;
2138 if (first_g != 0) {
2139 first_i = first_i * 8 + ctz64(first_g);
2140 last_i = last_i * 8 + 63 - clz64(last_g);
2141 len = last_i - first_i + (1 << esz);
2142 if (vd == vm) {
2143 vm = memcpy(&tmp, vm, opr_sz * 8);
2145 swap_memmove(vd, vn + first_i, len);
2147 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2150 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2151 void *vg, uint32_t desc)
2153 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2154 uint64_t *d = vd, *n = vn, *m = vm;
2155 uint8_t *pg = vg;
2157 for (i = 0; i < opr_sz; i += 1) {
2158 uint64_t nn = n[i], mm = m[i];
2159 uint64_t pp = expand_pred_b(pg[H1(i)]);
2160 d[i] = (nn & pp) | (mm & ~pp);
2164 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2165 void *vg, uint32_t desc)
2167 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2168 uint64_t *d = vd, *n = vn, *m = vm;
2169 uint8_t *pg = vg;
2171 for (i = 0; i < opr_sz; i += 1) {
2172 uint64_t nn = n[i], mm = m[i];
2173 uint64_t pp = expand_pred_h(pg[H1(i)]);
2174 d[i] = (nn & pp) | (mm & ~pp);
2178 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2179 void *vg, uint32_t desc)
2181 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2182 uint64_t *d = vd, *n = vn, *m = vm;
2183 uint8_t *pg = vg;
2185 for (i = 0; i < opr_sz; i += 1) {
2186 uint64_t nn = n[i], mm = m[i];
2187 uint64_t pp = expand_pred_s(pg[H1(i)]);
2188 d[i] = (nn & pp) | (mm & ~pp);
2192 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2193 void *vg, uint32_t desc)
2195 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2196 uint64_t *d = vd, *n = vn, *m = vm;
2197 uint8_t *pg = vg;
2199 for (i = 0; i < opr_sz; i += 1) {
2200 uint64_t nn = n[i], mm = m[i];
2201 d[i] = (pg[H1(i)] & 1 ? nn : mm);