target/arm: Implement SVE Integer Compare - Vectors Group
[qemu.git] / target / arm / sve_helper.c
blobd11f591661e0feda7c225b088c4546ea0952526d
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 return flags;
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
96 return flags;
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
116 return flags;
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
127 * printf("0x%016lx,\n", m);
130 static inline uint64_t expand_pred_b(uint8_t byte)
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
220 return word[byte];
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 static inline uint64_t expand_pred_h(uint8_t byte)
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
249 return word[byte & 0x55];
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
260 return word[byte & 0x11];
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
266 return rol32(h, 16);
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
280 return rol64(h, 32);
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
372 #define DO_DIV(N, M) (M ? N / M : 0)
374 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
375 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
376 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
377 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
379 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
380 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
381 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
382 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
384 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
385 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
386 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
387 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
389 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
390 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
391 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
392 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
394 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
395 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
396 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
397 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
399 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
400 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
401 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
402 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
404 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
405 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
406 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
407 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
409 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
410 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
411 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
412 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
414 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
415 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
416 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
417 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
419 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
420 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
421 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
422 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
424 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
425 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
426 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
427 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
429 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
430 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
431 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
432 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
434 /* Because the computation type is at least twice as large as required,
435 these work for both signed and unsigned source types. */
436 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
438 return (n * m) >> 8;
441 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
443 return (n * m) >> 16;
446 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
448 return (n * m) >> 32;
451 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
453 uint64_t lo, hi;
454 muls64(&lo, &hi, n, m);
455 return hi;
458 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
460 uint64_t lo, hi;
461 mulu64(&lo, &hi, n, m);
462 return hi;
465 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
466 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
467 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
468 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
470 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
471 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
472 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
473 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
475 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
476 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
477 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
478 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
480 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
481 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
483 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
484 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
486 /* Note that all bits of the shift are significant
487 and not modulo the element size. */
488 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
489 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
490 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
492 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
493 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
494 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
496 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
497 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
498 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
500 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
501 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
502 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
504 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
505 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
506 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
508 #undef DO_ZPZZ
509 #undef DO_ZPZZ_D
511 /* Three-operand expander, controlled by a predicate, in which the
512 * third operand is "wide". That is, for D = N op M, the same 64-bit
513 * value of M is used with all of the narrower values of N.
515 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
516 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
518 intptr_t i, opr_sz = simd_oprsz(desc); \
519 for (i = 0; i < opr_sz; ) { \
520 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
521 TYPEW mm = *(TYPEW *)(vm + i); \
522 do { \
523 if (pg & 1) { \
524 TYPE nn = *(TYPE *)(vn + H(i)); \
525 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
527 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
528 } while (i & 7); \
532 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
533 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
534 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
536 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
537 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
538 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
540 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
541 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
542 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
544 #undef DO_ZPZW
546 /* Fully general two-operand expander, controlled by a predicate.
548 #define DO_ZPZ(NAME, TYPE, H, OP) \
549 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
551 intptr_t i, opr_sz = simd_oprsz(desc); \
552 for (i = 0; i < opr_sz; ) { \
553 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
554 do { \
555 if (pg & 1) { \
556 TYPE nn = *(TYPE *)(vn + H(i)); \
557 *(TYPE *)(vd + H(i)) = OP(nn); \
559 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
560 } while (i & 15); \
564 /* Similarly, specialized for 64-bit operands. */
565 #define DO_ZPZ_D(NAME, TYPE, OP) \
566 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
568 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
569 TYPE *d = vd, *n = vn; \
570 uint8_t *pg = vg; \
571 for (i = 0; i < opr_sz; i += 1) { \
572 if (pg[H1(i)] & 1) { \
573 TYPE nn = n[i]; \
574 d[i] = OP(nn); \
579 #define DO_CLS_B(N) (clrsb32(N) - 24)
580 #define DO_CLS_H(N) (clrsb32(N) - 16)
582 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
583 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
584 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
585 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
587 #define DO_CLZ_B(N) (clz32(N) - 24)
588 #define DO_CLZ_H(N) (clz32(N) - 16)
590 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
591 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
592 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
593 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
595 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
596 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
597 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
598 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
600 #define DO_CNOT(N) (N == 0)
602 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
603 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
604 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
605 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
607 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
609 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
610 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
611 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
613 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
615 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
616 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
617 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
619 #define DO_NOT(N) (~N)
621 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
622 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
623 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
624 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
626 #define DO_SXTB(N) ((int8_t)N)
627 #define DO_SXTH(N) ((int16_t)N)
628 #define DO_SXTS(N) ((int32_t)N)
629 #define DO_UXTB(N) ((uint8_t)N)
630 #define DO_UXTH(N) ((uint16_t)N)
631 #define DO_UXTS(N) ((uint32_t)N)
633 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
634 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
635 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
636 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
637 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
638 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
640 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
641 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
642 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
643 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
644 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
645 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
647 #define DO_ABS(N) (N < 0 ? -N : N)
649 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
650 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
651 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
652 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
654 #define DO_NEG(N) (-N)
656 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
657 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
658 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
659 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
661 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
662 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
663 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
665 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
666 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
668 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
670 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
671 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
672 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
673 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
675 /* Three-operand expander, unpredicated, in which the third operand is "wide".
677 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
678 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
680 intptr_t i, opr_sz = simd_oprsz(desc); \
681 for (i = 0; i < opr_sz; ) { \
682 TYPEW mm = *(TYPEW *)(vm + i); \
683 do { \
684 TYPE nn = *(TYPE *)(vn + H(i)); \
685 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
686 i += sizeof(TYPE); \
687 } while (i & 7); \
691 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
692 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
693 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
695 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
696 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
697 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
699 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
700 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
701 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
703 #undef DO_ZZW
705 #undef DO_CLS_B
706 #undef DO_CLS_H
707 #undef DO_CLZ_B
708 #undef DO_CLZ_H
709 #undef DO_CNOT
710 #undef DO_FABS
711 #undef DO_FNEG
712 #undef DO_ABS
713 #undef DO_NEG
714 #undef DO_ZPZ
715 #undef DO_ZPZ_D
717 /* Two-operand reduction expander, controlled by a predicate.
718 * The difference between TYPERED and TYPERET has to do with
719 * sign-extension. E.g. for SMAX, TYPERED must be signed,
720 * but TYPERET must be unsigned so that e.g. a 32-bit value
721 * is not sign-extended to the ABI uint64_t return type.
723 /* ??? If we were to vectorize this by hand the reduction ordering
724 * would change. For integer operands, this is perfectly fine.
726 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
727 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
729 intptr_t i, opr_sz = simd_oprsz(desc); \
730 TYPERED ret = INIT; \
731 for (i = 0; i < opr_sz; ) { \
732 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
733 do { \
734 if (pg & 1) { \
735 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
736 ret = OP(ret, nn); \
738 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
739 } while (i & 15); \
741 return (TYPERET)ret; \
744 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
745 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
748 TYPEE *n = vn; \
749 uint8_t *pg = vg; \
750 TYPER ret = INIT; \
751 for (i = 0; i < opr_sz; i += 1) { \
752 if (pg[H1(i)] & 1) { \
753 TYPEE nn = n[i]; \
754 ret = OP(ret, nn); \
757 return ret; \
760 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
761 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
762 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
763 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
765 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
766 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
767 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
768 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
770 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
771 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
772 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
773 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
775 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
776 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
777 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
779 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
780 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
781 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
782 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
784 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
785 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
786 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
787 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
789 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
790 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
791 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
792 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
794 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
795 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
796 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
797 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
799 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
800 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
801 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
802 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
804 #undef DO_VPZ
805 #undef DO_VPZ_D
807 #undef DO_AND
808 #undef DO_ORR
809 #undef DO_EOR
810 #undef DO_BIC
811 #undef DO_ADD
812 #undef DO_SUB
813 #undef DO_MAX
814 #undef DO_MIN
815 #undef DO_ABD
816 #undef DO_MUL
817 #undef DO_DIV
818 #undef DO_ASR
819 #undef DO_LSR
820 #undef DO_LSL
822 /* Similar to the ARM LastActiveElement pseudocode function, except the
823 result is multiplied by the element size. This includes the not found
824 indication; e.g. not found for esz=3 is -8. */
825 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
827 uint64_t mask = pred_esz_masks[esz];
828 intptr_t i = words;
830 do {
831 uint64_t this_g = g[--i] & mask;
832 if (this_g) {
833 return i * 64 + (63 - clz64(this_g));
835 } while (i > 0);
836 return (intptr_t)-1 << esz;
839 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
841 uint32_t flags = PREDTEST_INIT;
842 uint64_t *d = vd, *g = vg;
843 intptr_t i = 0;
845 do {
846 uint64_t this_d = d[i];
847 uint64_t this_g = g[i];
849 if (this_g) {
850 if (!(flags & 4)) {
851 /* Set in D the first bit of G. */
852 this_d |= this_g & -this_g;
853 d[i] = this_d;
855 flags = iter_predtest_fwd(this_d, this_g, flags);
857 } while (++i < words);
859 return flags;
862 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
864 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
865 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
866 uint32_t flags = PREDTEST_INIT;
867 uint64_t *d = vd, *g = vg, esz_mask;
868 intptr_t i, next;
870 next = last_active_element(vd, words, esz) + (1 << esz);
871 esz_mask = pred_esz_masks[esz];
873 /* Similar to the pseudocode for pnext, but scaled by ESZ
874 so that we find the correct bit. */
875 if (next < words * 64) {
876 uint64_t mask = -1;
878 if (next & 63) {
879 mask = ~((1ull << (next & 63)) - 1);
880 next &= -64;
882 do {
883 uint64_t this_g = g[next / 64] & esz_mask & mask;
884 if (this_g != 0) {
885 next = (next & -64) + ctz64(this_g);
886 break;
888 next += 64;
889 mask = -1;
890 } while (next < words * 64);
893 i = 0;
894 do {
895 uint64_t this_d = 0;
896 if (i == next / 64) {
897 this_d = 1ull << (next & 63);
899 d[i] = this_d;
900 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
901 } while (++i < words);
903 return flags;
906 /* Store zero into every active element of Zd. We will use this for two
907 * and three-operand predicated instructions for which logic dictates a
908 * zero result. In particular, logical shift by element size, which is
909 * otherwise undefined on the host.
911 * For element sizes smaller than uint64_t, we use tables to expand
912 * the N bits of the controlling predicate to a byte mask, and clear
913 * those bytes.
915 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
917 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
918 uint64_t *d = vd;
919 uint8_t *pg = vg;
920 for (i = 0; i < opr_sz; i += 1) {
921 d[i] &= ~expand_pred_b(pg[H1(i)]);
925 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
927 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
928 uint64_t *d = vd;
929 uint8_t *pg = vg;
930 for (i = 0; i < opr_sz; i += 1) {
931 d[i] &= ~expand_pred_h(pg[H1(i)]);
935 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
937 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
938 uint64_t *d = vd;
939 uint8_t *pg = vg;
940 for (i = 0; i < opr_sz; i += 1) {
941 d[i] &= ~expand_pred_s(pg[H1(i)]);
945 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
947 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
948 uint64_t *d = vd;
949 uint8_t *pg = vg;
950 for (i = 0; i < opr_sz; i += 1) {
951 if (pg[H1(i)] & 1) {
952 d[i] = 0;
957 /* Three-operand expander, immediate operand, controlled by a predicate.
959 #define DO_ZPZI(NAME, TYPE, H, OP) \
960 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
962 intptr_t i, opr_sz = simd_oprsz(desc); \
963 TYPE imm = simd_data(desc); \
964 for (i = 0; i < opr_sz; ) { \
965 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
966 do { \
967 if (pg & 1) { \
968 TYPE nn = *(TYPE *)(vn + H(i)); \
969 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
971 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
972 } while (i & 15); \
976 /* Similarly, specialized for 64-bit operands. */
977 #define DO_ZPZI_D(NAME, TYPE, OP) \
978 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
980 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
981 TYPE *d = vd, *n = vn; \
982 TYPE imm = simd_data(desc); \
983 uint8_t *pg = vg; \
984 for (i = 0; i < opr_sz; i += 1) { \
985 if (pg[H1(i)] & 1) { \
986 TYPE nn = n[i]; \
987 d[i] = OP(nn, imm); \
992 #define DO_SHR(N, M) (N >> M)
993 #define DO_SHL(N, M) (N << M)
995 /* Arithmetic shift right for division. This rounds negative numbers
996 toward zero as per signed division. Therefore before shifting,
997 when N is negative, add 2**M-1. */
998 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1000 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1001 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1002 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1003 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1005 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1006 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1007 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1008 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1010 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1011 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1012 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1013 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1015 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1016 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1017 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1018 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1020 #undef DO_SHR
1021 #undef DO_SHL
1022 #undef DO_ASRD
1023 #undef DO_ZPZI
1024 #undef DO_ZPZI_D
1026 /* Fully general four-operand expander, controlled by a predicate.
1028 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1029 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1030 void *vg, uint32_t desc) \
1032 intptr_t i, opr_sz = simd_oprsz(desc); \
1033 for (i = 0; i < opr_sz; ) { \
1034 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1035 do { \
1036 if (pg & 1) { \
1037 TYPE nn = *(TYPE *)(vn + H(i)); \
1038 TYPE mm = *(TYPE *)(vm + H(i)); \
1039 TYPE aa = *(TYPE *)(va + H(i)); \
1040 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1042 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1043 } while (i & 15); \
1047 /* Similarly, specialized for 64-bit operands. */
1048 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1049 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1050 void *vg, uint32_t desc) \
1052 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1053 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1054 uint8_t *pg = vg; \
1055 for (i = 0; i < opr_sz; i += 1) { \
1056 if (pg[H1(i)] & 1) { \
1057 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1058 d[i] = OP(aa, nn, mm); \
1063 #define DO_MLA(A, N, M) (A + N * M)
1064 #define DO_MLS(A, N, M) (A - N * M)
1066 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1067 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1069 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1070 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1072 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1073 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1075 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1076 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1078 #undef DO_MLA
1079 #undef DO_MLS
1080 #undef DO_ZPZZZ
1081 #undef DO_ZPZZZ_D
1083 void HELPER(sve_index_b)(void *vd, uint32_t start,
1084 uint32_t incr, uint32_t desc)
1086 intptr_t i, opr_sz = simd_oprsz(desc);
1087 uint8_t *d = vd;
1088 for (i = 0; i < opr_sz; i += 1) {
1089 d[H1(i)] = start + i * incr;
1093 void HELPER(sve_index_h)(void *vd, uint32_t start,
1094 uint32_t incr, uint32_t desc)
1096 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1097 uint16_t *d = vd;
1098 for (i = 0; i < opr_sz; i += 1) {
1099 d[H2(i)] = start + i * incr;
1103 void HELPER(sve_index_s)(void *vd, uint32_t start,
1104 uint32_t incr, uint32_t desc)
1106 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1107 uint32_t *d = vd;
1108 for (i = 0; i < opr_sz; i += 1) {
1109 d[H4(i)] = start + i * incr;
1113 void HELPER(sve_index_d)(void *vd, uint64_t start,
1114 uint64_t incr, uint32_t desc)
1116 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1117 uint64_t *d = vd;
1118 for (i = 0; i < opr_sz; i += 1) {
1119 d[i] = start + i * incr;
1123 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1125 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1126 uint32_t sh = simd_data(desc);
1127 uint32_t *d = vd, *n = vn, *m = vm;
1128 for (i = 0; i < opr_sz; i += 1) {
1129 d[i] = n[i] + (m[i] << sh);
1133 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1135 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1136 uint64_t sh = simd_data(desc);
1137 uint64_t *d = vd, *n = vn, *m = vm;
1138 for (i = 0; i < opr_sz; i += 1) {
1139 d[i] = n[i] + (m[i] << sh);
1143 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1145 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1146 uint64_t sh = simd_data(desc);
1147 uint64_t *d = vd, *n = vn, *m = vm;
1148 for (i = 0; i < opr_sz; i += 1) {
1149 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1153 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1155 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1156 uint64_t sh = simd_data(desc);
1157 uint64_t *d = vd, *n = vn, *m = vm;
1158 for (i = 0; i < opr_sz; i += 1) {
1159 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1163 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1165 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1166 static const uint16_t coeff[] = {
1167 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1168 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1169 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1170 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1172 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1173 uint16_t *d = vd, *n = vn;
1175 for (i = 0; i < opr_sz; i++) {
1176 uint16_t nn = n[i];
1177 intptr_t idx = extract32(nn, 0, 5);
1178 uint16_t exp = extract32(nn, 5, 5);
1179 d[i] = coeff[idx] | (exp << 10);
1183 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1185 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1186 static const uint32_t coeff[] = {
1187 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1188 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1189 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1190 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1191 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1192 0x1ef532, 0x20b051, 0x227043, 0x243516,
1193 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1194 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1195 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1196 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1197 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1198 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1199 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1200 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1201 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1202 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1204 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1205 uint32_t *d = vd, *n = vn;
1207 for (i = 0; i < opr_sz; i++) {
1208 uint32_t nn = n[i];
1209 intptr_t idx = extract32(nn, 0, 6);
1210 uint32_t exp = extract32(nn, 6, 8);
1211 d[i] = coeff[idx] | (exp << 23);
1215 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1217 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1218 static const uint64_t coeff[] = {
1219 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1220 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1221 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1222 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1223 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1224 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1225 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1226 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1227 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1228 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1229 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1230 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1231 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1232 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1233 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1234 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1235 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1236 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1237 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1238 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1239 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1240 0xFA7C1819E90D8ull,
1242 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1243 uint64_t *d = vd, *n = vn;
1245 for (i = 0; i < opr_sz; i++) {
1246 uint64_t nn = n[i];
1247 intptr_t idx = extract32(nn, 0, 6);
1248 uint64_t exp = extract32(nn, 6, 11);
1249 d[i] = coeff[idx] | (exp << 52);
1253 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1255 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1256 uint16_t *d = vd, *n = vn, *m = vm;
1257 for (i = 0; i < opr_sz; i += 1) {
1258 uint16_t nn = n[i];
1259 uint16_t mm = m[i];
1260 if (mm & 1) {
1261 nn = float16_one;
1263 d[i] = nn ^ (mm & 2) << 14;
1267 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1269 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1270 uint32_t *d = vd, *n = vn, *m = vm;
1271 for (i = 0; i < opr_sz; i += 1) {
1272 uint32_t nn = n[i];
1273 uint32_t mm = m[i];
1274 if (mm & 1) {
1275 nn = float32_one;
1277 d[i] = nn ^ (mm & 2) << 30;
1281 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1283 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1284 uint64_t *d = vd, *n = vn, *m = vm;
1285 for (i = 0; i < opr_sz; i += 1) {
1286 uint64_t nn = n[i];
1287 uint64_t mm = m[i];
1288 if (mm & 1) {
1289 nn = float64_one;
1291 d[i] = nn ^ (mm & 2) << 62;
1296 * Signed saturating addition with scalar operand.
1299 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1301 intptr_t i, oprsz = simd_oprsz(desc);
1303 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1304 int r = *(int8_t *)(a + i) + b;
1305 if (r > INT8_MAX) {
1306 r = INT8_MAX;
1307 } else if (r < INT8_MIN) {
1308 r = INT8_MIN;
1310 *(int8_t *)(d + i) = r;
1314 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1316 intptr_t i, oprsz = simd_oprsz(desc);
1318 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1319 int r = *(int16_t *)(a + i) + b;
1320 if (r > INT16_MAX) {
1321 r = INT16_MAX;
1322 } else if (r < INT16_MIN) {
1323 r = INT16_MIN;
1325 *(int16_t *)(d + i) = r;
1329 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1331 intptr_t i, oprsz = simd_oprsz(desc);
1333 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1334 int64_t r = *(int32_t *)(a + i) + b;
1335 if (r > INT32_MAX) {
1336 r = INT32_MAX;
1337 } else if (r < INT32_MIN) {
1338 r = INT32_MIN;
1340 *(int32_t *)(d + i) = r;
1344 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1346 intptr_t i, oprsz = simd_oprsz(desc);
1348 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1349 int64_t ai = *(int64_t *)(a + i);
1350 int64_t r = ai + b;
1351 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1352 /* Signed overflow. */
1353 r = (r < 0 ? INT64_MAX : INT64_MIN);
1355 *(int64_t *)(d + i) = r;
1360 * Unsigned saturating addition with scalar operand.
1363 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1365 intptr_t i, oprsz = simd_oprsz(desc);
1367 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1368 int r = *(uint8_t *)(a + i) + b;
1369 if (r > UINT8_MAX) {
1370 r = UINT8_MAX;
1371 } else if (r < 0) {
1372 r = 0;
1374 *(uint8_t *)(d + i) = r;
1378 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1380 intptr_t i, oprsz = simd_oprsz(desc);
1382 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1383 int r = *(uint16_t *)(a + i) + b;
1384 if (r > UINT16_MAX) {
1385 r = UINT16_MAX;
1386 } else if (r < 0) {
1387 r = 0;
1389 *(uint16_t *)(d + i) = r;
1393 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1395 intptr_t i, oprsz = simd_oprsz(desc);
1397 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1398 int64_t r = *(uint32_t *)(a + i) + b;
1399 if (r > UINT32_MAX) {
1400 r = UINT32_MAX;
1401 } else if (r < 0) {
1402 r = 0;
1404 *(uint32_t *)(d + i) = r;
1408 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1410 intptr_t i, oprsz = simd_oprsz(desc);
1412 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1413 uint64_t r = *(uint64_t *)(a + i) + b;
1414 if (r < b) {
1415 r = UINT64_MAX;
1417 *(uint64_t *)(d + i) = r;
1421 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1423 intptr_t i, oprsz = simd_oprsz(desc);
1425 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1426 uint64_t ai = *(uint64_t *)(a + i);
1427 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1431 /* Two operand predicated copy immediate with merge. All valid immediates
1432 * can fit within 17 signed bits in the simd_data field.
1434 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1435 uint64_t mm, uint32_t desc)
1437 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1438 uint64_t *d = vd, *n = vn;
1439 uint8_t *pg = vg;
1441 mm = dup_const(MO_8, mm);
1442 for (i = 0; i < opr_sz; i += 1) {
1443 uint64_t nn = n[i];
1444 uint64_t pp = expand_pred_b(pg[H1(i)]);
1445 d[i] = (mm & pp) | (nn & ~pp);
1449 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1450 uint64_t mm, uint32_t desc)
1452 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1453 uint64_t *d = vd, *n = vn;
1454 uint8_t *pg = vg;
1456 mm = dup_const(MO_16, mm);
1457 for (i = 0; i < opr_sz; i += 1) {
1458 uint64_t nn = n[i];
1459 uint64_t pp = expand_pred_h(pg[H1(i)]);
1460 d[i] = (mm & pp) | (nn & ~pp);
1464 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1465 uint64_t mm, uint32_t desc)
1467 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1468 uint64_t *d = vd, *n = vn;
1469 uint8_t *pg = vg;
1471 mm = dup_const(MO_32, mm);
1472 for (i = 0; i < opr_sz; i += 1) {
1473 uint64_t nn = n[i];
1474 uint64_t pp = expand_pred_s(pg[H1(i)]);
1475 d[i] = (mm & pp) | (nn & ~pp);
1479 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1480 uint64_t mm, uint32_t desc)
1482 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1483 uint64_t *d = vd, *n = vn;
1484 uint8_t *pg = vg;
1486 for (i = 0; i < opr_sz; i += 1) {
1487 uint64_t nn = n[i];
1488 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1492 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1494 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1495 uint64_t *d = vd;
1496 uint8_t *pg = vg;
1498 val = dup_const(MO_8, val);
1499 for (i = 0; i < opr_sz; i += 1) {
1500 d[i] = val & expand_pred_b(pg[H1(i)]);
1504 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1506 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1507 uint64_t *d = vd;
1508 uint8_t *pg = vg;
1510 val = dup_const(MO_16, val);
1511 for (i = 0; i < opr_sz; i += 1) {
1512 d[i] = val & expand_pred_h(pg[H1(i)]);
1516 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1518 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1519 uint64_t *d = vd;
1520 uint8_t *pg = vg;
1522 val = dup_const(MO_32, val);
1523 for (i = 0; i < opr_sz; i += 1) {
1524 d[i] = val & expand_pred_s(pg[H1(i)]);
1528 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1530 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1531 uint64_t *d = vd;
1532 uint8_t *pg = vg;
1534 for (i = 0; i < opr_sz; i += 1) {
1535 d[i] = (pg[H1(i)] & 1 ? val : 0);
1539 /* Big-endian hosts need to frob the byte indicies. If the copy
1540 * happens to be 8-byte aligned, then no frobbing necessary.
1542 static void swap_memmove(void *vd, void *vs, size_t n)
1544 uintptr_t d = (uintptr_t)vd;
1545 uintptr_t s = (uintptr_t)vs;
1546 uintptr_t o = (d | s | n) & 7;
1547 size_t i;
1549 #ifndef HOST_WORDS_BIGENDIAN
1550 o = 0;
1551 #endif
1552 switch (o) {
1553 case 0:
1554 memmove(vd, vs, n);
1555 break;
1557 case 4:
1558 if (d < s || d >= s + n) {
1559 for (i = 0; i < n; i += 4) {
1560 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1562 } else {
1563 for (i = n; i > 0; ) {
1564 i -= 4;
1565 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1568 break;
1570 case 2:
1571 case 6:
1572 if (d < s || d >= s + n) {
1573 for (i = 0; i < n; i += 2) {
1574 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1576 } else {
1577 for (i = n; i > 0; ) {
1578 i -= 2;
1579 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1582 break;
1584 default:
1585 if (d < s || d >= s + n) {
1586 for (i = 0; i < n; i++) {
1587 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1589 } else {
1590 for (i = n; i > 0; ) {
1591 i -= 1;
1592 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1595 break;
1599 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1601 intptr_t opr_sz = simd_oprsz(desc);
1602 size_t n_ofs = simd_data(desc);
1603 size_t n_siz = opr_sz - n_ofs;
1605 if (vd != vm) {
1606 swap_memmove(vd, vn + n_ofs, n_siz);
1607 swap_memmove(vd + n_siz, vm, n_ofs);
1608 } else if (vd != vn) {
1609 swap_memmove(vd + n_siz, vd, n_ofs);
1610 swap_memmove(vd, vn + n_ofs, n_siz);
1611 } else {
1612 /* vd == vn == vm. Need temp space. */
1613 ARMVectorReg tmp;
1614 swap_memmove(&tmp, vm, n_ofs);
1615 swap_memmove(vd, vd + n_ofs, n_siz);
1616 memcpy(vd + n_siz, &tmp, n_ofs);
1620 #define DO_INSR(NAME, TYPE, H) \
1621 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1623 intptr_t opr_sz = simd_oprsz(desc); \
1624 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1625 *(TYPE *)(vd + H(0)) = val; \
1628 DO_INSR(sve_insr_b, uint8_t, H1)
1629 DO_INSR(sve_insr_h, uint16_t, H1_2)
1630 DO_INSR(sve_insr_s, uint32_t, H1_4)
1631 DO_INSR(sve_insr_d, uint64_t, )
1633 #undef DO_INSR
1635 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1637 intptr_t i, j, opr_sz = simd_oprsz(desc);
1638 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1639 uint64_t f = *(uint64_t *)(vn + i);
1640 uint64_t b = *(uint64_t *)(vn + j);
1641 *(uint64_t *)(vd + i) = bswap64(b);
1642 *(uint64_t *)(vd + j) = bswap64(f);
1646 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1648 intptr_t i, j, opr_sz = simd_oprsz(desc);
1649 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1650 uint64_t f = *(uint64_t *)(vn + i);
1651 uint64_t b = *(uint64_t *)(vn + j);
1652 *(uint64_t *)(vd + i) = hswap64(b);
1653 *(uint64_t *)(vd + j) = hswap64(f);
1657 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1659 intptr_t i, j, opr_sz = simd_oprsz(desc);
1660 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1661 uint64_t f = *(uint64_t *)(vn + i);
1662 uint64_t b = *(uint64_t *)(vn + j);
1663 *(uint64_t *)(vd + i) = rol64(b, 32);
1664 *(uint64_t *)(vd + j) = rol64(f, 32);
1668 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1670 intptr_t i, j, opr_sz = simd_oprsz(desc);
1671 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1672 uint64_t f = *(uint64_t *)(vn + i);
1673 uint64_t b = *(uint64_t *)(vn + j);
1674 *(uint64_t *)(vd + i) = b;
1675 *(uint64_t *)(vd + j) = f;
1679 #define DO_TBL(NAME, TYPE, H) \
1680 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1682 intptr_t i, opr_sz = simd_oprsz(desc); \
1683 uintptr_t elem = opr_sz / sizeof(TYPE); \
1684 TYPE *d = vd, *n = vn, *m = vm; \
1685 ARMVectorReg tmp; \
1686 if (unlikely(vd == vn)) { \
1687 n = memcpy(&tmp, vn, opr_sz); \
1689 for (i = 0; i < elem; i++) { \
1690 TYPE j = m[H(i)]; \
1691 d[H(i)] = j < elem ? n[H(j)] : 0; \
1695 DO_TBL(sve_tbl_b, uint8_t, H1)
1696 DO_TBL(sve_tbl_h, uint16_t, H2)
1697 DO_TBL(sve_tbl_s, uint32_t, H4)
1698 DO_TBL(sve_tbl_d, uint64_t, )
1700 #undef TBL
1702 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1703 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1705 intptr_t i, opr_sz = simd_oprsz(desc); \
1706 TYPED *d = vd; \
1707 TYPES *n = vn; \
1708 ARMVectorReg tmp; \
1709 if (unlikely(vn - vd < opr_sz)) { \
1710 n = memcpy(&tmp, n, opr_sz / 2); \
1712 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1713 d[HD(i)] = n[HS(i)]; \
1717 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1718 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1719 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1721 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1722 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1723 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1725 #undef DO_UNPK
1727 /* Mask of bits included in the even numbered predicates of width esz.
1728 * We also use this for expand_bits/compress_bits, and so extend the
1729 * same pattern out to 16-bit units.
1731 static const uint64_t even_bit_esz_masks[5] = {
1732 0x5555555555555555ull,
1733 0x3333333333333333ull,
1734 0x0f0f0f0f0f0f0f0full,
1735 0x00ff00ff00ff00ffull,
1736 0x0000ffff0000ffffull,
1739 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1740 * For N==0, this corresponds to the operation that in qemu/bitops.h
1741 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1742 * section 7-2 Shuffling Bits.
1744 static uint64_t expand_bits(uint64_t x, int n)
1746 int i;
1748 x &= 0xffffffffu;
1749 for (i = 4; i >= n; i--) {
1750 int sh = 1 << i;
1751 x = ((x << sh) | x) & even_bit_esz_masks[i];
1753 return x;
1756 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1757 * For N==0, this corresponds to the operation that in qemu/bitops.h
1758 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1759 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1761 static uint64_t compress_bits(uint64_t x, int n)
1763 int i;
1765 for (i = n; i <= 4; i++) {
1766 int sh = 1 << i;
1767 x &= even_bit_esz_masks[i];
1768 x = (x >> sh) | x;
1770 return x & 0xffffffffu;
1773 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1775 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1776 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1777 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1778 uint64_t *d = vd;
1779 intptr_t i;
1781 if (oprsz <= 8) {
1782 uint64_t nn = *(uint64_t *)vn;
1783 uint64_t mm = *(uint64_t *)vm;
1784 int half = 4 * oprsz;
1786 nn = extract64(nn, high * half, half);
1787 mm = extract64(mm, high * half, half);
1788 nn = expand_bits(nn, esz);
1789 mm = expand_bits(mm, esz);
1790 d[0] = nn + (mm << (1 << esz));
1791 } else {
1792 ARMPredicateReg tmp_n, tmp_m;
1794 /* We produce output faster than we consume input.
1795 Therefore we must be mindful of possible overlap. */
1796 if ((vn - vd) < (uintptr_t)oprsz) {
1797 vn = memcpy(&tmp_n, vn, oprsz);
1799 if ((vm - vd) < (uintptr_t)oprsz) {
1800 vm = memcpy(&tmp_m, vm, oprsz);
1802 if (high) {
1803 high = oprsz >> 1;
1806 if ((high & 3) == 0) {
1807 uint32_t *n = vn, *m = vm;
1808 high >>= 2;
1810 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1811 uint64_t nn = n[H4(high + i)];
1812 uint64_t mm = m[H4(high + i)];
1814 nn = expand_bits(nn, esz);
1815 mm = expand_bits(mm, esz);
1816 d[i] = nn + (mm << (1 << esz));
1818 } else {
1819 uint8_t *n = vn, *m = vm;
1820 uint16_t *d16 = vd;
1822 for (i = 0; i < oprsz / 2; i++) {
1823 uint16_t nn = n[H1(high + i)];
1824 uint16_t mm = m[H1(high + i)];
1826 nn = expand_bits(nn, esz);
1827 mm = expand_bits(mm, esz);
1828 d16[H2(i)] = nn + (mm << (1 << esz));
1834 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1836 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1837 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1838 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1839 uint64_t *d = vd, *n = vn, *m = vm;
1840 uint64_t l, h;
1841 intptr_t i;
1843 if (oprsz <= 8) {
1844 l = compress_bits(n[0] >> odd, esz);
1845 h = compress_bits(m[0] >> odd, esz);
1846 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1847 } else {
1848 ARMPredicateReg tmp_m;
1849 intptr_t oprsz_16 = oprsz / 16;
1851 if ((vm - vd) < (uintptr_t)oprsz) {
1852 m = memcpy(&tmp_m, vm, oprsz);
1855 for (i = 0; i < oprsz_16; i++) {
1856 l = n[2 * i + 0];
1857 h = n[2 * i + 1];
1858 l = compress_bits(l >> odd, esz);
1859 h = compress_bits(h >> odd, esz);
1860 d[i] = l + (h << 32);
1863 /* For VL which is not a power of 2, the results from M do not
1864 align nicely with the uint64_t for D. Put the aligned results
1865 from M into TMP_M and then copy it into place afterward. */
1866 if (oprsz & 15) {
1867 d[i] = compress_bits(n[2 * i] >> odd, esz);
1869 for (i = 0; i < oprsz_16; i++) {
1870 l = m[2 * i + 0];
1871 h = m[2 * i + 1];
1872 l = compress_bits(l >> odd, esz);
1873 h = compress_bits(h >> odd, esz);
1874 tmp_m.p[i] = l + (h << 32);
1876 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1878 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1879 } else {
1880 for (i = 0; i < oprsz_16; i++) {
1881 l = m[2 * i + 0];
1882 h = m[2 * i + 1];
1883 l = compress_bits(l >> odd, esz);
1884 h = compress_bits(h >> odd, esz);
1885 d[oprsz_16 + i] = l + (h << 32);
1891 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1893 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1894 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1895 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1896 uint64_t *d = vd, *n = vn, *m = vm;
1897 uint64_t mask;
1898 int shr, shl;
1899 intptr_t i;
1901 shl = 1 << esz;
1902 shr = 0;
1903 mask = even_bit_esz_masks[esz];
1904 if (odd) {
1905 mask <<= shl;
1906 shr = shl;
1907 shl = 0;
1910 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1911 uint64_t nn = (n[i] & mask) >> shr;
1912 uint64_t mm = (m[i] & mask) << shl;
1913 d[i] = nn + mm;
1917 /* Reverse units of 2**N bits. */
1918 static uint64_t reverse_bits_64(uint64_t x, int n)
1920 int i, sh;
1922 x = bswap64(x);
1923 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1924 uint64_t mask = even_bit_esz_masks[i];
1925 x = ((x & mask) << sh) | ((x >> sh) & mask);
1927 return x;
1930 static uint8_t reverse_bits_8(uint8_t x, int n)
1932 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
1933 int i, sh;
1935 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1936 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
1938 return x;
1941 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
1943 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1944 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1945 intptr_t i, oprsz_2 = oprsz / 2;
1947 if (oprsz <= 8) {
1948 uint64_t l = *(uint64_t *)vn;
1949 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
1950 *(uint64_t *)vd = l;
1951 } else if ((oprsz & 15) == 0) {
1952 for (i = 0; i < oprsz_2; i += 8) {
1953 intptr_t ih = oprsz - 8 - i;
1954 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
1955 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
1956 *(uint64_t *)(vd + i) = h;
1957 *(uint64_t *)(vd + ih) = l;
1959 } else {
1960 for (i = 0; i < oprsz_2; i += 1) {
1961 intptr_t il = H1(i);
1962 intptr_t ih = H1(oprsz - 1 - i);
1963 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
1964 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
1965 *(uint8_t *)(vd + il) = h;
1966 *(uint8_t *)(vd + ih) = l;
1971 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
1973 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1974 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1975 uint64_t *d = vd;
1976 intptr_t i;
1978 if (oprsz <= 8) {
1979 uint64_t nn = *(uint64_t *)vn;
1980 int half = 4 * oprsz;
1982 nn = extract64(nn, high * half, half);
1983 nn = expand_bits(nn, 0);
1984 d[0] = nn;
1985 } else {
1986 ARMPredicateReg tmp_n;
1988 /* We produce output faster than we consume input.
1989 Therefore we must be mindful of possible overlap. */
1990 if ((vn - vd) < (uintptr_t)oprsz) {
1991 vn = memcpy(&tmp_n, vn, oprsz);
1993 if (high) {
1994 high = oprsz >> 1;
1997 if ((high & 3) == 0) {
1998 uint32_t *n = vn;
1999 high >>= 2;
2001 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2002 uint64_t nn = n[H4(high + i)];
2003 d[i] = expand_bits(nn, 0);
2005 } else {
2006 uint16_t *d16 = vd;
2007 uint8_t *n = vn;
2009 for (i = 0; i < oprsz / 2; i++) {
2010 uint16_t nn = n[H1(high + i)];
2011 d16[H2(i)] = expand_bits(nn, 0);
2017 #define DO_ZIP(NAME, TYPE, H) \
2018 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2020 intptr_t oprsz = simd_oprsz(desc); \
2021 intptr_t i, oprsz_2 = oprsz / 2; \
2022 ARMVectorReg tmp_n, tmp_m; \
2023 /* We produce output faster than we consume input. \
2024 Therefore we must be mindful of possible overlap. */ \
2025 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2026 vn = memcpy(&tmp_n, vn, oprsz_2); \
2028 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2029 vm = memcpy(&tmp_m, vm, oprsz_2); \
2031 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2032 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2033 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2037 DO_ZIP(sve_zip_b, uint8_t, H1)
2038 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2039 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2040 DO_ZIP(sve_zip_d, uint64_t, )
2042 #define DO_UZP(NAME, TYPE, H) \
2043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2045 intptr_t oprsz = simd_oprsz(desc); \
2046 intptr_t oprsz_2 = oprsz / 2; \
2047 intptr_t odd_ofs = simd_data(desc); \
2048 intptr_t i; \
2049 ARMVectorReg tmp_m; \
2050 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2051 vm = memcpy(&tmp_m, vm, oprsz); \
2053 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2054 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2056 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2057 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2061 DO_UZP(sve_uzp_b, uint8_t, H1)
2062 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2063 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2064 DO_UZP(sve_uzp_d, uint64_t, )
2066 #define DO_TRN(NAME, TYPE, H) \
2067 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2069 intptr_t oprsz = simd_oprsz(desc); \
2070 intptr_t odd_ofs = simd_data(desc); \
2071 intptr_t i; \
2072 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2073 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2074 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2075 *(TYPE *)(vd + H(i + 0)) = ae; \
2076 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2080 DO_TRN(sve_trn_b, uint8_t, H1)
2081 DO_TRN(sve_trn_h, uint16_t, H1_2)
2082 DO_TRN(sve_trn_s, uint32_t, H1_4)
2083 DO_TRN(sve_trn_d, uint64_t, )
2085 #undef DO_ZIP
2086 #undef DO_UZP
2087 #undef DO_TRN
2089 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2091 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2092 uint32_t *d = vd, *n = vn;
2093 uint8_t *pg = vg;
2095 for (i = j = 0; i < opr_sz; i++) {
2096 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2097 d[H4(j)] = n[H4(i)];
2098 j++;
2101 for (; j < opr_sz; j++) {
2102 d[H4(j)] = 0;
2106 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2108 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2109 uint64_t *d = vd, *n = vn;
2110 uint8_t *pg = vg;
2112 for (i = j = 0; i < opr_sz; i++) {
2113 if (pg[H1(i)] & 1) {
2114 d[j] = n[i];
2115 j++;
2118 for (; j < opr_sz; j++) {
2119 d[j] = 0;
2123 /* Similar to the ARM LastActiveElement pseudocode function, except the
2124 * result is multiplied by the element size. This includes the not found
2125 * indication; e.g. not found for esz=3 is -8.
2127 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2129 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2130 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2132 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2135 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2137 intptr_t opr_sz = simd_oprsz(desc) / 8;
2138 int esz = simd_data(desc);
2139 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2140 intptr_t i, first_i, last_i;
2141 ARMVectorReg tmp;
2143 first_i = last_i = 0;
2144 first_g = last_g = 0;
2146 /* Find the extent of the active elements within VG. */
2147 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2148 pg = *(uint64_t *)(vg + i) & mask;
2149 if (pg) {
2150 if (last_g == 0) {
2151 last_g = pg;
2152 last_i = i;
2154 first_g = pg;
2155 first_i = i;
2159 len = 0;
2160 if (first_g != 0) {
2161 first_i = first_i * 8 + ctz64(first_g);
2162 last_i = last_i * 8 + 63 - clz64(last_g);
2163 len = last_i - first_i + (1 << esz);
2164 if (vd == vm) {
2165 vm = memcpy(&tmp, vm, opr_sz * 8);
2167 swap_memmove(vd, vn + first_i, len);
2169 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2172 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2173 void *vg, uint32_t desc)
2175 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2176 uint64_t *d = vd, *n = vn, *m = vm;
2177 uint8_t *pg = vg;
2179 for (i = 0; i < opr_sz; i += 1) {
2180 uint64_t nn = n[i], mm = m[i];
2181 uint64_t pp = expand_pred_b(pg[H1(i)]);
2182 d[i] = (nn & pp) | (mm & ~pp);
2186 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2187 void *vg, uint32_t desc)
2189 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2190 uint64_t *d = vd, *n = vn, *m = vm;
2191 uint8_t *pg = vg;
2193 for (i = 0; i < opr_sz; i += 1) {
2194 uint64_t nn = n[i], mm = m[i];
2195 uint64_t pp = expand_pred_h(pg[H1(i)]);
2196 d[i] = (nn & pp) | (mm & ~pp);
2200 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2201 void *vg, uint32_t desc)
2203 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2204 uint64_t *d = vd, *n = vn, *m = vm;
2205 uint8_t *pg = vg;
2207 for (i = 0; i < opr_sz; i += 1) {
2208 uint64_t nn = n[i], mm = m[i];
2209 uint64_t pp = expand_pred_s(pg[H1(i)]);
2210 d[i] = (nn & pp) | (mm & ~pp);
2214 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2215 void *vg, uint32_t desc)
2217 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2218 uint64_t *d = vd, *n = vn, *m = vm;
2219 uint8_t *pg = vg;
2221 for (i = 0; i < opr_sz; i += 1) {
2222 uint64_t nn = n[i], mm = m[i];
2223 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2227 /* Two operand comparison controlled by a predicate.
2228 * ??? It is very tempting to want to be able to expand this inline
2229 * with x86 instructions, e.g.
2231 * vcmpeqw zm, zn, %ymm0
2232 * vpmovmskb %ymm0, %eax
2233 * and $0x5555, %eax
2234 * and pg, %eax
2236 * or even aarch64, e.g.
2238 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2239 * cmeq v0.8h, zn, zm
2240 * and v0.8h, v0.8h, mask
2241 * addv h0, v0.8h
2242 * and v0.8b, pg
2244 * However, coming up with an abstraction that allows vector inputs and
2245 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2246 * scalar outputs, is tricky.
2248 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2249 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2251 intptr_t opr_sz = simd_oprsz(desc); \
2252 uint32_t flags = PREDTEST_INIT; \
2253 intptr_t i = opr_sz; \
2254 do { \
2255 uint64_t out = 0, pg; \
2256 do { \
2257 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2258 TYPE nn = *(TYPE *)(vn + H(i)); \
2259 TYPE mm = *(TYPE *)(vm + H(i)); \
2260 out |= nn OP mm; \
2261 } while (i & 63); \
2262 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2263 out &= pg; \
2264 *(uint64_t *)(vd + (i >> 3)) = out; \
2265 flags = iter_predtest_bwd(out, pg, flags); \
2266 } while (i > 0); \
2267 return flags; \
2270 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2271 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2272 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2273 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2274 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2275 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2276 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2277 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2279 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2280 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2281 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2282 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2284 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2285 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2286 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2287 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2289 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2290 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2291 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2292 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2294 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2295 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2296 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2297 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2299 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2300 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2301 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2302 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2304 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2305 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2306 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2307 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2309 #undef DO_CMP_PPZZ_B
2310 #undef DO_CMP_PPZZ_H
2311 #undef DO_CMP_PPZZ_S
2312 #undef DO_CMP_PPZZ_D
2313 #undef DO_CMP_PPZZ
2315 /* Similar, but the second source is "wide". */
2316 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2317 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2319 intptr_t opr_sz = simd_oprsz(desc); \
2320 uint32_t flags = PREDTEST_INIT; \
2321 intptr_t i = opr_sz; \
2322 do { \
2323 uint64_t out = 0, pg; \
2324 do { \
2325 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2326 do { \
2327 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2328 TYPE nn = *(TYPE *)(vn + H(i)); \
2329 out |= nn OP mm; \
2330 } while (i & 7); \
2331 } while (i & 63); \
2332 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2333 out &= pg; \
2334 *(uint64_t *)(vd + (i >> 3)) = out; \
2335 flags = iter_predtest_bwd(out, pg, flags); \
2336 } while (i > 0); \
2337 return flags; \
2340 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2341 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2342 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2343 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2344 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2345 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2347 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2348 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2349 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2351 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2352 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2353 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2355 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2356 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2357 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2359 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2360 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2361 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2363 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2364 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2365 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2367 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2368 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2369 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2371 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2372 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2373 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2375 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2376 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2377 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2379 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2380 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2381 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2383 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2384 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2385 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2387 #undef DO_CMP_PPZW_B
2388 #undef DO_CMP_PPZW_H
2389 #undef DO_CMP_PPZW_S
2390 #undef DO_CMP_PPZW