target/arm: Implement SVE floating-point trig select coefficient
[qemu/ar7.git] / target / arm / sve_helper.c
blob85a0639e3af8d4434d610bf9f34bea293d15b8f0
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 return flags;
77 /* The same for a single word predicate. */
78 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
80 return iter_predtest_fwd(d, g, PREDTEST_INIT);
83 /* The same for a multi-word predicate. */
84 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
86 uint32_t flags = PREDTEST_INIT;
87 uint64_t *d = vd, *g = vg;
88 uintptr_t i = 0;
90 do {
91 flags = iter_predtest_fwd(d[i], g[i], flags);
92 } while (++i < words);
94 return flags;
97 /* Expand active predicate bits to bytes, for byte elements.
98 * for (i = 0; i < 256; ++i) {
99 * unsigned long m = 0;
100 * for (j = 0; j < 8; j++) {
101 * if ((i >> j) & 1) {
102 * m |= 0xfful << (j << 3);
105 * printf("0x%016lx,\n", m);
108 static inline uint64_t expand_pred_b(uint8_t byte)
110 static const uint64_t word[256] = {
111 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
112 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
113 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
114 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
115 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
116 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
117 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
118 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
119 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
120 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
121 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
122 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
123 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
124 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
125 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
126 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
127 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
128 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
129 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
130 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
131 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
132 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
133 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
134 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
135 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
136 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
137 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
138 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
139 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
140 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
141 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
142 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
143 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
144 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
145 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
146 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
147 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
148 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
149 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
150 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
151 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
152 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
153 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
154 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
155 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
156 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
157 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
158 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
159 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
160 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
161 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
162 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
163 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
164 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
165 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
166 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
167 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
168 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
169 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
170 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
171 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
172 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
173 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
174 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
175 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
176 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
177 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
178 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
179 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
180 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
181 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
182 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
183 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
184 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
185 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
186 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
187 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
188 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
189 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
190 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
191 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
192 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
193 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
194 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
195 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
196 0xffffffffffffffff,
198 return word[byte];
201 /* Similarly for half-word elements.
202 * for (i = 0; i < 256; ++i) {
203 * unsigned long m = 0;
204 * if (i & 0xaa) {
205 * continue;
207 * for (j = 0; j < 8; j += 2) {
208 * if ((i >> j) & 1) {
209 * m |= 0xfffful << (j << 3);
212 * printf("[0x%x] = 0x%016lx,\n", i, m);
215 static inline uint64_t expand_pred_h(uint8_t byte)
217 static const uint64_t word[] = {
218 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
219 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
220 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
221 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
222 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
223 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
224 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
225 [0x55] = 0xffffffffffffffff,
227 return word[byte & 0x55];
230 /* Similarly for single word elements. */
231 static inline uint64_t expand_pred_s(uint8_t byte)
233 static const uint64_t word[] = {
234 [0x01] = 0x00000000ffffffffull,
235 [0x10] = 0xffffffff00000000ull,
236 [0x11] = 0xffffffffffffffffull,
238 return word[byte & 0x11];
241 #define LOGICAL_PPPP(NAME, FUNC) \
242 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
244 uintptr_t opr_sz = simd_oprsz(desc); \
245 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
246 uintptr_t i; \
247 for (i = 0; i < opr_sz / 8; ++i) { \
248 d[i] = FUNC(n[i], m[i], g[i]); \
252 #define DO_AND(N, M, G) (((N) & (M)) & (G))
253 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
254 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
255 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
256 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
257 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
258 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
259 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
261 LOGICAL_PPPP(sve_and_pppp, DO_AND)
262 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
263 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
264 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
265 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
266 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
267 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
268 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
270 #undef DO_AND
271 #undef DO_BIC
272 #undef DO_EOR
273 #undef DO_ORR
274 #undef DO_ORN
275 #undef DO_NOR
276 #undef DO_NAND
277 #undef DO_SEL
278 #undef LOGICAL_PPPP
280 /* Fully general three-operand expander, controlled by a predicate.
281 * This is complicated by the host-endian storage of the register file.
283 /* ??? I don't expect the compiler could ever vectorize this itself.
284 * With some tables we can convert bit masks to byte masks, and with
285 * extra care wrt byte/word ordering we could use gcc generic vectors
286 * and do 16 bytes at a time.
288 #define DO_ZPZZ(NAME, TYPE, H, OP) \
289 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
291 intptr_t i, opr_sz = simd_oprsz(desc); \
292 for (i = 0; i < opr_sz; ) { \
293 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
294 do { \
295 if (pg & 1) { \
296 TYPE nn = *(TYPE *)(vn + H(i)); \
297 TYPE mm = *(TYPE *)(vm + H(i)); \
298 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
300 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
301 } while (i & 15); \
305 /* Similarly, specialized for 64-bit operands. */
306 #define DO_ZPZZ_D(NAME, TYPE, OP) \
307 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
309 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
310 TYPE *d = vd, *n = vn, *m = vm; \
311 uint8_t *pg = vg; \
312 for (i = 0; i < opr_sz; i += 1) { \
313 if (pg[H1(i)] & 1) { \
314 TYPE nn = n[i], mm = m[i]; \
315 d[i] = OP(nn, mm); \
320 #define DO_AND(N, M) (N & M)
321 #define DO_EOR(N, M) (N ^ M)
322 #define DO_ORR(N, M) (N | M)
323 #define DO_BIC(N, M) (N & ~M)
324 #define DO_ADD(N, M) (N + M)
325 #define DO_SUB(N, M) (N - M)
326 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
327 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
328 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
329 #define DO_MUL(N, M) (N * M)
330 #define DO_DIV(N, M) (M ? N / M : 0)
332 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
333 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
334 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
335 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
337 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
338 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
339 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
340 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
342 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
343 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
344 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
345 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
347 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
348 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
349 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
350 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
352 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
353 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
354 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
355 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
357 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
358 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
359 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
360 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
362 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
363 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
364 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
365 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
367 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
368 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
369 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
370 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
372 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
373 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
374 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
375 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
377 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
378 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
379 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
380 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
382 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
383 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
384 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
385 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
387 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
388 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
389 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
390 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
392 /* Because the computation type is at least twice as large as required,
393 these work for both signed and unsigned source types. */
394 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
396 return (n * m) >> 8;
399 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
401 return (n * m) >> 16;
404 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
406 return (n * m) >> 32;
409 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
411 uint64_t lo, hi;
412 muls64(&lo, &hi, n, m);
413 return hi;
416 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
418 uint64_t lo, hi;
419 mulu64(&lo, &hi, n, m);
420 return hi;
423 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
424 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
425 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
426 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
428 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
429 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
430 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
431 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
433 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
434 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
435 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
436 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
438 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
439 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
441 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
442 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
444 /* Note that all bits of the shift are significant
445 and not modulo the element size. */
446 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
447 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
448 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
450 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
451 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
452 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
454 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
455 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
456 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
458 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
459 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
460 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
462 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
463 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
464 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
466 #undef DO_ZPZZ
467 #undef DO_ZPZZ_D
469 /* Three-operand expander, controlled by a predicate, in which the
470 * third operand is "wide". That is, for D = N op M, the same 64-bit
471 * value of M is used with all of the narrower values of N.
473 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
474 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
476 intptr_t i, opr_sz = simd_oprsz(desc); \
477 for (i = 0; i < opr_sz; ) { \
478 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
479 TYPEW mm = *(TYPEW *)(vm + i); \
480 do { \
481 if (pg & 1) { \
482 TYPE nn = *(TYPE *)(vn + H(i)); \
483 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
485 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
486 } while (i & 7); \
490 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
491 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
492 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
494 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
495 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
496 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
498 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
499 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
500 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
502 #undef DO_ZPZW
504 /* Fully general two-operand expander, controlled by a predicate.
506 #define DO_ZPZ(NAME, TYPE, H, OP) \
507 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
509 intptr_t i, opr_sz = simd_oprsz(desc); \
510 for (i = 0; i < opr_sz; ) { \
511 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
512 do { \
513 if (pg & 1) { \
514 TYPE nn = *(TYPE *)(vn + H(i)); \
515 *(TYPE *)(vd + H(i)) = OP(nn); \
517 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
518 } while (i & 15); \
522 /* Similarly, specialized for 64-bit operands. */
523 #define DO_ZPZ_D(NAME, TYPE, OP) \
524 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
526 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
527 TYPE *d = vd, *n = vn; \
528 uint8_t *pg = vg; \
529 for (i = 0; i < opr_sz; i += 1) { \
530 if (pg[H1(i)] & 1) { \
531 TYPE nn = n[i]; \
532 d[i] = OP(nn); \
537 #define DO_CLS_B(N) (clrsb32(N) - 24)
538 #define DO_CLS_H(N) (clrsb32(N) - 16)
540 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
541 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
542 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
543 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
545 #define DO_CLZ_B(N) (clz32(N) - 24)
546 #define DO_CLZ_H(N) (clz32(N) - 16)
548 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
549 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
550 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
551 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
553 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
554 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
555 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
556 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
558 #define DO_CNOT(N) (N == 0)
560 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
561 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
562 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
563 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
565 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
567 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
568 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
569 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
571 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
573 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
574 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
575 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
577 #define DO_NOT(N) (~N)
579 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
580 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
581 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
582 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
584 #define DO_SXTB(N) ((int8_t)N)
585 #define DO_SXTH(N) ((int16_t)N)
586 #define DO_SXTS(N) ((int32_t)N)
587 #define DO_UXTB(N) ((uint8_t)N)
588 #define DO_UXTH(N) ((uint16_t)N)
589 #define DO_UXTS(N) ((uint32_t)N)
591 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
592 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
593 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
594 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
595 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
596 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
598 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
599 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
600 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
601 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
602 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
603 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
605 #define DO_ABS(N) (N < 0 ? -N : N)
607 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
608 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
609 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
610 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
612 #define DO_NEG(N) (-N)
614 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
615 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
616 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
617 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
619 /* Three-operand expander, unpredicated, in which the third operand is "wide".
621 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
622 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
624 intptr_t i, opr_sz = simd_oprsz(desc); \
625 for (i = 0; i < opr_sz; ) { \
626 TYPEW mm = *(TYPEW *)(vm + i); \
627 do { \
628 TYPE nn = *(TYPE *)(vn + H(i)); \
629 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
630 i += sizeof(TYPE); \
631 } while (i & 7); \
635 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
636 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
637 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
639 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
640 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
641 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
643 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
644 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
645 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
647 #undef DO_ZZW
649 #undef DO_CLS_B
650 #undef DO_CLS_H
651 #undef DO_CLZ_B
652 #undef DO_CLZ_H
653 #undef DO_CNOT
654 #undef DO_FABS
655 #undef DO_FNEG
656 #undef DO_ABS
657 #undef DO_NEG
658 #undef DO_ZPZ
659 #undef DO_ZPZ_D
661 /* Two-operand reduction expander, controlled by a predicate.
662 * The difference between TYPERED and TYPERET has to do with
663 * sign-extension. E.g. for SMAX, TYPERED must be signed,
664 * but TYPERET must be unsigned so that e.g. a 32-bit value
665 * is not sign-extended to the ABI uint64_t return type.
667 /* ??? If we were to vectorize this by hand the reduction ordering
668 * would change. For integer operands, this is perfectly fine.
670 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
671 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
673 intptr_t i, opr_sz = simd_oprsz(desc); \
674 TYPERED ret = INIT; \
675 for (i = 0; i < opr_sz; ) { \
676 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
677 do { \
678 if (pg & 1) { \
679 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
680 ret = OP(ret, nn); \
682 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
683 } while (i & 15); \
685 return (TYPERET)ret; \
688 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
689 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
691 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
692 TYPEE *n = vn; \
693 uint8_t *pg = vg; \
694 TYPER ret = INIT; \
695 for (i = 0; i < opr_sz; i += 1) { \
696 if (pg[H1(i)] & 1) { \
697 TYPEE nn = n[i]; \
698 ret = OP(ret, nn); \
701 return ret; \
704 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
705 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
706 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
707 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
709 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
710 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
711 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
712 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
714 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
715 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
716 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
717 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
719 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
720 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
721 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
723 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
724 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
725 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
726 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
728 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
729 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
730 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
731 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
733 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
734 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
735 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
736 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
738 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
739 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
740 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
741 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
743 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
744 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
745 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
746 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
748 #undef DO_VPZ
749 #undef DO_VPZ_D
751 #undef DO_AND
752 #undef DO_ORR
753 #undef DO_EOR
754 #undef DO_BIC
755 #undef DO_ADD
756 #undef DO_SUB
757 #undef DO_MAX
758 #undef DO_MIN
759 #undef DO_ABD
760 #undef DO_MUL
761 #undef DO_DIV
762 #undef DO_ASR
763 #undef DO_LSR
764 #undef DO_LSL
766 /* Similar to the ARM LastActiveElement pseudocode function, except the
767 result is multiplied by the element size. This includes the not found
768 indication; e.g. not found for esz=3 is -8. */
769 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
771 uint64_t mask = pred_esz_masks[esz];
772 intptr_t i = words;
774 do {
775 uint64_t this_g = g[--i] & mask;
776 if (this_g) {
777 return i * 64 + (63 - clz64(this_g));
779 } while (i > 0);
780 return (intptr_t)-1 << esz;
783 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
785 uint32_t flags = PREDTEST_INIT;
786 uint64_t *d = vd, *g = vg;
787 intptr_t i = 0;
789 do {
790 uint64_t this_d = d[i];
791 uint64_t this_g = g[i];
793 if (this_g) {
794 if (!(flags & 4)) {
795 /* Set in D the first bit of G. */
796 this_d |= this_g & -this_g;
797 d[i] = this_d;
799 flags = iter_predtest_fwd(this_d, this_g, flags);
801 } while (++i < words);
803 return flags;
806 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
808 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
809 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
810 uint32_t flags = PREDTEST_INIT;
811 uint64_t *d = vd, *g = vg, esz_mask;
812 intptr_t i, next;
814 next = last_active_element(vd, words, esz) + (1 << esz);
815 esz_mask = pred_esz_masks[esz];
817 /* Similar to the pseudocode for pnext, but scaled by ESZ
818 so that we find the correct bit. */
819 if (next < words * 64) {
820 uint64_t mask = -1;
822 if (next & 63) {
823 mask = ~((1ull << (next & 63)) - 1);
824 next &= -64;
826 do {
827 uint64_t this_g = g[next / 64] & esz_mask & mask;
828 if (this_g != 0) {
829 next = (next & -64) + ctz64(this_g);
830 break;
832 next += 64;
833 mask = -1;
834 } while (next < words * 64);
837 i = 0;
838 do {
839 uint64_t this_d = 0;
840 if (i == next / 64) {
841 this_d = 1ull << (next & 63);
843 d[i] = this_d;
844 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
845 } while (++i < words);
847 return flags;
850 /* Store zero into every active element of Zd. We will use this for two
851 * and three-operand predicated instructions for which logic dictates a
852 * zero result. In particular, logical shift by element size, which is
853 * otherwise undefined on the host.
855 * For element sizes smaller than uint64_t, we use tables to expand
856 * the N bits of the controlling predicate to a byte mask, and clear
857 * those bytes.
859 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
861 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
862 uint64_t *d = vd;
863 uint8_t *pg = vg;
864 for (i = 0; i < opr_sz; i += 1) {
865 d[i] &= ~expand_pred_b(pg[H1(i)]);
869 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
871 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
872 uint64_t *d = vd;
873 uint8_t *pg = vg;
874 for (i = 0; i < opr_sz; i += 1) {
875 d[i] &= ~expand_pred_h(pg[H1(i)]);
879 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
881 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
882 uint64_t *d = vd;
883 uint8_t *pg = vg;
884 for (i = 0; i < opr_sz; i += 1) {
885 d[i] &= ~expand_pred_s(pg[H1(i)]);
889 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
891 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
892 uint64_t *d = vd;
893 uint8_t *pg = vg;
894 for (i = 0; i < opr_sz; i += 1) {
895 if (pg[H1(i)] & 1) {
896 d[i] = 0;
901 /* Three-operand expander, immediate operand, controlled by a predicate.
903 #define DO_ZPZI(NAME, TYPE, H, OP) \
904 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
906 intptr_t i, opr_sz = simd_oprsz(desc); \
907 TYPE imm = simd_data(desc); \
908 for (i = 0; i < opr_sz; ) { \
909 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
910 do { \
911 if (pg & 1) { \
912 TYPE nn = *(TYPE *)(vn + H(i)); \
913 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
915 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
916 } while (i & 15); \
920 /* Similarly, specialized for 64-bit operands. */
921 #define DO_ZPZI_D(NAME, TYPE, OP) \
922 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
924 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
925 TYPE *d = vd, *n = vn; \
926 TYPE imm = simd_data(desc); \
927 uint8_t *pg = vg; \
928 for (i = 0; i < opr_sz; i += 1) { \
929 if (pg[H1(i)] & 1) { \
930 TYPE nn = n[i]; \
931 d[i] = OP(nn, imm); \
936 #define DO_SHR(N, M) (N >> M)
937 #define DO_SHL(N, M) (N << M)
939 /* Arithmetic shift right for division. This rounds negative numbers
940 toward zero as per signed division. Therefore before shifting,
941 when N is negative, add 2**M-1. */
942 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
944 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
945 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
946 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
947 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
949 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
950 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
951 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
952 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
954 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
955 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
956 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
957 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
959 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
960 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
961 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
962 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
964 #undef DO_SHR
965 #undef DO_SHL
966 #undef DO_ASRD
967 #undef DO_ZPZI
968 #undef DO_ZPZI_D
970 /* Fully general four-operand expander, controlled by a predicate.
972 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
973 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
974 void *vg, uint32_t desc) \
976 intptr_t i, opr_sz = simd_oprsz(desc); \
977 for (i = 0; i < opr_sz; ) { \
978 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
979 do { \
980 if (pg & 1) { \
981 TYPE nn = *(TYPE *)(vn + H(i)); \
982 TYPE mm = *(TYPE *)(vm + H(i)); \
983 TYPE aa = *(TYPE *)(va + H(i)); \
984 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
986 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
987 } while (i & 15); \
991 /* Similarly, specialized for 64-bit operands. */
992 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
993 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
994 void *vg, uint32_t desc) \
996 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
997 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
998 uint8_t *pg = vg; \
999 for (i = 0; i < opr_sz; i += 1) { \
1000 if (pg[H1(i)] & 1) { \
1001 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1002 d[i] = OP(aa, nn, mm); \
1007 #define DO_MLA(A, N, M) (A + N * M)
1008 #define DO_MLS(A, N, M) (A - N * M)
1010 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1011 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1013 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1014 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1016 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1017 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1019 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1020 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1022 #undef DO_MLA
1023 #undef DO_MLS
1024 #undef DO_ZPZZZ
1025 #undef DO_ZPZZZ_D
1027 void HELPER(sve_index_b)(void *vd, uint32_t start,
1028 uint32_t incr, uint32_t desc)
1030 intptr_t i, opr_sz = simd_oprsz(desc);
1031 uint8_t *d = vd;
1032 for (i = 0; i < opr_sz; i += 1) {
1033 d[H1(i)] = start + i * incr;
1037 void HELPER(sve_index_h)(void *vd, uint32_t start,
1038 uint32_t incr, uint32_t desc)
1040 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1041 uint16_t *d = vd;
1042 for (i = 0; i < opr_sz; i += 1) {
1043 d[H2(i)] = start + i * incr;
1047 void HELPER(sve_index_s)(void *vd, uint32_t start,
1048 uint32_t incr, uint32_t desc)
1050 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1051 uint32_t *d = vd;
1052 for (i = 0; i < opr_sz; i += 1) {
1053 d[H4(i)] = start + i * incr;
1057 void HELPER(sve_index_d)(void *vd, uint64_t start,
1058 uint64_t incr, uint32_t desc)
1060 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1061 uint64_t *d = vd;
1062 for (i = 0; i < opr_sz; i += 1) {
1063 d[i] = start + i * incr;
1067 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1069 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1070 uint32_t sh = simd_data(desc);
1071 uint32_t *d = vd, *n = vn, *m = vm;
1072 for (i = 0; i < opr_sz; i += 1) {
1073 d[i] = n[i] + (m[i] << sh);
1077 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1079 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1080 uint64_t sh = simd_data(desc);
1081 uint64_t *d = vd, *n = vn, *m = vm;
1082 for (i = 0; i < opr_sz; i += 1) {
1083 d[i] = n[i] + (m[i] << sh);
1087 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1089 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1090 uint64_t sh = simd_data(desc);
1091 uint64_t *d = vd, *n = vn, *m = vm;
1092 for (i = 0; i < opr_sz; i += 1) {
1093 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1097 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1099 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1100 uint64_t sh = simd_data(desc);
1101 uint64_t *d = vd, *n = vn, *m = vm;
1102 for (i = 0; i < opr_sz; i += 1) {
1103 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1107 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1109 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1110 static const uint16_t coeff[] = {
1111 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1112 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1113 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1114 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1116 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1117 uint16_t *d = vd, *n = vn;
1119 for (i = 0; i < opr_sz; i++) {
1120 uint16_t nn = n[i];
1121 intptr_t idx = extract32(nn, 0, 5);
1122 uint16_t exp = extract32(nn, 5, 5);
1123 d[i] = coeff[idx] | (exp << 10);
1127 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1129 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1130 static const uint32_t coeff[] = {
1131 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1132 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1133 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1134 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1135 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1136 0x1ef532, 0x20b051, 0x227043, 0x243516,
1137 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1138 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1139 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1140 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1141 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1142 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1143 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1144 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1145 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1146 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1148 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1149 uint32_t *d = vd, *n = vn;
1151 for (i = 0; i < opr_sz; i++) {
1152 uint32_t nn = n[i];
1153 intptr_t idx = extract32(nn, 0, 6);
1154 uint32_t exp = extract32(nn, 6, 8);
1155 d[i] = coeff[idx] | (exp << 23);
1159 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1161 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1162 static const uint64_t coeff[] = {
1163 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1164 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1165 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1166 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1167 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1168 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1169 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1170 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1171 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1172 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1173 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1174 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1175 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1176 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1177 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1178 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1179 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1180 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1181 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1182 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1183 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1184 0xFA7C1819E90D8ull,
1186 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1187 uint64_t *d = vd, *n = vn;
1189 for (i = 0; i < opr_sz; i++) {
1190 uint64_t nn = n[i];
1191 intptr_t idx = extract32(nn, 0, 6);
1192 uint64_t exp = extract32(nn, 6, 11);
1193 d[i] = coeff[idx] | (exp << 52);
1197 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1199 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1200 uint16_t *d = vd, *n = vn, *m = vm;
1201 for (i = 0; i < opr_sz; i += 1) {
1202 uint16_t nn = n[i];
1203 uint16_t mm = m[i];
1204 if (mm & 1) {
1205 nn = float16_one;
1207 d[i] = nn ^ (mm & 2) << 14;
1211 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1213 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1214 uint32_t *d = vd, *n = vn, *m = vm;
1215 for (i = 0; i < opr_sz; i += 1) {
1216 uint32_t nn = n[i];
1217 uint32_t mm = m[i];
1218 if (mm & 1) {
1219 nn = float32_one;
1221 d[i] = nn ^ (mm & 2) << 30;
1225 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t *d = vd, *n = vn, *m = vm;
1229 for (i = 0; i < opr_sz; i += 1) {
1230 uint64_t nn = n[i];
1231 uint64_t mm = m[i];
1232 if (mm & 1) {
1233 nn = float64_one;
1235 d[i] = nn ^ (mm & 2) << 62;