4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
58 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
64 flags
|= ((d
& (g
& -g
)) != 0) << 31;
68 /* Accumulate Z from each D & G. */
69 flags
|= ((d
& g
) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
77 /* The same for a single word predicate. */
78 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
80 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
83 /* The same for a multi-word predicate. */
84 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
86 uint32_t flags
= PREDTEST_INIT
;
87 uint64_t *d
= vd
, *g
= vg
;
91 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
92 } while (++i
< words
);
97 /* Expand active predicate bits to bytes, for byte elements.
98 * for (i = 0; i < 256; ++i) {
99 * unsigned long m = 0;
100 * for (j = 0; j < 8; j++) {
101 * if ((i >> j) & 1) {
102 * m |= 0xfful << (j << 3);
105 * printf("0x%016lx,\n", m);
108 static inline uint64_t expand_pred_b(uint8_t byte
)
110 static const uint64_t word
[256] = {
111 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
112 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
113 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
114 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
115 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
116 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
117 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
118 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
119 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
120 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
121 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
122 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
123 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
124 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
125 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
126 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
127 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
128 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
129 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
130 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
131 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
132 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
133 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
134 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
135 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
136 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
137 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
138 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
139 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
140 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
141 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
142 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
143 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
144 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
145 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
146 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
147 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
148 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
149 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
150 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
151 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
152 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
153 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
154 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
155 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
156 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
157 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
158 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
159 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
160 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
161 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
162 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
163 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
164 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
165 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
166 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
167 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
168 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
169 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
170 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
171 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
172 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
173 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
174 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
175 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
176 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
177 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
178 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
179 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
180 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
181 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
182 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
183 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
184 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
185 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
186 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
187 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
188 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
189 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
190 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
191 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
192 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
193 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
194 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
195 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
201 /* Similarly for half-word elements.
202 * for (i = 0; i < 256; ++i) {
203 * unsigned long m = 0;
207 * for (j = 0; j < 8; j += 2) {
208 * if ((i >> j) & 1) {
209 * m |= 0xfffful << (j << 3);
212 * printf("[0x%x] = 0x%016lx,\n", i, m);
215 static inline uint64_t expand_pred_h(uint8_t byte
)
217 static const uint64_t word
[] = {
218 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
219 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
220 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
221 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
222 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
223 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
224 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
225 [0x55] = 0xffffffffffffffff,
227 return word
[byte
& 0x55];
230 /* Similarly for single word elements. */
231 static inline uint64_t expand_pred_s(uint8_t byte
)
233 static const uint64_t word
[] = {
234 [0x01] = 0x00000000ffffffffull
,
235 [0x10] = 0xffffffff00000000ull
,
236 [0x11] = 0xffffffffffffffffull
,
238 return word
[byte
& 0x11];
241 /* Swap 16-bit words within a 32-bit word. */
242 static inline uint32_t hswap32(uint32_t h
)
247 /* Swap 16-bit words within a 64-bit word. */
248 static inline uint64_t hswap64(uint64_t h
)
250 uint64_t m
= 0x0000ffff0000ffffull
;
252 return ((h
& m
) << 16) | ((h
>> 16) & m
);
255 /* Swap 32-bit words within a 64-bit word. */
256 static inline uint64_t wswap64(uint64_t h
)
261 #define LOGICAL_PPPP(NAME, FUNC) \
262 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
264 uintptr_t opr_sz = simd_oprsz(desc); \
265 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
267 for (i = 0; i < opr_sz / 8; ++i) { \
268 d[i] = FUNC(n[i], m[i], g[i]); \
272 #define DO_AND(N, M, G) (((N) & (M)) & (G))
273 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
274 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
275 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
276 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
277 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
278 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
279 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
281 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
282 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
283 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
284 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
285 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
286 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
287 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
288 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
300 /* Fully general three-operand expander, controlled by a predicate.
301 * This is complicated by the host-endian storage of the register file.
303 /* ??? I don't expect the compiler could ever vectorize this itself.
304 * With some tables we can convert bit masks to byte masks, and with
305 * extra care wrt byte/word ordering we could use gcc generic vectors
306 * and do 16 bytes at a time.
308 #define DO_ZPZZ(NAME, TYPE, H, OP) \
309 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
311 intptr_t i, opr_sz = simd_oprsz(desc); \
312 for (i = 0; i < opr_sz; ) { \
313 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
316 TYPE nn = *(TYPE *)(vn + H(i)); \
317 TYPE mm = *(TYPE *)(vm + H(i)); \
318 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
320 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
325 /* Similarly, specialized for 64-bit operands. */
326 #define DO_ZPZZ_D(NAME, TYPE, OP) \
327 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
329 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
330 TYPE *d = vd, *n = vn, *m = vm; \
332 for (i = 0; i < opr_sz; i += 1) { \
333 if (pg[H1(i)] & 1) { \
334 TYPE nn = n[i], mm = m[i]; \
340 #define DO_AND(N, M) (N & M)
341 #define DO_EOR(N, M) (N ^ M)
342 #define DO_ORR(N, M) (N | M)
343 #define DO_BIC(N, M) (N & ~M)
344 #define DO_ADD(N, M) (N + M)
345 #define DO_SUB(N, M) (N - M)
346 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
347 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
348 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
349 #define DO_MUL(N, M) (N * M)
350 #define DO_DIV(N, M) (M ? N / M : 0)
352 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
353 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
354 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
355 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
357 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
358 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
359 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
360 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
362 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
363 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
364 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
365 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
367 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
368 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
369 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
370 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
372 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
373 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
374 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
375 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
377 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
378 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
379 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
380 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
382 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
383 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
384 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
385 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
387 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
388 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
389 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
390 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
392 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
393 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
394 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
395 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
397 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
398 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
399 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
400 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
402 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
403 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
404 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
405 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
407 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
408 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
409 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
410 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
412 /* Because the computation type is at least twice as large as required,
413 these work for both signed and unsigned source types. */
414 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
419 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
421 return (n
* m
) >> 16;
424 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
426 return (n
* m
) >> 32;
429 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
432 muls64(&lo
, &hi
, n
, m
);
436 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
439 mulu64(&lo
, &hi
, n
, m
);
443 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
444 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
445 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
446 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
448 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
449 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
450 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
451 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
453 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
454 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
455 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
456 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
458 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_DIV
)
459 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_DIV
)
461 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_DIV
)
462 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_DIV
)
464 /* Note that all bits of the shift are significant
465 and not modulo the element size. */
466 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
467 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
468 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
470 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
471 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
472 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
474 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
475 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
476 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
478 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
479 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
480 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
482 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
483 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
484 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
489 /* Three-operand expander, controlled by a predicate, in which the
490 * third operand is "wide". That is, for D = N op M, the same 64-bit
491 * value of M is used with all of the narrower values of N.
493 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
494 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
496 intptr_t i, opr_sz = simd_oprsz(desc); \
497 for (i = 0; i < opr_sz; ) { \
498 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
499 TYPEW mm = *(TYPEW *)(vm + i); \
502 TYPE nn = *(TYPE *)(vn + H(i)); \
503 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
505 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
510 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
511 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
512 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
514 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
515 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
516 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
518 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
519 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
520 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
524 /* Fully general two-operand expander, controlled by a predicate.
526 #define DO_ZPZ(NAME, TYPE, H, OP) \
527 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
529 intptr_t i, opr_sz = simd_oprsz(desc); \
530 for (i = 0; i < opr_sz; ) { \
531 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn); \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
542 /* Similarly, specialized for 64-bit operands. */
543 #define DO_ZPZ_D(NAME, TYPE, OP) \
544 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
546 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
547 TYPE *d = vd, *n = vn; \
549 for (i = 0; i < opr_sz; i += 1) { \
550 if (pg[H1(i)] & 1) { \
557 #define DO_CLS_B(N) (clrsb32(N) - 24)
558 #define DO_CLS_H(N) (clrsb32(N) - 16)
560 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
561 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
562 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
563 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
565 #define DO_CLZ_B(N) (clz32(N) - 24)
566 #define DO_CLZ_H(N) (clz32(N) - 16)
568 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
569 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
570 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
571 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
573 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
574 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
575 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
576 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
578 #define DO_CNOT(N) (N == 0)
580 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
581 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
582 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
583 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
585 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
587 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
588 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
589 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
591 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
593 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
594 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
595 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
597 #define DO_NOT(N) (~N)
599 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
600 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
601 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
602 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
604 #define DO_SXTB(N) ((int8_t)N)
605 #define DO_SXTH(N) ((int16_t)N)
606 #define DO_SXTS(N) ((int32_t)N)
607 #define DO_UXTB(N) ((uint8_t)N)
608 #define DO_UXTH(N) ((uint16_t)N)
609 #define DO_UXTS(N) ((uint32_t)N)
611 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
612 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
613 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
614 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
615 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
616 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
618 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
619 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
620 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
621 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
622 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
623 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
625 #define DO_ABS(N) (N < 0 ? -N : N)
627 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
628 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
629 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
630 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
632 #define DO_NEG(N) (-N)
634 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
635 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
636 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
637 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
639 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
640 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
641 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
643 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
644 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
646 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
648 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
649 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
650 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
651 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
653 /* Three-operand expander, unpredicated, in which the third operand is "wide".
655 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
656 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
658 intptr_t i, opr_sz = simd_oprsz(desc); \
659 for (i = 0; i < opr_sz; ) { \
660 TYPEW mm = *(TYPEW *)(vm + i); \
662 TYPE nn = *(TYPE *)(vn + H(i)); \
663 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
669 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
670 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
671 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
673 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
674 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
675 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
677 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
678 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
679 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
695 /* Two-operand reduction expander, controlled by a predicate.
696 * The difference between TYPERED and TYPERET has to do with
697 * sign-extension. E.g. for SMAX, TYPERED must be signed,
698 * but TYPERET must be unsigned so that e.g. a 32-bit value
699 * is not sign-extended to the ABI uint64_t return type.
701 /* ??? If we were to vectorize this by hand the reduction ordering
702 * would change. For integer operands, this is perfectly fine.
704 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
705 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
707 intptr_t i, opr_sz = simd_oprsz(desc); \
708 TYPERED ret = INIT; \
709 for (i = 0; i < opr_sz; ) { \
710 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
713 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
716 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
719 return (TYPERET)ret; \
722 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
723 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
725 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
729 for (i = 0; i < opr_sz; i += 1) { \
730 if (pg[H1(i)] & 1) { \
738 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
739 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
740 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
741 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
743 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
744 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
745 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
746 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
748 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
749 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
750 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
751 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
753 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
754 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
755 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
757 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
758 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
759 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
760 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
762 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
763 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
764 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
765 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
767 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
768 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
769 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
770 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
772 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
773 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
774 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
775 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
777 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
778 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
779 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
780 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
800 /* Similar to the ARM LastActiveElement pseudocode function, except the
801 result is multiplied by the element size. This includes the not found
802 indication; e.g. not found for esz=3 is -8. */
803 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
805 uint64_t mask
= pred_esz_masks
[esz
];
809 uint64_t this_g
= g
[--i
] & mask
;
811 return i
* 64 + (63 - clz64(this_g
));
814 return (intptr_t)-1 << esz
;
817 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t words
)
819 uint32_t flags
= PREDTEST_INIT
;
820 uint64_t *d
= vd
, *g
= vg
;
824 uint64_t this_d
= d
[i
];
825 uint64_t this_g
= g
[i
];
829 /* Set in D the first bit of G. */
830 this_d
|= this_g
& -this_g
;
833 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
835 } while (++i
< words
);
840 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
842 intptr_t words
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
);
843 intptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
844 uint32_t flags
= PREDTEST_INIT
;
845 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
848 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
849 esz_mask
= pred_esz_masks
[esz
];
851 /* Similar to the pseudocode for pnext, but scaled by ESZ
852 so that we find the correct bit. */
853 if (next
< words
* 64) {
857 mask
= ~((1ull << (next
& 63)) - 1);
861 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
863 next
= (next
& -64) + ctz64(this_g
);
868 } while (next
< words
* 64);
874 if (i
== next
/ 64) {
875 this_d
= 1ull << (next
& 63);
878 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
879 } while (++i
< words
);
884 /* Store zero into every active element of Zd. We will use this for two
885 * and three-operand predicated instructions for which logic dictates a
886 * zero result. In particular, logical shift by element size, which is
887 * otherwise undefined on the host.
889 * For element sizes smaller than uint64_t, we use tables to expand
890 * the N bits of the controlling predicate to a byte mask, and clear
893 void HELPER(sve_clr_b
)(void *vd
, void *vg
, uint32_t desc
)
895 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
898 for (i
= 0; i
< opr_sz
; i
+= 1) {
899 d
[i
] &= ~expand_pred_b(pg
[H1(i
)]);
903 void HELPER(sve_clr_h
)(void *vd
, void *vg
, uint32_t desc
)
905 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
908 for (i
= 0; i
< opr_sz
; i
+= 1) {
909 d
[i
] &= ~expand_pred_h(pg
[H1(i
)]);
913 void HELPER(sve_clr_s
)(void *vd
, void *vg
, uint32_t desc
)
915 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
918 for (i
= 0; i
< opr_sz
; i
+= 1) {
919 d
[i
] &= ~expand_pred_s(pg
[H1(i
)]);
923 void HELPER(sve_clr_d
)(void *vd
, void *vg
, uint32_t desc
)
925 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
928 for (i
= 0; i
< opr_sz
; i
+= 1) {
935 /* Three-operand expander, immediate operand, controlled by a predicate.
937 #define DO_ZPZI(NAME, TYPE, H, OP) \
938 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
940 intptr_t i, opr_sz = simd_oprsz(desc); \
941 TYPE imm = simd_data(desc); \
942 for (i = 0; i < opr_sz; ) { \
943 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
946 TYPE nn = *(TYPE *)(vn + H(i)); \
947 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
949 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
954 /* Similarly, specialized for 64-bit operands. */
955 #define DO_ZPZI_D(NAME, TYPE, OP) \
956 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
958 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
959 TYPE *d = vd, *n = vn; \
960 TYPE imm = simd_data(desc); \
962 for (i = 0; i < opr_sz; i += 1) { \
963 if (pg[H1(i)] & 1) { \
965 d[i] = OP(nn, imm); \
970 #define DO_SHR(N, M) (N >> M)
971 #define DO_SHL(N, M) (N << M)
973 /* Arithmetic shift right for division. This rounds negative numbers
974 toward zero as per signed division. Therefore before shifting,
975 when N is negative, add 2**M-1. */
976 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
978 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
979 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
980 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
981 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
983 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
984 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
985 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
986 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
988 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
989 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
990 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
991 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
993 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
994 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
995 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
996 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
1004 /* Fully general four-operand expander, controlled by a predicate.
1006 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1007 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1008 void *vg, uint32_t desc) \
1010 intptr_t i, opr_sz = simd_oprsz(desc); \
1011 for (i = 0; i < opr_sz; ) { \
1012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1015 TYPE nn = *(TYPE *)(vn + H(i)); \
1016 TYPE mm = *(TYPE *)(vm + H(i)); \
1017 TYPE aa = *(TYPE *)(va + H(i)); \
1018 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1020 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1025 /* Similarly, specialized for 64-bit operands. */
1026 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1027 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1028 void *vg, uint32_t desc) \
1030 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1031 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1033 for (i = 0; i < opr_sz; i += 1) { \
1034 if (pg[H1(i)] & 1) { \
1035 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1036 d[i] = OP(aa, nn, mm); \
1041 #define DO_MLA(A, N, M) (A + N * M)
1042 #define DO_MLS(A, N, M) (A - N * M)
1044 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
1045 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
1047 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
1048 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
1050 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
1051 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
1053 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
1054 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
1061 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
1062 uint32_t incr
, uint32_t desc
)
1064 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1066 for (i
= 0; i
< opr_sz
; i
+= 1) {
1067 d
[H1(i
)] = start
+ i
* incr
;
1071 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
1072 uint32_t incr
, uint32_t desc
)
1074 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1076 for (i
= 0; i
< opr_sz
; i
+= 1) {
1077 d
[H2(i
)] = start
+ i
* incr
;
1081 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
1082 uint32_t incr
, uint32_t desc
)
1084 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1086 for (i
= 0; i
< opr_sz
; i
+= 1) {
1087 d
[H4(i
)] = start
+ i
* incr
;
1091 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
1092 uint64_t incr
, uint32_t desc
)
1094 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1096 for (i
= 0; i
< opr_sz
; i
+= 1) {
1097 d
[i
] = start
+ i
* incr
;
1101 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1103 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1104 uint32_t sh
= simd_data(desc
);
1105 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
1106 for (i
= 0; i
< opr_sz
; i
+= 1) {
1107 d
[i
] = n
[i
] + (m
[i
] << sh
);
1111 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1113 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1114 uint64_t sh
= simd_data(desc
);
1115 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1116 for (i
= 0; i
< opr_sz
; i
+= 1) {
1117 d
[i
] = n
[i
] + (m
[i
] << sh
);
1121 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1123 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1124 uint64_t sh
= simd_data(desc
);
1125 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1126 for (i
= 0; i
< opr_sz
; i
+= 1) {
1127 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
1131 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1133 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1134 uint64_t sh
= simd_data(desc
);
1135 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1136 for (i
= 0; i
< opr_sz
; i
+= 1) {
1137 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
1141 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
1143 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1144 static const uint16_t coeff
[] = {
1145 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1146 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1147 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1148 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1150 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1151 uint16_t *d
= vd
, *n
= vn
;
1153 for (i
= 0; i
< opr_sz
; i
++) {
1155 intptr_t idx
= extract32(nn
, 0, 5);
1156 uint16_t exp
= extract32(nn
, 5, 5);
1157 d
[i
] = coeff
[idx
] | (exp
<< 10);
1161 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
1163 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1164 static const uint32_t coeff
[] = {
1165 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1166 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1167 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1168 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1169 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1170 0x1ef532, 0x20b051, 0x227043, 0x243516,
1171 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1172 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1173 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1174 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1175 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1176 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1177 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1178 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1179 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1180 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1182 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1183 uint32_t *d
= vd
, *n
= vn
;
1185 for (i
= 0; i
< opr_sz
; i
++) {
1187 intptr_t idx
= extract32(nn
, 0, 6);
1188 uint32_t exp
= extract32(nn
, 6, 8);
1189 d
[i
] = coeff
[idx
] | (exp
<< 23);
1193 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
1195 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1196 static const uint64_t coeff
[] = {
1197 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
1198 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
1199 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
1200 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
1201 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
1202 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
1203 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
1204 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
1205 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
1206 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
1207 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
1208 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
1209 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
1210 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
1211 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
1212 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
1213 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
1214 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
1215 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
1216 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
1217 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
1220 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1221 uint64_t *d
= vd
, *n
= vn
;
1223 for (i
= 0; i
< opr_sz
; i
++) {
1225 intptr_t idx
= extract32(nn
, 0, 6);
1226 uint64_t exp
= extract32(nn
, 6, 11);
1227 d
[i
] = coeff
[idx
] | (exp
<< 52);
1231 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1233 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1234 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
1235 for (i
= 0; i
< opr_sz
; i
+= 1) {
1241 d
[i
] = nn
^ (mm
& 2) << 14;
1245 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1247 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1248 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
1249 for (i
= 0; i
< opr_sz
; i
+= 1) {
1255 d
[i
] = nn
^ (mm
& 2) << 30;
1259 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1261 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1262 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1263 for (i
= 0; i
< opr_sz
; i
+= 1) {
1269 d
[i
] = nn
^ (mm
& 2) << 62;
1274 * Signed saturating addition with scalar operand.
1277 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1279 intptr_t i
, oprsz
= simd_oprsz(desc
);
1281 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
1282 int r
= *(int8_t *)(a
+ i
) + b
;
1285 } else if (r
< INT8_MIN
) {
1288 *(int8_t *)(d
+ i
) = r
;
1292 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1294 intptr_t i
, oprsz
= simd_oprsz(desc
);
1296 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
1297 int r
= *(int16_t *)(a
+ i
) + b
;
1298 if (r
> INT16_MAX
) {
1300 } else if (r
< INT16_MIN
) {
1303 *(int16_t *)(d
+ i
) = r
;
1307 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1309 intptr_t i
, oprsz
= simd_oprsz(desc
);
1311 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
1312 int64_t r
= *(int32_t *)(a
+ i
) + b
;
1313 if (r
> INT32_MAX
) {
1315 } else if (r
< INT32_MIN
) {
1318 *(int32_t *)(d
+ i
) = r
;
1322 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1324 intptr_t i
, oprsz
= simd_oprsz(desc
);
1326 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
1327 int64_t ai
= *(int64_t *)(a
+ i
);
1329 if (((r
^ ai
) & ~(ai
^ b
)) < 0) {
1330 /* Signed overflow. */
1331 r
= (r
< 0 ? INT64_MAX
: INT64_MIN
);
1333 *(int64_t *)(d
+ i
) = r
;
1338 * Unsigned saturating addition with scalar operand.
1341 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1343 intptr_t i
, oprsz
= simd_oprsz(desc
);
1345 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
1346 int r
= *(uint8_t *)(a
+ i
) + b
;
1347 if (r
> UINT8_MAX
) {
1352 *(uint8_t *)(d
+ i
) = r
;
1356 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1358 intptr_t i
, oprsz
= simd_oprsz(desc
);
1360 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
1361 int r
= *(uint16_t *)(a
+ i
) + b
;
1362 if (r
> UINT16_MAX
) {
1367 *(uint16_t *)(d
+ i
) = r
;
1371 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1373 intptr_t i
, oprsz
= simd_oprsz(desc
);
1375 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
1376 int64_t r
= *(uint32_t *)(a
+ i
) + b
;
1377 if (r
> UINT32_MAX
) {
1382 *(uint32_t *)(d
+ i
) = r
;
1386 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
1388 intptr_t i
, oprsz
= simd_oprsz(desc
);
1390 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1391 uint64_t r
= *(uint64_t *)(a
+ i
) + b
;
1395 *(uint64_t *)(d
+ i
) = r
;
1399 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
1401 intptr_t i
, oprsz
= simd_oprsz(desc
);
1403 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1404 uint64_t ai
= *(uint64_t *)(a
+ i
);
1405 *(uint64_t *)(d
+ i
) = (ai
< b
? 0 : ai
- b
);
1409 /* Two operand predicated copy immediate with merge. All valid immediates
1410 * can fit within 17 signed bits in the simd_data field.
1412 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
1413 uint64_t mm
, uint32_t desc
)
1415 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1416 uint64_t *d
= vd
, *n
= vn
;
1419 mm
= dup_const(MO_8
, mm
);
1420 for (i
= 0; i
< opr_sz
; i
+= 1) {
1422 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
1423 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1427 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
1428 uint64_t mm
, uint32_t desc
)
1430 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1431 uint64_t *d
= vd
, *n
= vn
;
1434 mm
= dup_const(MO_16
, mm
);
1435 for (i
= 0; i
< opr_sz
; i
+= 1) {
1437 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
1438 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1442 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
1443 uint64_t mm
, uint32_t desc
)
1445 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1446 uint64_t *d
= vd
, *n
= vn
;
1449 mm
= dup_const(MO_32
, mm
);
1450 for (i
= 0; i
< opr_sz
; i
+= 1) {
1452 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
1453 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1457 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
1458 uint64_t mm
, uint32_t desc
)
1460 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1461 uint64_t *d
= vd
, *n
= vn
;
1464 for (i
= 0; i
< opr_sz
; i
+= 1) {
1466 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
1470 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1472 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1476 val
= dup_const(MO_8
, val
);
1477 for (i
= 0; i
< opr_sz
; i
+= 1) {
1478 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
1482 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1484 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1488 val
= dup_const(MO_16
, val
);
1489 for (i
= 0; i
< opr_sz
; i
+= 1) {
1490 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
1494 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1496 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1500 val
= dup_const(MO_32
, val
);
1501 for (i
= 0; i
< opr_sz
; i
+= 1) {
1502 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
1506 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1508 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1512 for (i
= 0; i
< opr_sz
; i
+= 1) {
1513 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
1517 /* Big-endian hosts need to frob the byte indicies. If the copy
1518 * happens to be 8-byte aligned, then no frobbing necessary.
1520 static void swap_memmove(void *vd
, void *vs
, size_t n
)
1522 uintptr_t d
= (uintptr_t)vd
;
1523 uintptr_t s
= (uintptr_t)vs
;
1524 uintptr_t o
= (d
| s
| n
) & 7;
1527 #ifndef HOST_WORDS_BIGENDIAN
1536 if (d
< s
|| d
>= s
+ n
) {
1537 for (i
= 0; i
< n
; i
+= 4) {
1538 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
1541 for (i
= n
; i
> 0; ) {
1543 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
1550 if (d
< s
|| d
>= s
+ n
) {
1551 for (i
= 0; i
< n
; i
+= 2) {
1552 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
1555 for (i
= n
; i
> 0; ) {
1557 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
1563 if (d
< s
|| d
>= s
+ n
) {
1564 for (i
= 0; i
< n
; i
++) {
1565 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
1568 for (i
= n
; i
> 0; ) {
1570 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
1577 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1579 intptr_t opr_sz
= simd_oprsz(desc
);
1580 size_t n_ofs
= simd_data(desc
);
1581 size_t n_siz
= opr_sz
- n_ofs
;
1584 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
1585 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
1586 } else if (vd
!= vn
) {
1587 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
1588 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
1590 /* vd == vn == vm. Need temp space. */
1592 swap_memmove(&tmp
, vm
, n_ofs
);
1593 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
1594 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
1598 #define DO_INSR(NAME, TYPE, H) \
1599 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1601 intptr_t opr_sz = simd_oprsz(desc); \
1602 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1603 *(TYPE *)(vd + H(0)) = val; \
1606 DO_INSR(sve_insr_b
, uint8_t, H1
)
1607 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
1608 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
1609 DO_INSR(sve_insr_d
, uint64_t, )
1613 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
1615 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1616 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1617 uint64_t f
= *(uint64_t *)(vn
+ i
);
1618 uint64_t b
= *(uint64_t *)(vn
+ j
);
1619 *(uint64_t *)(vd
+ i
) = bswap64(b
);
1620 *(uint64_t *)(vd
+ j
) = bswap64(f
);
1624 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
1626 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1627 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1628 uint64_t f
= *(uint64_t *)(vn
+ i
);
1629 uint64_t b
= *(uint64_t *)(vn
+ j
);
1630 *(uint64_t *)(vd
+ i
) = hswap64(b
);
1631 *(uint64_t *)(vd
+ j
) = hswap64(f
);
1635 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
1637 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1638 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1639 uint64_t f
= *(uint64_t *)(vn
+ i
);
1640 uint64_t b
= *(uint64_t *)(vn
+ j
);
1641 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
1642 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
1646 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
1648 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1649 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1650 uint64_t f
= *(uint64_t *)(vn
+ i
);
1651 uint64_t b
= *(uint64_t *)(vn
+ j
);
1652 *(uint64_t *)(vd
+ i
) = b
;
1653 *(uint64_t *)(vd
+ j
) = f
;
1657 #define DO_TBL(NAME, TYPE, H) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1660 intptr_t i, opr_sz = simd_oprsz(desc); \
1661 uintptr_t elem = opr_sz / sizeof(TYPE); \
1662 TYPE *d = vd, *n = vn, *m = vm; \
1664 if (unlikely(vd == vn)) { \
1665 n = memcpy(&tmp, vn, opr_sz); \
1667 for (i = 0; i < elem; i++) { \
1669 d[H(i)] = j < elem ? n[H(j)] : 0; \
1673 DO_TBL(sve_tbl_b
, uint8_t, H1
)
1674 DO_TBL(sve_tbl_h
, uint16_t, H2
)
1675 DO_TBL(sve_tbl_s
, uint32_t, H4
)
1676 DO_TBL(sve_tbl_d
, uint64_t, )
1680 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1681 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1683 intptr_t i, opr_sz = simd_oprsz(desc); \
1687 if (unlikely(vn - vd < opr_sz)) { \
1688 n = memcpy(&tmp, n, opr_sz / 2); \
1690 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1691 d[HD(i)] = n[HS(i)]; \
1695 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
1696 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
1697 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, , H4
)
1699 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
1700 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
1701 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, , H4
)
1705 /* Mask of bits included in the even numbered predicates of width esz.
1706 * We also use this for expand_bits/compress_bits, and so extend the
1707 * same pattern out to 16-bit units.
1709 static const uint64_t even_bit_esz_masks
[5] = {
1710 0x5555555555555555ull
,
1711 0x3333333333333333ull
,
1712 0x0f0f0f0f0f0f0f0full
,
1713 0x00ff00ff00ff00ffull
,
1714 0x0000ffff0000ffffull
,
1717 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1718 * For N==0, this corresponds to the operation that in qemu/bitops.h
1719 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1720 * section 7-2 Shuffling Bits.
1722 static uint64_t expand_bits(uint64_t x
, int n
)
1727 for (i
= 4; i
>= n
; i
--) {
1729 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
1734 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1735 * For N==0, this corresponds to the operation that in qemu/bitops.h
1736 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1737 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1739 static uint64_t compress_bits(uint64_t x
, int n
)
1743 for (i
= n
; i
<= 4; i
++) {
1745 x
&= even_bit_esz_masks
[i
];
1748 return x
& 0xffffffffu
;
1751 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
1753 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1754 int esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
1755 intptr_t high
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1);
1760 uint64_t nn
= *(uint64_t *)vn
;
1761 uint64_t mm
= *(uint64_t *)vm
;
1762 int half
= 4 * oprsz
;
1764 nn
= extract64(nn
, high
* half
, half
);
1765 mm
= extract64(mm
, high
* half
, half
);
1766 nn
= expand_bits(nn
, esz
);
1767 mm
= expand_bits(mm
, esz
);
1768 d
[0] = nn
+ (mm
<< (1 << esz
));
1770 ARMPredicateReg tmp_n
, tmp_m
;
1772 /* We produce output faster than we consume input.
1773 Therefore we must be mindful of possible overlap. */
1774 if ((vn
- vd
) < (uintptr_t)oprsz
) {
1775 vn
= memcpy(&tmp_n
, vn
, oprsz
);
1777 if ((vm
- vd
) < (uintptr_t)oprsz
) {
1778 vm
= memcpy(&tmp_m
, vm
, oprsz
);
1784 if ((high
& 3) == 0) {
1785 uint32_t *n
= vn
, *m
= vm
;
1788 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
1789 uint64_t nn
= n
[H4(high
+ i
)];
1790 uint64_t mm
= m
[H4(high
+ i
)];
1792 nn
= expand_bits(nn
, esz
);
1793 mm
= expand_bits(mm
, esz
);
1794 d
[i
] = nn
+ (mm
<< (1 << esz
));
1797 uint8_t *n
= vn
, *m
= vm
;
1800 for (i
= 0; i
< oprsz
/ 2; i
++) {
1801 uint16_t nn
= n
[H1(high
+ i
)];
1802 uint16_t mm
= m
[H1(high
+ i
)];
1804 nn
= expand_bits(nn
, esz
);
1805 mm
= expand_bits(mm
, esz
);
1806 d16
[H2(i
)] = nn
+ (mm
<< (1 << esz
));
1812 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
1814 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1815 int esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
1816 int odd
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1) << esz
;
1817 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1822 l
= compress_bits(n
[0] >> odd
, esz
);
1823 h
= compress_bits(m
[0] >> odd
, esz
);
1824 d
[0] = extract64(l
+ (h
<< (4 * oprsz
)), 0, 8 * oprsz
);
1826 ARMPredicateReg tmp_m
;
1827 intptr_t oprsz_16
= oprsz
/ 16;
1829 if ((vm
- vd
) < (uintptr_t)oprsz
) {
1830 m
= memcpy(&tmp_m
, vm
, oprsz
);
1833 for (i
= 0; i
< oprsz_16
; i
++) {
1836 l
= compress_bits(l
>> odd
, esz
);
1837 h
= compress_bits(h
>> odd
, esz
);
1838 d
[i
] = l
+ (h
<< 32);
1841 /* For VL which is not a power of 2, the results from M do not
1842 align nicely with the uint64_t for D. Put the aligned results
1843 from M into TMP_M and then copy it into place afterward. */
1845 d
[i
] = compress_bits(n
[2 * i
] >> odd
, esz
);
1847 for (i
= 0; i
< oprsz_16
; i
++) {
1850 l
= compress_bits(l
>> odd
, esz
);
1851 h
= compress_bits(h
>> odd
, esz
);
1852 tmp_m
.p
[i
] = l
+ (h
<< 32);
1854 tmp_m
.p
[i
] = compress_bits(m
[2 * i
] >> odd
, esz
);
1856 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
1858 for (i
= 0; i
< oprsz_16
; i
++) {
1861 l
= compress_bits(l
>> odd
, esz
);
1862 h
= compress_bits(h
>> odd
, esz
);
1863 d
[oprsz_16
+ i
] = l
+ (h
<< 32);
1869 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
1871 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1872 uintptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
1873 bool odd
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1);
1874 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1881 mask
= even_bit_esz_masks
[esz
];
1888 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
1889 uint64_t nn
= (n
[i
] & mask
) >> shr
;
1890 uint64_t mm
= (m
[i
] & mask
) << shl
;
1895 /* Reverse units of 2**N bits. */
1896 static uint64_t reverse_bits_64(uint64_t x
, int n
)
1901 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
1902 uint64_t mask
= even_bit_esz_masks
[i
];
1903 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
1908 static uint8_t reverse_bits_8(uint8_t x
, int n
)
1910 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
1913 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
1914 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
1919 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
1921 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1922 int esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
1923 intptr_t i
, oprsz_2
= oprsz
/ 2;
1926 uint64_t l
= *(uint64_t *)vn
;
1927 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
1928 *(uint64_t *)vd
= l
;
1929 } else if ((oprsz
& 15) == 0) {
1930 for (i
= 0; i
< oprsz_2
; i
+= 8) {
1931 intptr_t ih
= oprsz
- 8 - i
;
1932 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
1933 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
1934 *(uint64_t *)(vd
+ i
) = h
;
1935 *(uint64_t *)(vd
+ ih
) = l
;
1938 for (i
= 0; i
< oprsz_2
; i
+= 1) {
1939 intptr_t il
= H1(i
);
1940 intptr_t ih
= H1(oprsz
- 1 - i
);
1941 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
1942 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
1943 *(uint8_t *)(vd
+ il
) = h
;
1944 *(uint8_t *)(vd
+ ih
) = l
;
1949 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
1951 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1952 intptr_t high
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1);
1957 uint64_t nn
= *(uint64_t *)vn
;
1958 int half
= 4 * oprsz
;
1960 nn
= extract64(nn
, high
* half
, half
);
1961 nn
= expand_bits(nn
, 0);
1964 ARMPredicateReg tmp_n
;
1966 /* We produce output faster than we consume input.
1967 Therefore we must be mindful of possible overlap. */
1968 if ((vn
- vd
) < (uintptr_t)oprsz
) {
1969 vn
= memcpy(&tmp_n
, vn
, oprsz
);
1975 if ((high
& 3) == 0) {
1979 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
1980 uint64_t nn
= n
[H4(high
+ i
)];
1981 d
[i
] = expand_bits(nn
, 0);
1987 for (i
= 0; i
< oprsz
/ 2; i
++) {
1988 uint16_t nn
= n
[H1(high
+ i
)];
1989 d16
[H2(i
)] = expand_bits(nn
, 0);
1995 #define DO_ZIP(NAME, TYPE, H) \
1996 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1998 intptr_t oprsz = simd_oprsz(desc); \
1999 intptr_t i, oprsz_2 = oprsz / 2; \
2000 ARMVectorReg tmp_n, tmp_m; \
2001 /* We produce output faster than we consume input. \
2002 Therefore we must be mindful of possible overlap. */ \
2003 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2004 vn = memcpy(&tmp_n, vn, oprsz_2); \
2006 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2007 vm = memcpy(&tmp_m, vm, oprsz_2); \
2009 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2010 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2011 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2015 DO_ZIP(sve_zip_b
, uint8_t, H1
)
2016 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
2017 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
2018 DO_ZIP(sve_zip_d
, uint64_t, )
2020 #define DO_UZP(NAME, TYPE, H) \
2021 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2023 intptr_t oprsz = simd_oprsz(desc); \
2024 intptr_t oprsz_2 = oprsz / 2; \
2025 intptr_t odd_ofs = simd_data(desc); \
2027 ARMVectorReg tmp_m; \
2028 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2029 vm = memcpy(&tmp_m, vm, oprsz); \
2031 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2032 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2034 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2035 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2039 DO_UZP(sve_uzp_b
, uint8_t, H1
)
2040 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
2041 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
2042 DO_UZP(sve_uzp_d
, uint64_t, )
2044 #define DO_TRN(NAME, TYPE, H) \
2045 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2047 intptr_t oprsz = simd_oprsz(desc); \
2048 intptr_t odd_ofs = simd_data(desc); \
2050 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2051 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2052 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2053 *(TYPE *)(vd + H(i + 0)) = ae; \
2054 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2058 DO_TRN(sve_trn_b
, uint8_t, H1
)
2059 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
2060 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
2061 DO_TRN(sve_trn_d
, uint64_t, )
2067 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2069 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
2070 uint32_t *d
= vd
, *n
= vn
;
2073 for (i
= j
= 0; i
< opr_sz
; i
++) {
2074 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
2075 d
[H4(j
)] = n
[H4(i
)];
2079 for (; j
< opr_sz
; j
++) {
2084 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2086 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
2087 uint64_t *d
= vd
, *n
= vn
;
2090 for (i
= j
= 0; i
< opr_sz
; i
++) {
2091 if (pg
[H1(i
)] & 1) {
2096 for (; j
< opr_sz
; j
++) {
2101 /* Similar to the ARM LastActiveElement pseudocode function, except the
2102 * result is multiplied by the element size. This includes the not found
2103 * indication; e.g. not found for esz=3 is -8.
2105 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
2107 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2108 intptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
2110 return last_active_element(vg
, DIV_ROUND_UP(oprsz
, 8), esz
);