target/arm: Provide and use H8 and H1_8 macros
[qemu/ar7.git] / target / arm / sve_helper.c
bloba373f8c573e1c2a9bae1e64685637e2583768e48
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35 * and bit 0 set if C is set. Compare the definitions of these variables
36 * within CPUARMState.
39 /* For no G bits set, NZCV = C. */
40 #define PREDTEST_INIT 1
42 /* This is an iterative function, called for each Pd and Pg word
43 * moving forward.
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47 if (likely(g)) {
48 /* Compute N from first D & G.
49 Use bit 2 to signal first G bit seen. */
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
55 /* Accumulate Z from each D & G. */
56 flags |= ((d & g) != 0) << 1;
58 /* Compute C from last !(D & G). Replace previous. */
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 return flags;
64 /* This is an iterative function, called for each Pd and Pg word
65 * moving backward.
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69 if (likely(g)) {
70 /* Compute C from first (i.e last) !(D & G).
71 Use bit 2 to signal first G bit seen. */
72 if (!(flags & 4)) {
73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74 flags |= (d & pow2floor(g)) == 0;
77 /* Accumulate Z from each D & G. */
78 flags |= ((d & g) != 0) << 1;
80 /* Compute N from last (i.e first) D & G. Replace previous. */
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 return flags;
86 /* The same for a single word predicate. */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
92 /* The same for a multi-word predicate. */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
103 return flags;
106 /* Expand active predicate bits to bytes, for byte elements.
107 * for (i = 0; i < 256; ++i) {
108 * unsigned long m = 0;
109 * for (j = 0; j < 8; j++) {
110 * if ((i >> j) & 1) {
111 * m |= 0xfful << (j << 3);
114 * printf("0x%016lx,\n", m);
117 static inline uint64_t expand_pred_b(uint8_t byte)
119 static const uint64_t word[256] = {
120 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
121 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
122 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
123 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
124 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
125 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
126 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
127 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
128 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
129 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
130 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
131 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
132 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
133 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
134 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
135 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
136 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
137 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
138 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
139 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
140 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
141 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
142 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
143 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
144 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
145 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
146 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
147 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
148 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
149 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
150 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
151 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
152 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
153 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
154 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
155 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
156 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
157 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
158 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
159 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
160 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
161 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
162 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
163 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
164 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
165 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
166 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
167 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
168 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
169 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
170 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
171 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
172 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
173 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
174 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
175 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
176 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
177 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
178 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
179 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
180 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
181 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
182 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
183 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
184 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
185 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
186 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
187 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
188 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
189 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
190 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
191 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
192 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
193 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
194 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
195 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
196 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
197 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
198 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
199 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
200 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
201 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
202 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
203 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
204 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
205 0xffffffffffffffff,
207 return word[byte];
210 /* Similarly for half-word elements.
211 * for (i = 0; i < 256; ++i) {
212 * unsigned long m = 0;
213 * if (i & 0xaa) {
214 * continue;
216 * for (j = 0; j < 8; j += 2) {
217 * if ((i >> j) & 1) {
218 * m |= 0xfffful << (j << 3);
221 * printf("[0x%x] = 0x%016lx,\n", i, m);
224 static inline uint64_t expand_pred_h(uint8_t byte)
226 static const uint64_t word[] = {
227 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
228 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
229 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
230 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
231 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
232 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
233 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
234 [0x55] = 0xffffffffffffffff,
236 return word[byte & 0x55];
239 /* Similarly for single word elements. */
240 static inline uint64_t expand_pred_s(uint8_t byte)
242 static const uint64_t word[] = {
243 [0x01] = 0x00000000ffffffffull,
244 [0x10] = 0xffffffff00000000ull,
245 [0x11] = 0xffffffffffffffffull,
247 return word[byte & 0x11];
250 /* Swap 16-bit words within a 32-bit word. */
251 static inline uint32_t hswap32(uint32_t h)
253 return rol32(h, 16);
256 /* Swap 16-bit words within a 64-bit word. */
257 static inline uint64_t hswap64(uint64_t h)
259 uint64_t m = 0x0000ffff0000ffffull;
260 h = rol64(h, 32);
261 return ((h & m) << 16) | ((h >> 16) & m);
264 /* Swap 32-bit words within a 64-bit word. */
265 static inline uint64_t wswap64(uint64_t h)
267 return rol64(h, 32);
270 #define LOGICAL_PPPP(NAME, FUNC) \
271 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
273 uintptr_t opr_sz = simd_oprsz(desc); \
274 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
275 uintptr_t i; \
276 for (i = 0; i < opr_sz / 8; ++i) { \
277 d[i] = FUNC(n[i], m[i], g[i]); \
281 #define DO_AND(N, M, G) (((N) & (M)) & (G))
282 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
283 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
284 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
285 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
286 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
287 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
288 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
290 LOGICAL_PPPP(sve_and_pppp, DO_AND)
291 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
292 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
293 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
294 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
295 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
296 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
297 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
299 #undef DO_AND
300 #undef DO_BIC
301 #undef DO_EOR
302 #undef DO_ORR
303 #undef DO_ORN
304 #undef DO_NOR
305 #undef DO_NAND
306 #undef DO_SEL
307 #undef LOGICAL_PPPP
309 /* Fully general three-operand expander, controlled by a predicate.
310 * This is complicated by the host-endian storage of the register file.
312 /* ??? I don't expect the compiler could ever vectorize this itself.
313 * With some tables we can convert bit masks to byte masks, and with
314 * extra care wrt byte/word ordering we could use gcc generic vectors
315 * and do 16 bytes at a time.
317 #define DO_ZPZZ(NAME, TYPE, H, OP) \
318 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
320 intptr_t i, opr_sz = simd_oprsz(desc); \
321 for (i = 0; i < opr_sz; ) { \
322 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
323 do { \
324 if (pg & 1) { \
325 TYPE nn = *(TYPE *)(vn + H(i)); \
326 TYPE mm = *(TYPE *)(vm + H(i)); \
327 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
329 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
330 } while (i & 15); \
334 /* Similarly, specialized for 64-bit operands. */
335 #define DO_ZPZZ_D(NAME, TYPE, OP) \
336 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
338 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
339 TYPE *d = vd, *n = vn, *m = vm; \
340 uint8_t *pg = vg; \
341 for (i = 0; i < opr_sz; i += 1) { \
342 if (pg[H1(i)] & 1) { \
343 TYPE nn = n[i], mm = m[i]; \
344 d[i] = OP(nn, mm); \
349 #define DO_AND(N, M) (N & M)
350 #define DO_EOR(N, M) (N ^ M)
351 #define DO_ORR(N, M) (N | M)
352 #define DO_BIC(N, M) (N & ~M)
353 #define DO_ADD(N, M) (N + M)
354 #define DO_SUB(N, M) (N - M)
355 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
356 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
357 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
358 #define DO_MUL(N, M) (N * M)
362 * We must avoid the C undefined behaviour cases: division by
363 * zero and signed division of INT_MIN by -1. Both of these
364 * have architecturally defined required results for Arm.
365 * We special case all signed divisions by -1 to avoid having
366 * to deduce the minimum integer for the type involved.
368 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
369 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
371 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
372 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
373 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
374 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
376 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
377 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
378 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
379 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
381 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
382 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
383 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
384 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
386 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
387 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
388 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
389 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
391 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
392 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
393 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
394 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
396 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
397 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
398 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
399 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
401 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
402 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
403 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
404 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
406 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
407 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
408 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
409 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
411 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
412 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
413 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
414 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
416 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
417 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
418 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
419 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
421 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
422 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
423 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
424 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
426 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
427 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
428 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
429 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
431 /* Because the computation type is at least twice as large as required,
432 these work for both signed and unsigned source types. */
433 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
435 return (n * m) >> 8;
438 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
440 return (n * m) >> 16;
443 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
445 return (n * m) >> 32;
448 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
450 uint64_t lo, hi;
451 muls64(&lo, &hi, n, m);
452 return hi;
455 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
457 uint64_t lo, hi;
458 mulu64(&lo, &hi, n, m);
459 return hi;
462 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
463 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
464 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
465 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
467 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
468 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
469 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
470 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
472 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
473 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
474 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
475 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
477 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
478 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
480 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
481 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
483 /* Note that all bits of the shift are significant
484 and not modulo the element size. */
485 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
486 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
487 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
489 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
490 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
491 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
493 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
494 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
495 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
497 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
498 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
499 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
501 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
502 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
503 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
505 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
507 int8_t n1 = n, n2 = n >> 8;
508 return m + n1 + n2;
511 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
513 int16_t n1 = n, n2 = n >> 16;
514 return m + n1 + n2;
517 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
519 int32_t n1 = n, n2 = n >> 32;
520 return m + n1 + n2;
523 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
524 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
525 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
527 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
529 uint8_t n1 = n, n2 = n >> 8;
530 return m + n1 + n2;
533 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
535 uint16_t n1 = n, n2 = n >> 16;
536 return m + n1 + n2;
539 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
541 uint32_t n1 = n, n2 = n >> 32;
542 return m + n1 + n2;
545 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
546 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
547 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
549 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
550 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
551 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
552 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
554 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
555 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
556 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
557 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
559 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
560 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
561 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
562 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
564 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
565 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
566 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
567 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
570 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
571 * We pass in a pointer to a dummy saturation field to trigger
572 * the saturating arithmetic but discard the information about
573 * whether it has occurred.
575 #define do_sqshl_b(n, m) \
576 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
577 #define do_sqshl_h(n, m) \
578 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
579 #define do_sqshl_s(n, m) \
580 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
581 #define do_sqshl_d(n, m) \
582 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
584 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
585 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
586 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
587 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
589 #define do_uqshl_b(n, m) \
590 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
591 #define do_uqshl_h(n, m) \
592 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
593 #define do_uqshl_s(n, m) \
594 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
595 #define do_uqshl_d(n, m) \
596 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
598 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
599 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
600 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
601 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
603 #define do_sqrshl_b(n, m) \
604 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
605 #define do_sqrshl_h(n, m) \
606 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
607 #define do_sqrshl_s(n, m) \
608 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
609 #define do_sqrshl_d(n, m) \
610 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
612 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
613 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
614 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
615 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
617 #undef do_sqrshl_d
619 #define do_uqrshl_b(n, m) \
620 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
621 #define do_uqrshl_h(n, m) \
622 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
623 #define do_uqrshl_s(n, m) \
624 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_uqrshl_d(n, m) \
626 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
629 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
630 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
631 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
633 #undef do_uqrshl_d
635 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
636 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
638 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
639 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
640 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
641 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
643 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
644 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
645 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
646 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
648 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
649 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
651 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
652 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
653 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
654 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
656 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
657 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
658 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
659 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
661 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
662 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
664 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
665 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
666 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
667 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
669 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
670 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
671 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
672 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
674 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
676 return val >= max ? max : val <= min ? min : val;
679 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
680 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
681 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
683 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
685 int64_t r = n + m;
686 if (((r ^ n) & ~(n ^ m)) < 0) {
687 /* Signed overflow. */
688 return r < 0 ? INT64_MAX : INT64_MIN;
690 return r;
693 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
694 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
695 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
696 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
698 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
699 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
700 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
702 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
704 uint64_t r = n + m;
705 return r < n ? UINT64_MAX : r;
708 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
709 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
710 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
711 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
713 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
714 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
715 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
717 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
719 int64_t r = n - m;
720 if (((r ^ n) & (n ^ m)) < 0) {
721 /* Signed overflow. */
722 return r < 0 ? INT64_MAX : INT64_MIN;
724 return r;
727 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
728 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
729 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
730 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
732 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
733 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
734 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
736 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
738 return n > m ? n - m : 0;
741 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
742 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
743 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
744 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
746 #define DO_SUQADD_B(n, m) \
747 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
748 #define DO_SUQADD_H(n, m) \
749 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
750 #define DO_SUQADD_S(n, m) \
751 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
753 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
755 uint64_t r = n + m;
757 if (n < 0) {
758 /* Note that m - abs(n) cannot underflow. */
759 if (r > INT64_MAX) {
760 /* Result is either very large positive or negative. */
761 if (m > -n) {
762 /* m > abs(n), so r is a very large positive. */
763 return INT64_MAX;
765 /* Result is negative. */
767 } else {
768 /* Both inputs are positive: check for overflow. */
769 if (r < m || r > INT64_MAX) {
770 return INT64_MAX;
773 return r;
776 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
777 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
778 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
779 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
781 #define DO_USQADD_B(n, m) \
782 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
783 #define DO_USQADD_H(n, m) \
784 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
785 #define DO_USQADD_S(n, m) \
786 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
788 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
790 uint64_t r = n + m;
792 if (m < 0) {
793 return n < -m ? 0 : r;
795 return r < n ? UINT64_MAX : r;
798 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
799 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
800 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
801 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
803 #undef DO_ZPZZ
804 #undef DO_ZPZZ_D
807 * Three operand expander, operating on element pairs.
808 * If the slot I is even, the elements from from VN {I, I+1}.
809 * If the slot I is odd, the elements from from VM {I-1, I}.
810 * Load all of the input elements in each pair before overwriting output.
812 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
813 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
815 intptr_t i, opr_sz = simd_oprsz(desc); \
816 for (i = 0; i < opr_sz; ) { \
817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
818 do { \
819 TYPE n0 = *(TYPE *)(vn + H(i)); \
820 TYPE m0 = *(TYPE *)(vm + H(i)); \
821 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
822 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
823 if (pg & 1) { \
824 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
826 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
827 if (pg & 1) { \
828 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
830 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
831 } while (i & 15); \
835 /* Similarly, specialized for 64-bit operands. */
836 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
837 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
839 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
840 TYPE *d = vd, *n = vn, *m = vm; \
841 uint8_t *pg = vg; \
842 for (i = 0; i < opr_sz; i += 2) { \
843 TYPE n0 = n[i], n1 = n[i + 1]; \
844 TYPE m0 = m[i], m1 = m[i + 1]; \
845 if (pg[H1(i)] & 1) { \
846 d[i] = OP(n0, n1); \
848 if (pg[H1(i + 1)] & 1) { \
849 d[i + 1] = OP(m0, m1); \
854 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
855 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
856 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
857 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
859 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
860 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
861 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
862 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
864 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
865 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
866 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
867 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
869 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
870 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
871 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
872 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
874 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
875 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
876 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
877 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
879 #undef DO_ZPZZ_PAIR
880 #undef DO_ZPZZ_PAIR_D
882 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
883 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
884 void *status, uint32_t desc) \
886 intptr_t i, opr_sz = simd_oprsz(desc); \
887 for (i = 0; i < opr_sz; ) { \
888 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
889 do { \
890 TYPE n0 = *(TYPE *)(vn + H(i)); \
891 TYPE m0 = *(TYPE *)(vm + H(i)); \
892 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
893 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
894 if (pg & 1) { \
895 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
897 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
898 if (pg & 1) { \
899 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
901 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
902 } while (i & 15); \
906 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
907 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
908 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
910 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
911 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
912 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
914 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
915 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
916 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
918 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
919 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
920 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
922 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
923 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
924 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
926 #undef DO_ZPZZ_PAIR_FP
928 /* Three-operand expander, controlled by a predicate, in which the
929 * third operand is "wide". That is, for D = N op M, the same 64-bit
930 * value of M is used with all of the narrower values of N.
932 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
933 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
935 intptr_t i, opr_sz = simd_oprsz(desc); \
936 for (i = 0; i < opr_sz; ) { \
937 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
938 TYPEW mm = *(TYPEW *)(vm + i); \
939 do { \
940 if (pg & 1) { \
941 TYPE nn = *(TYPE *)(vn + H(i)); \
942 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
944 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
945 } while (i & 7); \
949 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
950 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
951 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
953 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
954 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
955 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
957 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
958 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
959 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
961 #undef DO_ZPZW
963 /* Fully general two-operand expander, controlled by a predicate.
965 #define DO_ZPZ(NAME, TYPE, H, OP) \
966 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
968 intptr_t i, opr_sz = simd_oprsz(desc); \
969 for (i = 0; i < opr_sz; ) { \
970 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
971 do { \
972 if (pg & 1) { \
973 TYPE nn = *(TYPE *)(vn + H(i)); \
974 *(TYPE *)(vd + H(i)) = OP(nn); \
976 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
977 } while (i & 15); \
981 /* Similarly, specialized for 64-bit operands. */
982 #define DO_ZPZ_D(NAME, TYPE, OP) \
983 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
985 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
986 TYPE *d = vd, *n = vn; \
987 uint8_t *pg = vg; \
988 for (i = 0; i < opr_sz; i += 1) { \
989 if (pg[H1(i)] & 1) { \
990 TYPE nn = n[i]; \
991 d[i] = OP(nn); \
996 #define DO_CLS_B(N) (clrsb32(N) - 24)
997 #define DO_CLS_H(N) (clrsb32(N) - 16)
999 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
1000 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
1001 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
1002 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
1004 #define DO_CLZ_B(N) (clz32(N) - 24)
1005 #define DO_CLZ_H(N) (clz32(N) - 16)
1007 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
1008 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
1009 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
1010 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
1012 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
1013 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
1014 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
1015 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
1017 #define DO_CNOT(N) (N == 0)
1019 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
1020 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
1021 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
1022 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
1024 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1026 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
1027 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
1028 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
1030 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1032 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1033 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1034 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1036 #define DO_NOT(N) (~N)
1038 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1039 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1040 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1041 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1043 #define DO_SXTB(N) ((int8_t)N)
1044 #define DO_SXTH(N) ((int16_t)N)
1045 #define DO_SXTS(N) ((int32_t)N)
1046 #define DO_UXTB(N) ((uint8_t)N)
1047 #define DO_UXTH(N) ((uint16_t)N)
1048 #define DO_UXTS(N) ((uint32_t)N)
1050 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1051 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1052 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1053 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1054 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1055 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1057 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1058 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1059 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1060 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1061 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1062 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1064 #define DO_ABS(N) (N < 0 ? -N : N)
1066 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1067 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1068 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1069 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1071 #define DO_NEG(N) (-N)
1073 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1074 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1075 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1076 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1078 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1079 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1080 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1082 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1083 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1085 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1087 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1088 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1089 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1090 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1092 #define DO_SQABS(X) \
1093 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1094 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1096 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1097 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1098 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1099 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1101 #define DO_SQNEG(X) \
1102 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1103 x_ == min_ ? -min_ - 1 : -x_; })
1105 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1106 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1107 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1108 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1110 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1111 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1113 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1115 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1116 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1118 intptr_t i, opr_sz = simd_oprsz(desc); \
1119 for (i = 0; i < opr_sz; ) { \
1120 TYPEW mm = *(TYPEW *)(vm + i); \
1121 do { \
1122 TYPE nn = *(TYPE *)(vn + H(i)); \
1123 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1124 i += sizeof(TYPE); \
1125 } while (i & 7); \
1129 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1130 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1131 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1133 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1134 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1135 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1137 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1138 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1139 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1141 #undef DO_ZZW
1143 #undef DO_CLS_B
1144 #undef DO_CLS_H
1145 #undef DO_CLZ_B
1146 #undef DO_CLZ_H
1147 #undef DO_CNOT
1148 #undef DO_FABS
1149 #undef DO_FNEG
1150 #undef DO_ABS
1151 #undef DO_NEG
1152 #undef DO_ZPZ
1153 #undef DO_ZPZ_D
1156 * Three-operand expander, unpredicated, in which the two inputs are
1157 * selected from the top or bottom half of the wide column.
1159 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1160 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1162 intptr_t i, opr_sz = simd_oprsz(desc); \
1163 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1164 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1165 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1166 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1167 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1168 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1172 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1173 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1174 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1176 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1177 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1178 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1180 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1181 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1182 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1184 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1185 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1186 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1188 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1189 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1190 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1192 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1193 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1194 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1196 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1197 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1198 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1200 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1201 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1202 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1204 /* Note that the multiply cannot overflow, but the doubling can. */
1205 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1207 int16_t val = n * m;
1208 return DO_SQADD_H(val, val);
1211 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1213 int32_t val = n * m;
1214 return DO_SQADD_S(val, val);
1217 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1219 int64_t val = n * m;
1220 return do_sqadd_d(val, val);
1223 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1224 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1225 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1227 #undef DO_ZZZ_TB
1229 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1230 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1232 intptr_t i, opr_sz = simd_oprsz(desc); \
1233 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1234 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1235 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1236 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1237 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1241 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1242 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1243 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1245 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1246 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1247 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1249 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1250 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1251 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1253 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1254 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1255 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1257 #undef DO_ZZZ_WTB
1259 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1260 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1262 intptr_t i, opr_sz = simd_oprsz(desc); \
1263 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1264 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1265 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1266 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1267 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1268 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1272 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1273 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1274 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1275 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1277 #undef DO_ZZZ_NTB
1279 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1280 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1282 intptr_t i, opr_sz = simd_oprsz(desc); \
1283 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1284 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1285 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1286 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1287 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1288 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1292 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1293 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1294 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1296 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1297 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1298 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1300 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1301 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1302 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1304 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1305 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1306 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1308 #define DO_NMUL(N, M) -(N * M)
1310 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1311 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1312 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1314 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1315 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1316 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1318 #undef DO_ZZZW_ACC
1320 #define DO_XTNB(NAME, TYPE, OP) \
1321 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1323 intptr_t i, opr_sz = simd_oprsz(desc); \
1324 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1325 TYPE nn = *(TYPE *)(vn + i); \
1326 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1327 *(TYPE *)(vd + i) = nn; \
1331 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1332 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1334 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1335 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1336 TYPE nn = *(TYPE *)(vn + i); \
1337 *(TYPEN *)(vd + i + odd) = OP(nn); \
1341 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1342 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1343 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1345 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1346 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1347 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1349 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1350 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1351 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1353 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1354 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1355 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1357 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1358 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1359 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1361 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1362 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1363 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1365 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1366 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1367 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1369 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1370 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1371 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1373 #undef DO_XTNB
1374 #undef DO_XTNT
1376 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1378 intptr_t i, opr_sz = simd_oprsz(desc);
1379 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1380 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1381 uint32_t *a = va, *n = vn;
1382 uint64_t *d = vd, *m = vm;
1384 for (i = 0; i < opr_sz / 8; ++i) {
1385 uint32_t e1 = a[2 * i + H4(0)];
1386 uint32_t e2 = n[2 * i + sel] ^ inv;
1387 uint64_t c = extract64(m[i], 32, 1);
1388 /* Compute and store the entire 33-bit result at once. */
1389 d[i] = c + e1 + e2;
1393 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1395 intptr_t i, opr_sz = simd_oprsz(desc);
1396 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1397 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1398 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1400 for (i = 0; i < opr_sz / 8; i += 2) {
1401 Int128 e1 = int128_make64(a[i]);
1402 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1403 Int128 c = int128_make64(m[i + 1] & 1);
1404 Int128 r = int128_add(int128_add(e1, e2), c);
1405 d[i + 0] = int128_getlo(r);
1406 d[i + 1] = int128_gethi(r);
1410 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1413 intptr_t i, opr_sz = simd_oprsz(desc); \
1414 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1415 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1416 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1417 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1418 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1419 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1420 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1424 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1425 do_sqdmull_h, DO_SQADD_H)
1426 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1427 do_sqdmull_s, DO_SQADD_S)
1428 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1429 do_sqdmull_d, do_sqadd_d)
1431 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1432 do_sqdmull_h, DO_SQSUB_H)
1433 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1434 do_sqdmull_s, DO_SQSUB_S)
1435 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1436 do_sqdmull_d, do_sqsub_d)
1438 #undef DO_SQDMLAL
1440 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1441 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1443 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1444 int rot = simd_data(desc); \
1445 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1446 bool sub_r = rot == 1 || rot == 2; \
1447 bool sub_i = rot >= 2; \
1448 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1449 for (i = 0; i < opr_sz; i += 2) { \
1450 TYPE elt1_a = n[H(i + sel_a)]; \
1451 TYPE elt2_a = m[H(i + sel_a)]; \
1452 TYPE elt2_b = m[H(i + sel_b)]; \
1453 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1454 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1458 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1460 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1461 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1462 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1463 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1465 #define DO_SQRDMLAH_B(N, M, A, S) \
1466 do_sqrdmlah_b(N, M, A, S, true)
1467 #define DO_SQRDMLAH_H(N, M, A, S) \
1468 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1469 #define DO_SQRDMLAH_S(N, M, A, S) \
1470 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1471 #define DO_SQRDMLAH_D(N, M, A, S) \
1472 do_sqrdmlah_d(N, M, A, S, true)
1474 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1475 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1476 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1477 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1479 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1480 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1482 intptr_t i, j, oprsz = simd_oprsz(desc); \
1483 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1484 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1485 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1486 bool sub_r = rot == 1 || rot == 2; \
1487 bool sub_i = rot >= 2; \
1488 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1489 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1490 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1491 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1492 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1493 TYPE elt1_a = n[H(i + j + sel_a)]; \
1494 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1495 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1500 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1501 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1503 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1504 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1506 #undef DO_CMLA
1507 #undef DO_CMLA_FUNC
1508 #undef DO_CMLA_IDX_FUNC
1509 #undef DO_SQRDMLAH_B
1510 #undef DO_SQRDMLAH_H
1511 #undef DO_SQRDMLAH_S
1512 #undef DO_SQRDMLAH_D
1514 /* Note N and M are 4 elements bundled into one unit. */
1515 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1516 int sel_a, int sel_b, int sub_i)
1518 for (int i = 0; i <= 1; i++) {
1519 int32_t elt1_r = (int8_t)(n >> (16 * i));
1520 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1521 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1522 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1524 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1526 return a;
1529 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1530 int sel_a, int sel_b, int sub_i)
1532 for (int i = 0; i <= 1; i++) {
1533 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1534 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1535 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1536 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1538 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1540 return a;
1543 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1544 void *va, uint32_t desc)
1546 int opr_sz = simd_oprsz(desc);
1547 int rot = simd_data(desc);
1548 int sel_a = rot & 1;
1549 int sel_b = sel_a ^ 1;
1550 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1551 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1553 for (int e = 0; e < opr_sz / 4; e++) {
1554 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1558 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1559 void *va, uint32_t desc)
1561 int opr_sz = simd_oprsz(desc);
1562 int rot = simd_data(desc);
1563 int sel_a = rot & 1;
1564 int sel_b = sel_a ^ 1;
1565 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1566 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1568 for (int e = 0; e < opr_sz / 8; e++) {
1569 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1573 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1574 void *va, uint32_t desc)
1576 int opr_sz = simd_oprsz(desc);
1577 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1578 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1579 int sel_a = rot & 1;
1580 int sel_b = sel_a ^ 1;
1581 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1582 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1584 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1585 uint32_t seg_m = m[seg + idx];
1586 for (int e = 0; e < 4; e++) {
1587 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1588 sel_a, sel_b, sub_i);
1593 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1594 void *va, uint32_t desc)
1596 int seg, opr_sz = simd_oprsz(desc);
1597 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1598 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1599 int sel_a = rot & 1;
1600 int sel_b = sel_a ^ 1;
1601 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1602 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1604 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1605 uint64_t seg_m = m[seg + idx];
1606 for (int e = 0; e < 2; e++) {
1607 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1608 sel_a, sel_b, sub_i);
1613 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1614 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1616 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1617 intptr_t i, j, idx = simd_data(desc); \
1618 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1619 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1620 TYPE mm = m[i]; \
1621 for (j = 0; j < segment; j++) { \
1622 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1627 #define DO_SQRDMLAH_H(N, M, A) \
1628 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1629 #define DO_SQRDMLAH_S(N, M, A) \
1630 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1631 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1633 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1634 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1635 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1637 #define DO_SQRDMLSH_H(N, M, A) \
1638 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1639 #define DO_SQRDMLSH_S(N, M, A) \
1640 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1641 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1643 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1644 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1645 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1647 #undef DO_ZZXZ
1649 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1650 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1652 intptr_t i, j, oprsz = simd_oprsz(desc); \
1653 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1654 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1655 for (i = 0; i < oprsz; i += 16) { \
1656 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1657 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1658 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1659 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1660 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1665 #define DO_MLA(N, M, A) (A + N * M)
1667 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1668 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1669 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1670 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1672 #define DO_MLS(N, M, A) (A - N * M)
1674 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1675 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1676 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1677 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1679 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1680 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1682 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1683 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1685 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1686 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1688 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1689 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1691 #undef DO_MLA
1692 #undef DO_MLS
1693 #undef DO_ZZXW
1695 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1696 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1698 intptr_t i, j, oprsz = simd_oprsz(desc); \
1699 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1700 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1701 for (i = 0; i < oprsz; i += 16) { \
1702 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1703 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1704 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1705 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1710 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1711 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1713 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1714 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1716 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1717 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1719 #undef DO_ZZX
1721 #define DO_BITPERM(NAME, TYPE, OP) \
1722 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1724 intptr_t i, opr_sz = simd_oprsz(desc); \
1725 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1726 TYPE nn = *(TYPE *)(vn + i); \
1727 TYPE mm = *(TYPE *)(vm + i); \
1728 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1732 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1734 uint64_t res = 0;
1735 int db, rb = 0;
1737 for (db = 0; db < n; ++db) {
1738 if ((mask >> db) & 1) {
1739 res |= ((data >> db) & 1) << rb;
1740 ++rb;
1743 return res;
1746 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1747 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1748 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1749 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1751 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1753 uint64_t res = 0;
1754 int rb, db = 0;
1756 for (rb = 0; rb < n; ++rb) {
1757 if ((mask >> rb) & 1) {
1758 res |= ((data >> db) & 1) << rb;
1759 ++db;
1762 return res;
1765 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1766 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1767 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1768 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1770 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1772 uint64_t resm = 0, resu = 0;
1773 int db, rbm = 0, rbu = 0;
1775 for (db = 0; db < n; ++db) {
1776 uint64_t val = (data >> db) & 1;
1777 if ((mask >> db) & 1) {
1778 resm |= val << rbm++;
1779 } else {
1780 resu |= val << rbu++;
1784 return resm | (resu << rbm);
1787 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1788 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1789 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1790 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1792 #undef DO_BITPERM
1794 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1795 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1797 intptr_t i, opr_sz = simd_oprsz(desc); \
1798 int sub_r = simd_data(desc); \
1799 if (sub_r) { \
1800 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1801 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1802 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1803 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1804 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1805 acc_r = ADD_OP(acc_r, el2_i); \
1806 acc_i = SUB_OP(acc_i, el2_r); \
1807 *(TYPE *)(vd + H(i)) = acc_r; \
1808 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1810 } else { \
1811 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1812 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1813 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1814 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1815 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1816 acc_r = SUB_OP(acc_r, el2_i); \
1817 acc_i = ADD_OP(acc_i, el2_r); \
1818 *(TYPE *)(vd + H(i)) = acc_r; \
1819 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1824 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1825 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1826 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1827 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1829 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1830 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1831 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1832 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1834 #undef DO_CADD
1836 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1837 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1839 intptr_t i, opr_sz = simd_oprsz(desc); \
1840 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1841 int shift = simd_data(desc) >> 1; \
1842 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1843 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1844 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1848 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1849 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1850 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1852 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1853 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1854 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1856 #undef DO_ZZI_SHLL
1858 /* Two-operand reduction expander, controlled by a predicate.
1859 * The difference between TYPERED and TYPERET has to do with
1860 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1861 * but TYPERET must be unsigned so that e.g. a 32-bit value
1862 * is not sign-extended to the ABI uint64_t return type.
1864 /* ??? If we were to vectorize this by hand the reduction ordering
1865 * would change. For integer operands, this is perfectly fine.
1867 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1868 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1870 intptr_t i, opr_sz = simd_oprsz(desc); \
1871 TYPERED ret = INIT; \
1872 for (i = 0; i < opr_sz; ) { \
1873 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1874 do { \
1875 if (pg & 1) { \
1876 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1877 ret = OP(ret, nn); \
1879 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1880 } while (i & 15); \
1882 return (TYPERET)ret; \
1885 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1886 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1888 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1889 TYPEE *n = vn; \
1890 uint8_t *pg = vg; \
1891 TYPER ret = INIT; \
1892 for (i = 0; i < opr_sz; i += 1) { \
1893 if (pg[H1(i)] & 1) { \
1894 TYPEE nn = n[i]; \
1895 ret = OP(ret, nn); \
1898 return ret; \
1901 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1902 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1903 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1904 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1906 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1907 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1908 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1909 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1911 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1912 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1913 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1914 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1916 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1917 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1918 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1920 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1921 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1922 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1923 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1925 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1926 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1927 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1928 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1930 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1931 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1932 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1933 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1935 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1936 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1937 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1938 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1940 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1941 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1942 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1943 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1945 #undef DO_VPZ
1946 #undef DO_VPZ_D
1948 /* Two vector operand, one scalar operand, unpredicated. */
1949 #define DO_ZZI(NAME, TYPE, OP) \
1950 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1952 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1953 TYPE s = s64, *d = vd, *n = vn; \
1954 for (i = 0; i < opr_sz; ++i) { \
1955 d[i] = OP(n[i], s); \
1959 #define DO_SUBR(X, Y) (Y - X)
1961 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1962 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1963 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1964 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1966 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1967 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1968 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1969 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1971 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1972 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1973 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1974 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1976 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1977 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1978 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1979 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1981 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1982 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1983 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1984 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1986 #undef DO_ZZI
1988 #undef DO_AND
1989 #undef DO_ORR
1990 #undef DO_EOR
1991 #undef DO_BIC
1992 #undef DO_ADD
1993 #undef DO_SUB
1994 #undef DO_MAX
1995 #undef DO_MIN
1996 #undef DO_ABD
1997 #undef DO_MUL
1998 #undef DO_DIV
1999 #undef DO_ASR
2000 #undef DO_LSR
2001 #undef DO_LSL
2002 #undef DO_SUBR
2004 /* Similar to the ARM LastActiveElement pseudocode function, except the
2005 result is multiplied by the element size. This includes the not found
2006 indication; e.g. not found for esz=3 is -8. */
2007 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
2009 uint64_t mask = pred_esz_masks[esz];
2010 intptr_t i = words;
2012 do {
2013 uint64_t this_g = g[--i] & mask;
2014 if (this_g) {
2015 return i * 64 + (63 - clz64(this_g));
2017 } while (i > 0);
2018 return (intptr_t)-1 << esz;
2021 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
2023 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2024 uint32_t flags = PREDTEST_INIT;
2025 uint64_t *d = vd, *g = vg;
2026 intptr_t i = 0;
2028 do {
2029 uint64_t this_d = d[i];
2030 uint64_t this_g = g[i];
2032 if (this_g) {
2033 if (!(flags & 4)) {
2034 /* Set in D the first bit of G. */
2035 this_d |= this_g & -this_g;
2036 d[i] = this_d;
2038 flags = iter_predtest_fwd(this_d, this_g, flags);
2040 } while (++i < words);
2042 return flags;
2045 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
2047 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2048 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2049 uint32_t flags = PREDTEST_INIT;
2050 uint64_t *d = vd, *g = vg, esz_mask;
2051 intptr_t i, next;
2053 next = last_active_element(vd, words, esz) + (1 << esz);
2054 esz_mask = pred_esz_masks[esz];
2056 /* Similar to the pseudocode for pnext, but scaled by ESZ
2057 so that we find the correct bit. */
2058 if (next < words * 64) {
2059 uint64_t mask = -1;
2061 if (next & 63) {
2062 mask = ~((1ull << (next & 63)) - 1);
2063 next &= -64;
2065 do {
2066 uint64_t this_g = g[next / 64] & esz_mask & mask;
2067 if (this_g != 0) {
2068 next = (next & -64) + ctz64(this_g);
2069 break;
2071 next += 64;
2072 mask = -1;
2073 } while (next < words * 64);
2076 i = 0;
2077 do {
2078 uint64_t this_d = 0;
2079 if (i == next / 64) {
2080 this_d = 1ull << (next & 63);
2082 d[i] = this_d;
2083 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
2084 } while (++i < words);
2086 return flags;
2090 * Copy Zn into Zd, and store zero into inactive elements.
2091 * If inv, store zeros into the active elements.
2093 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
2095 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2096 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2097 uint64_t *d = vd, *n = vn;
2098 uint8_t *pg = vg;
2100 for (i = 0; i < opr_sz; i += 1) {
2101 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
2105 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
2107 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2108 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2109 uint64_t *d = vd, *n = vn;
2110 uint8_t *pg = vg;
2112 for (i = 0; i < opr_sz; i += 1) {
2113 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2117 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2119 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2120 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2121 uint64_t *d = vd, *n = vn;
2122 uint8_t *pg = vg;
2124 for (i = 0; i < opr_sz; i += 1) {
2125 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2129 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2131 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2132 uint64_t *d = vd, *n = vn;
2133 uint8_t *pg = vg;
2134 uint8_t inv = simd_data(desc);
2136 for (i = 0; i < opr_sz; i += 1) {
2137 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2141 /* Three-operand expander, immediate operand, controlled by a predicate.
2143 #define DO_ZPZI(NAME, TYPE, H, OP) \
2144 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2146 intptr_t i, opr_sz = simd_oprsz(desc); \
2147 TYPE imm = simd_data(desc); \
2148 for (i = 0; i < opr_sz; ) { \
2149 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2150 do { \
2151 if (pg & 1) { \
2152 TYPE nn = *(TYPE *)(vn + H(i)); \
2153 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2155 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2156 } while (i & 15); \
2160 /* Similarly, specialized for 64-bit operands. */
2161 #define DO_ZPZI_D(NAME, TYPE, OP) \
2162 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2164 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2165 TYPE *d = vd, *n = vn; \
2166 TYPE imm = simd_data(desc); \
2167 uint8_t *pg = vg; \
2168 for (i = 0; i < opr_sz; i += 1) { \
2169 if (pg[H1(i)] & 1) { \
2170 TYPE nn = n[i]; \
2171 d[i] = OP(nn, imm); \
2176 #define DO_SHR(N, M) (N >> M)
2177 #define DO_SHL(N, M) (N << M)
2179 /* Arithmetic shift right for division. This rounds negative numbers
2180 toward zero as per signed division. Therefore before shifting,
2181 when N is negative, add 2**M-1. */
2182 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2184 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2186 if (likely(sh < 64)) {
2187 return (x >> sh) + ((x >> (sh - 1)) & 1);
2188 } else if (sh == 64) {
2189 return x >> 63;
2190 } else {
2191 return 0;
2195 static inline int64_t do_srshr(int64_t x, unsigned sh)
2197 if (likely(sh < 64)) {
2198 return (x >> sh) + ((x >> (sh - 1)) & 1);
2199 } else {
2200 /* Rounding the sign bit always produces 0. */
2201 return 0;
2205 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2206 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2207 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2208 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2210 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2211 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2212 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2213 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2215 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2216 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2217 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2218 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2220 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2221 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2222 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2223 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2225 /* SVE2 bitwise shift by immediate */
2226 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2227 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2228 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2229 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2231 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2232 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2233 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2234 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2236 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2237 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2238 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2239 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2241 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2242 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2243 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2244 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2246 #define do_suqrshl_b(n, m) \
2247 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2248 #define do_suqrshl_h(n, m) \
2249 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2250 #define do_suqrshl_s(n, m) \
2251 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2252 #define do_suqrshl_d(n, m) \
2253 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2255 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2256 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2257 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2258 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2260 #undef DO_ASRD
2261 #undef DO_ZPZI
2262 #undef DO_ZPZI_D
2264 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2265 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2267 intptr_t i, opr_sz = simd_oprsz(desc); \
2268 int shift = simd_data(desc); \
2269 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2270 TYPEW nn = *(TYPEW *)(vn + i); \
2271 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2275 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2276 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2278 intptr_t i, opr_sz = simd_oprsz(desc); \
2279 int shift = simd_data(desc); \
2280 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2281 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2282 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2286 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2287 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2288 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2290 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2291 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2292 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2294 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2295 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2296 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2298 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2299 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2300 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2302 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2303 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2304 #define DO_SQSHRUN_D(x, sh) \
2305 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2307 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2308 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2309 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2311 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2312 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2313 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2315 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2316 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2317 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2319 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2320 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2321 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2323 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2324 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2325 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2327 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2328 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2329 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2331 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2332 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2333 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2335 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2336 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2337 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2339 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2340 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2341 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2343 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2344 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2345 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2347 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2348 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2349 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2351 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2352 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2353 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2355 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2356 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2357 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2359 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2360 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2361 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2363 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2364 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2365 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2367 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2368 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2369 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2371 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2372 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2373 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2375 #undef DO_SHRNB
2376 #undef DO_SHRNT
2378 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2379 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2381 intptr_t i, opr_sz = simd_oprsz(desc); \
2382 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2383 TYPEW nn = *(TYPEW *)(vn + i); \
2384 TYPEW mm = *(TYPEW *)(vm + i); \
2385 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2389 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2390 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2392 intptr_t i, opr_sz = simd_oprsz(desc); \
2393 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2394 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2395 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2396 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2400 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2401 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2402 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2403 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2405 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2406 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2407 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2409 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2410 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2411 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2413 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2414 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2415 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2417 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2418 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2419 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2421 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2422 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2423 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2425 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2426 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2427 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2429 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2430 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2431 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2433 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2434 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2435 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2437 #undef DO_RSUBHN
2438 #undef DO_SUBHN
2439 #undef DO_RADDHN
2440 #undef DO_ADDHN
2442 #undef DO_BINOPNB
2444 /* Fully general four-operand expander, controlled by a predicate.
2446 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2447 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2448 void *vg, uint32_t desc) \
2450 intptr_t i, opr_sz = simd_oprsz(desc); \
2451 for (i = 0; i < opr_sz; ) { \
2452 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2453 do { \
2454 if (pg & 1) { \
2455 TYPE nn = *(TYPE *)(vn + H(i)); \
2456 TYPE mm = *(TYPE *)(vm + H(i)); \
2457 TYPE aa = *(TYPE *)(va + H(i)); \
2458 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2460 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2461 } while (i & 15); \
2465 /* Similarly, specialized for 64-bit operands. */
2466 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2467 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2468 void *vg, uint32_t desc) \
2470 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2471 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2472 uint8_t *pg = vg; \
2473 for (i = 0; i < opr_sz; i += 1) { \
2474 if (pg[H1(i)] & 1) { \
2475 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2476 d[i] = OP(aa, nn, mm); \
2481 #define DO_MLA(A, N, M) (A + N * M)
2482 #define DO_MLS(A, N, M) (A - N * M)
2484 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2485 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2487 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2488 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2490 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2491 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2493 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2494 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2496 #undef DO_MLA
2497 #undef DO_MLS
2498 #undef DO_ZPZZZ
2499 #undef DO_ZPZZZ_D
2501 void HELPER(sve_index_b)(void *vd, uint32_t start,
2502 uint32_t incr, uint32_t desc)
2504 intptr_t i, opr_sz = simd_oprsz(desc);
2505 uint8_t *d = vd;
2506 for (i = 0; i < opr_sz; i += 1) {
2507 d[H1(i)] = start + i * incr;
2511 void HELPER(sve_index_h)(void *vd, uint32_t start,
2512 uint32_t incr, uint32_t desc)
2514 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2515 uint16_t *d = vd;
2516 for (i = 0; i < opr_sz; i += 1) {
2517 d[H2(i)] = start + i * incr;
2521 void HELPER(sve_index_s)(void *vd, uint32_t start,
2522 uint32_t incr, uint32_t desc)
2524 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2525 uint32_t *d = vd;
2526 for (i = 0; i < opr_sz; i += 1) {
2527 d[H4(i)] = start + i * incr;
2531 void HELPER(sve_index_d)(void *vd, uint64_t start,
2532 uint64_t incr, uint32_t desc)
2534 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2535 uint64_t *d = vd;
2536 for (i = 0; i < opr_sz; i += 1) {
2537 d[i] = start + i * incr;
2541 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2543 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2544 uint32_t sh = simd_data(desc);
2545 uint32_t *d = vd, *n = vn, *m = vm;
2546 for (i = 0; i < opr_sz; i += 1) {
2547 d[i] = n[i] + (m[i] << sh);
2551 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2553 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2554 uint64_t sh = simd_data(desc);
2555 uint64_t *d = vd, *n = vn, *m = vm;
2556 for (i = 0; i < opr_sz; i += 1) {
2557 d[i] = n[i] + (m[i] << sh);
2561 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2563 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2564 uint64_t sh = simd_data(desc);
2565 uint64_t *d = vd, *n = vn, *m = vm;
2566 for (i = 0; i < opr_sz; i += 1) {
2567 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2571 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2574 uint64_t sh = simd_data(desc);
2575 uint64_t *d = vd, *n = vn, *m = vm;
2576 for (i = 0; i < opr_sz; i += 1) {
2577 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2581 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2583 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2584 static const uint16_t coeff[] = {
2585 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2586 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2587 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2588 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2590 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2591 uint16_t *d = vd, *n = vn;
2593 for (i = 0; i < opr_sz; i++) {
2594 uint16_t nn = n[i];
2595 intptr_t idx = extract32(nn, 0, 5);
2596 uint16_t exp = extract32(nn, 5, 5);
2597 d[i] = coeff[idx] | (exp << 10);
2601 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2603 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2604 static const uint32_t coeff[] = {
2605 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2606 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2607 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2608 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2609 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2610 0x1ef532, 0x20b051, 0x227043, 0x243516,
2611 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2612 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2613 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2614 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2615 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2616 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2617 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2618 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2619 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2620 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2622 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2623 uint32_t *d = vd, *n = vn;
2625 for (i = 0; i < opr_sz; i++) {
2626 uint32_t nn = n[i];
2627 intptr_t idx = extract32(nn, 0, 6);
2628 uint32_t exp = extract32(nn, 6, 8);
2629 d[i] = coeff[idx] | (exp << 23);
2633 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2635 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2636 static const uint64_t coeff[] = {
2637 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2638 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2639 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2640 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2641 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2642 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2643 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2644 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2645 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2646 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2647 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2648 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2649 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2650 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2651 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2652 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2653 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2654 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2655 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2656 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2657 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2658 0xFA7C1819E90D8ull,
2660 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2661 uint64_t *d = vd, *n = vn;
2663 for (i = 0; i < opr_sz; i++) {
2664 uint64_t nn = n[i];
2665 intptr_t idx = extract32(nn, 0, 6);
2666 uint64_t exp = extract32(nn, 6, 11);
2667 d[i] = coeff[idx] | (exp << 52);
2671 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2673 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2674 uint16_t *d = vd, *n = vn, *m = vm;
2675 for (i = 0; i < opr_sz; i += 1) {
2676 uint16_t nn = n[i];
2677 uint16_t mm = m[i];
2678 if (mm & 1) {
2679 nn = float16_one;
2681 d[i] = nn ^ (mm & 2) << 14;
2685 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2687 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2688 uint32_t *d = vd, *n = vn, *m = vm;
2689 for (i = 0; i < opr_sz; i += 1) {
2690 uint32_t nn = n[i];
2691 uint32_t mm = m[i];
2692 if (mm & 1) {
2693 nn = float32_one;
2695 d[i] = nn ^ (mm & 2) << 30;
2699 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2701 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2702 uint64_t *d = vd, *n = vn, *m = vm;
2703 for (i = 0; i < opr_sz; i += 1) {
2704 uint64_t nn = n[i];
2705 uint64_t mm = m[i];
2706 if (mm & 1) {
2707 nn = float64_one;
2709 d[i] = nn ^ (mm & 2) << 62;
2714 * Signed saturating addition with scalar operand.
2717 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2719 intptr_t i, oprsz = simd_oprsz(desc);
2721 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2722 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2726 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2728 intptr_t i, oprsz = simd_oprsz(desc);
2730 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2731 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2735 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2737 intptr_t i, oprsz = simd_oprsz(desc);
2739 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2740 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2744 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2746 intptr_t i, oprsz = simd_oprsz(desc);
2748 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2749 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2754 * Unsigned saturating addition with scalar operand.
2757 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2759 intptr_t i, oprsz = simd_oprsz(desc);
2761 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2762 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2766 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2768 intptr_t i, oprsz = simd_oprsz(desc);
2770 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2771 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2775 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2777 intptr_t i, oprsz = simd_oprsz(desc);
2779 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2780 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2784 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2786 intptr_t i, oprsz = simd_oprsz(desc);
2788 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2789 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2793 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2795 intptr_t i, oprsz = simd_oprsz(desc);
2797 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2798 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2802 /* Two operand predicated copy immediate with merge. All valid immediates
2803 * can fit within 17 signed bits in the simd_data field.
2805 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2806 uint64_t mm, uint32_t desc)
2808 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2809 uint64_t *d = vd, *n = vn;
2810 uint8_t *pg = vg;
2812 mm = dup_const(MO_8, mm);
2813 for (i = 0; i < opr_sz; i += 1) {
2814 uint64_t nn = n[i];
2815 uint64_t pp = expand_pred_b(pg[H1(i)]);
2816 d[i] = (mm & pp) | (nn & ~pp);
2820 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2821 uint64_t mm, uint32_t desc)
2823 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2824 uint64_t *d = vd, *n = vn;
2825 uint8_t *pg = vg;
2827 mm = dup_const(MO_16, mm);
2828 for (i = 0; i < opr_sz; i += 1) {
2829 uint64_t nn = n[i];
2830 uint64_t pp = expand_pred_h(pg[H1(i)]);
2831 d[i] = (mm & pp) | (nn & ~pp);
2835 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2836 uint64_t mm, uint32_t desc)
2838 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2839 uint64_t *d = vd, *n = vn;
2840 uint8_t *pg = vg;
2842 mm = dup_const(MO_32, mm);
2843 for (i = 0; i < opr_sz; i += 1) {
2844 uint64_t nn = n[i];
2845 uint64_t pp = expand_pred_s(pg[H1(i)]);
2846 d[i] = (mm & pp) | (nn & ~pp);
2850 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2851 uint64_t mm, uint32_t desc)
2853 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2854 uint64_t *d = vd, *n = vn;
2855 uint8_t *pg = vg;
2857 for (i = 0; i < opr_sz; i += 1) {
2858 uint64_t nn = n[i];
2859 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2863 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2865 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2866 uint64_t *d = vd;
2867 uint8_t *pg = vg;
2869 val = dup_const(MO_8, val);
2870 for (i = 0; i < opr_sz; i += 1) {
2871 d[i] = val & expand_pred_b(pg[H1(i)]);
2875 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2877 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2878 uint64_t *d = vd;
2879 uint8_t *pg = vg;
2881 val = dup_const(MO_16, val);
2882 for (i = 0; i < opr_sz; i += 1) {
2883 d[i] = val & expand_pred_h(pg[H1(i)]);
2887 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2889 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2890 uint64_t *d = vd;
2891 uint8_t *pg = vg;
2893 val = dup_const(MO_32, val);
2894 for (i = 0; i < opr_sz; i += 1) {
2895 d[i] = val & expand_pred_s(pg[H1(i)]);
2899 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2901 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2902 uint64_t *d = vd;
2903 uint8_t *pg = vg;
2905 for (i = 0; i < opr_sz; i += 1) {
2906 d[i] = (pg[H1(i)] & 1 ? val : 0);
2910 /* Big-endian hosts need to frob the byte indices. If the copy
2911 * happens to be 8-byte aligned, then no frobbing necessary.
2913 static void swap_memmove(void *vd, void *vs, size_t n)
2915 uintptr_t d = (uintptr_t)vd;
2916 uintptr_t s = (uintptr_t)vs;
2917 uintptr_t o = (d | s | n) & 7;
2918 size_t i;
2920 #ifndef HOST_WORDS_BIGENDIAN
2921 o = 0;
2922 #endif
2923 switch (o) {
2924 case 0:
2925 memmove(vd, vs, n);
2926 break;
2928 case 4:
2929 if (d < s || d >= s + n) {
2930 for (i = 0; i < n; i += 4) {
2931 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2933 } else {
2934 for (i = n; i > 0; ) {
2935 i -= 4;
2936 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2939 break;
2941 case 2:
2942 case 6:
2943 if (d < s || d >= s + n) {
2944 for (i = 0; i < n; i += 2) {
2945 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2947 } else {
2948 for (i = n; i > 0; ) {
2949 i -= 2;
2950 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2953 break;
2955 default:
2956 if (d < s || d >= s + n) {
2957 for (i = 0; i < n; i++) {
2958 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2960 } else {
2961 for (i = n; i > 0; ) {
2962 i -= 1;
2963 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2966 break;
2970 /* Similarly for memset of 0. */
2971 static void swap_memzero(void *vd, size_t n)
2973 uintptr_t d = (uintptr_t)vd;
2974 uintptr_t o = (d | n) & 7;
2975 size_t i;
2977 /* Usually, the first bit of a predicate is set, so N is 0. */
2978 if (likely(n == 0)) {
2979 return;
2982 #ifndef HOST_WORDS_BIGENDIAN
2983 o = 0;
2984 #endif
2985 switch (o) {
2986 case 0:
2987 memset(vd, 0, n);
2988 break;
2990 case 4:
2991 for (i = 0; i < n; i += 4) {
2992 *(uint32_t *)H1_4(d + i) = 0;
2994 break;
2996 case 2:
2997 case 6:
2998 for (i = 0; i < n; i += 2) {
2999 *(uint16_t *)H1_2(d + i) = 0;
3001 break;
3003 default:
3004 for (i = 0; i < n; i++) {
3005 *(uint8_t *)H1(d + i) = 0;
3007 break;
3011 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
3013 intptr_t opr_sz = simd_oprsz(desc);
3014 size_t n_ofs = simd_data(desc);
3015 size_t n_siz = opr_sz - n_ofs;
3017 if (vd != vm) {
3018 swap_memmove(vd, vn + n_ofs, n_siz);
3019 swap_memmove(vd + n_siz, vm, n_ofs);
3020 } else if (vd != vn) {
3021 swap_memmove(vd + n_siz, vd, n_ofs);
3022 swap_memmove(vd, vn + n_ofs, n_siz);
3023 } else {
3024 /* vd == vn == vm. Need temp space. */
3025 ARMVectorReg tmp;
3026 swap_memmove(&tmp, vm, n_ofs);
3027 swap_memmove(vd, vd + n_ofs, n_siz);
3028 memcpy(vd + n_siz, &tmp, n_ofs);
3032 #define DO_INSR(NAME, TYPE, H) \
3033 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
3035 intptr_t opr_sz = simd_oprsz(desc); \
3036 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
3037 *(TYPE *)(vd + H(0)) = val; \
3040 DO_INSR(sve_insr_b, uint8_t, H1)
3041 DO_INSR(sve_insr_h, uint16_t, H1_2)
3042 DO_INSR(sve_insr_s, uint32_t, H1_4)
3043 DO_INSR(sve_insr_d, uint64_t, H1_8)
3045 #undef DO_INSR
3047 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
3049 intptr_t i, j, opr_sz = simd_oprsz(desc);
3050 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3051 uint64_t f = *(uint64_t *)(vn + i);
3052 uint64_t b = *(uint64_t *)(vn + j);
3053 *(uint64_t *)(vd + i) = bswap64(b);
3054 *(uint64_t *)(vd + j) = bswap64(f);
3058 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
3060 intptr_t i, j, opr_sz = simd_oprsz(desc);
3061 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3062 uint64_t f = *(uint64_t *)(vn + i);
3063 uint64_t b = *(uint64_t *)(vn + j);
3064 *(uint64_t *)(vd + i) = hswap64(b);
3065 *(uint64_t *)(vd + j) = hswap64(f);
3069 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
3071 intptr_t i, j, opr_sz = simd_oprsz(desc);
3072 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3073 uint64_t f = *(uint64_t *)(vn + i);
3074 uint64_t b = *(uint64_t *)(vn + j);
3075 *(uint64_t *)(vd + i) = rol64(b, 32);
3076 *(uint64_t *)(vd + j) = rol64(f, 32);
3080 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
3082 intptr_t i, j, opr_sz = simd_oprsz(desc);
3083 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3084 uint64_t f = *(uint64_t *)(vn + i);
3085 uint64_t b = *(uint64_t *)(vn + j);
3086 *(uint64_t *)(vd + i) = b;
3087 *(uint64_t *)(vd + j) = f;
3091 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
3093 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
3094 bool is_tbx, tb_impl_fn *fn)
3096 ARMVectorReg scratch;
3097 uintptr_t oprsz = simd_oprsz(desc);
3099 if (unlikely(vd == vn)) {
3100 vn = memcpy(&scratch, vn, oprsz);
3103 fn(vd, vn, NULL, vm, oprsz, is_tbx);
3106 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3107 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3109 ARMVectorReg scratch;
3110 uintptr_t oprsz = simd_oprsz(desc);
3112 if (unlikely(vd == vn0)) {
3113 vn0 = memcpy(&scratch, vn0, oprsz);
3114 if (vd == vn1) {
3115 vn1 = vn0;
3117 } else if (unlikely(vd == vn1)) {
3118 vn1 = memcpy(&scratch, vn1, oprsz);
3121 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3124 #define DO_TB(SUFF, TYPE, H) \
3125 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3126 void *vm, uintptr_t oprsz, bool is_tbx) \
3128 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3129 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3130 for (i = 0; i < nelem; ++i) { \
3131 TYPE index = indexes[H1(i)], val = 0; \
3132 if (index < nelem) { \
3133 val = tbl0[H(index)]; \
3134 } else { \
3135 index -= nelem; \
3136 if (tbl1 && index < nelem) { \
3137 val = tbl1[H(index)]; \
3138 } else if (is_tbx) { \
3139 continue; \
3142 d[H(i)] = val; \
3145 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3147 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3149 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3150 void *vm, uint32_t desc) \
3152 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3154 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3156 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3159 DO_TB(b, uint8_t, H1)
3160 DO_TB(h, uint16_t, H2)
3161 DO_TB(s, uint32_t, H4)
3162 DO_TB(d, uint64_t, H8)
3164 #undef DO_TB
3166 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3167 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3169 intptr_t i, opr_sz = simd_oprsz(desc); \
3170 TYPED *d = vd; \
3171 TYPES *n = vn; \
3172 ARMVectorReg tmp; \
3173 if (unlikely(vn - vd < opr_sz)) { \
3174 n = memcpy(&tmp, n, opr_sz / 2); \
3176 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3177 d[HD(i)] = n[HS(i)]; \
3181 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3182 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3183 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3185 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3186 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3187 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3189 #undef DO_UNPK
3191 /* Mask of bits included in the even numbered predicates of width esz.
3192 * We also use this for expand_bits/compress_bits, and so extend the
3193 * same pattern out to 16-bit units.
3195 static const uint64_t even_bit_esz_masks[5] = {
3196 0x5555555555555555ull,
3197 0x3333333333333333ull,
3198 0x0f0f0f0f0f0f0f0full,
3199 0x00ff00ff00ff00ffull,
3200 0x0000ffff0000ffffull,
3203 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3204 * For N==0, this corresponds to the operation that in qemu/bitops.h
3205 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3206 * section 7-2 Shuffling Bits.
3208 static uint64_t expand_bits(uint64_t x, int n)
3210 int i;
3212 x &= 0xffffffffu;
3213 for (i = 4; i >= n; i--) {
3214 int sh = 1 << i;
3215 x = ((x << sh) | x) & even_bit_esz_masks[i];
3217 return x;
3220 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3221 * For N==0, this corresponds to the operation that in qemu/bitops.h
3222 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3223 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3225 static uint64_t compress_bits(uint64_t x, int n)
3227 int i;
3229 for (i = n; i <= 4; i++) {
3230 int sh = 1 << i;
3231 x &= even_bit_esz_masks[i];
3232 x = (x >> sh) | x;
3234 return x & 0xffffffffu;
3237 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3239 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3240 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3241 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3242 int esize = 1 << esz;
3243 uint64_t *d = vd;
3244 intptr_t i;
3246 if (oprsz <= 8) {
3247 uint64_t nn = *(uint64_t *)vn;
3248 uint64_t mm = *(uint64_t *)vm;
3249 int half = 4 * oprsz;
3251 nn = extract64(nn, high * half, half);
3252 mm = extract64(mm, high * half, half);
3253 nn = expand_bits(nn, esz);
3254 mm = expand_bits(mm, esz);
3255 d[0] = nn | (mm << esize);
3256 } else {
3257 ARMPredicateReg tmp;
3259 /* We produce output faster than we consume input.
3260 Therefore we must be mindful of possible overlap. */
3261 if (vd == vn) {
3262 vn = memcpy(&tmp, vn, oprsz);
3263 if (vd == vm) {
3264 vm = vn;
3266 } else if (vd == vm) {
3267 vm = memcpy(&tmp, vm, oprsz);
3269 if (high) {
3270 high = oprsz >> 1;
3273 if ((oprsz & 7) == 0) {
3274 uint32_t *n = vn, *m = vm;
3275 high >>= 2;
3277 for (i = 0; i < oprsz / 8; i++) {
3278 uint64_t nn = n[H4(high + i)];
3279 uint64_t mm = m[H4(high + i)];
3281 nn = expand_bits(nn, esz);
3282 mm = expand_bits(mm, esz);
3283 d[i] = nn | (mm << esize);
3285 } else {
3286 uint8_t *n = vn, *m = vm;
3287 uint16_t *d16 = vd;
3289 for (i = 0; i < oprsz / 2; i++) {
3290 uint16_t nn = n[H1(high + i)];
3291 uint16_t mm = m[H1(high + i)];
3293 nn = expand_bits(nn, esz);
3294 mm = expand_bits(mm, esz);
3295 d16[H2(i)] = nn | (mm << esize);
3301 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3303 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3304 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3305 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3306 uint64_t *d = vd, *n = vn, *m = vm;
3307 uint64_t l, h;
3308 intptr_t i;
3310 if (oprsz <= 8) {
3311 l = compress_bits(n[0] >> odd, esz);
3312 h = compress_bits(m[0] >> odd, esz);
3313 d[0] = l | (h << (4 * oprsz));
3314 } else {
3315 ARMPredicateReg tmp_m;
3316 intptr_t oprsz_16 = oprsz / 16;
3318 if ((vm - vd) < (uintptr_t)oprsz) {
3319 m = memcpy(&tmp_m, vm, oprsz);
3322 for (i = 0; i < oprsz_16; i++) {
3323 l = n[2 * i + 0];
3324 h = n[2 * i + 1];
3325 l = compress_bits(l >> odd, esz);
3326 h = compress_bits(h >> odd, esz);
3327 d[i] = l | (h << 32);
3331 * For VL which is not a multiple of 512, the results from M do not
3332 * align nicely with the uint64_t for D. Put the aligned results
3333 * from M into TMP_M and then copy it into place afterward.
3335 if (oprsz & 15) {
3336 int final_shift = (oprsz & 15) * 2;
3338 l = n[2 * i + 0];
3339 h = n[2 * i + 1];
3340 l = compress_bits(l >> odd, esz);
3341 h = compress_bits(h >> odd, esz);
3342 d[i] = l | (h << final_shift);
3344 for (i = 0; i < oprsz_16; i++) {
3345 l = m[2 * i + 0];
3346 h = m[2 * i + 1];
3347 l = compress_bits(l >> odd, esz);
3348 h = compress_bits(h >> odd, esz);
3349 tmp_m.p[i] = l | (h << 32);
3351 l = m[2 * i + 0];
3352 h = m[2 * i + 1];
3353 l = compress_bits(l >> odd, esz);
3354 h = compress_bits(h >> odd, esz);
3355 tmp_m.p[i] = l | (h << final_shift);
3357 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3358 } else {
3359 for (i = 0; i < oprsz_16; i++) {
3360 l = m[2 * i + 0];
3361 h = m[2 * i + 1];
3362 l = compress_bits(l >> odd, esz);
3363 h = compress_bits(h >> odd, esz);
3364 d[oprsz_16 + i] = l | (h << 32);
3370 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3372 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3373 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3374 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3375 uint64_t *d = vd, *n = vn, *m = vm;
3376 uint64_t mask;
3377 int shr, shl;
3378 intptr_t i;
3380 shl = 1 << esz;
3381 shr = 0;
3382 mask = even_bit_esz_masks[esz];
3383 if (odd) {
3384 mask <<= shl;
3385 shr = shl;
3386 shl = 0;
3389 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3390 uint64_t nn = (n[i] & mask) >> shr;
3391 uint64_t mm = (m[i] & mask) << shl;
3392 d[i] = nn + mm;
3396 /* Reverse units of 2**N bits. */
3397 static uint64_t reverse_bits_64(uint64_t x, int n)
3399 int i, sh;
3401 x = bswap64(x);
3402 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3403 uint64_t mask = even_bit_esz_masks[i];
3404 x = ((x & mask) << sh) | ((x >> sh) & mask);
3406 return x;
3409 static uint8_t reverse_bits_8(uint8_t x, int n)
3411 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3412 int i, sh;
3414 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3415 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3417 return x;
3420 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3422 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3423 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3424 intptr_t i, oprsz_2 = oprsz / 2;
3426 if (oprsz <= 8) {
3427 uint64_t l = *(uint64_t *)vn;
3428 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3429 *(uint64_t *)vd = l;
3430 } else if ((oprsz & 15) == 0) {
3431 for (i = 0; i < oprsz_2; i += 8) {
3432 intptr_t ih = oprsz - 8 - i;
3433 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3434 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3435 *(uint64_t *)(vd + i) = h;
3436 *(uint64_t *)(vd + ih) = l;
3438 } else {
3439 for (i = 0; i < oprsz_2; i += 1) {
3440 intptr_t il = H1(i);
3441 intptr_t ih = H1(oprsz - 1 - i);
3442 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3443 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3444 *(uint8_t *)(vd + il) = h;
3445 *(uint8_t *)(vd + ih) = l;
3450 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3452 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3453 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3454 uint64_t *d = vd;
3455 intptr_t i;
3457 if (oprsz <= 8) {
3458 uint64_t nn = *(uint64_t *)vn;
3459 int half = 4 * oprsz;
3461 nn = extract64(nn, high * half, half);
3462 nn = expand_bits(nn, 0);
3463 d[0] = nn;
3464 } else {
3465 ARMPredicateReg tmp_n;
3467 /* We produce output faster than we consume input.
3468 Therefore we must be mindful of possible overlap. */
3469 if ((vn - vd) < (uintptr_t)oprsz) {
3470 vn = memcpy(&tmp_n, vn, oprsz);
3472 if (high) {
3473 high = oprsz >> 1;
3476 if ((oprsz & 7) == 0) {
3477 uint32_t *n = vn;
3478 high >>= 2;
3480 for (i = 0; i < oprsz / 8; i++) {
3481 uint64_t nn = n[H4(high + i)];
3482 d[i] = expand_bits(nn, 0);
3484 } else {
3485 uint16_t *d16 = vd;
3486 uint8_t *n = vn;
3488 for (i = 0; i < oprsz / 2; i++) {
3489 uint16_t nn = n[H1(high + i)];
3490 d16[H2(i)] = expand_bits(nn, 0);
3496 #define DO_ZIP(NAME, TYPE, H) \
3497 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3499 intptr_t oprsz = simd_oprsz(desc); \
3500 intptr_t i, oprsz_2 = oprsz / 2; \
3501 ARMVectorReg tmp_n, tmp_m; \
3502 /* We produce output faster than we consume input. \
3503 Therefore we must be mindful of possible overlap. */ \
3504 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3505 vn = memcpy(&tmp_n, vn, oprsz_2); \
3507 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3508 vm = memcpy(&tmp_m, vm, oprsz_2); \
3510 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3511 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3512 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3514 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3515 memset(vd + oprsz - 16, 0, 16); \
3519 DO_ZIP(sve_zip_b, uint8_t, H1)
3520 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3521 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3522 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3523 DO_ZIP(sve2_zip_q, Int128, )
3525 #define DO_UZP(NAME, TYPE, H) \
3526 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3528 intptr_t oprsz = simd_oprsz(desc); \
3529 intptr_t odd_ofs = simd_data(desc); \
3530 intptr_t i, p; \
3531 ARMVectorReg tmp_m; \
3532 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3533 vm = memcpy(&tmp_m, vm, oprsz); \
3535 i = 0, p = odd_ofs; \
3536 do { \
3537 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3538 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3539 } while (p < oprsz); \
3540 p -= oprsz; \
3541 do { \
3542 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3543 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3544 } while (p < oprsz); \
3545 tcg_debug_assert(i == oprsz); \
3548 DO_UZP(sve_uzp_b, uint8_t, H1)
3549 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3550 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3551 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3552 DO_UZP(sve2_uzp_q, Int128, )
3554 #define DO_TRN(NAME, TYPE, H) \
3555 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3557 intptr_t oprsz = simd_oprsz(desc); \
3558 intptr_t odd_ofs = simd_data(desc); \
3559 intptr_t i; \
3560 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3561 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3562 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3563 *(TYPE *)(vd + H(i + 0)) = ae; \
3564 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3566 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3567 memset(vd + oprsz - 16, 0, 16); \
3571 DO_TRN(sve_trn_b, uint8_t, H1)
3572 DO_TRN(sve_trn_h, uint16_t, H1_2)
3573 DO_TRN(sve_trn_s, uint32_t, H1_4)
3574 DO_TRN(sve_trn_d, uint64_t, H1_8)
3575 DO_TRN(sve2_trn_q, Int128, )
3577 #undef DO_ZIP
3578 #undef DO_UZP
3579 #undef DO_TRN
3581 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3583 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3584 uint32_t *d = vd, *n = vn;
3585 uint8_t *pg = vg;
3587 for (i = j = 0; i < opr_sz; i++) {
3588 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3589 d[H4(j)] = n[H4(i)];
3590 j++;
3593 for (; j < opr_sz; j++) {
3594 d[H4(j)] = 0;
3598 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3600 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3601 uint64_t *d = vd, *n = vn;
3602 uint8_t *pg = vg;
3604 for (i = j = 0; i < opr_sz; i++) {
3605 if (pg[H1(i)] & 1) {
3606 d[j] = n[i];
3607 j++;
3610 for (; j < opr_sz; j++) {
3611 d[j] = 0;
3615 /* Similar to the ARM LastActiveElement pseudocode function, except the
3616 * result is multiplied by the element size. This includes the not found
3617 * indication; e.g. not found for esz=3 is -8.
3619 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3621 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3622 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3624 return last_active_element(vg, words, esz);
3627 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3629 intptr_t opr_sz = simd_oprsz(desc) / 8;
3630 int esz = simd_data(desc);
3631 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3632 intptr_t i, first_i, last_i;
3633 ARMVectorReg tmp;
3635 first_i = last_i = 0;
3636 first_g = last_g = 0;
3638 /* Find the extent of the active elements within VG. */
3639 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3640 pg = *(uint64_t *)(vg + i) & mask;
3641 if (pg) {
3642 if (last_g == 0) {
3643 last_g = pg;
3644 last_i = i;
3646 first_g = pg;
3647 first_i = i;
3651 len = 0;
3652 if (first_g != 0) {
3653 first_i = first_i * 8 + ctz64(first_g);
3654 last_i = last_i * 8 + 63 - clz64(last_g);
3655 len = last_i - first_i + (1 << esz);
3656 if (vd == vm) {
3657 vm = memcpy(&tmp, vm, opr_sz * 8);
3659 swap_memmove(vd, vn + first_i, len);
3661 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3664 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3665 void *vg, uint32_t desc)
3667 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3668 uint64_t *d = vd, *n = vn, *m = vm;
3669 uint8_t *pg = vg;
3671 for (i = 0; i < opr_sz; i += 1) {
3672 uint64_t nn = n[i], mm = m[i];
3673 uint64_t pp = expand_pred_b(pg[H1(i)]);
3674 d[i] = (nn & pp) | (mm & ~pp);
3678 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3679 void *vg, uint32_t desc)
3681 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3682 uint64_t *d = vd, *n = vn, *m = vm;
3683 uint8_t *pg = vg;
3685 for (i = 0; i < opr_sz; i += 1) {
3686 uint64_t nn = n[i], mm = m[i];
3687 uint64_t pp = expand_pred_h(pg[H1(i)]);
3688 d[i] = (nn & pp) | (mm & ~pp);
3692 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3693 void *vg, uint32_t desc)
3695 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3696 uint64_t *d = vd, *n = vn, *m = vm;
3697 uint8_t *pg = vg;
3699 for (i = 0; i < opr_sz; i += 1) {
3700 uint64_t nn = n[i], mm = m[i];
3701 uint64_t pp = expand_pred_s(pg[H1(i)]);
3702 d[i] = (nn & pp) | (mm & ~pp);
3706 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3707 void *vg, uint32_t desc)
3709 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3710 uint64_t *d = vd, *n = vn, *m = vm;
3711 uint8_t *pg = vg;
3713 for (i = 0; i < opr_sz; i += 1) {
3714 uint64_t nn = n[i], mm = m[i];
3715 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3719 /* Two operand comparison controlled by a predicate.
3720 * ??? It is very tempting to want to be able to expand this inline
3721 * with x86 instructions, e.g.
3723 * vcmpeqw zm, zn, %ymm0
3724 * vpmovmskb %ymm0, %eax
3725 * and $0x5555, %eax
3726 * and pg, %eax
3728 * or even aarch64, e.g.
3730 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3731 * cmeq v0.8h, zn, zm
3732 * and v0.8h, v0.8h, mask
3733 * addv h0, v0.8h
3734 * and v0.8b, pg
3736 * However, coming up with an abstraction that allows vector inputs and
3737 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3738 * scalar outputs, is tricky.
3740 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3741 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3743 intptr_t opr_sz = simd_oprsz(desc); \
3744 uint32_t flags = PREDTEST_INIT; \
3745 intptr_t i = opr_sz; \
3746 do { \
3747 uint64_t out = 0, pg; \
3748 do { \
3749 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3750 TYPE nn = *(TYPE *)(vn + H(i)); \
3751 TYPE mm = *(TYPE *)(vm + H(i)); \
3752 out |= nn OP mm; \
3753 } while (i & 63); \
3754 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3755 out &= pg; \
3756 *(uint64_t *)(vd + (i >> 3)) = out; \
3757 flags = iter_predtest_bwd(out, pg, flags); \
3758 } while (i > 0); \
3759 return flags; \
3762 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3763 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3764 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3765 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3766 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3767 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3768 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3769 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3771 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3772 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3773 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3774 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3776 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3777 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3778 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3779 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3781 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3782 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3783 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3784 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3786 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3787 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3788 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3789 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3791 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3792 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3793 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3794 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3796 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3797 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3798 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3799 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3801 #undef DO_CMP_PPZZ_B
3802 #undef DO_CMP_PPZZ_H
3803 #undef DO_CMP_PPZZ_S
3804 #undef DO_CMP_PPZZ_D
3805 #undef DO_CMP_PPZZ
3807 /* Similar, but the second source is "wide". */
3808 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3809 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3811 intptr_t opr_sz = simd_oprsz(desc); \
3812 uint32_t flags = PREDTEST_INIT; \
3813 intptr_t i = opr_sz; \
3814 do { \
3815 uint64_t out = 0, pg; \
3816 do { \
3817 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3818 do { \
3819 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3820 TYPE nn = *(TYPE *)(vn + H(i)); \
3821 out |= nn OP mm; \
3822 } while (i & 7); \
3823 } while (i & 63); \
3824 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3825 out &= pg; \
3826 *(uint64_t *)(vd + (i >> 3)) = out; \
3827 flags = iter_predtest_bwd(out, pg, flags); \
3828 } while (i > 0); \
3829 return flags; \
3832 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3833 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3834 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3835 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3836 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3837 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3839 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3840 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3841 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3843 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3844 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3845 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3847 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3848 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3849 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3851 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3852 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3853 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3855 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3856 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3857 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3859 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3860 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3861 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3863 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3864 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3865 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3867 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3868 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3869 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3871 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3872 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3873 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3875 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3876 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3877 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3879 #undef DO_CMP_PPZW_B
3880 #undef DO_CMP_PPZW_H
3881 #undef DO_CMP_PPZW_S
3882 #undef DO_CMP_PPZW
3884 /* Similar, but the second source is immediate. */
3885 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3886 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3888 intptr_t opr_sz = simd_oprsz(desc); \
3889 uint32_t flags = PREDTEST_INIT; \
3890 TYPE mm = simd_data(desc); \
3891 intptr_t i = opr_sz; \
3892 do { \
3893 uint64_t out = 0, pg; \
3894 do { \
3895 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3896 TYPE nn = *(TYPE *)(vn + H(i)); \
3897 out |= nn OP mm; \
3898 } while (i & 63); \
3899 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3900 out &= pg; \
3901 *(uint64_t *)(vd + (i >> 3)) = out; \
3902 flags = iter_predtest_bwd(out, pg, flags); \
3903 } while (i > 0); \
3904 return flags; \
3907 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3908 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3909 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3910 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3911 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3912 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3913 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3914 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3916 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3917 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3918 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3919 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3921 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3922 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3923 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3924 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3926 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3927 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3928 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3929 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3931 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3932 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3933 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3934 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3936 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3937 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3938 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3939 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3941 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3942 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3943 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3944 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3946 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3947 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3948 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3949 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3951 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3952 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3953 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3954 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3956 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3957 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3958 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3959 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3961 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3962 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3963 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3964 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3966 #undef DO_CMP_PPZI_B
3967 #undef DO_CMP_PPZI_H
3968 #undef DO_CMP_PPZI_S
3969 #undef DO_CMP_PPZI_D
3970 #undef DO_CMP_PPZI
3972 /* Similar to the ARM LastActive pseudocode function. */
3973 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3975 intptr_t i;
3977 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3978 uint64_t pg = *(uint64_t *)(vg + i);
3979 if (pg) {
3980 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3983 return 0;
3986 /* Compute a mask into RETB that is true for all G, up to and including
3987 * (if after) or excluding (if !after) the first G & N.
3988 * Return true if BRK found.
3990 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3991 bool brk, bool after)
3993 uint64_t b;
3995 if (brk) {
3996 b = 0;
3997 } else if ((g & n) == 0) {
3998 /* For all G, no N are set; break not found. */
3999 b = g;
4000 } else {
4001 /* Break somewhere in N. Locate it. */
4002 b = g & n; /* guard true, pred true */
4003 b = b & -b; /* first such */
4004 if (after) {
4005 b = b | (b - 1); /* break after same */
4006 } else {
4007 b = b - 1; /* break before same */
4009 brk = true;
4012 *retb = b;
4013 return brk;
4016 /* Compute a zeroing BRK. */
4017 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
4018 intptr_t oprsz, bool after)
4020 bool brk = false;
4021 intptr_t i;
4023 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4024 uint64_t this_b, this_g = g[i];
4026 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4027 d[i] = this_b & this_g;
4031 /* Likewise, but also compute flags. */
4032 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
4033 intptr_t oprsz, bool after)
4035 uint32_t flags = PREDTEST_INIT;
4036 bool brk = false;
4037 intptr_t i;
4039 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4040 uint64_t this_b, this_d, this_g = g[i];
4042 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4043 d[i] = this_d = this_b & this_g;
4044 flags = iter_predtest_fwd(this_d, this_g, flags);
4046 return flags;
4049 /* Compute a merging BRK. */
4050 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
4051 intptr_t oprsz, bool after)
4053 bool brk = false;
4054 intptr_t i;
4056 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4057 uint64_t this_b, this_g = g[i];
4059 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4060 d[i] = (this_b & this_g) | (d[i] & ~this_g);
4064 /* Likewise, but also compute flags. */
4065 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
4066 intptr_t oprsz, bool after)
4068 uint32_t flags = PREDTEST_INIT;
4069 bool brk = false;
4070 intptr_t i;
4072 for (i = 0; i < oprsz / 8; ++i) {
4073 uint64_t this_b, this_d = d[i], this_g = g[i];
4075 brk = compute_brk(&this_b, n[i], this_g, brk, after);
4076 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
4077 flags = iter_predtest_fwd(this_d, this_g, flags);
4079 return flags;
4082 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
4084 /* It is quicker to zero the whole predicate than loop on OPRSZ.
4085 * The compiler should turn this into 4 64-bit integer stores.
4087 memset(d, 0, sizeof(ARMPredicateReg));
4088 return PREDTEST_INIT;
4091 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4092 uint32_t pred_desc)
4094 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4095 if (last_active_pred(vn, vg, oprsz)) {
4096 compute_brk_z(vd, vm, vg, oprsz, true);
4097 } else {
4098 do_zero(vd, oprsz);
4102 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4103 uint32_t pred_desc)
4105 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4106 if (last_active_pred(vn, vg, oprsz)) {
4107 return compute_brks_z(vd, vm, vg, oprsz, true);
4108 } else {
4109 return do_zero(vd, oprsz);
4113 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4114 uint32_t pred_desc)
4116 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4117 if (last_active_pred(vn, vg, oprsz)) {
4118 compute_brk_z(vd, vm, vg, oprsz, false);
4119 } else {
4120 do_zero(vd, oprsz);
4124 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4125 uint32_t pred_desc)
4127 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4128 if (last_active_pred(vn, vg, oprsz)) {
4129 return compute_brks_z(vd, vm, vg, oprsz, false);
4130 } else {
4131 return do_zero(vd, oprsz);
4135 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4138 compute_brk_z(vd, vn, vg, oprsz, true);
4141 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4143 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4144 return compute_brks_z(vd, vn, vg, oprsz, true);
4147 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4149 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4150 compute_brk_z(vd, vn, vg, oprsz, false);
4153 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4155 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4156 return compute_brks_z(vd, vn, vg, oprsz, false);
4159 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4161 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4162 compute_brk_m(vd, vn, vg, oprsz, true);
4165 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4167 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4168 return compute_brks_m(vd, vn, vg, oprsz, true);
4171 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4173 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4174 compute_brk_m(vd, vn, vg, oprsz, false);
4177 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4179 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4180 return compute_brks_m(vd, vn, vg, oprsz, false);
4183 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4185 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4186 if (!last_active_pred(vn, vg, oprsz)) {
4187 do_zero(vd, oprsz);
4191 /* As if PredTest(Ones(PL), D, esz). */
4192 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4193 uint64_t esz_mask)
4195 uint32_t flags = PREDTEST_INIT;
4196 intptr_t i;
4198 for (i = 0; i < oprsz / 8; i++) {
4199 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4201 if (oprsz & 7) {
4202 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4203 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4205 return flags;
4208 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4210 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4211 if (last_active_pred(vn, vg, oprsz)) {
4212 return predtest_ones(vd, oprsz, -1);
4213 } else {
4214 return do_zero(vd, oprsz);
4218 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4220 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4221 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4222 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4223 intptr_t i;
4225 for (i = 0; i < words; ++i) {
4226 uint64_t t = n[i] & g[i] & mask;
4227 sum += ctpop64(t);
4229 return sum;
4232 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4234 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4235 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4236 uint64_t esz_mask = pred_esz_masks[esz];
4237 ARMPredicateReg *d = vd;
4238 uint32_t flags;
4239 intptr_t i;
4241 /* Begin with a zero predicate register. */
4242 flags = do_zero(d, oprsz);
4243 if (count == 0) {
4244 return flags;
4247 /* Set all of the requested bits. */
4248 for (i = 0; i < count / 64; ++i) {
4249 d->p[i] = esz_mask;
4251 if (count & 63) {
4252 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4255 return predtest_ones(d, oprsz, esz_mask);
4258 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4260 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4261 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4262 uint64_t esz_mask = pred_esz_masks[esz];
4263 ARMPredicateReg *d = vd;
4264 intptr_t i, invcount, oprbits;
4265 uint64_t bits;
4267 if (count == 0) {
4268 return do_zero(d, oprsz);
4271 oprbits = oprsz * 8;
4272 tcg_debug_assert(count <= oprbits);
4274 bits = esz_mask;
4275 if (oprbits & 63) {
4276 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4279 invcount = oprbits - count;
4280 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4281 d->p[i] = bits;
4282 bits = esz_mask;
4285 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4287 while (--i >= 0) {
4288 d->p[i] = 0;
4291 return predtest_ones(d, oprsz, esz_mask);
4294 /* Recursive reduction on a function;
4295 * C.f. the ARM ARM function ReducePredicated.
4297 * While it would be possible to write this without the DATA temporary,
4298 * it is much simpler to process the predicate register this way.
4299 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4300 * little to gain with a more complex non-recursive form.
4302 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4303 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4305 if (n == 1) { \
4306 return *data; \
4307 } else { \
4308 uintptr_t half = n / 2; \
4309 TYPE lo = NAME##_reduce(data, status, half); \
4310 TYPE hi = NAME##_reduce(data + half, status, half); \
4311 return TYPE##_##FUNC(lo, hi, status); \
4314 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4316 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4317 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4318 for (i = 0; i < oprsz; ) { \
4319 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4320 do { \
4321 TYPE nn = *(TYPE *)(vn + H(i)); \
4322 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4323 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4324 } while (i & 15); \
4326 for (; i < maxsz; i += sizeof(TYPE)) { \
4327 *(TYPE *)((void *)data + i) = IDENT; \
4329 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4332 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4333 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4334 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4336 /* Identity is floatN_default_nan, without the function call. */
4337 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4338 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4339 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4341 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4342 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4343 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4345 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4346 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4347 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4349 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4350 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4351 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4353 #undef DO_REDUCE
4355 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4356 void *status, uint32_t desc)
4358 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4359 float16 result = nn;
4361 do {
4362 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4363 do {
4364 if (pg & 1) {
4365 float16 mm = *(float16 *)(vm + H1_2(i));
4366 result = float16_add(result, mm, status);
4368 i += sizeof(float16), pg >>= sizeof(float16);
4369 } while (i & 15);
4370 } while (i < opr_sz);
4372 return result;
4375 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4376 void *status, uint32_t desc)
4378 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4379 float32 result = nn;
4381 do {
4382 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4383 do {
4384 if (pg & 1) {
4385 float32 mm = *(float32 *)(vm + H1_2(i));
4386 result = float32_add(result, mm, status);
4388 i += sizeof(float32), pg >>= sizeof(float32);
4389 } while (i & 15);
4390 } while (i < opr_sz);
4392 return result;
4395 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4396 void *status, uint32_t desc)
4398 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4399 uint64_t *m = vm;
4400 uint8_t *pg = vg;
4402 for (i = 0; i < opr_sz; i++) {
4403 if (pg[H1(i)] & 1) {
4404 nn = float64_add(nn, m[i], status);
4408 return nn;
4411 /* Fully general three-operand expander, controlled by a predicate,
4412 * With the extra float_status parameter.
4414 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4415 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4416 void *status, uint32_t desc) \
4418 intptr_t i = simd_oprsz(desc); \
4419 uint64_t *g = vg; \
4420 do { \
4421 uint64_t pg = g[(i - 1) >> 6]; \
4422 do { \
4423 i -= sizeof(TYPE); \
4424 if (likely((pg >> (i & 63)) & 1)) { \
4425 TYPE nn = *(TYPE *)(vn + H(i)); \
4426 TYPE mm = *(TYPE *)(vm + H(i)); \
4427 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4429 } while (i & 63); \
4430 } while (i != 0); \
4433 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4434 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4435 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4437 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4438 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4439 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4441 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4442 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4443 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4445 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4446 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4447 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4449 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4450 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4451 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4453 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4454 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4455 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4457 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4458 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4459 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4461 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4462 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4463 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4465 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4467 return float16_abs(float16_sub(a, b, s));
4470 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4472 return float32_abs(float32_sub(a, b, s));
4475 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4477 return float64_abs(float64_sub(a, b, s));
4480 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4481 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4482 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4484 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4486 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4487 return float64_scalbn(a, b_int, s);
4490 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4491 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4492 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4494 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4495 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4496 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4498 #undef DO_ZPZZ_FP
4500 /* Three-operand expander, with one scalar operand, controlled by
4501 * a predicate, with the extra float_status parameter.
4503 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4504 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4505 void *status, uint32_t desc) \
4507 intptr_t i = simd_oprsz(desc); \
4508 uint64_t *g = vg; \
4509 TYPE mm = scalar; \
4510 do { \
4511 uint64_t pg = g[(i - 1) >> 6]; \
4512 do { \
4513 i -= sizeof(TYPE); \
4514 if (likely((pg >> (i & 63)) & 1)) { \
4515 TYPE nn = *(TYPE *)(vn + H(i)); \
4516 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4518 } while (i & 63); \
4519 } while (i != 0); \
4522 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4523 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4524 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4526 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4527 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4528 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4530 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4531 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4532 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4534 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4536 return float16_sub(b, a, s);
4539 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4541 return float32_sub(b, a, s);
4544 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4546 return float64_sub(b, a, s);
4549 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4550 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4551 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4553 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4554 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4555 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4557 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4558 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4559 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4561 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4562 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4563 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4565 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4566 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4567 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4569 /* Fully general two-operand expander, controlled by a predicate,
4570 * With the extra float_status parameter.
4572 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4573 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4575 intptr_t i = simd_oprsz(desc); \
4576 uint64_t *g = vg; \
4577 do { \
4578 uint64_t pg = g[(i - 1) >> 6]; \
4579 do { \
4580 i -= sizeof(TYPE); \
4581 if (likely((pg >> (i & 63)) & 1)) { \
4582 TYPE nn = *(TYPE *)(vn + H(i)); \
4583 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4585 } while (i & 63); \
4586 } while (i != 0); \
4589 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4590 * FZ16. When converting from fp16, this affects flushing input denormals;
4591 * when converting to fp16, this affects flushing output denormals.
4593 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4595 bool save = get_flush_inputs_to_zero(fpst);
4596 float32 ret;
4598 set_flush_inputs_to_zero(false, fpst);
4599 ret = float16_to_float32(f, true, fpst);
4600 set_flush_inputs_to_zero(save, fpst);
4601 return ret;
4604 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4606 bool save = get_flush_inputs_to_zero(fpst);
4607 float64 ret;
4609 set_flush_inputs_to_zero(false, fpst);
4610 ret = float16_to_float64(f, true, fpst);
4611 set_flush_inputs_to_zero(save, fpst);
4612 return ret;
4615 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4617 bool save = get_flush_to_zero(fpst);
4618 float16 ret;
4620 set_flush_to_zero(false, fpst);
4621 ret = float32_to_float16(f, true, fpst);
4622 set_flush_to_zero(save, fpst);
4623 return ret;
4626 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4628 bool save = get_flush_to_zero(fpst);
4629 float16 ret;
4631 set_flush_to_zero(false, fpst);
4632 ret = float64_to_float16(f, true, fpst);
4633 set_flush_to_zero(save, fpst);
4634 return ret;
4637 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4639 if (float16_is_any_nan(f)) {
4640 float_raise(float_flag_invalid, s);
4641 return 0;
4643 return float16_to_int16_round_to_zero(f, s);
4646 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4648 if (float16_is_any_nan(f)) {
4649 float_raise(float_flag_invalid, s);
4650 return 0;
4652 return float16_to_int64_round_to_zero(f, s);
4655 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4657 if (float32_is_any_nan(f)) {
4658 float_raise(float_flag_invalid, s);
4659 return 0;
4661 return float32_to_int64_round_to_zero(f, s);
4664 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4666 if (float64_is_any_nan(f)) {
4667 float_raise(float_flag_invalid, s);
4668 return 0;
4670 return float64_to_int64_round_to_zero(f, s);
4673 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4675 if (float16_is_any_nan(f)) {
4676 float_raise(float_flag_invalid, s);
4677 return 0;
4679 return float16_to_uint16_round_to_zero(f, s);
4682 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4684 if (float16_is_any_nan(f)) {
4685 float_raise(float_flag_invalid, s);
4686 return 0;
4688 return float16_to_uint64_round_to_zero(f, s);
4691 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4693 if (float32_is_any_nan(f)) {
4694 float_raise(float_flag_invalid, s);
4695 return 0;
4697 return float32_to_uint64_round_to_zero(f, s);
4700 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4702 if (float64_is_any_nan(f)) {
4703 float_raise(float_flag_invalid, s);
4704 return 0;
4706 return float64_to_uint64_round_to_zero(f, s);
4709 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4710 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4711 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4712 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4713 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4714 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4715 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4717 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4718 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4719 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4720 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4721 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4722 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4723 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4725 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4726 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4727 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4728 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4729 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4730 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4731 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4733 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4734 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4735 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4737 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4738 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4739 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4741 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4742 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4743 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4745 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4746 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4747 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4749 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4750 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4751 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4752 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4753 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4754 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4755 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4757 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4758 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4759 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4760 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4761 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4762 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4763 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4765 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4767 /* Extract frac to the top of the uint32_t. */
4768 uint32_t frac = (uint32_t)a << (16 + 6);
4769 int16_t exp = extract32(a, 10, 5);
4771 if (unlikely(exp == 0)) {
4772 if (frac != 0) {
4773 if (!get_flush_inputs_to_zero(s)) {
4774 /* denormal: bias - fractional_zeros */
4775 return -15 - clz32(frac);
4777 /* flush to zero */
4778 float_raise(float_flag_input_denormal, s);
4780 } else if (unlikely(exp == 0x1f)) {
4781 if (frac == 0) {
4782 return INT16_MAX; /* infinity */
4784 } else {
4785 /* normal: exp - bias */
4786 return exp - 15;
4788 /* nan or zero */
4789 float_raise(float_flag_invalid, s);
4790 return INT16_MIN;
4793 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4795 /* Extract frac to the top of the uint32_t. */
4796 uint32_t frac = a << 9;
4797 int32_t exp = extract32(a, 23, 8);
4799 if (unlikely(exp == 0)) {
4800 if (frac != 0) {
4801 if (!get_flush_inputs_to_zero(s)) {
4802 /* denormal: bias - fractional_zeros */
4803 return -127 - clz32(frac);
4805 /* flush to zero */
4806 float_raise(float_flag_input_denormal, s);
4808 } else if (unlikely(exp == 0xff)) {
4809 if (frac == 0) {
4810 return INT32_MAX; /* infinity */
4812 } else {
4813 /* normal: exp - bias */
4814 return exp - 127;
4816 /* nan or zero */
4817 float_raise(float_flag_invalid, s);
4818 return INT32_MIN;
4821 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4823 /* Extract frac to the top of the uint64_t. */
4824 uint64_t frac = a << 12;
4825 int64_t exp = extract64(a, 52, 11);
4827 if (unlikely(exp == 0)) {
4828 if (frac != 0) {
4829 if (!get_flush_inputs_to_zero(s)) {
4830 /* denormal: bias - fractional_zeros */
4831 return -1023 - clz64(frac);
4833 /* flush to zero */
4834 float_raise(float_flag_input_denormal, s);
4836 } else if (unlikely(exp == 0x7ff)) {
4837 if (frac == 0) {
4838 return INT64_MAX; /* infinity */
4840 } else {
4841 /* normal: exp - bias */
4842 return exp - 1023;
4844 /* nan or zero */
4845 float_raise(float_flag_invalid, s);
4846 return INT64_MIN;
4849 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4850 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4851 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4853 #undef DO_ZPZ_FP
4855 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4856 float_status *status, uint32_t desc,
4857 uint16_t neg1, uint16_t neg3)
4859 intptr_t i = simd_oprsz(desc);
4860 uint64_t *g = vg;
4862 do {
4863 uint64_t pg = g[(i - 1) >> 6];
4864 do {
4865 i -= 2;
4866 if (likely((pg >> (i & 63)) & 1)) {
4867 float16 e1, e2, e3, r;
4869 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4870 e2 = *(uint16_t *)(vm + H1_2(i));
4871 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4872 r = float16_muladd(e1, e2, e3, 0, status);
4873 *(uint16_t *)(vd + H1_2(i)) = r;
4875 } while (i & 63);
4876 } while (i != 0);
4879 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4880 void *vg, void *status, uint32_t desc)
4882 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4885 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4886 void *vg, void *status, uint32_t desc)
4888 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4891 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4892 void *vg, void *status, uint32_t desc)
4894 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4897 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4898 void *vg, void *status, uint32_t desc)
4900 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4903 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4904 float_status *status, uint32_t desc,
4905 uint32_t neg1, uint32_t neg3)
4907 intptr_t i = simd_oprsz(desc);
4908 uint64_t *g = vg;
4910 do {
4911 uint64_t pg = g[(i - 1) >> 6];
4912 do {
4913 i -= 4;
4914 if (likely((pg >> (i & 63)) & 1)) {
4915 float32 e1, e2, e3, r;
4917 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4918 e2 = *(uint32_t *)(vm + H1_4(i));
4919 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4920 r = float32_muladd(e1, e2, e3, 0, status);
4921 *(uint32_t *)(vd + H1_4(i)) = r;
4923 } while (i & 63);
4924 } while (i != 0);
4927 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4928 void *vg, void *status, uint32_t desc)
4930 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4933 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4934 void *vg, void *status, uint32_t desc)
4936 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4939 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4940 void *vg, void *status, uint32_t desc)
4942 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4945 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4946 void *vg, void *status, uint32_t desc)
4948 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4951 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4952 float_status *status, uint32_t desc,
4953 uint64_t neg1, uint64_t neg3)
4955 intptr_t i = simd_oprsz(desc);
4956 uint64_t *g = vg;
4958 do {
4959 uint64_t pg = g[(i - 1) >> 6];
4960 do {
4961 i -= 8;
4962 if (likely((pg >> (i & 63)) & 1)) {
4963 float64 e1, e2, e3, r;
4965 e1 = *(uint64_t *)(vn + i) ^ neg1;
4966 e2 = *(uint64_t *)(vm + i);
4967 e3 = *(uint64_t *)(va + i) ^ neg3;
4968 r = float64_muladd(e1, e2, e3, 0, status);
4969 *(uint64_t *)(vd + i) = r;
4971 } while (i & 63);
4972 } while (i != 0);
4975 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4976 void *vg, void *status, uint32_t desc)
4978 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4981 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4982 void *vg, void *status, uint32_t desc)
4984 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4987 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4988 void *vg, void *status, uint32_t desc)
4990 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4993 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4994 void *vg, void *status, uint32_t desc)
4996 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4999 /* Two operand floating-point comparison controlled by a predicate.
5000 * Unlike the integer version, we are not allowed to optimistically
5001 * compare operands, since the comparison may have side effects wrt
5002 * the FPSR.
5004 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
5005 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
5006 void *status, uint32_t desc) \
5008 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5009 uint64_t *d = vd, *g = vg; \
5010 do { \
5011 uint64_t out = 0, pg = g[j]; \
5012 do { \
5013 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5014 if (likely((pg >> (i & 63)) & 1)) { \
5015 TYPE nn = *(TYPE *)(vn + H(i)); \
5016 TYPE mm = *(TYPE *)(vm + H(i)); \
5017 out |= OP(TYPE, nn, mm, status); \
5019 } while (i & 63); \
5020 d[j--] = out; \
5021 } while (i > 0); \
5024 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5025 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5026 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5027 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5028 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5029 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5031 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5032 DO_FPCMP_PPZZ_H(NAME, OP) \
5033 DO_FPCMP_PPZZ_S(NAME, OP) \
5034 DO_FPCMP_PPZZ_D(NAME, OP)
5036 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
5037 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
5038 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
5039 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
5040 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
5041 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
5042 #define DO_FCMUO(TYPE, X, Y, ST) \
5043 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5044 #define DO_FACGE(TYPE, X, Y, ST) \
5045 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5046 #define DO_FACGT(TYPE, X, Y, ST) \
5047 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5049 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5050 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5051 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5052 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5053 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5054 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5055 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5057 #undef DO_FPCMP_PPZZ_ALL
5058 #undef DO_FPCMP_PPZZ_D
5059 #undef DO_FPCMP_PPZZ_S
5060 #undef DO_FPCMP_PPZZ_H
5061 #undef DO_FPCMP_PPZZ
5063 /* One operand floating-point comparison against zero, controlled
5064 * by a predicate.
5066 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
5067 void HELPER(NAME)(void *vd, void *vn, void *vg, \
5068 void *status, uint32_t desc) \
5070 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5071 uint64_t *d = vd, *g = vg; \
5072 do { \
5073 uint64_t out = 0, pg = g[j]; \
5074 do { \
5075 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5076 if ((pg >> (i & 63)) & 1) { \
5077 TYPE nn = *(TYPE *)(vn + H(i)); \
5078 out |= OP(TYPE, nn, 0, status); \
5080 } while (i & 63); \
5081 d[j--] = out; \
5082 } while (i > 0); \
5085 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5086 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5087 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5088 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5089 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5090 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5092 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5093 DO_FPCMP_PPZ0_H(NAME, OP) \
5094 DO_FPCMP_PPZ0_S(NAME, OP) \
5095 DO_FPCMP_PPZ0_D(NAME, OP)
5097 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5098 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5099 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5100 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5101 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5102 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5104 /* FP Trig Multiply-Add. */
5106 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5108 static const float16 coeff[16] = {
5109 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5110 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5112 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5113 intptr_t x = simd_data(desc);
5114 float16 *d = vd, *n = vn, *m = vm;
5115 for (i = 0; i < opr_sz; i++) {
5116 float16 mm = m[i];
5117 intptr_t xx = x;
5118 if (float16_is_neg(mm)) {
5119 mm = float16_abs(mm);
5120 xx += 8;
5122 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5126 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5128 static const float32 coeff[16] = {
5129 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5130 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5131 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5132 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5134 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5135 intptr_t x = simd_data(desc);
5136 float32 *d = vd, *n = vn, *m = vm;
5137 for (i = 0; i < opr_sz; i++) {
5138 float32 mm = m[i];
5139 intptr_t xx = x;
5140 if (float32_is_neg(mm)) {
5141 mm = float32_abs(mm);
5142 xx += 8;
5144 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5148 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5150 static const float64 coeff[16] = {
5151 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5152 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5153 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5154 0x3de5d8408868552full, 0x0000000000000000ull,
5155 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5156 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5157 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5158 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5160 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5161 intptr_t x = simd_data(desc);
5162 float64 *d = vd, *n = vn, *m = vm;
5163 for (i = 0; i < opr_sz; i++) {
5164 float64 mm = m[i];
5165 intptr_t xx = x;
5166 if (float64_is_neg(mm)) {
5167 mm = float64_abs(mm);
5168 xx += 8;
5170 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5175 * FP Complex Add
5178 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5179 void *vs, uint32_t desc)
5181 intptr_t j, i = simd_oprsz(desc);
5182 uint64_t *g = vg;
5183 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5184 float16 neg_real = float16_chs(neg_imag);
5186 do {
5187 uint64_t pg = g[(i - 1) >> 6];
5188 do {
5189 float16 e0, e1, e2, e3;
5191 /* I holds the real index; J holds the imag index. */
5192 j = i - sizeof(float16);
5193 i -= 2 * sizeof(float16);
5195 e0 = *(float16 *)(vn + H1_2(i));
5196 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5197 e2 = *(float16 *)(vn + H1_2(j));
5198 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5200 if (likely((pg >> (i & 63)) & 1)) {
5201 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5203 if (likely((pg >> (j & 63)) & 1)) {
5204 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5206 } while (i & 63);
5207 } while (i != 0);
5210 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5211 void *vs, uint32_t desc)
5213 intptr_t j, i = simd_oprsz(desc);
5214 uint64_t *g = vg;
5215 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5216 float32 neg_real = float32_chs(neg_imag);
5218 do {
5219 uint64_t pg = g[(i - 1) >> 6];
5220 do {
5221 float32 e0, e1, e2, e3;
5223 /* I holds the real index; J holds the imag index. */
5224 j = i - sizeof(float32);
5225 i -= 2 * sizeof(float32);
5227 e0 = *(float32 *)(vn + H1_2(i));
5228 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5229 e2 = *(float32 *)(vn + H1_2(j));
5230 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5232 if (likely((pg >> (i & 63)) & 1)) {
5233 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5235 if (likely((pg >> (j & 63)) & 1)) {
5236 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5238 } while (i & 63);
5239 } while (i != 0);
5242 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5243 void *vs, uint32_t desc)
5245 intptr_t j, i = simd_oprsz(desc);
5246 uint64_t *g = vg;
5247 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5248 float64 neg_real = float64_chs(neg_imag);
5250 do {
5251 uint64_t pg = g[(i - 1) >> 6];
5252 do {
5253 float64 e0, e1, e2, e3;
5255 /* I holds the real index; J holds the imag index. */
5256 j = i - sizeof(float64);
5257 i -= 2 * sizeof(float64);
5259 e0 = *(float64 *)(vn + H1_2(i));
5260 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5261 e2 = *(float64 *)(vn + H1_2(j));
5262 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5264 if (likely((pg >> (i & 63)) & 1)) {
5265 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5267 if (likely((pg >> (j & 63)) & 1)) {
5268 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5270 } while (i & 63);
5271 } while (i != 0);
5275 * FP Complex Multiply
5278 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5279 void *vg, void *status, uint32_t desc)
5281 intptr_t j, i = simd_oprsz(desc);
5282 unsigned rot = simd_data(desc);
5283 bool flip = rot & 1;
5284 float16 neg_imag, neg_real;
5285 uint64_t *g = vg;
5287 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5288 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5290 do {
5291 uint64_t pg = g[(i - 1) >> 6];
5292 do {
5293 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5295 /* I holds the real index; J holds the imag index. */
5296 j = i - sizeof(float16);
5297 i -= 2 * sizeof(float16);
5299 nr = *(float16 *)(vn + H1_2(i));
5300 ni = *(float16 *)(vn + H1_2(j));
5301 mr = *(float16 *)(vm + H1_2(i));
5302 mi = *(float16 *)(vm + H1_2(j));
5304 e2 = (flip ? ni : nr);
5305 e1 = (flip ? mi : mr) ^ neg_real;
5306 e4 = e2;
5307 e3 = (flip ? mr : mi) ^ neg_imag;
5309 if (likely((pg >> (i & 63)) & 1)) {
5310 d = *(float16 *)(va + H1_2(i));
5311 d = float16_muladd(e2, e1, d, 0, status);
5312 *(float16 *)(vd + H1_2(i)) = d;
5314 if (likely((pg >> (j & 63)) & 1)) {
5315 d = *(float16 *)(va + H1_2(j));
5316 d = float16_muladd(e4, e3, d, 0, status);
5317 *(float16 *)(vd + H1_2(j)) = d;
5319 } while (i & 63);
5320 } while (i != 0);
5323 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5324 void *vg, void *status, uint32_t desc)
5326 intptr_t j, i = simd_oprsz(desc);
5327 unsigned rot = simd_data(desc);
5328 bool flip = rot & 1;
5329 float32 neg_imag, neg_real;
5330 uint64_t *g = vg;
5332 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5333 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5335 do {
5336 uint64_t pg = g[(i - 1) >> 6];
5337 do {
5338 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5340 /* I holds the real index; J holds the imag index. */
5341 j = i - sizeof(float32);
5342 i -= 2 * sizeof(float32);
5344 nr = *(float32 *)(vn + H1_2(i));
5345 ni = *(float32 *)(vn + H1_2(j));
5346 mr = *(float32 *)(vm + H1_2(i));
5347 mi = *(float32 *)(vm + H1_2(j));
5349 e2 = (flip ? ni : nr);
5350 e1 = (flip ? mi : mr) ^ neg_real;
5351 e4 = e2;
5352 e3 = (flip ? mr : mi) ^ neg_imag;
5354 if (likely((pg >> (i & 63)) & 1)) {
5355 d = *(float32 *)(va + H1_2(i));
5356 d = float32_muladd(e2, e1, d, 0, status);
5357 *(float32 *)(vd + H1_2(i)) = d;
5359 if (likely((pg >> (j & 63)) & 1)) {
5360 d = *(float32 *)(va + H1_2(j));
5361 d = float32_muladd(e4, e3, d, 0, status);
5362 *(float32 *)(vd + H1_2(j)) = d;
5364 } while (i & 63);
5365 } while (i != 0);
5368 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5369 void *vg, void *status, uint32_t desc)
5371 intptr_t j, i = simd_oprsz(desc);
5372 unsigned rot = simd_data(desc);
5373 bool flip = rot & 1;
5374 float64 neg_imag, neg_real;
5375 uint64_t *g = vg;
5377 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5378 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5380 do {
5381 uint64_t pg = g[(i - 1) >> 6];
5382 do {
5383 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5385 /* I holds the real index; J holds the imag index. */
5386 j = i - sizeof(float64);
5387 i -= 2 * sizeof(float64);
5389 nr = *(float64 *)(vn + H1_2(i));
5390 ni = *(float64 *)(vn + H1_2(j));
5391 mr = *(float64 *)(vm + H1_2(i));
5392 mi = *(float64 *)(vm + H1_2(j));
5394 e2 = (flip ? ni : nr);
5395 e1 = (flip ? mi : mr) ^ neg_real;
5396 e4 = e2;
5397 e3 = (flip ? mr : mi) ^ neg_imag;
5399 if (likely((pg >> (i & 63)) & 1)) {
5400 d = *(float64 *)(va + H1_2(i));
5401 d = float64_muladd(e2, e1, d, 0, status);
5402 *(float64 *)(vd + H1_2(i)) = d;
5404 if (likely((pg >> (j & 63)) & 1)) {
5405 d = *(float64 *)(va + H1_2(j));
5406 d = float64_muladd(e4, e3, d, 0, status);
5407 *(float64 *)(vd + H1_2(j)) = d;
5409 } while (i & 63);
5410 } while (i != 0);
5414 * Load contiguous data, protected by a governing predicate.
5418 * Load one element into @vd + @reg_off from @host.
5419 * The controlling predicate is known to be true.
5421 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
5424 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5425 * The controlling predicate is known to be true.
5427 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5428 target_ulong vaddr, uintptr_t retaddr);
5431 * Generate the above primitives.
5434 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5435 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5437 TYPEM val = HOST(host); \
5438 *(TYPEE *)(vd + H(reg_off)) = val; \
5441 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5442 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5443 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5445 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5446 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5447 target_ulong addr, uintptr_t ra) \
5449 *(TYPEE *)(vd + H(reg_off)) = \
5450 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5453 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5454 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5455 target_ulong addr, uintptr_t ra) \
5457 TLB(env, useronly_clean_ptr(addr), \
5458 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5461 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5462 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5463 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5465 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5466 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5467 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5468 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5469 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5470 DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t)
5471 DO_LD_PRIM_1(ld1bds, H1_8, uint64_t, int8_t)
5473 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5474 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5475 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5477 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5478 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5479 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5480 DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t)
5482 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5483 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5484 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5485 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5486 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5488 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5489 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5490 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5491 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5492 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5494 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5495 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5496 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5497 DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw)
5498 DO_LD_PRIM_2(hds, H1_8, uint64_t, int16_t, lduw)
5500 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5501 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5502 DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw)
5504 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5505 DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl)
5506 DO_LD_PRIM_2(sds, H1_8, uint64_t, int32_t, ldl)
5508 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5509 DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
5511 DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
5512 DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
5514 #undef DO_LD_TLB
5515 #undef DO_ST_TLB
5516 #undef DO_LD_HOST
5517 #undef DO_LD_PRIM_1
5518 #undef DO_ST_PRIM_1
5519 #undef DO_LD_PRIM_2
5520 #undef DO_ST_PRIM_2
5523 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5524 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5525 * element >= @reg_off, or @reg_max if there were no active elements at all.
5527 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5528 intptr_t reg_max, int esz)
5530 uint64_t pg_mask = pred_esz_masks[esz];
5531 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5533 /* In normal usage, the first element is active. */
5534 if (likely(pg & 1)) {
5535 return reg_off;
5538 if (pg == 0) {
5539 reg_off &= -64;
5540 do {
5541 reg_off += 64;
5542 if (unlikely(reg_off >= reg_max)) {
5543 /* The entire predicate was false. */
5544 return reg_max;
5546 pg = vg[reg_off >> 6] & pg_mask;
5547 } while (pg == 0);
5549 reg_off += ctz64(pg);
5551 /* We should never see an out of range predicate bit set. */
5552 tcg_debug_assert(reg_off < reg_max);
5553 return reg_off;
5557 * Resolve the guest virtual address to info->host and info->flags.
5558 * If @nofault, return false if the page is invalid, otherwise
5559 * exit via page fault exception.
5562 typedef struct {
5563 void *host;
5564 int flags;
5565 MemTxAttrs attrs;
5566 } SVEHostPage;
5568 static bool sve_probe_page(SVEHostPage *info, bool nofault,
5569 CPUARMState *env, target_ulong addr,
5570 int mem_off, MMUAccessType access_type,
5571 int mmu_idx, uintptr_t retaddr)
5573 int flags;
5575 addr += mem_off;
5578 * User-only currently always issues with TBI. See the comment
5579 * above useronly_clean_ptr. Usually we clean this top byte away
5580 * during translation, but we can't do that for e.g. vector + imm
5581 * addressing modes.
5583 * We currently always enable TBI for user-only, and do not provide
5584 * a way to turn it off. So clean the pointer unconditionally here,
5585 * rather than look it up here, or pass it down from above.
5587 addr = useronly_clean_ptr(addr);
5589 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5590 &info->host, retaddr);
5591 info->flags = flags;
5593 if (flags & TLB_INVALID_MASK) {
5594 g_assert(nofault);
5595 return false;
5598 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5599 info->host -= mem_off;
5601 #ifdef CONFIG_USER_ONLY
5602 memset(&info->attrs, 0, sizeof(info->attrs));
5603 #else
5605 * Find the iotlbentry for addr and return the transaction attributes.
5606 * This *must* be present in the TLB because we just found the mapping.
5609 uintptr_t index = tlb_index(env, mmu_idx, addr);
5611 # ifdef CONFIG_DEBUG_TCG
5612 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5613 target_ulong comparator = (access_type == MMU_DATA_LOAD
5614 ? entry->addr_read
5615 : tlb_addr_write(entry));
5616 g_assert(tlb_hit(comparator, addr));
5617 # endif
5619 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5620 info->attrs = iotlbentry->attrs;
5622 #endif
5624 return true;
5629 * Analyse contiguous data, protected by a governing predicate.
5632 typedef enum {
5633 FAULT_NO,
5634 FAULT_FIRST,
5635 FAULT_ALL,
5636 } SVEContFault;
5638 typedef struct {
5640 * First and last element wholly contained within the two pages.
5641 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5642 * reg_off_last[0] may be < 0 if the first element crosses pages.
5643 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5644 * are set >= 0 only if there are complete elements on a second page.
5646 * The reg_off_* offsets are relative to the internal vector register.
5647 * The mem_off_first offset is relative to the memory address; the
5648 * two offsets are different when a load operation extends, a store
5649 * operation truncates, or for multi-register operations.
5651 int16_t mem_off_first[2];
5652 int16_t reg_off_first[2];
5653 int16_t reg_off_last[2];
5656 * One element that is misaligned and spans both pages,
5657 * or -1 if there is no such active element.
5659 int16_t mem_off_split;
5660 int16_t reg_off_split;
5663 * The byte offset at which the entire operation crosses a page boundary.
5664 * Set >= 0 if and only if the entire operation spans two pages.
5666 int16_t page_split;
5668 /* TLB data for the two pages. */
5669 SVEHostPage page[2];
5670 } SVEContLdSt;
5673 * Find first active element on each page, and a loose bound for the
5674 * final element on each page. Identify any single element that spans
5675 * the page boundary. Return true if there are any active elements.
5677 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5678 uint64_t *vg, intptr_t reg_max,
5679 int esz, int msize)
5681 const int esize = 1 << esz;
5682 const uint64_t pg_mask = pred_esz_masks[esz];
5683 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5684 intptr_t mem_off_last, mem_off_split;
5685 intptr_t page_split, elt_split;
5686 intptr_t i;
5688 /* Set all of the element indices to -1, and the TLB data to 0. */
5689 memset(info, -1, offsetof(SVEContLdSt, page));
5690 memset(info->page, 0, sizeof(info->page));
5692 /* Gross scan over the entire predicate to find bounds. */
5693 i = 0;
5694 do {
5695 uint64_t pg = vg[i] & pg_mask;
5696 if (pg) {
5697 reg_off_last = i * 64 + 63 - clz64(pg);
5698 if (reg_off_first < 0) {
5699 reg_off_first = i * 64 + ctz64(pg);
5702 } while (++i * 64 < reg_max);
5704 if (unlikely(reg_off_first < 0)) {
5705 /* No active elements, no pages touched. */
5706 return false;
5708 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5710 info->reg_off_first[0] = reg_off_first;
5711 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5712 mem_off_last = (reg_off_last >> esz) * msize;
5714 page_split = -(addr | TARGET_PAGE_MASK);
5715 if (likely(mem_off_last + msize <= page_split)) {
5716 /* The entire operation fits within a single page. */
5717 info->reg_off_last[0] = reg_off_last;
5718 return true;
5721 info->page_split = page_split;
5722 elt_split = page_split / msize;
5723 reg_off_split = elt_split << esz;
5724 mem_off_split = elt_split * msize;
5727 * This is the last full element on the first page, but it is not
5728 * necessarily active. If there is no full element, i.e. the first
5729 * active element is the one that's split, this value remains -1.
5730 * It is useful as iteration bounds.
5732 if (elt_split != 0) {
5733 info->reg_off_last[0] = reg_off_split - esize;
5736 /* Determine if an unaligned element spans the pages. */
5737 if (page_split % msize != 0) {
5738 /* It is helpful to know if the split element is active. */
5739 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5740 info->reg_off_split = reg_off_split;
5741 info->mem_off_split = mem_off_split;
5743 if (reg_off_split == reg_off_last) {
5744 /* The page crossing element is last. */
5745 return true;
5748 reg_off_split += esize;
5749 mem_off_split += msize;
5753 * We do want the first active element on the second page, because
5754 * this may affect the address reported in an exception.
5756 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5757 tcg_debug_assert(reg_off_split <= reg_off_last);
5758 info->reg_off_first[1] = reg_off_split;
5759 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5760 info->reg_off_last[1] = reg_off_last;
5761 return true;
5765 * Resolve the guest virtual addresses to info->page[].
5766 * Control the generation of page faults with @fault. Return false if
5767 * there is no work to do, which can only happen with @fault == FAULT_NO.
5769 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5770 CPUARMState *env, target_ulong addr,
5771 MMUAccessType access_type, uintptr_t retaddr)
5773 int mmu_idx = cpu_mmu_index(env, false);
5774 int mem_off = info->mem_off_first[0];
5775 bool nofault = fault == FAULT_NO;
5776 bool have_work = true;
5778 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5779 access_type, mmu_idx, retaddr)) {
5780 /* No work to be done. */
5781 return false;
5784 if (likely(info->page_split < 0)) {
5785 /* The entire operation was on the one page. */
5786 return true;
5790 * If the second page is invalid, then we want the fault address to be
5791 * the first byte on that page which is accessed.
5793 if (info->mem_off_split >= 0) {
5795 * There is an element split across the pages. The fault address
5796 * should be the first byte of the second page.
5798 mem_off = info->page_split;
5800 * If the split element is also the first active element
5801 * of the vector, then: For first-fault we should continue
5802 * to generate faults for the second page. For no-fault,
5803 * we have work only if the second page is valid.
5805 if (info->mem_off_first[0] < info->mem_off_split) {
5806 nofault = FAULT_FIRST;
5807 have_work = false;
5809 } else {
5811 * There is no element split across the pages. The fault address
5812 * should be the first active element on the second page.
5814 mem_off = info->mem_off_first[1];
5816 * There must have been one active element on the first page,
5817 * so we're out of first-fault territory.
5819 nofault = fault != FAULT_ALL;
5822 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5823 access_type, mmu_idx, retaddr);
5824 return have_work;
5827 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5828 uint64_t *vg, target_ulong addr,
5829 int esize, int msize, int wp_access,
5830 uintptr_t retaddr)
5832 #ifndef CONFIG_USER_ONLY
5833 intptr_t mem_off, reg_off, reg_last;
5834 int flags0 = info->page[0].flags;
5835 int flags1 = info->page[1].flags;
5837 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5838 return;
5841 /* Indicate that watchpoints are handled. */
5842 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5843 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5845 if (flags0 & TLB_WATCHPOINT) {
5846 mem_off = info->mem_off_first[0];
5847 reg_off = info->reg_off_first[0];
5848 reg_last = info->reg_off_last[0];
5850 while (reg_off <= reg_last) {
5851 uint64_t pg = vg[reg_off >> 6];
5852 do {
5853 if ((pg >> (reg_off & 63)) & 1) {
5854 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5855 msize, info->page[0].attrs,
5856 wp_access, retaddr);
5858 reg_off += esize;
5859 mem_off += msize;
5860 } while (reg_off <= reg_last && (reg_off & 63));
5864 mem_off = info->mem_off_split;
5865 if (mem_off >= 0) {
5866 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5867 info->page[0].attrs, wp_access, retaddr);
5870 mem_off = info->mem_off_first[1];
5871 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5872 reg_off = info->reg_off_first[1];
5873 reg_last = info->reg_off_last[1];
5875 do {
5876 uint64_t pg = vg[reg_off >> 6];
5877 do {
5878 if ((pg >> (reg_off & 63)) & 1) {
5879 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5880 msize, info->page[1].attrs,
5881 wp_access, retaddr);
5883 reg_off += esize;
5884 mem_off += msize;
5885 } while (reg_off & 63);
5886 } while (reg_off <= reg_last);
5888 #endif
5891 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5892 uint64_t *vg, target_ulong addr, int esize,
5893 int msize, uint32_t mtedesc, uintptr_t ra)
5895 intptr_t mem_off, reg_off, reg_last;
5897 /* Process the page only if MemAttr == Tagged. */
5898 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5899 mem_off = info->mem_off_first[0];
5900 reg_off = info->reg_off_first[0];
5901 reg_last = info->reg_off_split;
5902 if (reg_last < 0) {
5903 reg_last = info->reg_off_last[0];
5906 do {
5907 uint64_t pg = vg[reg_off >> 6];
5908 do {
5909 if ((pg >> (reg_off & 63)) & 1) {
5910 mte_check(env, mtedesc, addr, ra);
5912 reg_off += esize;
5913 mem_off += msize;
5914 } while (reg_off <= reg_last && (reg_off & 63));
5915 } while (reg_off <= reg_last);
5918 mem_off = info->mem_off_first[1];
5919 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5920 reg_off = info->reg_off_first[1];
5921 reg_last = info->reg_off_last[1];
5923 do {
5924 uint64_t pg = vg[reg_off >> 6];
5925 do {
5926 if ((pg >> (reg_off & 63)) & 1) {
5927 mte_check(env, mtedesc, addr, ra);
5929 reg_off += esize;
5930 mem_off += msize;
5931 } while (reg_off & 63);
5932 } while (reg_off <= reg_last);
5937 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5939 static inline QEMU_ALWAYS_INLINE
5940 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5941 uint32_t desc, const uintptr_t retaddr,
5942 const int esz, const int msz, const int N, uint32_t mtedesc,
5943 sve_ldst1_host_fn *host_fn,
5944 sve_ldst1_tlb_fn *tlb_fn)
5946 const unsigned rd = simd_data(desc);
5947 const intptr_t reg_max = simd_oprsz(desc);
5948 intptr_t reg_off, reg_last, mem_off;
5949 SVEContLdSt info;
5950 void *host;
5951 int flags, i;
5953 /* Find the active elements. */
5954 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5955 /* The entire predicate was false; no load occurs. */
5956 for (i = 0; i < N; ++i) {
5957 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5959 return;
5962 /* Probe the page(s). Exit with exception for any invalid page. */
5963 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5965 /* Handle watchpoints for all active elements. */
5966 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5967 BP_MEM_READ, retaddr);
5970 * Handle mte checks for all active elements.
5971 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5973 if (mtedesc) {
5974 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5975 mtedesc, retaddr);
5978 flags = info.page[0].flags | info.page[1].flags;
5979 if (unlikely(flags != 0)) {
5980 #ifdef CONFIG_USER_ONLY
5981 g_assert_not_reached();
5982 #else
5984 * At least one page includes MMIO.
5985 * Any bus operation can fail with cpu_transaction_failed,
5986 * which for ARM will raise SyncExternal. Perform the load
5987 * into scratch memory to preserve register state until the end.
5989 ARMVectorReg scratch[4] = { };
5991 mem_off = info.mem_off_first[0];
5992 reg_off = info.reg_off_first[0];
5993 reg_last = info.reg_off_last[1];
5994 if (reg_last < 0) {
5995 reg_last = info.reg_off_split;
5996 if (reg_last < 0) {
5997 reg_last = info.reg_off_last[0];
6001 do {
6002 uint64_t pg = vg[reg_off >> 6];
6003 do {
6004 if ((pg >> (reg_off & 63)) & 1) {
6005 for (i = 0; i < N; ++i) {
6006 tlb_fn(env, &scratch[i], reg_off,
6007 addr + mem_off + (i << msz), retaddr);
6010 reg_off += 1 << esz;
6011 mem_off += N << msz;
6012 } while (reg_off & 63);
6013 } while (reg_off <= reg_last);
6015 for (i = 0; i < N; ++i) {
6016 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
6018 return;
6019 #endif
6022 /* The entire operation is in RAM, on valid pages. */
6024 for (i = 0; i < N; ++i) {
6025 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6028 mem_off = info.mem_off_first[0];
6029 reg_off = info.reg_off_first[0];
6030 reg_last = info.reg_off_last[0];
6031 host = info.page[0].host;
6033 while (reg_off <= reg_last) {
6034 uint64_t pg = vg[reg_off >> 6];
6035 do {
6036 if ((pg >> (reg_off & 63)) & 1) {
6037 for (i = 0; i < N; ++i) {
6038 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6039 host + mem_off + (i << msz));
6042 reg_off += 1 << esz;
6043 mem_off += N << msz;
6044 } while (reg_off <= reg_last && (reg_off & 63));
6048 * Use the slow path to manage the cross-page misalignment.
6049 * But we know this is RAM and cannot trap.
6051 mem_off = info.mem_off_split;
6052 if (unlikely(mem_off >= 0)) {
6053 reg_off = info.reg_off_split;
6054 for (i = 0; i < N; ++i) {
6055 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6056 addr + mem_off + (i << msz), retaddr);
6060 mem_off = info.mem_off_first[1];
6061 if (unlikely(mem_off >= 0)) {
6062 reg_off = info.reg_off_first[1];
6063 reg_last = info.reg_off_last[1];
6064 host = info.page[1].host;
6066 do {
6067 uint64_t pg = vg[reg_off >> 6];
6068 do {
6069 if ((pg >> (reg_off & 63)) & 1) {
6070 for (i = 0; i < N; ++i) {
6071 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6072 host + mem_off + (i << msz));
6075 reg_off += 1 << esz;
6076 mem_off += N << msz;
6077 } while (reg_off & 63);
6078 } while (reg_off <= reg_last);
6082 static inline QEMU_ALWAYS_INLINE
6083 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6084 uint32_t desc, const uintptr_t ra,
6085 const int esz, const int msz, const int N,
6086 sve_ldst1_host_fn *host_fn,
6087 sve_ldst1_tlb_fn *tlb_fn)
6089 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6090 int bit55 = extract64(addr, 55, 1);
6092 /* Remove mtedesc from the normal sve descriptor. */
6093 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6095 /* Perform gross MTE suppression early. */
6096 if (!tbi_check(desc, bit55) ||
6097 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6098 mtedesc = 0;
6101 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6104 #define DO_LD1_1(NAME, ESZ) \
6105 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
6106 target_ulong addr, uint32_t desc) \
6108 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
6109 sve_##NAME##_host, sve_##NAME##_tlb); \
6111 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
6112 target_ulong addr, uint32_t desc) \
6114 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6115 sve_##NAME##_host, sve_##NAME##_tlb); \
6118 #define DO_LD1_2(NAME, ESZ, MSZ) \
6119 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6120 target_ulong addr, uint32_t desc) \
6122 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6123 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6125 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6126 target_ulong addr, uint32_t desc) \
6128 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6129 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6131 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6132 target_ulong addr, uint32_t desc) \
6134 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6135 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6137 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6138 target_ulong addr, uint32_t desc) \
6140 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6141 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6144 DO_LD1_1(ld1bb, MO_8)
6145 DO_LD1_1(ld1bhu, MO_16)
6146 DO_LD1_1(ld1bhs, MO_16)
6147 DO_LD1_1(ld1bsu, MO_32)
6148 DO_LD1_1(ld1bss, MO_32)
6149 DO_LD1_1(ld1bdu, MO_64)
6150 DO_LD1_1(ld1bds, MO_64)
6152 DO_LD1_2(ld1hh, MO_16, MO_16)
6153 DO_LD1_2(ld1hsu, MO_32, MO_16)
6154 DO_LD1_2(ld1hss, MO_32, MO_16)
6155 DO_LD1_2(ld1hdu, MO_64, MO_16)
6156 DO_LD1_2(ld1hds, MO_64, MO_16)
6158 DO_LD1_2(ld1ss, MO_32, MO_32)
6159 DO_LD1_2(ld1sdu, MO_64, MO_32)
6160 DO_LD1_2(ld1sds, MO_64, MO_32)
6162 DO_LD1_2(ld1dd, MO_64, MO_64)
6164 #undef DO_LD1_1
6165 #undef DO_LD1_2
6167 #define DO_LDN_1(N) \
6168 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6169 target_ulong addr, uint32_t desc) \
6171 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6172 sve_ld1bb_host, sve_ld1bb_tlb); \
6174 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6175 target_ulong addr, uint32_t desc) \
6177 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6178 sve_ld1bb_host, sve_ld1bb_tlb); \
6181 #define DO_LDN_2(N, SUFF, ESZ) \
6182 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6183 target_ulong addr, uint32_t desc) \
6185 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6186 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6188 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6189 target_ulong addr, uint32_t desc) \
6191 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6192 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6194 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6195 target_ulong addr, uint32_t desc) \
6197 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6198 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6200 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6201 target_ulong addr, uint32_t desc) \
6203 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6204 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6207 DO_LDN_1(2)
6208 DO_LDN_1(3)
6209 DO_LDN_1(4)
6211 DO_LDN_2(2, hh, MO_16)
6212 DO_LDN_2(3, hh, MO_16)
6213 DO_LDN_2(4, hh, MO_16)
6215 DO_LDN_2(2, ss, MO_32)
6216 DO_LDN_2(3, ss, MO_32)
6217 DO_LDN_2(4, ss, MO_32)
6219 DO_LDN_2(2, dd, MO_64)
6220 DO_LDN_2(3, dd, MO_64)
6221 DO_LDN_2(4, dd, MO_64)
6223 #undef DO_LDN_1
6224 #undef DO_LDN_2
6227 * Load contiguous data, first-fault and no-fault.
6229 * For user-only, one could argue that we should hold the mmap_lock during
6230 * the operation so that there is no race between page_check_range and the
6231 * load operation. However, unmapping pages out from under a running thread
6232 * is extraordinarily unlikely. This theoretical race condition also affects
6233 * linux-user/ in its get_user/put_user macros.
6235 * TODO: Construct some helpers, written in assembly, that interact with
6236 * handle_cpu_signal to produce memory ops which can properly report errors
6237 * without racing.
6240 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6241 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6242 * option, which leaves subsequent data unchanged.
6244 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6246 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6248 if (i & 63) {
6249 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6250 i = ROUND_UP(i, 64);
6252 for (; i < oprsz; i += 64) {
6253 ffr[i / 64] = 0;
6258 * Common helper for all contiguous no-fault and first-fault loads.
6260 static inline QEMU_ALWAYS_INLINE
6261 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6262 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6263 const int esz, const int msz, const SVEContFault fault,
6264 sve_ldst1_host_fn *host_fn,
6265 sve_ldst1_tlb_fn *tlb_fn)
6267 const unsigned rd = simd_data(desc);
6268 void *vd = &env->vfp.zregs[rd];
6269 const intptr_t reg_max = simd_oprsz(desc);
6270 intptr_t reg_off, mem_off, reg_last;
6271 SVEContLdSt info;
6272 int flags;
6273 void *host;
6275 /* Find the active elements. */
6276 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6277 /* The entire predicate was false; no load occurs. */
6278 memset(vd, 0, reg_max);
6279 return;
6281 reg_off = info.reg_off_first[0];
6283 /* Probe the page(s). */
6284 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6285 /* Fault on first element. */
6286 tcg_debug_assert(fault == FAULT_NO);
6287 memset(vd, 0, reg_max);
6288 goto do_fault;
6291 mem_off = info.mem_off_first[0];
6292 flags = info.page[0].flags;
6295 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6296 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6298 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
6299 mtedesc = 0;
6302 if (fault == FAULT_FIRST) {
6303 /* Trapping mte check for the first-fault element. */
6304 if (mtedesc) {
6305 mte_check(env, mtedesc, addr + mem_off, retaddr);
6309 * Special handling of the first active element,
6310 * if it crosses a page boundary or is MMIO.
6312 bool is_split = mem_off == info.mem_off_split;
6313 if (unlikely(flags != 0) || unlikely(is_split)) {
6315 * Use the slow path for cross-page handling.
6316 * Might trap for MMIO or watchpoints.
6318 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6320 /* After any fault, zero the other elements. */
6321 swap_memzero(vd, reg_off);
6322 reg_off += 1 << esz;
6323 mem_off += 1 << msz;
6324 swap_memzero(vd + reg_off, reg_max - reg_off);
6326 if (is_split) {
6327 goto second_page;
6329 } else {
6330 memset(vd, 0, reg_max);
6332 } else {
6333 memset(vd, 0, reg_max);
6334 if (unlikely(mem_off == info.mem_off_split)) {
6335 /* The first active element crosses a page boundary. */
6336 flags |= info.page[1].flags;
6337 if (unlikely(flags & TLB_MMIO)) {
6338 /* Some page is MMIO, see below. */
6339 goto do_fault;
6341 if (unlikely(flags & TLB_WATCHPOINT) &&
6342 (cpu_watchpoint_address_matches
6343 (env_cpu(env), addr + mem_off, 1 << msz)
6344 & BP_MEM_READ)) {
6345 /* Watchpoint hit, see below. */
6346 goto do_fault;
6348 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6349 goto do_fault;
6352 * Use the slow path for cross-page handling.
6353 * This is RAM, without a watchpoint, and will not trap.
6355 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6356 goto second_page;
6361 * From this point on, all memory operations are MemSingleNF.
6363 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6364 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6366 * Unfortuately we do not have access to the memory attributes from the
6367 * PTE to tell Device memory from Normal memory. So we make a mostly
6368 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6369 * This gives the right answer for the common cases of "Normal memory,
6370 * backed by host RAM" and "Device memory, backed by MMIO".
6371 * The architecture allows us to suppress an NF load and return
6372 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6373 * case of "Normal memory, backed by MMIO" is permitted. The case we
6374 * get wrong is "Device memory, backed by host RAM", for which we
6375 * should return (UNKNOWN, FAULT) for but do not.
6377 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6378 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6379 * architectural breakpoints the same.
6381 if (unlikely(flags & TLB_MMIO)) {
6382 goto do_fault;
6385 reg_last = info.reg_off_last[0];
6386 host = info.page[0].host;
6388 do {
6389 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6390 do {
6391 if ((pg >> (reg_off & 63)) & 1) {
6392 if (unlikely(flags & TLB_WATCHPOINT) &&
6393 (cpu_watchpoint_address_matches
6394 (env_cpu(env), addr + mem_off, 1 << msz)
6395 & BP_MEM_READ)) {
6396 goto do_fault;
6398 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6399 goto do_fault;
6401 host_fn(vd, reg_off, host + mem_off);
6403 reg_off += 1 << esz;
6404 mem_off += 1 << msz;
6405 } while (reg_off <= reg_last && (reg_off & 63));
6406 } while (reg_off <= reg_last);
6409 * MemSingleNF is allowed to fail for any reason. We have special
6410 * code above to handle the first element crossing a page boundary.
6411 * As an implementation choice, decline to handle a cross-page element
6412 * in any other position.
6414 reg_off = info.reg_off_split;
6415 if (reg_off >= 0) {
6416 goto do_fault;
6419 second_page:
6420 reg_off = info.reg_off_first[1];
6421 if (likely(reg_off < 0)) {
6422 /* No active elements on the second page. All done. */
6423 return;
6427 * MemSingleNF is allowed to fail for any reason. As an implementation
6428 * choice, decline to handle elements on the second page. This should
6429 * be low frequency as the guest walks through memory -- the next
6430 * iteration of the guest's loop should be aligned on the page boundary,
6431 * and then all following iterations will stay aligned.
6434 do_fault:
6435 record_fault(env, reg_off, reg_max);
6438 static inline QEMU_ALWAYS_INLINE
6439 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6440 uint32_t desc, const uintptr_t retaddr,
6441 const int esz, const int msz, const SVEContFault fault,
6442 sve_ldst1_host_fn *host_fn,
6443 sve_ldst1_tlb_fn *tlb_fn)
6445 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6446 int bit55 = extract64(addr, 55, 1);
6448 /* Remove mtedesc from the normal sve descriptor. */
6449 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6451 /* Perform gross MTE suppression early. */
6452 if (!tbi_check(desc, bit55) ||
6453 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6454 mtedesc = 0;
6457 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6458 esz, msz, fault, host_fn, tlb_fn);
6461 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6462 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6463 target_ulong addr, uint32_t desc) \
6465 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6466 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6468 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6469 target_ulong addr, uint32_t desc) \
6471 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6472 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6474 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6475 target_ulong addr, uint32_t desc) \
6477 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6478 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6480 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6481 target_ulong addr, uint32_t desc) \
6483 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6484 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6487 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6488 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6489 target_ulong addr, uint32_t desc) \
6491 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6492 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6494 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6495 target_ulong addr, uint32_t desc) \
6497 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6498 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6500 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6501 target_ulong addr, uint32_t desc) \
6503 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6504 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6506 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6507 target_ulong addr, uint32_t desc) \
6509 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6510 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6512 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6513 target_ulong addr, uint32_t desc) \
6515 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6516 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6518 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6519 target_ulong addr, uint32_t desc) \
6521 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6522 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6524 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6525 target_ulong addr, uint32_t desc) \
6527 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6528 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6530 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6531 target_ulong addr, uint32_t desc) \
6533 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6534 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6537 DO_LDFF1_LDNF1_1(bb, MO_8)
6538 DO_LDFF1_LDNF1_1(bhu, MO_16)
6539 DO_LDFF1_LDNF1_1(bhs, MO_16)
6540 DO_LDFF1_LDNF1_1(bsu, MO_32)
6541 DO_LDFF1_LDNF1_1(bss, MO_32)
6542 DO_LDFF1_LDNF1_1(bdu, MO_64)
6543 DO_LDFF1_LDNF1_1(bds, MO_64)
6545 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6546 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6547 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6548 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6549 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6551 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6552 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6553 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6555 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6557 #undef DO_LDFF1_LDNF1_1
6558 #undef DO_LDFF1_LDNF1_2
6561 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6564 static inline QEMU_ALWAYS_INLINE
6565 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6566 uint32_t desc, const uintptr_t retaddr,
6567 const int esz, const int msz, const int N, uint32_t mtedesc,
6568 sve_ldst1_host_fn *host_fn,
6569 sve_ldst1_tlb_fn *tlb_fn)
6571 const unsigned rd = simd_data(desc);
6572 const intptr_t reg_max = simd_oprsz(desc);
6573 intptr_t reg_off, reg_last, mem_off;
6574 SVEContLdSt info;
6575 void *host;
6576 int i, flags;
6578 /* Find the active elements. */
6579 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6580 /* The entire predicate was false; no store occurs. */
6581 return;
6584 /* Probe the page(s). Exit with exception for any invalid page. */
6585 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6587 /* Handle watchpoints for all active elements. */
6588 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6589 BP_MEM_WRITE, retaddr);
6592 * Handle mte checks for all active elements.
6593 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6595 if (mtedesc) {
6596 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6597 mtedesc, retaddr);
6600 flags = info.page[0].flags | info.page[1].flags;
6601 if (unlikely(flags != 0)) {
6602 #ifdef CONFIG_USER_ONLY
6603 g_assert_not_reached();
6604 #else
6606 * At least one page includes MMIO.
6607 * Any bus operation can fail with cpu_transaction_failed,
6608 * which for ARM will raise SyncExternal. We cannot avoid
6609 * this fault and will leave with the store incomplete.
6611 mem_off = info.mem_off_first[0];
6612 reg_off = info.reg_off_first[0];
6613 reg_last = info.reg_off_last[1];
6614 if (reg_last < 0) {
6615 reg_last = info.reg_off_split;
6616 if (reg_last < 0) {
6617 reg_last = info.reg_off_last[0];
6621 do {
6622 uint64_t pg = vg[reg_off >> 6];
6623 do {
6624 if ((pg >> (reg_off & 63)) & 1) {
6625 for (i = 0; i < N; ++i) {
6626 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6627 addr + mem_off + (i << msz), retaddr);
6630 reg_off += 1 << esz;
6631 mem_off += N << msz;
6632 } while (reg_off & 63);
6633 } while (reg_off <= reg_last);
6634 return;
6635 #endif
6638 mem_off = info.mem_off_first[0];
6639 reg_off = info.reg_off_first[0];
6640 reg_last = info.reg_off_last[0];
6641 host = info.page[0].host;
6643 while (reg_off <= reg_last) {
6644 uint64_t pg = vg[reg_off >> 6];
6645 do {
6646 if ((pg >> (reg_off & 63)) & 1) {
6647 for (i = 0; i < N; ++i) {
6648 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6649 host + mem_off + (i << msz));
6652 reg_off += 1 << esz;
6653 mem_off += N << msz;
6654 } while (reg_off <= reg_last && (reg_off & 63));
6658 * Use the slow path to manage the cross-page misalignment.
6659 * But we know this is RAM and cannot trap.
6661 mem_off = info.mem_off_split;
6662 if (unlikely(mem_off >= 0)) {
6663 reg_off = info.reg_off_split;
6664 for (i = 0; i < N; ++i) {
6665 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6666 addr + mem_off + (i << msz), retaddr);
6670 mem_off = info.mem_off_first[1];
6671 if (unlikely(mem_off >= 0)) {
6672 reg_off = info.reg_off_first[1];
6673 reg_last = info.reg_off_last[1];
6674 host = info.page[1].host;
6676 do {
6677 uint64_t pg = vg[reg_off >> 6];
6678 do {
6679 if ((pg >> (reg_off & 63)) & 1) {
6680 for (i = 0; i < N; ++i) {
6681 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6682 host + mem_off + (i << msz));
6685 reg_off += 1 << esz;
6686 mem_off += N << msz;
6687 } while (reg_off & 63);
6688 } while (reg_off <= reg_last);
6692 static inline QEMU_ALWAYS_INLINE
6693 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6694 uint32_t desc, const uintptr_t ra,
6695 const int esz, const int msz, const int N,
6696 sve_ldst1_host_fn *host_fn,
6697 sve_ldst1_tlb_fn *tlb_fn)
6699 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6700 int bit55 = extract64(addr, 55, 1);
6702 /* Remove mtedesc from the normal sve descriptor. */
6703 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6705 /* Perform gross MTE suppression early. */
6706 if (!tbi_check(desc, bit55) ||
6707 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6708 mtedesc = 0;
6711 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6714 #define DO_STN_1(N, NAME, ESZ) \
6715 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6716 target_ulong addr, uint32_t desc) \
6718 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6719 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6721 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6722 target_ulong addr, uint32_t desc) \
6724 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6725 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6728 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6729 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6730 target_ulong addr, uint32_t desc) \
6732 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6733 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6735 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6736 target_ulong addr, uint32_t desc) \
6738 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6739 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6741 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6742 target_ulong addr, uint32_t desc) \
6744 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6745 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6747 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6748 target_ulong addr, uint32_t desc) \
6750 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6751 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6754 DO_STN_1(1, bb, MO_8)
6755 DO_STN_1(1, bh, MO_16)
6756 DO_STN_1(1, bs, MO_32)
6757 DO_STN_1(1, bd, MO_64)
6758 DO_STN_1(2, bb, MO_8)
6759 DO_STN_1(3, bb, MO_8)
6760 DO_STN_1(4, bb, MO_8)
6762 DO_STN_2(1, hh, MO_16, MO_16)
6763 DO_STN_2(1, hs, MO_32, MO_16)
6764 DO_STN_2(1, hd, MO_64, MO_16)
6765 DO_STN_2(2, hh, MO_16, MO_16)
6766 DO_STN_2(3, hh, MO_16, MO_16)
6767 DO_STN_2(4, hh, MO_16, MO_16)
6769 DO_STN_2(1, ss, MO_32, MO_32)
6770 DO_STN_2(1, sd, MO_64, MO_32)
6771 DO_STN_2(2, ss, MO_32, MO_32)
6772 DO_STN_2(3, ss, MO_32, MO_32)
6773 DO_STN_2(4, ss, MO_32, MO_32)
6775 DO_STN_2(1, dd, MO_64, MO_64)
6776 DO_STN_2(2, dd, MO_64, MO_64)
6777 DO_STN_2(3, dd, MO_64, MO_64)
6778 DO_STN_2(4, dd, MO_64, MO_64)
6780 #undef DO_STN_1
6781 #undef DO_STN_2
6784 * Loads with a vector index.
6788 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6790 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6792 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6794 return *(uint32_t *)(reg + H1_4(reg_ofs));
6797 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6799 return *(int32_t *)(reg + H1_4(reg_ofs));
6802 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6804 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6807 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6809 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6812 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6814 return *(uint64_t *)(reg + reg_ofs);
6817 static inline QEMU_ALWAYS_INLINE
6818 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6819 target_ulong base, uint32_t desc, uintptr_t retaddr,
6820 uint32_t mtedesc, int esize, int msize,
6821 zreg_off_fn *off_fn,
6822 sve_ldst1_host_fn *host_fn,
6823 sve_ldst1_tlb_fn *tlb_fn)
6825 const int mmu_idx = cpu_mmu_index(env, false);
6826 const intptr_t reg_max = simd_oprsz(desc);
6827 const int scale = simd_data(desc);
6828 ARMVectorReg scratch;
6829 intptr_t reg_off;
6830 SVEHostPage info, info2;
6832 memset(&scratch, 0, reg_max);
6833 reg_off = 0;
6834 do {
6835 uint64_t pg = vg[reg_off >> 6];
6836 do {
6837 if (likely(pg & 1)) {
6838 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6839 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6841 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6842 mmu_idx, retaddr);
6844 if (likely(in_page >= msize)) {
6845 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6846 cpu_check_watchpoint(env_cpu(env), addr, msize,
6847 info.attrs, BP_MEM_READ, retaddr);
6849 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6850 mte_check(env, mtedesc, addr, retaddr);
6852 host_fn(&scratch, reg_off, info.host);
6853 } else {
6854 /* Element crosses the page boundary. */
6855 sve_probe_page(&info2, false, env, addr + in_page, 0,
6856 MMU_DATA_LOAD, mmu_idx, retaddr);
6857 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6858 cpu_check_watchpoint(env_cpu(env), addr,
6859 msize, info.attrs,
6860 BP_MEM_READ, retaddr);
6862 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6863 mte_check(env, mtedesc, addr, retaddr);
6865 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6868 reg_off += esize;
6869 pg >>= esize;
6870 } while (reg_off & 63);
6871 } while (reg_off < reg_max);
6873 /* Wait until all exceptions have been raised to write back. */
6874 memcpy(vd, &scratch, reg_max);
6877 static inline QEMU_ALWAYS_INLINE
6878 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6879 target_ulong base, uint32_t desc, uintptr_t retaddr,
6880 int esize, int msize, zreg_off_fn *off_fn,
6881 sve_ldst1_host_fn *host_fn,
6882 sve_ldst1_tlb_fn *tlb_fn)
6884 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6885 /* Remove mtedesc from the normal sve descriptor. */
6886 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6889 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6890 * offset base entirely over the address space hole to change the
6891 * pointer tag, or change the bit55 selector. So we could here
6892 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6894 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6895 esize, msize, off_fn, host_fn, tlb_fn);
6898 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6899 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6900 void *vm, target_ulong base, uint32_t desc) \
6902 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6903 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6905 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6906 void *vm, target_ulong base, uint32_t desc) \
6908 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6909 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6912 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6913 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6914 void *vm, target_ulong base, uint32_t desc) \
6916 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6917 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6919 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6920 void *vm, target_ulong base, uint32_t desc) \
6922 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6923 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6926 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6927 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6928 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6929 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6930 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6932 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6933 DO_LD1_ZPZ_S(bss, zss, MO_8)
6934 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6935 DO_LD1_ZPZ_D(bds, zss, MO_8)
6936 DO_LD1_ZPZ_D(bds, zd, MO_8)
6938 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6939 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6940 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6941 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6942 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6944 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6945 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6946 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6947 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6948 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6950 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6951 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6952 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6953 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6954 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6956 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6957 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6958 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6959 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6960 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6962 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6963 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6964 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6965 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6966 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6968 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6969 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6970 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6971 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6972 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6974 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6975 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6976 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6978 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6979 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6980 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6982 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6983 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6984 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6986 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6987 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6988 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6990 #undef DO_LD1_ZPZ_S
6991 #undef DO_LD1_ZPZ_D
6993 /* First fault loads with a vector index. */
6996 * Common helpers for all gather first-faulting loads.
6999 static inline QEMU_ALWAYS_INLINE
7000 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7001 target_ulong base, uint32_t desc, uintptr_t retaddr,
7002 uint32_t mtedesc, const int esz, const int msz,
7003 zreg_off_fn *off_fn,
7004 sve_ldst1_host_fn *host_fn,
7005 sve_ldst1_tlb_fn *tlb_fn)
7007 const int mmu_idx = cpu_mmu_index(env, false);
7008 const intptr_t reg_max = simd_oprsz(desc);
7009 const int scale = simd_data(desc);
7010 const int esize = 1 << esz;
7011 const int msize = 1 << msz;
7012 intptr_t reg_off;
7013 SVEHostPage info;
7014 target_ulong addr, in_page;
7016 /* Skip to the first true predicate. */
7017 reg_off = find_next_active(vg, 0, reg_max, esz);
7018 if (unlikely(reg_off >= reg_max)) {
7019 /* The entire predicate was false; no load occurs. */
7020 memset(vd, 0, reg_max);
7021 return;
7025 * Probe the first element, allowing faults.
7027 addr = base + (off_fn(vm, reg_off) << scale);
7028 if (mtedesc) {
7029 mte_check(env, mtedesc, addr, retaddr);
7031 tlb_fn(env, vd, reg_off, addr, retaddr);
7033 /* After any fault, zero the other elements. */
7034 swap_memzero(vd, reg_off);
7035 reg_off += esize;
7036 swap_memzero(vd + reg_off, reg_max - reg_off);
7039 * Probe the remaining elements, not allowing faults.
7041 while (reg_off < reg_max) {
7042 uint64_t pg = vg[reg_off >> 6];
7043 do {
7044 if (likely((pg >> (reg_off & 63)) & 1)) {
7045 addr = base + (off_fn(vm, reg_off) << scale);
7046 in_page = -(addr | TARGET_PAGE_MASK);
7048 if (unlikely(in_page < msize)) {
7049 /* Stop if the element crosses a page boundary. */
7050 goto fault;
7053 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
7054 mmu_idx, retaddr);
7055 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
7056 goto fault;
7058 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7059 (cpu_watchpoint_address_matches
7060 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7061 goto fault;
7063 if (mtedesc &&
7064 arm_tlb_mte_tagged(&info.attrs) &&
7065 !mte_probe(env, mtedesc, addr)) {
7066 goto fault;
7069 host_fn(vd, reg_off, info.host);
7071 reg_off += esize;
7072 } while (reg_off & 63);
7074 return;
7076 fault:
7077 record_fault(env, reg_off, reg_max);
7080 static inline QEMU_ALWAYS_INLINE
7081 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7082 target_ulong base, uint32_t desc, uintptr_t retaddr,
7083 const int esz, const int msz,
7084 zreg_off_fn *off_fn,
7085 sve_ldst1_host_fn *host_fn,
7086 sve_ldst1_tlb_fn *tlb_fn)
7088 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7089 /* Remove mtedesc from the normal sve descriptor. */
7090 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7093 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7094 * offset base entirely over the address space hole to change the
7095 * pointer tag, or change the bit55 selector. So we could here
7096 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7098 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7099 esz, msz, off_fn, host_fn, tlb_fn);
7102 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
7103 void HELPER(sve_ldff##MEM##_##OFS) \
7104 (CPUARMState *env, void *vd, void *vg, \
7105 void *vm, target_ulong base, uint32_t desc) \
7107 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
7108 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7110 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7111 (CPUARMState *env, void *vd, void *vg, \
7112 void *vm, target_ulong base, uint32_t desc) \
7114 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7115 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7118 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7119 void HELPER(sve_ldff##MEM##_##OFS) \
7120 (CPUARMState *env, void *vd, void *vg, \
7121 void *vm, target_ulong base, uint32_t desc) \
7123 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7124 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7126 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7127 (CPUARMState *env, void *vd, void *vg, \
7128 void *vm, target_ulong base, uint32_t desc) \
7130 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7131 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7134 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7135 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7136 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7137 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7138 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7140 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7141 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7142 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7143 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7144 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7146 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7147 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7148 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7149 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7150 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7152 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7153 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7154 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7155 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7156 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7158 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7159 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7160 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7161 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7162 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7164 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7165 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7166 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7167 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7168 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7170 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7171 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7172 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7173 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7174 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7176 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7177 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7178 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7179 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7180 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7182 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7183 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7184 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7186 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7187 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7188 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7190 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7191 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7192 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7194 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7195 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7196 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7198 /* Stores with a vector index. */
7200 static inline QEMU_ALWAYS_INLINE
7201 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7202 target_ulong base, uint32_t desc, uintptr_t retaddr,
7203 uint32_t mtedesc, int esize, int msize,
7204 zreg_off_fn *off_fn,
7205 sve_ldst1_host_fn *host_fn,
7206 sve_ldst1_tlb_fn *tlb_fn)
7208 const int mmu_idx = cpu_mmu_index(env, false);
7209 const intptr_t reg_max = simd_oprsz(desc);
7210 const int scale = simd_data(desc);
7211 void *host[ARM_MAX_VQ * 4];
7212 intptr_t reg_off, i;
7213 SVEHostPage info, info2;
7216 * Probe all of the elements for host addresses and flags.
7218 i = reg_off = 0;
7219 do {
7220 uint64_t pg = vg[reg_off >> 6];
7221 do {
7222 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7223 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7225 host[i] = NULL;
7226 if (likely((pg >> (reg_off & 63)) & 1)) {
7227 if (likely(in_page >= msize)) {
7228 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7229 mmu_idx, retaddr);
7230 host[i] = info.host;
7231 } else {
7233 * Element crosses the page boundary.
7234 * Probe both pages, but do not record the host address,
7235 * so that we use the slow path.
7237 sve_probe_page(&info, false, env, addr, 0,
7238 MMU_DATA_STORE, mmu_idx, retaddr);
7239 sve_probe_page(&info2, false, env, addr + in_page, 0,
7240 MMU_DATA_STORE, mmu_idx, retaddr);
7241 info.flags |= info2.flags;
7244 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7245 cpu_check_watchpoint(env_cpu(env), addr, msize,
7246 info.attrs, BP_MEM_WRITE, retaddr);
7249 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
7250 mte_check(env, mtedesc, addr, retaddr);
7253 i += 1;
7254 reg_off += esize;
7255 } while (reg_off & 63);
7256 } while (reg_off < reg_max);
7259 * Now that we have recognized all exceptions except SyncExternal
7260 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7262 * Note for the common case of an element in RAM, not crossing a page
7263 * boundary, we have stored the host address in host[]. This doubles
7264 * as a first-level check against the predicate, since only enabled
7265 * elements have non-null host addresses.
7267 i = reg_off = 0;
7268 do {
7269 void *h = host[i];
7270 if (likely(h != NULL)) {
7271 host_fn(vd, reg_off, h);
7272 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7273 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7274 tlb_fn(env, vd, reg_off, addr, retaddr);
7276 i += 1;
7277 reg_off += esize;
7278 } while (reg_off < reg_max);
7281 static inline QEMU_ALWAYS_INLINE
7282 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7283 target_ulong base, uint32_t desc, uintptr_t retaddr,
7284 int esize, int msize, zreg_off_fn *off_fn,
7285 sve_ldst1_host_fn *host_fn,
7286 sve_ldst1_tlb_fn *tlb_fn)
7288 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7289 /* Remove mtedesc from the normal sve descriptor. */
7290 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7293 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7294 * offset base entirely over the address space hole to change the
7295 * pointer tag, or change the bit55 selector. So we could here
7296 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7298 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7299 esize, msize, off_fn, host_fn, tlb_fn);
7302 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7303 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7304 void *vm, target_ulong base, uint32_t desc) \
7306 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7307 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7309 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7310 void *vm, target_ulong base, uint32_t desc) \
7312 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7313 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7316 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7317 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7318 void *vm, target_ulong base, uint32_t desc) \
7320 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7321 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7323 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7324 void *vm, target_ulong base, uint32_t desc) \
7326 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7327 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7330 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7331 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7332 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7333 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7334 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7336 DO_ST1_ZPZ_S(bs, zss, MO_8)
7337 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7338 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7339 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7340 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7342 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7343 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7344 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7345 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7346 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7347 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7348 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7350 DO_ST1_ZPZ_D(bd, zss, MO_8)
7351 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7352 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7353 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7354 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7355 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7356 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7358 DO_ST1_ZPZ_D(bd, zd, MO_8)
7359 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7360 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7361 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7362 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7363 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7364 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7366 #undef DO_ST1_ZPZ_S
7367 #undef DO_ST1_ZPZ_D
7369 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7371 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7372 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7374 for (i = 0; i < opr_sz; ++i) {
7375 d[i] = n[i] ^ m[i] ^ k[i];
7379 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7381 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7382 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7384 for (i = 0; i < opr_sz; ++i) {
7385 d[i] = n[i] ^ (m[i] & ~k[i]);
7389 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7391 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7392 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7394 for (i = 0; i < opr_sz; ++i) {
7395 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7399 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7401 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7402 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7404 for (i = 0; i < opr_sz; ++i) {
7405 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7409 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7411 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7412 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7414 for (i = 0; i < opr_sz; ++i) {
7415 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7420 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7421 * See hasless(v,1) from
7422 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7424 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7426 int bits = 8 << esz;
7427 uint64_t ones = dup_const(esz, 1);
7428 uint64_t signs = ones << (bits - 1);
7429 uint64_t cmp0, cmp1;
7431 cmp1 = dup_const(esz, n);
7432 cmp0 = cmp1 ^ m0;
7433 cmp1 = cmp1 ^ m1;
7434 cmp0 = (cmp0 - ones) & ~cmp0;
7435 cmp1 = (cmp1 - ones) & ~cmp1;
7436 return (cmp0 | cmp1) & signs;
7439 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7440 uint32_t desc, int esz, bool nmatch)
7442 uint16_t esz_mask = pred_esz_masks[esz];
7443 intptr_t opr_sz = simd_oprsz(desc);
7444 uint32_t flags = PREDTEST_INIT;
7445 intptr_t i, j, k;
7447 for (i = 0; i < opr_sz; i += 16) {
7448 uint64_t m0 = *(uint64_t *)(vm + i);
7449 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7450 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7451 uint16_t out = 0;
7453 for (j = 0; j < 16; j += 8) {
7454 uint64_t n = *(uint64_t *)(vn + i + j);
7456 for (k = 0; k < 8; k += 1 << esz) {
7457 if (pg & (1 << (j + k))) {
7458 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7459 out |= (o ^ nmatch) << (j + k);
7463 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7464 flags = iter_predtest_fwd(out, pg, flags);
7466 return flags;
7469 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7470 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7472 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7475 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7476 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7478 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7479 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7481 #undef DO_PPZZ_MATCH
7483 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7484 uint32_t desc)
7486 ARMVectorReg scratch;
7487 intptr_t i, j;
7488 intptr_t opr_sz = simd_oprsz(desc);
7489 uint32_t *d = vd, *n = vn, *m = vm;
7490 uint8_t *pg = vg;
7492 if (d == n) {
7493 n = memcpy(&scratch, n, opr_sz);
7494 if (d == m) {
7495 m = n;
7497 } else if (d == m) {
7498 m = memcpy(&scratch, m, opr_sz);
7501 for (i = 0; i < opr_sz; i += 4) {
7502 uint64_t count = 0;
7503 uint8_t pred;
7505 pred = pg[H1(i >> 3)] >> (i & 7);
7506 if (pred & 1) {
7507 uint32_t nn = n[H4(i >> 2)];
7509 for (j = 0; j <= i; j += 4) {
7510 pred = pg[H1(j >> 3)] >> (j & 7);
7511 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7512 ++count;
7516 d[H4(i >> 2)] = count;
7520 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7521 uint32_t desc)
7523 ARMVectorReg scratch;
7524 intptr_t i, j;
7525 intptr_t opr_sz = simd_oprsz(desc);
7526 uint64_t *d = vd, *n = vn, *m = vm;
7527 uint8_t *pg = vg;
7529 if (d == n) {
7530 n = memcpy(&scratch, n, opr_sz);
7531 if (d == m) {
7532 m = n;
7534 } else if (d == m) {
7535 m = memcpy(&scratch, m, opr_sz);
7538 for (i = 0; i < opr_sz / 8; ++i) {
7539 uint64_t count = 0;
7540 if (pg[H1(i)] & 1) {
7541 uint64_t nn = n[i];
7542 for (j = 0; j <= i; ++j) {
7543 if ((pg[H1(j)] & 1) && nn == m[j]) {
7544 ++count;
7548 d[i] = count;
7553 * Returns the number of bytes in m0 and m1 that match n.
7554 * Unlike do_match2 we don't just need true/false, we need an exact count.
7555 * This requires two extra logical operations.
7557 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7559 const uint64_t mask = dup_const(MO_8, 0x7f);
7560 uint64_t cmp0, cmp1;
7562 cmp1 = dup_const(MO_8, n);
7563 cmp0 = cmp1 ^ m0;
7564 cmp1 = cmp1 ^ m1;
7567 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7568 * 2: carry in to msb if byte != 0 (+ mask)
7569 * 3: set msb if cmp has msb set (| cmp)
7570 * 4: set ~msb to ignore them (| mask)
7571 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7572 * 5: invert, resulting in 0x80 if and only if byte == 0.
7574 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7575 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7578 * Combine the two compares in a way that the bits do
7579 * not overlap, and so preserves the count of set bits.
7580 * If the host has an efficient instruction for ctpop,
7581 * then ctpop(x) + ctpop(y) has the same number of
7582 * operations as ctpop(x | (y >> 1)). If the host does
7583 * not have an efficient ctpop, then we only want to
7584 * use it once.
7586 return ctpop64(cmp0 | (cmp1 >> 1));
7589 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7591 intptr_t i, j;
7592 intptr_t opr_sz = simd_oprsz(desc);
7594 for (i = 0; i < opr_sz; i += 16) {
7595 uint64_t n0 = *(uint64_t *)(vn + i);
7596 uint64_t m0 = *(uint64_t *)(vm + i);
7597 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7598 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7599 uint64_t out0 = 0;
7600 uint64_t out1 = 0;
7602 for (j = 0; j < 64; j += 8) {
7603 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7604 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7605 out0 |= cnt0 << j;
7606 out1 |= cnt1 << j;
7609 *(uint64_t *)(vd + i) = out0;
7610 *(uint64_t *)(vd + i + 8) = out1;
7614 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7616 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7617 int shr = simd_data(desc);
7618 int shl = 8 - shr;
7619 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7620 uint64_t *d = vd, *n = vn, *m = vm;
7622 for (i = 0; i < opr_sz; ++i) {
7623 uint64_t t = n[i] ^ m[i];
7624 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7628 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7630 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7631 int shr = simd_data(desc);
7632 int shl = 16 - shr;
7633 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7634 uint64_t *d = vd, *n = vn, *m = vm;
7636 for (i = 0; i < opr_sz; ++i) {
7637 uint64_t t = n[i] ^ m[i];
7638 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7642 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7644 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7645 int shr = simd_data(desc);
7646 uint32_t *d = vd, *n = vn, *m = vm;
7648 for (i = 0; i < opr_sz; ++i) {
7649 d[i] = ror32(n[i] ^ m[i], shr);
7653 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7654 void *status, uint32_t desc)
7656 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7658 for (s = 0; s < opr_sz; ++s) {
7659 float32 *n = vn + s * sizeof(float32) * 4;
7660 float32 *m = vm + s * sizeof(float32) * 4;
7661 float32 *a = va + s * sizeof(float32) * 4;
7662 float32 *d = vd + s * sizeof(float32) * 4;
7663 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7664 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7665 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7666 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7667 float32 p0, p1;
7669 /* i = 0, j = 0 */
7670 p0 = float32_mul(n00, m00, status);
7671 p1 = float32_mul(n01, m01, status);
7672 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7674 /* i = 0, j = 1 */
7675 p0 = float32_mul(n00, m10, status);
7676 p1 = float32_mul(n01, m11, status);
7677 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7679 /* i = 1, j = 0 */
7680 p0 = float32_mul(n10, m00, status);
7681 p1 = float32_mul(n11, m01, status);
7682 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7684 /* i = 1, j = 1 */
7685 p0 = float32_mul(n10, m10, status);
7686 p1 = float32_mul(n11, m11, status);
7687 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7691 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7692 void *status, uint32_t desc)
7694 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7696 for (s = 0; s < opr_sz; ++s) {
7697 float64 *n = vn + s * sizeof(float64) * 4;
7698 float64 *m = vm + s * sizeof(float64) * 4;
7699 float64 *a = va + s * sizeof(float64) * 4;
7700 float64 *d = vd + s * sizeof(float64) * 4;
7701 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7702 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7703 float64 p0, p1;
7705 /* i = 0, j = 0 */
7706 p0 = float64_mul(n00, m00, status);
7707 p1 = float64_mul(n01, m01, status);
7708 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7710 /* i = 0, j = 1 */
7711 p0 = float64_mul(n00, m10, status);
7712 p1 = float64_mul(n01, m11, status);
7713 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7715 /* i = 1, j = 0 */
7716 p0 = float64_mul(n10, m00, status);
7717 p1 = float64_mul(n11, m01, status);
7718 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7720 /* i = 1, j = 1 */
7721 p0 = float64_mul(n10, m10, status);
7722 p1 = float64_mul(n11, m11, status);
7723 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7727 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7728 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7730 intptr_t i = simd_oprsz(desc); \
7731 uint64_t *g = vg; \
7732 do { \
7733 uint64_t pg = g[(i - 1) >> 6]; \
7734 do { \
7735 i -= sizeof(TYPEW); \
7736 if (likely((pg >> (i & 63)) & 1)) { \
7737 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7738 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7740 } while (i & 63); \
7741 } while (i != 0); \
7744 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7745 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7746 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7748 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7749 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7751 intptr_t i = simd_oprsz(desc); \
7752 uint64_t *g = vg; \
7753 do { \
7754 uint64_t pg = g[(i - 1) >> 6]; \
7755 do { \
7756 i -= sizeof(TYPEW); \
7757 if (likely((pg >> (i & 63)) & 1)) { \
7758 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7759 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7761 } while (i & 63); \
7762 } while (i != 0); \
7765 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7766 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7768 #undef DO_FCVTLT
7769 #undef DO_FCVTNT