target/arm: Implement SVE2 complex integer dot product
[qemu.git] / target / arm / sve_helper.c
blobf9c20612602e466ae5191f369a2b607eadad5323
1 /*
2 * ARM SVE Operations
4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
40 #else
41 #define H1(x) (x)
42 #define H1_2(x) (x)
43 #define H1_4(x) (x)
44 #define H2(x) (x)
45 #define H4(x) (x)
46 #endif
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
52 * within CPUARMState.
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
59 * moving forward.
61 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
63 if (likely(g)) {
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
66 if (!(flags & 4)) {
67 flags |= ((d & (g & -g)) != 0) << 31;
68 flags |= 4;
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
77 return flags;
80 /* This is an iterative function, called for each Pd and Pg word
81 * moving backward.
83 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
85 if (likely(g)) {
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
88 if (!(flags & 4)) {
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
99 return flags;
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
113 uintptr_t i = 0;
115 do {
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
119 return flags;
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte)
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
221 0xffffffffffffffff,
223 return word[byte];
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
229 * if (i & 0xaa) {
230 * continue;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte)
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word[byte & 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte)
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
263 return word[byte & 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h)
269 return rol32(h, 16);
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h)
275 uint64_t m = 0x0000ffff0000ffffull;
276 h = rol64(h, 32);
277 return ((h & m) << 16) | ((h >> 16) & m);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h)
283 return rol64(h, 32);
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 uintptr_t i; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp, DO_AND)
307 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
315 #undef DO_AND
316 #undef DO_BIC
317 #undef DO_EOR
318 #undef DO_ORR
319 #undef DO_ORN
320 #undef DO_NOR
321 #undef DO_NAND
322 #undef DO_SEL
323 #undef LOGICAL_PPPP
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
339 do { \
340 if (pg & 1) { \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
346 } while (i & 15); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
356 uint8_t *pg = vg; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
360 d[i] = OP(nn, mm); \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
392 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
397 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
402 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
407 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
412 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
417 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
427 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
437 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
451 return (n * m) >> 8;
454 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
456 return (n * m) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
461 return (n * m) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
466 uint64_t lo, hi;
467 muls64(&lo, &hi, n, m);
468 return hi;
471 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
473 uint64_t lo, hi;
474 mulu64(&lo, &hi, n, m);
475 return hi;
478 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
483 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
488 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
493 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
496 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
509 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
513 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
517 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
521 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
523 int8_t n1 = n, n2 = n >> 8;
524 return m + n1 + n2;
527 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
529 int16_t n1 = n, n2 = n >> 16;
530 return m + n1 + n2;
533 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
535 int32_t n1 = n, n2 = n >> 32;
536 return m + n1 + n2;
539 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
543 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
545 uint8_t n1 = n, n2 = n >> 8;
546 return m + n1 + n2;
549 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
551 uint16_t n1 = n, n2 = n >> 16;
552 return m + n1 + n2;
555 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
557 uint32_t n1 = n, n2 = n >> 32;
558 return m + n1 + n2;
561 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
633 #undef do_sqrshl_d
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
649 #undef do_uqrshl_d
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
659 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
672 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
685 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
690 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
692 return val >= max ? max : val <= min ? min : val;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
701 int64_t r = n + m;
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
706 return r;
709 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
720 uint64_t r = n + m;
721 return r < n ? UINT64_MAX : r;
724 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
735 int64_t r = n - m;
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
740 return r;
743 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
754 return n > m ? n - m : 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
771 uint64_t r = n + m;
773 if (n < 0) {
774 /* Note that m - abs(n) cannot underflow. */
775 if (r > INT64_MAX) {
776 /* Result is either very large positive or negative. */
777 if (m > -n) {
778 /* m > abs(n), so r is a very large positive. */
779 return INT64_MAX;
781 /* Result is negative. */
783 } else {
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
786 return INT64_MAX;
789 return r;
792 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
806 uint64_t r = n + m;
808 if (m < 0) {
809 return n < -m ? 0 : r;
811 return r < n ? UINT64_MAX : r;
814 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
819 #undef DO_ZPZZ
820 #undef DO_ZPZZ_D
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
834 do { \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
839 if (pg & 1) { \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
843 if (pg & 1) { \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
847 } while (i & 15); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
857 uint8_t *pg = vg; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
862 d[i] = OP(n0, n1); \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
895 #undef DO_ZPZZ_PAIR
896 #undef DO_ZPZZ_PAIR_D
898 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
905 do { \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
910 if (pg & 1) { \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
914 if (pg & 1) { \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
918 } while (i & 15); \
922 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
923 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
924 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add)
926 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
927 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
928 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum)
930 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
931 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
932 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum)
934 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
935 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
936 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max)
938 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
939 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
940 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min)
942 #undef DO_ZPZZ_PAIR_FP
944 /* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
948 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
955 do { \
956 if (pg & 1) { \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
961 } while (i & 7); \
965 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
966 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
967 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
969 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
970 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
971 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
973 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
974 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
975 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
977 #undef DO_ZPZW
979 /* Fully general two-operand expander, controlled by a predicate.
981 #define DO_ZPZ(NAME, TYPE, H, OP) \
982 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
987 do { \
988 if (pg & 1) { \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
993 } while (i & 15); \
997 /* Similarly, specialized for 64-bit operands. */
998 #define DO_ZPZ_D(NAME, TYPE, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1003 uint8_t *pg = vg; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1006 TYPE nn = n[i]; \
1007 d[i] = OP(nn); \
1012 #define DO_CLS_B(N) (clrsb32(N) - 24)
1013 #define DO_CLS_H(N) (clrsb32(N) - 16)
1015 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
1016 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
1017 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
1018 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
1020 #define DO_CLZ_B(N) (clz32(N) - 24)
1021 #define DO_CLZ_H(N) (clz32(N) - 16)
1023 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
1024 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
1025 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
1026 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
1028 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
1029 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
1030 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
1031 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
1033 #define DO_CNOT(N) (N == 0)
1035 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
1036 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
1037 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
1038 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
1040 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1042 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
1043 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
1044 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
1046 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1048 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1049 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1050 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1052 #define DO_NOT(N) (~N)
1054 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1055 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1056 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1057 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1059 #define DO_SXTB(N) ((int8_t)N)
1060 #define DO_SXTH(N) ((int16_t)N)
1061 #define DO_SXTS(N) ((int32_t)N)
1062 #define DO_UXTB(N) ((uint8_t)N)
1063 #define DO_UXTH(N) ((uint16_t)N)
1064 #define DO_UXTS(N) ((uint32_t)N)
1066 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1067 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1068 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1069 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1070 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1071 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1073 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1074 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1075 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1076 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1077 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1078 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1080 #define DO_ABS(N) (N < 0 ? -N : N)
1082 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1083 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1084 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1085 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1087 #define DO_NEG(N) (-N)
1089 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1090 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1091 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1092 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1094 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1095 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1096 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1098 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1099 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1101 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1103 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1104 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1105 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1106 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1108 #define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1112 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1113 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1114 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1115 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1117 #define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1121 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1122 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1123 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1124 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1126 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1127 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1129 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1131 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1137 do { \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1141 } while (i & 7); \
1145 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1146 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1147 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1149 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1150 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1151 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1153 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1154 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1155 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1157 #undef DO_ZZW
1159 #undef DO_CLS_B
1160 #undef DO_CLS_H
1161 #undef DO_CLZ_B
1162 #undef DO_CLZ_H
1163 #undef DO_CNOT
1164 #undef DO_FABS
1165 #undef DO_FNEG
1166 #undef DO_ABS
1167 #undef DO_NEG
1168 #undef DO_ZPZ
1169 #undef DO_ZPZ_D
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1175 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1188 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1189 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1190 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1192 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1193 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1194 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1196 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1197 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1198 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1200 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1201 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1202 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1204 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1205 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1206 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1208 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1209 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1210 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1212 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1213 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1214 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1216 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1217 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1218 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1220 /* Note that the multiply cannot overflow, but the doubling can. */
1221 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1223 int16_t val = n * m;
1224 return DO_SQADD_H(val, val);
1227 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1229 int32_t val = n * m;
1230 return DO_SQADD_S(val, val);
1233 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1235 int64_t val = n * m;
1236 return do_sqadd_d(val, val);
1239 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1240 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1241 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1243 #undef DO_ZZZ_TB
1245 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1257 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1258 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1259 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1261 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1262 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1263 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1265 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1266 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1267 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1269 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1270 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1271 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1273 #undef DO_ZZZ_WTB
1275 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1288 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1289 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1290 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1291 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1293 #undef DO_ZZZ_NTB
1295 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1308 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1309 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1310 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
1312 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1313 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1314 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1316 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1317 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1318 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, , H1_4, DO_MUL)
1320 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1321 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1322 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1324 #define DO_NMUL(N, M) -(N * M)
1326 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1327 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1328 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, , H1_4, DO_NMUL)
1330 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1331 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1332 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, , H1_4, DO_NMUL)
1334 #undef DO_ZZZW_ACC
1336 #define DO_XTNB(NAME, TYPE, OP) \
1337 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1339 intptr_t i, opr_sz = simd_oprsz(desc); \
1340 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1341 TYPE nn = *(TYPE *)(vn + i); \
1342 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1343 *(TYPE *)(vd + i) = nn; \
1347 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1348 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1350 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1351 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1352 TYPE nn = *(TYPE *)(vn + i); \
1353 *(TYPEN *)(vd + i + odd) = OP(nn); \
1357 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1358 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1359 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1361 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1362 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1363 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1365 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1366 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1367 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1369 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1370 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1371 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1373 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1374 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1375 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1377 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1378 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1379 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1381 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1382 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1383 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1385 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1386 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1387 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1389 #undef DO_XTNB
1390 #undef DO_XTNT
1392 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1394 intptr_t i, opr_sz = simd_oprsz(desc);
1395 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1396 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1397 uint32_t *a = va, *n = vn;
1398 uint64_t *d = vd, *m = vm;
1400 for (i = 0; i < opr_sz / 8; ++i) {
1401 uint32_t e1 = a[2 * i + H4(0)];
1402 uint32_t e2 = n[2 * i + sel] ^ inv;
1403 uint64_t c = extract64(m[i], 32, 1);
1404 /* Compute and store the entire 33-bit result at once. */
1405 d[i] = c + e1 + e2;
1409 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1411 intptr_t i, opr_sz = simd_oprsz(desc);
1412 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1413 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1414 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1416 for (i = 0; i < opr_sz / 8; i += 2) {
1417 Int128 e1 = int128_make64(a[i]);
1418 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1419 Int128 c = int128_make64(m[i + 1] & 1);
1420 Int128 r = int128_add(int128_add(e1, e2), c);
1421 d[i + 0] = int128_getlo(r);
1422 d[i + 1] = int128_gethi(r);
1426 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1429 intptr_t i, opr_sz = simd_oprsz(desc); \
1430 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1431 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1432 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1433 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1434 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1435 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1436 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1440 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1441 do_sqdmull_h, DO_SQADD_H)
1442 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1443 do_sqdmull_s, DO_SQADD_S)
1444 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, , H1_4,
1445 do_sqdmull_d, do_sqadd_d)
1447 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1448 do_sqdmull_h, DO_SQSUB_H)
1449 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1450 do_sqdmull_s, DO_SQSUB_S)
1451 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, , H1_4,
1452 do_sqdmull_d, do_sqsub_d)
1454 #undef DO_SQDMLAL
1456 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1457 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1459 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1460 int rot = simd_data(desc); \
1461 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1462 bool sub_r = rot == 1 || rot == 2; \
1463 bool sub_i = rot >= 2; \
1464 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1465 for (i = 0; i < opr_sz; i += 2) { \
1466 TYPE elt1_a = n[H(i + sel_a)]; \
1467 TYPE elt2_a = m[H(i + sel_a)]; \
1468 TYPE elt2_b = m[H(i + sel_b)]; \
1469 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1470 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1474 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1476 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1477 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1478 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1479 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, , DO_CMLA)
1481 #define DO_SQRDMLAH_B(N, M, A, S) \
1482 do_sqrdmlah_b(N, M, A, S, true)
1483 #define DO_SQRDMLAH_H(N, M, A, S) \
1484 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1485 #define DO_SQRDMLAH_S(N, M, A, S) \
1486 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1487 #define DO_SQRDMLAH_D(N, M, A, S) \
1488 do_sqrdmlah_d(N, M, A, S, true)
1490 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1491 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1492 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1493 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, , DO_SQRDMLAH_D)
1495 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1496 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1498 intptr_t i, j, oprsz = simd_oprsz(desc); \
1499 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1500 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1501 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1502 bool sub_r = rot == 1 || rot == 2; \
1503 bool sub_i = rot >= 2; \
1504 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1505 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1506 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1507 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1508 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1509 TYPE elt1_a = n[H(i + j + sel_a)]; \
1510 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1511 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1516 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1517 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1519 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1520 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1522 #undef DO_CMLA
1523 #undef DO_CMLA_FUNC
1524 #undef DO_CMLA_IDX_FUNC
1525 #undef DO_SQRDMLAH_B
1526 #undef DO_SQRDMLAH_H
1527 #undef DO_SQRDMLAH_S
1528 #undef DO_SQRDMLAH_D
1530 /* Note N and M are 4 elements bundled into one unit. */
1531 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1532 int sel_a, int sel_b, int sub_i)
1534 for (int i = 0; i <= 1; i++) {
1535 int32_t elt1_r = (int8_t)(n >> (16 * i));
1536 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1537 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1538 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1540 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1542 return a;
1545 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1546 int sel_a, int sel_b, int sub_i)
1548 for (int i = 0; i <= 1; i++) {
1549 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1550 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1551 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1552 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1554 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1556 return a;
1559 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1560 void *va, uint32_t desc)
1562 int opr_sz = simd_oprsz(desc);
1563 int rot = simd_data(desc);
1564 int sel_a = rot & 1;
1565 int sel_b = sel_a ^ 1;
1566 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1567 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1569 for (int e = 0; e < opr_sz / 4; e++) {
1570 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1574 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1575 void *va, uint32_t desc)
1577 int opr_sz = simd_oprsz(desc);
1578 int rot = simd_data(desc);
1579 int sel_a = rot & 1;
1580 int sel_b = sel_a ^ 1;
1581 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1582 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1584 for (int e = 0; e < opr_sz / 8; e++) {
1585 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1589 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1590 void *va, uint32_t desc)
1592 int opr_sz = simd_oprsz(desc);
1593 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1594 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1595 int sel_a = rot & 1;
1596 int sel_b = sel_a ^ 1;
1597 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1598 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1600 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1601 uint32_t seg_m = m[seg + idx];
1602 for (int e = 0; e < 4; e++) {
1603 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1604 sel_a, sel_b, sub_i);
1609 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1610 void *va, uint32_t desc)
1612 int seg, opr_sz = simd_oprsz(desc);
1613 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1614 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1615 int sel_a = rot & 1;
1616 int sel_b = sel_a ^ 1;
1617 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1618 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1620 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1621 uint64_t seg_m = m[seg + idx];
1622 for (int e = 0; e < 2; e++) {
1623 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1624 sel_a, sel_b, sub_i);
1629 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1630 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1632 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1633 intptr_t i, j, idx = simd_data(desc); \
1634 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1635 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1636 TYPE mm = m[i]; \
1637 for (j = 0; j < segment; j++) { \
1638 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1643 #define DO_SQRDMLAH_H(N, M, A) \
1644 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1645 #define DO_SQRDMLAH_S(N, M, A) \
1646 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1647 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1649 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1650 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1651 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, , DO_SQRDMLAH_D)
1653 #define DO_SQRDMLSH_H(N, M, A) \
1654 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1655 #define DO_SQRDMLSH_S(N, M, A) \
1656 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1657 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1659 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1660 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1661 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, , DO_SQRDMLSH_D)
1663 #undef DO_ZZXZ
1665 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1666 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1668 intptr_t i, j, oprsz = simd_oprsz(desc); \
1669 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1670 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1671 for (i = 0; i < oprsz; i += 16) { \
1672 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1673 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1674 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1675 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1676 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1681 #define DO_MLA(N, M, A) (A + N * M)
1683 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1684 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, , H1_4, DO_MLA)
1685 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1686 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, , H1_4, DO_MLA)
1688 #define DO_MLS(N, M, A) (A - N * M)
1690 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1691 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, , H1_4, DO_MLS)
1692 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1693 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, , H1_4, DO_MLS)
1695 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1696 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1698 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1699 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, , H1_4, DO_SQDMLAL_D)
1701 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1702 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1704 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1705 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, , H1_4, DO_SQDMLSL_D)
1707 #undef DO_MLA
1708 #undef DO_MLS
1709 #undef DO_ZZXW
1711 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1712 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1714 intptr_t i, j, oprsz = simd_oprsz(desc); \
1715 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1716 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1717 for (i = 0; i < oprsz; i += 16) { \
1718 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1719 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1720 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1721 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1726 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1727 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1729 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1730 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, , H1_4, DO_MUL)
1732 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1733 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1735 #undef DO_ZZX
1737 #define DO_BITPERM(NAME, TYPE, OP) \
1738 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1740 intptr_t i, opr_sz = simd_oprsz(desc); \
1741 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1742 TYPE nn = *(TYPE *)(vn + i); \
1743 TYPE mm = *(TYPE *)(vm + i); \
1744 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1748 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1750 uint64_t res = 0;
1751 int db, rb = 0;
1753 for (db = 0; db < n; ++db) {
1754 if ((mask >> db) & 1) {
1755 res |= ((data >> db) & 1) << rb;
1756 ++rb;
1759 return res;
1762 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1763 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1764 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1765 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1767 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1769 uint64_t res = 0;
1770 int rb, db = 0;
1772 for (rb = 0; rb < n; ++rb) {
1773 if ((mask >> rb) & 1) {
1774 res |= ((data >> db) & 1) << rb;
1775 ++db;
1778 return res;
1781 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1782 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1783 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1784 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1786 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1788 uint64_t resm = 0, resu = 0;
1789 int db, rbm = 0, rbu = 0;
1791 for (db = 0; db < n; ++db) {
1792 uint64_t val = (data >> db) & 1;
1793 if ((mask >> db) & 1) {
1794 resm |= val << rbm++;
1795 } else {
1796 resu |= val << rbu++;
1800 return resm | (resu << rbm);
1803 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1804 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1805 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1806 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1808 #undef DO_BITPERM
1810 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1811 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1813 intptr_t i, opr_sz = simd_oprsz(desc); \
1814 int sub_r = simd_data(desc); \
1815 if (sub_r) { \
1816 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1817 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1818 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1819 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1820 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1821 acc_r = ADD_OP(acc_r, el2_i); \
1822 acc_i = SUB_OP(acc_i, el2_r); \
1823 *(TYPE *)(vd + H(i)) = acc_r; \
1824 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1826 } else { \
1827 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1828 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1829 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1830 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1831 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1832 acc_r = SUB_OP(acc_r, el2_i); \
1833 acc_i = ADD_OP(acc_i, el2_r); \
1834 *(TYPE *)(vd + H(i)) = acc_r; \
1835 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1840 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1841 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1842 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1843 DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
1845 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1846 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1847 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1848 DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
1850 #undef DO_CADD
1852 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1853 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1855 intptr_t i, opr_sz = simd_oprsz(desc); \
1856 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1857 int shift = simd_data(desc) >> 1; \
1858 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1859 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1860 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1864 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1865 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1866 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1868 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1869 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1870 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1872 #undef DO_ZZI_SHLL
1874 /* Two-operand reduction expander, controlled by a predicate.
1875 * The difference between TYPERED and TYPERET has to do with
1876 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1877 * but TYPERET must be unsigned so that e.g. a 32-bit value
1878 * is not sign-extended to the ABI uint64_t return type.
1880 /* ??? If we were to vectorize this by hand the reduction ordering
1881 * would change. For integer operands, this is perfectly fine.
1883 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1884 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1886 intptr_t i, opr_sz = simd_oprsz(desc); \
1887 TYPERED ret = INIT; \
1888 for (i = 0; i < opr_sz; ) { \
1889 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1890 do { \
1891 if (pg & 1) { \
1892 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1893 ret = OP(ret, nn); \
1895 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1896 } while (i & 15); \
1898 return (TYPERET)ret; \
1901 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1902 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1904 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1905 TYPEE *n = vn; \
1906 uint8_t *pg = vg; \
1907 TYPER ret = INIT; \
1908 for (i = 0; i < opr_sz; i += 1) { \
1909 if (pg[H1(i)] & 1) { \
1910 TYPEE nn = n[i]; \
1911 ret = OP(ret, nn); \
1914 return ret; \
1917 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1918 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1919 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1920 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1922 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1923 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1924 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1925 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1927 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1928 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1929 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1930 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1932 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1933 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1934 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1936 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1937 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1938 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1939 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1941 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1942 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1943 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1944 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1946 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1947 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1948 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1949 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1951 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1952 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1953 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1954 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1956 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1957 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1958 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1959 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1961 #undef DO_VPZ
1962 #undef DO_VPZ_D
1964 /* Two vector operand, one scalar operand, unpredicated. */
1965 #define DO_ZZI(NAME, TYPE, OP) \
1966 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1968 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1969 TYPE s = s64, *d = vd, *n = vn; \
1970 for (i = 0; i < opr_sz; ++i) { \
1971 d[i] = OP(n[i], s); \
1975 #define DO_SUBR(X, Y) (Y - X)
1977 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1978 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1979 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1980 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1982 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1983 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1984 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1985 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1987 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1988 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1989 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1990 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1992 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1993 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1994 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1995 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1997 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1998 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1999 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
2000 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
2002 #undef DO_ZZI
2004 #undef DO_AND
2005 #undef DO_ORR
2006 #undef DO_EOR
2007 #undef DO_BIC
2008 #undef DO_ADD
2009 #undef DO_SUB
2010 #undef DO_MAX
2011 #undef DO_MIN
2012 #undef DO_ABD
2013 #undef DO_MUL
2014 #undef DO_DIV
2015 #undef DO_ASR
2016 #undef DO_LSR
2017 #undef DO_LSL
2018 #undef DO_SUBR
2020 /* Similar to the ARM LastActiveElement pseudocode function, except the
2021 result is multiplied by the element size. This includes the not found
2022 indication; e.g. not found for esz=3 is -8. */
2023 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
2025 uint64_t mask = pred_esz_masks[esz];
2026 intptr_t i = words;
2028 do {
2029 uint64_t this_g = g[--i] & mask;
2030 if (this_g) {
2031 return i * 64 + (63 - clz64(this_g));
2033 } while (i > 0);
2034 return (intptr_t)-1 << esz;
2037 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
2039 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2040 uint32_t flags = PREDTEST_INIT;
2041 uint64_t *d = vd, *g = vg;
2042 intptr_t i = 0;
2044 do {
2045 uint64_t this_d = d[i];
2046 uint64_t this_g = g[i];
2048 if (this_g) {
2049 if (!(flags & 4)) {
2050 /* Set in D the first bit of G. */
2051 this_d |= this_g & -this_g;
2052 d[i] = this_d;
2054 flags = iter_predtest_fwd(this_d, this_g, flags);
2056 } while (++i < words);
2058 return flags;
2061 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
2063 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2064 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2065 uint32_t flags = PREDTEST_INIT;
2066 uint64_t *d = vd, *g = vg, esz_mask;
2067 intptr_t i, next;
2069 next = last_active_element(vd, words, esz) + (1 << esz);
2070 esz_mask = pred_esz_masks[esz];
2072 /* Similar to the pseudocode for pnext, but scaled by ESZ
2073 so that we find the correct bit. */
2074 if (next < words * 64) {
2075 uint64_t mask = -1;
2077 if (next & 63) {
2078 mask = ~((1ull << (next & 63)) - 1);
2079 next &= -64;
2081 do {
2082 uint64_t this_g = g[next / 64] & esz_mask & mask;
2083 if (this_g != 0) {
2084 next = (next & -64) + ctz64(this_g);
2085 break;
2087 next += 64;
2088 mask = -1;
2089 } while (next < words * 64);
2092 i = 0;
2093 do {
2094 uint64_t this_d = 0;
2095 if (i == next / 64) {
2096 this_d = 1ull << (next & 63);
2098 d[i] = this_d;
2099 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
2100 } while (++i < words);
2102 return flags;
2106 * Copy Zn into Zd, and store zero into inactive elements.
2107 * If inv, store zeros into the active elements.
2109 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
2111 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2112 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2113 uint64_t *d = vd, *n = vn;
2114 uint8_t *pg = vg;
2116 for (i = 0; i < opr_sz; i += 1) {
2117 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
2121 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
2123 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2124 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2125 uint64_t *d = vd, *n = vn;
2126 uint8_t *pg = vg;
2128 for (i = 0; i < opr_sz; i += 1) {
2129 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2133 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2135 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2136 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2137 uint64_t *d = vd, *n = vn;
2138 uint8_t *pg = vg;
2140 for (i = 0; i < opr_sz; i += 1) {
2141 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2145 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2147 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2148 uint64_t *d = vd, *n = vn;
2149 uint8_t *pg = vg;
2150 uint8_t inv = simd_data(desc);
2152 for (i = 0; i < opr_sz; i += 1) {
2153 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2157 /* Three-operand expander, immediate operand, controlled by a predicate.
2159 #define DO_ZPZI(NAME, TYPE, H, OP) \
2160 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2162 intptr_t i, opr_sz = simd_oprsz(desc); \
2163 TYPE imm = simd_data(desc); \
2164 for (i = 0; i < opr_sz; ) { \
2165 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2166 do { \
2167 if (pg & 1) { \
2168 TYPE nn = *(TYPE *)(vn + H(i)); \
2169 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2171 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2172 } while (i & 15); \
2176 /* Similarly, specialized for 64-bit operands. */
2177 #define DO_ZPZI_D(NAME, TYPE, OP) \
2178 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2180 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2181 TYPE *d = vd, *n = vn; \
2182 TYPE imm = simd_data(desc); \
2183 uint8_t *pg = vg; \
2184 for (i = 0; i < opr_sz; i += 1) { \
2185 if (pg[H1(i)] & 1) { \
2186 TYPE nn = n[i]; \
2187 d[i] = OP(nn, imm); \
2192 #define DO_SHR(N, M) (N >> M)
2193 #define DO_SHL(N, M) (N << M)
2195 /* Arithmetic shift right for division. This rounds negative numbers
2196 toward zero as per signed division. Therefore before shifting,
2197 when N is negative, add 2**M-1. */
2198 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2200 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2202 if (likely(sh < 64)) {
2203 return (x >> sh) + ((x >> (sh - 1)) & 1);
2204 } else if (sh == 64) {
2205 return x >> 63;
2206 } else {
2207 return 0;
2211 static inline int64_t do_srshr(int64_t x, unsigned sh)
2213 if (likely(sh < 64)) {
2214 return (x >> sh) + ((x >> (sh - 1)) & 1);
2215 } else {
2216 /* Rounding the sign bit always produces 0. */
2217 return 0;
2221 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2222 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2223 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2224 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2226 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2227 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2228 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2229 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2231 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2232 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2233 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2234 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2236 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2237 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2238 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2239 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2241 #undef DO_ASRD
2242 #undef DO_ZPZI
2243 #undef DO_ZPZI_D
2245 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2246 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2248 intptr_t i, opr_sz = simd_oprsz(desc); \
2249 int shift = simd_data(desc); \
2250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2251 TYPEW nn = *(TYPEW *)(vn + i); \
2252 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2256 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2257 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2259 intptr_t i, opr_sz = simd_oprsz(desc); \
2260 int shift = simd_data(desc); \
2261 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2262 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2263 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2267 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2268 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2269 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2271 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2272 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2273 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR)
2275 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2276 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2277 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2279 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2280 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2281 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr)
2283 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2284 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2285 #define DO_SQSHRUN_D(x, sh) \
2286 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2288 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2289 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2290 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2292 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2293 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2294 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D)
2296 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2297 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2298 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2300 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2301 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2302 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2304 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2305 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2306 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D)
2308 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2309 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2310 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2312 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2313 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2314 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2316 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2317 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2318 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D)
2320 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2321 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2322 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2324 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2325 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2326 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2328 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2329 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2330 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D)
2332 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2333 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2334 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2336 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2337 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2338 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2340 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2341 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2342 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D)
2344 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2345 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2346 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2348 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2349 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2350 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2352 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2353 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2354 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D)
2356 #undef DO_SHRNB
2357 #undef DO_SHRNT
2359 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2362 intptr_t i, opr_sz = simd_oprsz(desc); \
2363 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2364 TYPEW nn = *(TYPEW *)(vn + i); \
2365 TYPEW mm = *(TYPEW *)(vm + i); \
2366 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2370 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2371 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2373 intptr_t i, opr_sz = simd_oprsz(desc); \
2374 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2375 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2376 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2377 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2381 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2382 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2383 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2384 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2386 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2387 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2388 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2390 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2391 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2392 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_ADDHN)
2394 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2395 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2396 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2398 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2399 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2400 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RADDHN)
2402 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2403 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2404 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2406 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2407 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2408 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_SUBHN)
2410 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2411 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2412 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2414 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2415 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2416 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RSUBHN)
2418 #undef DO_RSUBHN
2419 #undef DO_SUBHN
2420 #undef DO_RADDHN
2421 #undef DO_ADDHN
2423 #undef DO_BINOPNB
2425 /* Fully general four-operand expander, controlled by a predicate.
2427 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2428 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2429 void *vg, uint32_t desc) \
2431 intptr_t i, opr_sz = simd_oprsz(desc); \
2432 for (i = 0; i < opr_sz; ) { \
2433 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2434 do { \
2435 if (pg & 1) { \
2436 TYPE nn = *(TYPE *)(vn + H(i)); \
2437 TYPE mm = *(TYPE *)(vm + H(i)); \
2438 TYPE aa = *(TYPE *)(va + H(i)); \
2439 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2441 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2442 } while (i & 15); \
2446 /* Similarly, specialized for 64-bit operands. */
2447 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2448 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2449 void *vg, uint32_t desc) \
2451 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2452 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2453 uint8_t *pg = vg; \
2454 for (i = 0; i < opr_sz; i += 1) { \
2455 if (pg[H1(i)] & 1) { \
2456 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2457 d[i] = OP(aa, nn, mm); \
2462 #define DO_MLA(A, N, M) (A + N * M)
2463 #define DO_MLS(A, N, M) (A - N * M)
2465 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2466 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2468 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2469 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2471 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2472 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2474 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2475 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2477 #undef DO_MLA
2478 #undef DO_MLS
2479 #undef DO_ZPZZZ
2480 #undef DO_ZPZZZ_D
2482 void HELPER(sve_index_b)(void *vd, uint32_t start,
2483 uint32_t incr, uint32_t desc)
2485 intptr_t i, opr_sz = simd_oprsz(desc);
2486 uint8_t *d = vd;
2487 for (i = 0; i < opr_sz; i += 1) {
2488 d[H1(i)] = start + i * incr;
2492 void HELPER(sve_index_h)(void *vd, uint32_t start,
2493 uint32_t incr, uint32_t desc)
2495 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2496 uint16_t *d = vd;
2497 for (i = 0; i < opr_sz; i += 1) {
2498 d[H2(i)] = start + i * incr;
2502 void HELPER(sve_index_s)(void *vd, uint32_t start,
2503 uint32_t incr, uint32_t desc)
2505 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2506 uint32_t *d = vd;
2507 for (i = 0; i < opr_sz; i += 1) {
2508 d[H4(i)] = start + i * incr;
2512 void HELPER(sve_index_d)(void *vd, uint64_t start,
2513 uint64_t incr, uint32_t desc)
2515 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2516 uint64_t *d = vd;
2517 for (i = 0; i < opr_sz; i += 1) {
2518 d[i] = start + i * incr;
2522 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2524 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2525 uint32_t sh = simd_data(desc);
2526 uint32_t *d = vd, *n = vn, *m = vm;
2527 for (i = 0; i < opr_sz; i += 1) {
2528 d[i] = n[i] + (m[i] << sh);
2532 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2534 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2535 uint64_t sh = simd_data(desc);
2536 uint64_t *d = vd, *n = vn, *m = vm;
2537 for (i = 0; i < opr_sz; i += 1) {
2538 d[i] = n[i] + (m[i] << sh);
2542 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2545 uint64_t sh = simd_data(desc);
2546 uint64_t *d = vd, *n = vn, *m = vm;
2547 for (i = 0; i < opr_sz; i += 1) {
2548 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2552 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2554 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2555 uint64_t sh = simd_data(desc);
2556 uint64_t *d = vd, *n = vn, *m = vm;
2557 for (i = 0; i < opr_sz; i += 1) {
2558 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2562 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2564 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2565 static const uint16_t coeff[] = {
2566 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2567 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2568 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2569 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2571 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2572 uint16_t *d = vd, *n = vn;
2574 for (i = 0; i < opr_sz; i++) {
2575 uint16_t nn = n[i];
2576 intptr_t idx = extract32(nn, 0, 5);
2577 uint16_t exp = extract32(nn, 5, 5);
2578 d[i] = coeff[idx] | (exp << 10);
2582 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2584 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2585 static const uint32_t coeff[] = {
2586 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2587 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2588 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2589 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2590 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2591 0x1ef532, 0x20b051, 0x227043, 0x243516,
2592 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2593 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2594 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2595 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2596 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2597 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2598 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2599 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2600 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2601 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2603 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2604 uint32_t *d = vd, *n = vn;
2606 for (i = 0; i < opr_sz; i++) {
2607 uint32_t nn = n[i];
2608 intptr_t idx = extract32(nn, 0, 6);
2609 uint32_t exp = extract32(nn, 6, 8);
2610 d[i] = coeff[idx] | (exp << 23);
2614 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2616 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2617 static const uint64_t coeff[] = {
2618 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2619 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2620 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2621 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2622 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2623 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2624 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2625 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2626 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2627 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2628 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2629 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2630 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2631 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2632 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2633 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2634 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2635 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2636 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2637 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2638 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2639 0xFA7C1819E90D8ull,
2641 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2642 uint64_t *d = vd, *n = vn;
2644 for (i = 0; i < opr_sz; i++) {
2645 uint64_t nn = n[i];
2646 intptr_t idx = extract32(nn, 0, 6);
2647 uint64_t exp = extract32(nn, 6, 11);
2648 d[i] = coeff[idx] | (exp << 52);
2652 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2654 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2655 uint16_t *d = vd, *n = vn, *m = vm;
2656 for (i = 0; i < opr_sz; i += 1) {
2657 uint16_t nn = n[i];
2658 uint16_t mm = m[i];
2659 if (mm & 1) {
2660 nn = float16_one;
2662 d[i] = nn ^ (mm & 2) << 14;
2666 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2668 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2669 uint32_t *d = vd, *n = vn, *m = vm;
2670 for (i = 0; i < opr_sz; i += 1) {
2671 uint32_t nn = n[i];
2672 uint32_t mm = m[i];
2673 if (mm & 1) {
2674 nn = float32_one;
2676 d[i] = nn ^ (mm & 2) << 30;
2680 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2682 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2683 uint64_t *d = vd, *n = vn, *m = vm;
2684 for (i = 0; i < opr_sz; i += 1) {
2685 uint64_t nn = n[i];
2686 uint64_t mm = m[i];
2687 if (mm & 1) {
2688 nn = float64_one;
2690 d[i] = nn ^ (mm & 2) << 62;
2695 * Signed saturating addition with scalar operand.
2698 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2700 intptr_t i, oprsz = simd_oprsz(desc);
2702 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2703 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2707 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2709 intptr_t i, oprsz = simd_oprsz(desc);
2711 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2712 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2716 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2718 intptr_t i, oprsz = simd_oprsz(desc);
2720 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2721 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2725 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2727 intptr_t i, oprsz = simd_oprsz(desc);
2729 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2730 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2735 * Unsigned saturating addition with scalar operand.
2738 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2740 intptr_t i, oprsz = simd_oprsz(desc);
2742 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2743 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2747 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2749 intptr_t i, oprsz = simd_oprsz(desc);
2751 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2752 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2756 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2758 intptr_t i, oprsz = simd_oprsz(desc);
2760 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2761 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2765 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2767 intptr_t i, oprsz = simd_oprsz(desc);
2769 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2770 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2774 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2776 intptr_t i, oprsz = simd_oprsz(desc);
2778 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2779 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2783 /* Two operand predicated copy immediate with merge. All valid immediates
2784 * can fit within 17 signed bits in the simd_data field.
2786 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2787 uint64_t mm, uint32_t desc)
2789 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2790 uint64_t *d = vd, *n = vn;
2791 uint8_t *pg = vg;
2793 mm = dup_const(MO_8, mm);
2794 for (i = 0; i < opr_sz; i += 1) {
2795 uint64_t nn = n[i];
2796 uint64_t pp = expand_pred_b(pg[H1(i)]);
2797 d[i] = (mm & pp) | (nn & ~pp);
2801 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2802 uint64_t mm, uint32_t desc)
2804 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2805 uint64_t *d = vd, *n = vn;
2806 uint8_t *pg = vg;
2808 mm = dup_const(MO_16, mm);
2809 for (i = 0; i < opr_sz; i += 1) {
2810 uint64_t nn = n[i];
2811 uint64_t pp = expand_pred_h(pg[H1(i)]);
2812 d[i] = (mm & pp) | (nn & ~pp);
2816 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2817 uint64_t mm, uint32_t desc)
2819 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2820 uint64_t *d = vd, *n = vn;
2821 uint8_t *pg = vg;
2823 mm = dup_const(MO_32, mm);
2824 for (i = 0; i < opr_sz; i += 1) {
2825 uint64_t nn = n[i];
2826 uint64_t pp = expand_pred_s(pg[H1(i)]);
2827 d[i] = (mm & pp) | (nn & ~pp);
2831 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2832 uint64_t mm, uint32_t desc)
2834 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2835 uint64_t *d = vd, *n = vn;
2836 uint8_t *pg = vg;
2838 for (i = 0; i < opr_sz; i += 1) {
2839 uint64_t nn = n[i];
2840 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2844 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2846 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2847 uint64_t *d = vd;
2848 uint8_t *pg = vg;
2850 val = dup_const(MO_8, val);
2851 for (i = 0; i < opr_sz; i += 1) {
2852 d[i] = val & expand_pred_b(pg[H1(i)]);
2856 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2858 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2859 uint64_t *d = vd;
2860 uint8_t *pg = vg;
2862 val = dup_const(MO_16, val);
2863 for (i = 0; i < opr_sz; i += 1) {
2864 d[i] = val & expand_pred_h(pg[H1(i)]);
2868 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2870 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2871 uint64_t *d = vd;
2872 uint8_t *pg = vg;
2874 val = dup_const(MO_32, val);
2875 for (i = 0; i < opr_sz; i += 1) {
2876 d[i] = val & expand_pred_s(pg[H1(i)]);
2880 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2882 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2883 uint64_t *d = vd;
2884 uint8_t *pg = vg;
2886 for (i = 0; i < opr_sz; i += 1) {
2887 d[i] = (pg[H1(i)] & 1 ? val : 0);
2891 /* Big-endian hosts need to frob the byte indices. If the copy
2892 * happens to be 8-byte aligned, then no frobbing necessary.
2894 static void swap_memmove(void *vd, void *vs, size_t n)
2896 uintptr_t d = (uintptr_t)vd;
2897 uintptr_t s = (uintptr_t)vs;
2898 uintptr_t o = (d | s | n) & 7;
2899 size_t i;
2901 #ifndef HOST_WORDS_BIGENDIAN
2902 o = 0;
2903 #endif
2904 switch (o) {
2905 case 0:
2906 memmove(vd, vs, n);
2907 break;
2909 case 4:
2910 if (d < s || d >= s + n) {
2911 for (i = 0; i < n; i += 4) {
2912 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2914 } else {
2915 for (i = n; i > 0; ) {
2916 i -= 4;
2917 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2920 break;
2922 case 2:
2923 case 6:
2924 if (d < s || d >= s + n) {
2925 for (i = 0; i < n; i += 2) {
2926 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2928 } else {
2929 for (i = n; i > 0; ) {
2930 i -= 2;
2931 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2934 break;
2936 default:
2937 if (d < s || d >= s + n) {
2938 for (i = 0; i < n; i++) {
2939 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2941 } else {
2942 for (i = n; i > 0; ) {
2943 i -= 1;
2944 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2947 break;
2951 /* Similarly for memset of 0. */
2952 static void swap_memzero(void *vd, size_t n)
2954 uintptr_t d = (uintptr_t)vd;
2955 uintptr_t o = (d | n) & 7;
2956 size_t i;
2958 /* Usually, the first bit of a predicate is set, so N is 0. */
2959 if (likely(n == 0)) {
2960 return;
2963 #ifndef HOST_WORDS_BIGENDIAN
2964 o = 0;
2965 #endif
2966 switch (o) {
2967 case 0:
2968 memset(vd, 0, n);
2969 break;
2971 case 4:
2972 for (i = 0; i < n; i += 4) {
2973 *(uint32_t *)H1_4(d + i) = 0;
2975 break;
2977 case 2:
2978 case 6:
2979 for (i = 0; i < n; i += 2) {
2980 *(uint16_t *)H1_2(d + i) = 0;
2982 break;
2984 default:
2985 for (i = 0; i < n; i++) {
2986 *(uint8_t *)H1(d + i) = 0;
2988 break;
2992 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2994 intptr_t opr_sz = simd_oprsz(desc);
2995 size_t n_ofs = simd_data(desc);
2996 size_t n_siz = opr_sz - n_ofs;
2998 if (vd != vm) {
2999 swap_memmove(vd, vn + n_ofs, n_siz);
3000 swap_memmove(vd + n_siz, vm, n_ofs);
3001 } else if (vd != vn) {
3002 swap_memmove(vd + n_siz, vd, n_ofs);
3003 swap_memmove(vd, vn + n_ofs, n_siz);
3004 } else {
3005 /* vd == vn == vm. Need temp space. */
3006 ARMVectorReg tmp;
3007 swap_memmove(&tmp, vm, n_ofs);
3008 swap_memmove(vd, vd + n_ofs, n_siz);
3009 memcpy(vd + n_siz, &tmp, n_ofs);
3013 #define DO_INSR(NAME, TYPE, H) \
3014 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
3016 intptr_t opr_sz = simd_oprsz(desc); \
3017 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
3018 *(TYPE *)(vd + H(0)) = val; \
3021 DO_INSR(sve_insr_b, uint8_t, H1)
3022 DO_INSR(sve_insr_h, uint16_t, H1_2)
3023 DO_INSR(sve_insr_s, uint32_t, H1_4)
3024 DO_INSR(sve_insr_d, uint64_t, )
3026 #undef DO_INSR
3028 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
3030 intptr_t i, j, opr_sz = simd_oprsz(desc);
3031 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3032 uint64_t f = *(uint64_t *)(vn + i);
3033 uint64_t b = *(uint64_t *)(vn + j);
3034 *(uint64_t *)(vd + i) = bswap64(b);
3035 *(uint64_t *)(vd + j) = bswap64(f);
3039 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
3041 intptr_t i, j, opr_sz = simd_oprsz(desc);
3042 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3043 uint64_t f = *(uint64_t *)(vn + i);
3044 uint64_t b = *(uint64_t *)(vn + j);
3045 *(uint64_t *)(vd + i) = hswap64(b);
3046 *(uint64_t *)(vd + j) = hswap64(f);
3050 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
3052 intptr_t i, j, opr_sz = simd_oprsz(desc);
3053 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3054 uint64_t f = *(uint64_t *)(vn + i);
3055 uint64_t b = *(uint64_t *)(vn + j);
3056 *(uint64_t *)(vd + i) = rol64(b, 32);
3057 *(uint64_t *)(vd + j) = rol64(f, 32);
3061 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
3063 intptr_t i, j, opr_sz = simd_oprsz(desc);
3064 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3065 uint64_t f = *(uint64_t *)(vn + i);
3066 uint64_t b = *(uint64_t *)(vn + j);
3067 *(uint64_t *)(vd + i) = b;
3068 *(uint64_t *)(vd + j) = f;
3072 #define DO_TBL(NAME, TYPE, H) \
3073 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3075 intptr_t i, opr_sz = simd_oprsz(desc); \
3076 uintptr_t elem = opr_sz / sizeof(TYPE); \
3077 TYPE *d = vd, *n = vn, *m = vm; \
3078 ARMVectorReg tmp; \
3079 if (unlikely(vd == vn)) { \
3080 n = memcpy(&tmp, vn, opr_sz); \
3082 for (i = 0; i < elem; i++) { \
3083 TYPE j = m[H(i)]; \
3084 d[H(i)] = j < elem ? n[H(j)] : 0; \
3088 DO_TBL(sve_tbl_b, uint8_t, H1)
3089 DO_TBL(sve_tbl_h, uint16_t, H2)
3090 DO_TBL(sve_tbl_s, uint32_t, H4)
3091 DO_TBL(sve_tbl_d, uint64_t, )
3093 #undef TBL
3095 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3096 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3098 intptr_t i, opr_sz = simd_oprsz(desc); \
3099 TYPED *d = vd; \
3100 TYPES *n = vn; \
3101 ARMVectorReg tmp; \
3102 if (unlikely(vn - vd < opr_sz)) { \
3103 n = memcpy(&tmp, n, opr_sz / 2); \
3105 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3106 d[HD(i)] = n[HS(i)]; \
3110 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3111 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3112 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
3114 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3115 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3116 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
3118 #undef DO_UNPK
3120 /* Mask of bits included in the even numbered predicates of width esz.
3121 * We also use this for expand_bits/compress_bits, and so extend the
3122 * same pattern out to 16-bit units.
3124 static const uint64_t even_bit_esz_masks[5] = {
3125 0x5555555555555555ull,
3126 0x3333333333333333ull,
3127 0x0f0f0f0f0f0f0f0full,
3128 0x00ff00ff00ff00ffull,
3129 0x0000ffff0000ffffull,
3132 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3133 * For N==0, this corresponds to the operation that in qemu/bitops.h
3134 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3135 * section 7-2 Shuffling Bits.
3137 static uint64_t expand_bits(uint64_t x, int n)
3139 int i;
3141 x &= 0xffffffffu;
3142 for (i = 4; i >= n; i--) {
3143 int sh = 1 << i;
3144 x = ((x << sh) | x) & even_bit_esz_masks[i];
3146 return x;
3149 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3150 * For N==0, this corresponds to the operation that in qemu/bitops.h
3151 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3152 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3154 static uint64_t compress_bits(uint64_t x, int n)
3156 int i;
3158 for (i = n; i <= 4; i++) {
3159 int sh = 1 << i;
3160 x &= even_bit_esz_masks[i];
3161 x = (x >> sh) | x;
3163 return x & 0xffffffffu;
3166 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3168 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3169 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3170 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3171 int esize = 1 << esz;
3172 uint64_t *d = vd;
3173 intptr_t i;
3175 if (oprsz <= 8) {
3176 uint64_t nn = *(uint64_t *)vn;
3177 uint64_t mm = *(uint64_t *)vm;
3178 int half = 4 * oprsz;
3180 nn = extract64(nn, high * half, half);
3181 mm = extract64(mm, high * half, half);
3182 nn = expand_bits(nn, esz);
3183 mm = expand_bits(mm, esz);
3184 d[0] = nn | (mm << esize);
3185 } else {
3186 ARMPredicateReg tmp;
3188 /* We produce output faster than we consume input.
3189 Therefore we must be mindful of possible overlap. */
3190 if (vd == vn) {
3191 vn = memcpy(&tmp, vn, oprsz);
3192 if (vd == vm) {
3193 vm = vn;
3195 } else if (vd == vm) {
3196 vm = memcpy(&tmp, vm, oprsz);
3198 if (high) {
3199 high = oprsz >> 1;
3202 if ((oprsz & 7) == 0) {
3203 uint32_t *n = vn, *m = vm;
3204 high >>= 2;
3206 for (i = 0; i < oprsz / 8; i++) {
3207 uint64_t nn = n[H4(high + i)];
3208 uint64_t mm = m[H4(high + i)];
3210 nn = expand_bits(nn, esz);
3211 mm = expand_bits(mm, esz);
3212 d[i] = nn | (mm << esize);
3214 } else {
3215 uint8_t *n = vn, *m = vm;
3216 uint16_t *d16 = vd;
3218 for (i = 0; i < oprsz / 2; i++) {
3219 uint16_t nn = n[H1(high + i)];
3220 uint16_t mm = m[H1(high + i)];
3222 nn = expand_bits(nn, esz);
3223 mm = expand_bits(mm, esz);
3224 d16[H2(i)] = nn | (mm << esize);
3230 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3232 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3233 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3234 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3235 uint64_t *d = vd, *n = vn, *m = vm;
3236 uint64_t l, h;
3237 intptr_t i;
3239 if (oprsz <= 8) {
3240 l = compress_bits(n[0] >> odd, esz);
3241 h = compress_bits(m[0] >> odd, esz);
3242 d[0] = l | (h << (4 * oprsz));
3243 } else {
3244 ARMPredicateReg tmp_m;
3245 intptr_t oprsz_16 = oprsz / 16;
3247 if ((vm - vd) < (uintptr_t)oprsz) {
3248 m = memcpy(&tmp_m, vm, oprsz);
3251 for (i = 0; i < oprsz_16; i++) {
3252 l = n[2 * i + 0];
3253 h = n[2 * i + 1];
3254 l = compress_bits(l >> odd, esz);
3255 h = compress_bits(h >> odd, esz);
3256 d[i] = l | (h << 32);
3260 * For VL which is not a multiple of 512, the results from M do not
3261 * align nicely with the uint64_t for D. Put the aligned results
3262 * from M into TMP_M and then copy it into place afterward.
3264 if (oprsz & 15) {
3265 int final_shift = (oprsz & 15) * 2;
3267 l = n[2 * i + 0];
3268 h = n[2 * i + 1];
3269 l = compress_bits(l >> odd, esz);
3270 h = compress_bits(h >> odd, esz);
3271 d[i] = l | (h << final_shift);
3273 for (i = 0; i < oprsz_16; i++) {
3274 l = m[2 * i + 0];
3275 h = m[2 * i + 1];
3276 l = compress_bits(l >> odd, esz);
3277 h = compress_bits(h >> odd, esz);
3278 tmp_m.p[i] = l | (h << 32);
3280 l = m[2 * i + 0];
3281 h = m[2 * i + 1];
3282 l = compress_bits(l >> odd, esz);
3283 h = compress_bits(h >> odd, esz);
3284 tmp_m.p[i] = l | (h << final_shift);
3286 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3287 } else {
3288 for (i = 0; i < oprsz_16; i++) {
3289 l = m[2 * i + 0];
3290 h = m[2 * i + 1];
3291 l = compress_bits(l >> odd, esz);
3292 h = compress_bits(h >> odd, esz);
3293 d[oprsz_16 + i] = l | (h << 32);
3299 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3301 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3302 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3303 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3304 uint64_t *d = vd, *n = vn, *m = vm;
3305 uint64_t mask;
3306 int shr, shl;
3307 intptr_t i;
3309 shl = 1 << esz;
3310 shr = 0;
3311 mask = even_bit_esz_masks[esz];
3312 if (odd) {
3313 mask <<= shl;
3314 shr = shl;
3315 shl = 0;
3318 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3319 uint64_t nn = (n[i] & mask) >> shr;
3320 uint64_t mm = (m[i] & mask) << shl;
3321 d[i] = nn + mm;
3325 /* Reverse units of 2**N bits. */
3326 static uint64_t reverse_bits_64(uint64_t x, int n)
3328 int i, sh;
3330 x = bswap64(x);
3331 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3332 uint64_t mask = even_bit_esz_masks[i];
3333 x = ((x & mask) << sh) | ((x >> sh) & mask);
3335 return x;
3338 static uint8_t reverse_bits_8(uint8_t x, int n)
3340 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3341 int i, sh;
3343 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3344 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3346 return x;
3349 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3351 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3352 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3353 intptr_t i, oprsz_2 = oprsz / 2;
3355 if (oprsz <= 8) {
3356 uint64_t l = *(uint64_t *)vn;
3357 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3358 *(uint64_t *)vd = l;
3359 } else if ((oprsz & 15) == 0) {
3360 for (i = 0; i < oprsz_2; i += 8) {
3361 intptr_t ih = oprsz - 8 - i;
3362 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3363 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3364 *(uint64_t *)(vd + i) = h;
3365 *(uint64_t *)(vd + ih) = l;
3367 } else {
3368 for (i = 0; i < oprsz_2; i += 1) {
3369 intptr_t il = H1(i);
3370 intptr_t ih = H1(oprsz - 1 - i);
3371 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3372 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3373 *(uint8_t *)(vd + il) = h;
3374 *(uint8_t *)(vd + ih) = l;
3379 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3381 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3382 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3383 uint64_t *d = vd;
3384 intptr_t i;
3386 if (oprsz <= 8) {
3387 uint64_t nn = *(uint64_t *)vn;
3388 int half = 4 * oprsz;
3390 nn = extract64(nn, high * half, half);
3391 nn = expand_bits(nn, 0);
3392 d[0] = nn;
3393 } else {
3394 ARMPredicateReg tmp_n;
3396 /* We produce output faster than we consume input.
3397 Therefore we must be mindful of possible overlap. */
3398 if ((vn - vd) < (uintptr_t)oprsz) {
3399 vn = memcpy(&tmp_n, vn, oprsz);
3401 if (high) {
3402 high = oprsz >> 1;
3405 if ((oprsz & 7) == 0) {
3406 uint32_t *n = vn;
3407 high >>= 2;
3409 for (i = 0; i < oprsz / 8; i++) {
3410 uint64_t nn = n[H4(high + i)];
3411 d[i] = expand_bits(nn, 0);
3413 } else {
3414 uint16_t *d16 = vd;
3415 uint8_t *n = vn;
3417 for (i = 0; i < oprsz / 2; i++) {
3418 uint16_t nn = n[H1(high + i)];
3419 d16[H2(i)] = expand_bits(nn, 0);
3425 #define DO_ZIP(NAME, TYPE, H) \
3426 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3428 intptr_t oprsz = simd_oprsz(desc); \
3429 intptr_t i, oprsz_2 = oprsz / 2; \
3430 ARMVectorReg tmp_n, tmp_m; \
3431 /* We produce output faster than we consume input. \
3432 Therefore we must be mindful of possible overlap. */ \
3433 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3434 vn = memcpy(&tmp_n, vn, oprsz_2); \
3436 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3437 vm = memcpy(&tmp_m, vm, oprsz_2); \
3439 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3440 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3441 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3445 DO_ZIP(sve_zip_b, uint8_t, H1)
3446 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3447 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3448 DO_ZIP(sve_zip_d, uint64_t, )
3450 #define DO_UZP(NAME, TYPE, H) \
3451 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3453 intptr_t oprsz = simd_oprsz(desc); \
3454 intptr_t oprsz_2 = oprsz / 2; \
3455 intptr_t odd_ofs = simd_data(desc); \
3456 intptr_t i; \
3457 ARMVectorReg tmp_m; \
3458 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3459 vm = memcpy(&tmp_m, vm, oprsz); \
3461 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3462 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3464 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3465 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3469 DO_UZP(sve_uzp_b, uint8_t, H1)
3470 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3471 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3472 DO_UZP(sve_uzp_d, uint64_t, )
3474 #define DO_TRN(NAME, TYPE, H) \
3475 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3477 intptr_t oprsz = simd_oprsz(desc); \
3478 intptr_t odd_ofs = simd_data(desc); \
3479 intptr_t i; \
3480 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3481 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3482 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3483 *(TYPE *)(vd + H(i + 0)) = ae; \
3484 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3488 DO_TRN(sve_trn_b, uint8_t, H1)
3489 DO_TRN(sve_trn_h, uint16_t, H1_2)
3490 DO_TRN(sve_trn_s, uint32_t, H1_4)
3491 DO_TRN(sve_trn_d, uint64_t, )
3493 #undef DO_ZIP
3494 #undef DO_UZP
3495 #undef DO_TRN
3497 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3499 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3500 uint32_t *d = vd, *n = vn;
3501 uint8_t *pg = vg;
3503 for (i = j = 0; i < opr_sz; i++) {
3504 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3505 d[H4(j)] = n[H4(i)];
3506 j++;
3509 for (; j < opr_sz; j++) {
3510 d[H4(j)] = 0;
3514 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3516 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3517 uint64_t *d = vd, *n = vn;
3518 uint8_t *pg = vg;
3520 for (i = j = 0; i < opr_sz; i++) {
3521 if (pg[H1(i)] & 1) {
3522 d[j] = n[i];
3523 j++;
3526 for (; j < opr_sz; j++) {
3527 d[j] = 0;
3531 /* Similar to the ARM LastActiveElement pseudocode function, except the
3532 * result is multiplied by the element size. This includes the not found
3533 * indication; e.g. not found for esz=3 is -8.
3535 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3537 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3538 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3540 return last_active_element(vg, words, esz);
3543 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3545 intptr_t opr_sz = simd_oprsz(desc) / 8;
3546 int esz = simd_data(desc);
3547 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3548 intptr_t i, first_i, last_i;
3549 ARMVectorReg tmp;
3551 first_i = last_i = 0;
3552 first_g = last_g = 0;
3554 /* Find the extent of the active elements within VG. */
3555 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3556 pg = *(uint64_t *)(vg + i) & mask;
3557 if (pg) {
3558 if (last_g == 0) {
3559 last_g = pg;
3560 last_i = i;
3562 first_g = pg;
3563 first_i = i;
3567 len = 0;
3568 if (first_g != 0) {
3569 first_i = first_i * 8 + ctz64(first_g);
3570 last_i = last_i * 8 + 63 - clz64(last_g);
3571 len = last_i - first_i + (1 << esz);
3572 if (vd == vm) {
3573 vm = memcpy(&tmp, vm, opr_sz * 8);
3575 swap_memmove(vd, vn + first_i, len);
3577 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3580 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3581 void *vg, uint32_t desc)
3583 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3584 uint64_t *d = vd, *n = vn, *m = vm;
3585 uint8_t *pg = vg;
3587 for (i = 0; i < opr_sz; i += 1) {
3588 uint64_t nn = n[i], mm = m[i];
3589 uint64_t pp = expand_pred_b(pg[H1(i)]);
3590 d[i] = (nn & pp) | (mm & ~pp);
3594 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3595 void *vg, uint32_t desc)
3597 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3598 uint64_t *d = vd, *n = vn, *m = vm;
3599 uint8_t *pg = vg;
3601 for (i = 0; i < opr_sz; i += 1) {
3602 uint64_t nn = n[i], mm = m[i];
3603 uint64_t pp = expand_pred_h(pg[H1(i)]);
3604 d[i] = (nn & pp) | (mm & ~pp);
3608 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3609 void *vg, uint32_t desc)
3611 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3612 uint64_t *d = vd, *n = vn, *m = vm;
3613 uint8_t *pg = vg;
3615 for (i = 0; i < opr_sz; i += 1) {
3616 uint64_t nn = n[i], mm = m[i];
3617 uint64_t pp = expand_pred_s(pg[H1(i)]);
3618 d[i] = (nn & pp) | (mm & ~pp);
3622 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3623 void *vg, uint32_t desc)
3625 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3626 uint64_t *d = vd, *n = vn, *m = vm;
3627 uint8_t *pg = vg;
3629 for (i = 0; i < opr_sz; i += 1) {
3630 uint64_t nn = n[i], mm = m[i];
3631 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3635 /* Two operand comparison controlled by a predicate.
3636 * ??? It is very tempting to want to be able to expand this inline
3637 * with x86 instructions, e.g.
3639 * vcmpeqw zm, zn, %ymm0
3640 * vpmovmskb %ymm0, %eax
3641 * and $0x5555, %eax
3642 * and pg, %eax
3644 * or even aarch64, e.g.
3646 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3647 * cmeq v0.8h, zn, zm
3648 * and v0.8h, v0.8h, mask
3649 * addv h0, v0.8h
3650 * and v0.8b, pg
3652 * However, coming up with an abstraction that allows vector inputs and
3653 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3654 * scalar outputs, is tricky.
3656 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3657 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3659 intptr_t opr_sz = simd_oprsz(desc); \
3660 uint32_t flags = PREDTEST_INIT; \
3661 intptr_t i = opr_sz; \
3662 do { \
3663 uint64_t out = 0, pg; \
3664 do { \
3665 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3666 TYPE nn = *(TYPE *)(vn + H(i)); \
3667 TYPE mm = *(TYPE *)(vm + H(i)); \
3668 out |= nn OP mm; \
3669 } while (i & 63); \
3670 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3671 out &= pg; \
3672 *(uint64_t *)(vd + (i >> 3)) = out; \
3673 flags = iter_predtest_bwd(out, pg, flags); \
3674 } while (i > 0); \
3675 return flags; \
3678 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3679 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3680 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3681 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3682 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3683 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3684 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3685 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3687 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3688 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3689 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3690 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3692 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3693 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3694 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3695 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3697 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3698 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3699 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3700 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3702 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3703 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3704 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3705 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3707 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3708 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3709 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3710 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3712 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3713 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3714 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3715 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3717 #undef DO_CMP_PPZZ_B
3718 #undef DO_CMP_PPZZ_H
3719 #undef DO_CMP_PPZZ_S
3720 #undef DO_CMP_PPZZ_D
3721 #undef DO_CMP_PPZZ
3723 /* Similar, but the second source is "wide". */
3724 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3725 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3727 intptr_t opr_sz = simd_oprsz(desc); \
3728 uint32_t flags = PREDTEST_INIT; \
3729 intptr_t i = opr_sz; \
3730 do { \
3731 uint64_t out = 0, pg; \
3732 do { \
3733 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3734 do { \
3735 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3736 TYPE nn = *(TYPE *)(vn + H(i)); \
3737 out |= nn OP mm; \
3738 } while (i & 7); \
3739 } while (i & 63); \
3740 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3741 out &= pg; \
3742 *(uint64_t *)(vd + (i >> 3)) = out; \
3743 flags = iter_predtest_bwd(out, pg, flags); \
3744 } while (i > 0); \
3745 return flags; \
3748 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3749 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3750 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3751 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3752 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3753 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3755 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3756 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3757 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3759 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3760 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3761 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3763 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3764 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3765 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3767 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3768 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3769 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3771 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3772 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3773 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3775 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3776 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3777 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3779 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3780 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3781 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3783 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3784 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3785 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3787 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3788 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3789 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3791 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3792 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3793 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3795 #undef DO_CMP_PPZW_B
3796 #undef DO_CMP_PPZW_H
3797 #undef DO_CMP_PPZW_S
3798 #undef DO_CMP_PPZW
3800 /* Similar, but the second source is immediate. */
3801 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3802 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3804 intptr_t opr_sz = simd_oprsz(desc); \
3805 uint32_t flags = PREDTEST_INIT; \
3806 TYPE mm = simd_data(desc); \
3807 intptr_t i = opr_sz; \
3808 do { \
3809 uint64_t out = 0, pg; \
3810 do { \
3811 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3812 TYPE nn = *(TYPE *)(vn + H(i)); \
3813 out |= nn OP mm; \
3814 } while (i & 63); \
3815 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3816 out &= pg; \
3817 *(uint64_t *)(vd + (i >> 3)) = out; \
3818 flags = iter_predtest_bwd(out, pg, flags); \
3819 } while (i > 0); \
3820 return flags; \
3823 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3824 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3825 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3826 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3827 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3828 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3829 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3830 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3832 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3833 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3834 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3835 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3837 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3838 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3839 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3840 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3842 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3843 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3844 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3845 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3847 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3848 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3849 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3850 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3852 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3853 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3854 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3855 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3857 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3858 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3859 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3860 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3862 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3863 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3864 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3865 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3867 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3868 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3869 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3870 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3872 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3873 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3874 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3875 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3877 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3878 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3879 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3880 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3882 #undef DO_CMP_PPZI_B
3883 #undef DO_CMP_PPZI_H
3884 #undef DO_CMP_PPZI_S
3885 #undef DO_CMP_PPZI_D
3886 #undef DO_CMP_PPZI
3888 /* Similar to the ARM LastActive pseudocode function. */
3889 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3891 intptr_t i;
3893 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3894 uint64_t pg = *(uint64_t *)(vg + i);
3895 if (pg) {
3896 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3899 return 0;
3902 /* Compute a mask into RETB that is true for all G, up to and including
3903 * (if after) or excluding (if !after) the first G & N.
3904 * Return true if BRK found.
3906 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3907 bool brk, bool after)
3909 uint64_t b;
3911 if (brk) {
3912 b = 0;
3913 } else if ((g & n) == 0) {
3914 /* For all G, no N are set; break not found. */
3915 b = g;
3916 } else {
3917 /* Break somewhere in N. Locate it. */
3918 b = g & n; /* guard true, pred true */
3919 b = b & -b; /* first such */
3920 if (after) {
3921 b = b | (b - 1); /* break after same */
3922 } else {
3923 b = b - 1; /* break before same */
3925 brk = true;
3928 *retb = b;
3929 return brk;
3932 /* Compute a zeroing BRK. */
3933 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3934 intptr_t oprsz, bool after)
3936 bool brk = false;
3937 intptr_t i;
3939 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3940 uint64_t this_b, this_g = g[i];
3942 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3943 d[i] = this_b & this_g;
3947 /* Likewise, but also compute flags. */
3948 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3949 intptr_t oprsz, bool after)
3951 uint32_t flags = PREDTEST_INIT;
3952 bool brk = false;
3953 intptr_t i;
3955 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3956 uint64_t this_b, this_d, this_g = g[i];
3958 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3959 d[i] = this_d = this_b & this_g;
3960 flags = iter_predtest_fwd(this_d, this_g, flags);
3962 return flags;
3965 /* Compute a merging BRK. */
3966 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3967 intptr_t oprsz, bool after)
3969 bool brk = false;
3970 intptr_t i;
3972 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3973 uint64_t this_b, this_g = g[i];
3975 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3976 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3980 /* Likewise, but also compute flags. */
3981 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3982 intptr_t oprsz, bool after)
3984 uint32_t flags = PREDTEST_INIT;
3985 bool brk = false;
3986 intptr_t i;
3988 for (i = 0; i < oprsz / 8; ++i) {
3989 uint64_t this_b, this_d = d[i], this_g = g[i];
3991 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3992 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3993 flags = iter_predtest_fwd(this_d, this_g, flags);
3995 return flags;
3998 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
4000 /* It is quicker to zero the whole predicate than loop on OPRSZ.
4001 * The compiler should turn this into 4 64-bit integer stores.
4003 memset(d, 0, sizeof(ARMPredicateReg));
4004 return PREDTEST_INIT;
4007 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4008 uint32_t pred_desc)
4010 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4011 if (last_active_pred(vn, vg, oprsz)) {
4012 compute_brk_z(vd, vm, vg, oprsz, true);
4013 } else {
4014 do_zero(vd, oprsz);
4018 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4019 uint32_t pred_desc)
4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022 if (last_active_pred(vn, vg, oprsz)) {
4023 return compute_brks_z(vd, vm, vg, oprsz, true);
4024 } else {
4025 return do_zero(vd, oprsz);
4029 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4030 uint32_t pred_desc)
4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4033 if (last_active_pred(vn, vg, oprsz)) {
4034 compute_brk_z(vd, vm, vg, oprsz, false);
4035 } else {
4036 do_zero(vd, oprsz);
4040 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4041 uint32_t pred_desc)
4043 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4044 if (last_active_pred(vn, vg, oprsz)) {
4045 return compute_brks_z(vd, vm, vg, oprsz, false);
4046 } else {
4047 return do_zero(vd, oprsz);
4051 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4053 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4054 compute_brk_z(vd, vn, vg, oprsz, true);
4057 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4059 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4060 return compute_brks_z(vd, vn, vg, oprsz, true);
4063 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4065 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4066 compute_brk_z(vd, vn, vg, oprsz, false);
4069 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4071 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4072 return compute_brks_z(vd, vn, vg, oprsz, false);
4075 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4077 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4078 compute_brk_m(vd, vn, vg, oprsz, true);
4081 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4083 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4084 return compute_brks_m(vd, vn, vg, oprsz, true);
4087 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4089 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090 compute_brk_m(vd, vn, vg, oprsz, false);
4093 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4095 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4096 return compute_brks_m(vd, vn, vg, oprsz, false);
4099 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4101 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4102 if (!last_active_pred(vn, vg, oprsz)) {
4103 do_zero(vd, oprsz);
4107 /* As if PredTest(Ones(PL), D, esz). */
4108 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4109 uint64_t esz_mask)
4111 uint32_t flags = PREDTEST_INIT;
4112 intptr_t i;
4114 for (i = 0; i < oprsz / 8; i++) {
4115 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4117 if (oprsz & 7) {
4118 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4119 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4121 return flags;
4124 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4126 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4127 if (last_active_pred(vn, vg, oprsz)) {
4128 return predtest_ones(vd, oprsz, -1);
4129 } else {
4130 return do_zero(vd, oprsz);
4134 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4136 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4137 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4138 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4139 intptr_t i;
4141 for (i = 0; i < words; ++i) {
4142 uint64_t t = n[i] & g[i] & mask;
4143 sum += ctpop64(t);
4145 return sum;
4148 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4150 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4151 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4152 uint64_t esz_mask = pred_esz_masks[esz];
4153 ARMPredicateReg *d = vd;
4154 uint32_t flags;
4155 intptr_t i;
4157 /* Begin with a zero predicate register. */
4158 flags = do_zero(d, oprsz);
4159 if (count == 0) {
4160 return flags;
4163 /* Set all of the requested bits. */
4164 for (i = 0; i < count / 64; ++i) {
4165 d->p[i] = esz_mask;
4167 if (count & 63) {
4168 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4171 return predtest_ones(d, oprsz, esz_mask);
4174 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4176 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4177 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4178 uint64_t esz_mask = pred_esz_masks[esz];
4179 ARMPredicateReg *d = vd;
4180 intptr_t i, invcount, oprbits;
4181 uint64_t bits;
4183 if (count == 0) {
4184 return do_zero(d, oprsz);
4187 oprbits = oprsz * 8;
4188 tcg_debug_assert(count <= oprbits);
4190 bits = esz_mask;
4191 if (oprbits & 63) {
4192 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4195 invcount = oprbits - count;
4196 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4197 d->p[i] = bits;
4198 bits = esz_mask;
4201 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4203 while (--i >= 0) {
4204 d->p[i] = 0;
4207 return predtest_ones(d, oprsz, esz_mask);
4210 /* Recursive reduction on a function;
4211 * C.f. the ARM ARM function ReducePredicated.
4213 * While it would be possible to write this without the DATA temporary,
4214 * it is much simpler to process the predicate register this way.
4215 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4216 * little to gain with a more complex non-recursive form.
4218 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4219 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4221 if (n == 1) { \
4222 return *data; \
4223 } else { \
4224 uintptr_t half = n / 2; \
4225 TYPE lo = NAME##_reduce(data, status, half); \
4226 TYPE hi = NAME##_reduce(data + half, status, half); \
4227 return TYPE##_##FUNC(lo, hi, status); \
4230 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4232 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4233 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4234 for (i = 0; i < oprsz; ) { \
4235 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4236 do { \
4237 TYPE nn = *(TYPE *)(vn + H(i)); \
4238 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4239 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4240 } while (i & 15); \
4242 for (; i < maxsz; i += sizeof(TYPE)) { \
4243 *(TYPE *)((void *)data + i) = IDENT; \
4245 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4248 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4249 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4250 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
4252 /* Identity is floatN_default_nan, without the function call. */
4253 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4254 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4255 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
4257 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4258 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4259 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
4261 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4262 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4263 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
4265 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4266 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4267 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
4269 #undef DO_REDUCE
4271 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4272 void *status, uint32_t desc)
4274 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4275 float16 result = nn;
4277 do {
4278 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4279 do {
4280 if (pg & 1) {
4281 float16 mm = *(float16 *)(vm + H1_2(i));
4282 result = float16_add(result, mm, status);
4284 i += sizeof(float16), pg >>= sizeof(float16);
4285 } while (i & 15);
4286 } while (i < opr_sz);
4288 return result;
4291 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4292 void *status, uint32_t desc)
4294 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4295 float32 result = nn;
4297 do {
4298 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4299 do {
4300 if (pg & 1) {
4301 float32 mm = *(float32 *)(vm + H1_2(i));
4302 result = float32_add(result, mm, status);
4304 i += sizeof(float32), pg >>= sizeof(float32);
4305 } while (i & 15);
4306 } while (i < opr_sz);
4308 return result;
4311 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4312 void *status, uint32_t desc)
4314 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4315 uint64_t *m = vm;
4316 uint8_t *pg = vg;
4318 for (i = 0; i < opr_sz; i++) {
4319 if (pg[H1(i)] & 1) {
4320 nn = float64_add(nn, m[i], status);
4324 return nn;
4327 /* Fully general three-operand expander, controlled by a predicate,
4328 * With the extra float_status parameter.
4330 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4332 void *status, uint32_t desc) \
4334 intptr_t i = simd_oprsz(desc); \
4335 uint64_t *g = vg; \
4336 do { \
4337 uint64_t pg = g[(i - 1) >> 6]; \
4338 do { \
4339 i -= sizeof(TYPE); \
4340 if (likely((pg >> (i & 63)) & 1)) { \
4341 TYPE nn = *(TYPE *)(vn + H(i)); \
4342 TYPE mm = *(TYPE *)(vm + H(i)); \
4343 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4345 } while (i & 63); \
4346 } while (i != 0); \
4349 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4350 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4351 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
4353 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4354 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4355 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
4357 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4358 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4359 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
4361 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4362 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4363 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
4365 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4366 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4367 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
4369 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4370 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4371 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
4373 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4374 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4375 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
4377 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4378 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4379 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
4381 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4383 return float16_abs(float16_sub(a, b, s));
4386 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4388 return float32_abs(float32_sub(a, b, s));
4391 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4393 return float64_abs(float64_sub(a, b, s));
4396 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4397 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4398 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
4400 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4402 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4403 return float64_scalbn(a, b_int, s);
4406 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4407 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4408 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
4410 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4411 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4412 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
4414 #undef DO_ZPZZ_FP
4416 /* Three-operand expander, with one scalar operand, controlled by
4417 * a predicate, with the extra float_status parameter.
4419 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4420 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4421 void *status, uint32_t desc) \
4423 intptr_t i = simd_oprsz(desc); \
4424 uint64_t *g = vg; \
4425 TYPE mm = scalar; \
4426 do { \
4427 uint64_t pg = g[(i - 1) >> 6]; \
4428 do { \
4429 i -= sizeof(TYPE); \
4430 if (likely((pg >> (i & 63)) & 1)) { \
4431 TYPE nn = *(TYPE *)(vn + H(i)); \
4432 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4434 } while (i & 63); \
4435 } while (i != 0); \
4438 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4439 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4440 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
4442 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4443 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4444 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
4446 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4447 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4448 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
4450 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4452 return float16_sub(b, a, s);
4455 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4457 return float32_sub(b, a, s);
4460 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4462 return float64_sub(b, a, s);
4465 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4466 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4467 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
4469 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4470 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4471 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
4473 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4474 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4475 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
4477 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4478 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4479 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
4481 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4482 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4483 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
4485 /* Fully general two-operand expander, controlled by a predicate,
4486 * With the extra float_status parameter.
4488 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4489 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4491 intptr_t i = simd_oprsz(desc); \
4492 uint64_t *g = vg; \
4493 do { \
4494 uint64_t pg = g[(i - 1) >> 6]; \
4495 do { \
4496 i -= sizeof(TYPE); \
4497 if (likely((pg >> (i & 63)) & 1)) { \
4498 TYPE nn = *(TYPE *)(vn + H(i)); \
4499 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4501 } while (i & 63); \
4502 } while (i != 0); \
4505 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4506 * FZ16. When converting from fp16, this affects flushing input denormals;
4507 * when converting to fp16, this affects flushing output denormals.
4509 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4511 bool save = get_flush_inputs_to_zero(fpst);
4512 float32 ret;
4514 set_flush_inputs_to_zero(false, fpst);
4515 ret = float16_to_float32(f, true, fpst);
4516 set_flush_inputs_to_zero(save, fpst);
4517 return ret;
4520 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4522 bool save = get_flush_inputs_to_zero(fpst);
4523 float64 ret;
4525 set_flush_inputs_to_zero(false, fpst);
4526 ret = float16_to_float64(f, true, fpst);
4527 set_flush_inputs_to_zero(save, fpst);
4528 return ret;
4531 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4533 bool save = get_flush_to_zero(fpst);
4534 float16 ret;
4536 set_flush_to_zero(false, fpst);
4537 ret = float32_to_float16(f, true, fpst);
4538 set_flush_to_zero(save, fpst);
4539 return ret;
4542 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4544 bool save = get_flush_to_zero(fpst);
4545 float16 ret;
4547 set_flush_to_zero(false, fpst);
4548 ret = float64_to_float16(f, true, fpst);
4549 set_flush_to_zero(save, fpst);
4550 return ret;
4553 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4555 if (float16_is_any_nan(f)) {
4556 float_raise(float_flag_invalid, s);
4557 return 0;
4559 return float16_to_int16_round_to_zero(f, s);
4562 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4564 if (float16_is_any_nan(f)) {
4565 float_raise(float_flag_invalid, s);
4566 return 0;
4568 return float16_to_int64_round_to_zero(f, s);
4571 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4573 if (float32_is_any_nan(f)) {
4574 float_raise(float_flag_invalid, s);
4575 return 0;
4577 return float32_to_int64_round_to_zero(f, s);
4580 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4582 if (float64_is_any_nan(f)) {
4583 float_raise(float_flag_invalid, s);
4584 return 0;
4586 return float64_to_int64_round_to_zero(f, s);
4589 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4591 if (float16_is_any_nan(f)) {
4592 float_raise(float_flag_invalid, s);
4593 return 0;
4595 return float16_to_uint16_round_to_zero(f, s);
4598 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4600 if (float16_is_any_nan(f)) {
4601 float_raise(float_flag_invalid, s);
4602 return 0;
4604 return float16_to_uint64_round_to_zero(f, s);
4607 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4609 if (float32_is_any_nan(f)) {
4610 float_raise(float_flag_invalid, s);
4611 return 0;
4613 return float32_to_uint64_round_to_zero(f, s);
4616 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4618 if (float64_is_any_nan(f)) {
4619 float_raise(float_flag_invalid, s);
4620 return 0;
4622 return float64_to_uint64_round_to_zero(f, s);
4625 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4626 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4627 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
4628 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
4629 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
4630 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
4632 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4633 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4634 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4635 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
4636 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
4637 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
4638 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
4640 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4641 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4642 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4643 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
4644 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
4645 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
4646 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
4648 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4649 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4650 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
4652 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4653 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4654 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
4656 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4657 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4658 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
4660 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4661 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4662 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
4664 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4665 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4666 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4667 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
4668 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
4669 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
4670 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
4672 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4673 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4674 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4675 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
4676 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
4677 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
4678 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
4680 #undef DO_ZPZ_FP
4682 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4683 float_status *status, uint32_t desc,
4684 uint16_t neg1, uint16_t neg3)
4686 intptr_t i = simd_oprsz(desc);
4687 uint64_t *g = vg;
4689 do {
4690 uint64_t pg = g[(i - 1) >> 6];
4691 do {
4692 i -= 2;
4693 if (likely((pg >> (i & 63)) & 1)) {
4694 float16 e1, e2, e3, r;
4696 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4697 e2 = *(uint16_t *)(vm + H1_2(i));
4698 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4699 r = float16_muladd(e1, e2, e3, 0, status);
4700 *(uint16_t *)(vd + H1_2(i)) = r;
4702 } while (i & 63);
4703 } while (i != 0);
4706 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4707 void *vg, void *status, uint32_t desc)
4709 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4712 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4713 void *vg, void *status, uint32_t desc)
4715 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4718 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4719 void *vg, void *status, uint32_t desc)
4721 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4724 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4725 void *vg, void *status, uint32_t desc)
4727 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4730 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4731 float_status *status, uint32_t desc,
4732 uint32_t neg1, uint32_t neg3)
4734 intptr_t i = simd_oprsz(desc);
4735 uint64_t *g = vg;
4737 do {
4738 uint64_t pg = g[(i - 1) >> 6];
4739 do {
4740 i -= 4;
4741 if (likely((pg >> (i & 63)) & 1)) {
4742 float32 e1, e2, e3, r;
4744 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4745 e2 = *(uint32_t *)(vm + H1_4(i));
4746 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4747 r = float32_muladd(e1, e2, e3, 0, status);
4748 *(uint32_t *)(vd + H1_4(i)) = r;
4750 } while (i & 63);
4751 } while (i != 0);
4754 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4755 void *vg, void *status, uint32_t desc)
4757 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4760 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4761 void *vg, void *status, uint32_t desc)
4763 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4766 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4767 void *vg, void *status, uint32_t desc)
4769 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4772 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4773 void *vg, void *status, uint32_t desc)
4775 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4778 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4779 float_status *status, uint32_t desc,
4780 uint64_t neg1, uint64_t neg3)
4782 intptr_t i = simd_oprsz(desc);
4783 uint64_t *g = vg;
4785 do {
4786 uint64_t pg = g[(i - 1) >> 6];
4787 do {
4788 i -= 8;
4789 if (likely((pg >> (i & 63)) & 1)) {
4790 float64 e1, e2, e3, r;
4792 e1 = *(uint64_t *)(vn + i) ^ neg1;
4793 e2 = *(uint64_t *)(vm + i);
4794 e3 = *(uint64_t *)(va + i) ^ neg3;
4795 r = float64_muladd(e1, e2, e3, 0, status);
4796 *(uint64_t *)(vd + i) = r;
4798 } while (i & 63);
4799 } while (i != 0);
4802 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4803 void *vg, void *status, uint32_t desc)
4805 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4808 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4809 void *vg, void *status, uint32_t desc)
4811 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4814 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4815 void *vg, void *status, uint32_t desc)
4817 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4820 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4821 void *vg, void *status, uint32_t desc)
4823 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4826 /* Two operand floating-point comparison controlled by a predicate.
4827 * Unlike the integer version, we are not allowed to optimistically
4828 * compare operands, since the comparison may have side effects wrt
4829 * the FPSR.
4831 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4832 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4833 void *status, uint32_t desc) \
4835 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4836 uint64_t *d = vd, *g = vg; \
4837 do { \
4838 uint64_t out = 0, pg = g[j]; \
4839 do { \
4840 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4841 if (likely((pg >> (i & 63)) & 1)) { \
4842 TYPE nn = *(TYPE *)(vn + H(i)); \
4843 TYPE mm = *(TYPE *)(vm + H(i)); \
4844 out |= OP(TYPE, nn, mm, status); \
4846 } while (i & 63); \
4847 d[j--] = out; \
4848 } while (i > 0); \
4851 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4852 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4853 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4854 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4855 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4856 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4858 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4859 DO_FPCMP_PPZZ_H(NAME, OP) \
4860 DO_FPCMP_PPZZ_S(NAME, OP) \
4861 DO_FPCMP_PPZZ_D(NAME, OP)
4863 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4864 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4865 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4866 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4867 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4868 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4869 #define DO_FCMUO(TYPE, X, Y, ST) \
4870 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4871 #define DO_FACGE(TYPE, X, Y, ST) \
4872 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4873 #define DO_FACGT(TYPE, X, Y, ST) \
4874 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4876 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4877 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4878 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4879 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4880 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4881 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4882 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4884 #undef DO_FPCMP_PPZZ_ALL
4885 #undef DO_FPCMP_PPZZ_D
4886 #undef DO_FPCMP_PPZZ_S
4887 #undef DO_FPCMP_PPZZ_H
4888 #undef DO_FPCMP_PPZZ
4890 /* One operand floating-point comparison against zero, controlled
4891 * by a predicate.
4893 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4894 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4895 void *status, uint32_t desc) \
4897 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4898 uint64_t *d = vd, *g = vg; \
4899 do { \
4900 uint64_t out = 0, pg = g[j]; \
4901 do { \
4902 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4903 if ((pg >> (i & 63)) & 1) { \
4904 TYPE nn = *(TYPE *)(vn + H(i)); \
4905 out |= OP(TYPE, nn, 0, status); \
4907 } while (i & 63); \
4908 d[j--] = out; \
4909 } while (i > 0); \
4912 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4913 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4914 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4915 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4916 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4917 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4919 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4920 DO_FPCMP_PPZ0_H(NAME, OP) \
4921 DO_FPCMP_PPZ0_S(NAME, OP) \
4922 DO_FPCMP_PPZ0_D(NAME, OP)
4924 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4925 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4926 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4927 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4928 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4929 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4931 /* FP Trig Multiply-Add. */
4933 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4935 static const float16 coeff[16] = {
4936 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4937 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4939 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4940 intptr_t x = simd_data(desc);
4941 float16 *d = vd, *n = vn, *m = vm;
4942 for (i = 0; i < opr_sz; i++) {
4943 float16 mm = m[i];
4944 intptr_t xx = x;
4945 if (float16_is_neg(mm)) {
4946 mm = float16_abs(mm);
4947 xx += 8;
4949 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4953 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4955 static const float32 coeff[16] = {
4956 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4957 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4958 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4959 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4961 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4962 intptr_t x = simd_data(desc);
4963 float32 *d = vd, *n = vn, *m = vm;
4964 for (i = 0; i < opr_sz; i++) {
4965 float32 mm = m[i];
4966 intptr_t xx = x;
4967 if (float32_is_neg(mm)) {
4968 mm = float32_abs(mm);
4969 xx += 8;
4971 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4975 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4977 static const float64 coeff[16] = {
4978 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4979 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4980 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4981 0x3de5d8408868552full, 0x0000000000000000ull,
4982 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4983 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4984 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4985 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4987 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4988 intptr_t x = simd_data(desc);
4989 float64 *d = vd, *n = vn, *m = vm;
4990 for (i = 0; i < opr_sz; i++) {
4991 float64 mm = m[i];
4992 intptr_t xx = x;
4993 if (float64_is_neg(mm)) {
4994 mm = float64_abs(mm);
4995 xx += 8;
4997 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5002 * FP Complex Add
5005 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5006 void *vs, uint32_t desc)
5008 intptr_t j, i = simd_oprsz(desc);
5009 uint64_t *g = vg;
5010 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5011 float16 neg_real = float16_chs(neg_imag);
5013 do {
5014 uint64_t pg = g[(i - 1) >> 6];
5015 do {
5016 float16 e0, e1, e2, e3;
5018 /* I holds the real index; J holds the imag index. */
5019 j = i - sizeof(float16);
5020 i -= 2 * sizeof(float16);
5022 e0 = *(float16 *)(vn + H1_2(i));
5023 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5024 e2 = *(float16 *)(vn + H1_2(j));
5025 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5027 if (likely((pg >> (i & 63)) & 1)) {
5028 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5030 if (likely((pg >> (j & 63)) & 1)) {
5031 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5033 } while (i & 63);
5034 } while (i != 0);
5037 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5038 void *vs, uint32_t desc)
5040 intptr_t j, i = simd_oprsz(desc);
5041 uint64_t *g = vg;
5042 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5043 float32 neg_real = float32_chs(neg_imag);
5045 do {
5046 uint64_t pg = g[(i - 1) >> 6];
5047 do {
5048 float32 e0, e1, e2, e3;
5050 /* I holds the real index; J holds the imag index. */
5051 j = i - sizeof(float32);
5052 i -= 2 * sizeof(float32);
5054 e0 = *(float32 *)(vn + H1_2(i));
5055 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5056 e2 = *(float32 *)(vn + H1_2(j));
5057 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5059 if (likely((pg >> (i & 63)) & 1)) {
5060 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5062 if (likely((pg >> (j & 63)) & 1)) {
5063 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5065 } while (i & 63);
5066 } while (i != 0);
5069 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5070 void *vs, uint32_t desc)
5072 intptr_t j, i = simd_oprsz(desc);
5073 uint64_t *g = vg;
5074 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5075 float64 neg_real = float64_chs(neg_imag);
5077 do {
5078 uint64_t pg = g[(i - 1) >> 6];
5079 do {
5080 float64 e0, e1, e2, e3;
5082 /* I holds the real index; J holds the imag index. */
5083 j = i - sizeof(float64);
5084 i -= 2 * sizeof(float64);
5086 e0 = *(float64 *)(vn + H1_2(i));
5087 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5088 e2 = *(float64 *)(vn + H1_2(j));
5089 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5091 if (likely((pg >> (i & 63)) & 1)) {
5092 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5094 if (likely((pg >> (j & 63)) & 1)) {
5095 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5097 } while (i & 63);
5098 } while (i != 0);
5102 * FP Complex Multiply
5105 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5106 void *vg, void *status, uint32_t desc)
5108 intptr_t j, i = simd_oprsz(desc);
5109 unsigned rot = simd_data(desc);
5110 bool flip = rot & 1;
5111 float16 neg_imag, neg_real;
5112 uint64_t *g = vg;
5114 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5115 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5117 do {
5118 uint64_t pg = g[(i - 1) >> 6];
5119 do {
5120 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5122 /* I holds the real index; J holds the imag index. */
5123 j = i - sizeof(float16);
5124 i -= 2 * sizeof(float16);
5126 nr = *(float16 *)(vn + H1_2(i));
5127 ni = *(float16 *)(vn + H1_2(j));
5128 mr = *(float16 *)(vm + H1_2(i));
5129 mi = *(float16 *)(vm + H1_2(j));
5131 e2 = (flip ? ni : nr);
5132 e1 = (flip ? mi : mr) ^ neg_real;
5133 e4 = e2;
5134 e3 = (flip ? mr : mi) ^ neg_imag;
5136 if (likely((pg >> (i & 63)) & 1)) {
5137 d = *(float16 *)(va + H1_2(i));
5138 d = float16_muladd(e2, e1, d, 0, status);
5139 *(float16 *)(vd + H1_2(i)) = d;
5141 if (likely((pg >> (j & 63)) & 1)) {
5142 d = *(float16 *)(va + H1_2(j));
5143 d = float16_muladd(e4, e3, d, 0, status);
5144 *(float16 *)(vd + H1_2(j)) = d;
5146 } while (i & 63);
5147 } while (i != 0);
5150 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5151 void *vg, void *status, uint32_t desc)
5153 intptr_t j, i = simd_oprsz(desc);
5154 unsigned rot = simd_data(desc);
5155 bool flip = rot & 1;
5156 float32 neg_imag, neg_real;
5157 uint64_t *g = vg;
5159 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5160 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5162 do {
5163 uint64_t pg = g[(i - 1) >> 6];
5164 do {
5165 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5167 /* I holds the real index; J holds the imag index. */
5168 j = i - sizeof(float32);
5169 i -= 2 * sizeof(float32);
5171 nr = *(float32 *)(vn + H1_2(i));
5172 ni = *(float32 *)(vn + H1_2(j));
5173 mr = *(float32 *)(vm + H1_2(i));
5174 mi = *(float32 *)(vm + H1_2(j));
5176 e2 = (flip ? ni : nr);
5177 e1 = (flip ? mi : mr) ^ neg_real;
5178 e4 = e2;
5179 e3 = (flip ? mr : mi) ^ neg_imag;
5181 if (likely((pg >> (i & 63)) & 1)) {
5182 d = *(float32 *)(va + H1_2(i));
5183 d = float32_muladd(e2, e1, d, 0, status);
5184 *(float32 *)(vd + H1_2(i)) = d;
5186 if (likely((pg >> (j & 63)) & 1)) {
5187 d = *(float32 *)(va + H1_2(j));
5188 d = float32_muladd(e4, e3, d, 0, status);
5189 *(float32 *)(vd + H1_2(j)) = d;
5191 } while (i & 63);
5192 } while (i != 0);
5195 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5196 void *vg, void *status, uint32_t desc)
5198 intptr_t j, i = simd_oprsz(desc);
5199 unsigned rot = simd_data(desc);
5200 bool flip = rot & 1;
5201 float64 neg_imag, neg_real;
5202 uint64_t *g = vg;
5204 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5205 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5207 do {
5208 uint64_t pg = g[(i - 1) >> 6];
5209 do {
5210 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5212 /* I holds the real index; J holds the imag index. */
5213 j = i - sizeof(float64);
5214 i -= 2 * sizeof(float64);
5216 nr = *(float64 *)(vn + H1_2(i));
5217 ni = *(float64 *)(vn + H1_2(j));
5218 mr = *(float64 *)(vm + H1_2(i));
5219 mi = *(float64 *)(vm + H1_2(j));
5221 e2 = (flip ? ni : nr);
5222 e1 = (flip ? mi : mr) ^ neg_real;
5223 e4 = e2;
5224 e3 = (flip ? mr : mi) ^ neg_imag;
5226 if (likely((pg >> (i & 63)) & 1)) {
5227 d = *(float64 *)(va + H1_2(i));
5228 d = float64_muladd(e2, e1, d, 0, status);
5229 *(float64 *)(vd + H1_2(i)) = d;
5231 if (likely((pg >> (j & 63)) & 1)) {
5232 d = *(float64 *)(va + H1_2(j));
5233 d = float64_muladd(e4, e3, d, 0, status);
5234 *(float64 *)(vd + H1_2(j)) = d;
5236 } while (i & 63);
5237 } while (i != 0);
5241 * Load contiguous data, protected by a governing predicate.
5245 * Load one element into @vd + @reg_off from @host.
5246 * The controlling predicate is known to be true.
5248 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
5251 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5252 * The controlling predicate is known to be true.
5254 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5255 target_ulong vaddr, uintptr_t retaddr);
5258 * Generate the above primitives.
5261 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5262 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5264 TYPEM val = HOST(host); \
5265 *(TYPEE *)(vd + H(reg_off)) = val; \
5268 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5269 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5270 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5272 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5273 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5274 target_ulong addr, uintptr_t ra) \
5276 *(TYPEE *)(vd + H(reg_off)) = \
5277 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5280 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5281 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5282 target_ulong addr, uintptr_t ra) \
5284 TLB(env, useronly_clean_ptr(addr), \
5285 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5288 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5289 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5290 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5292 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5293 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5294 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5295 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5296 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5297 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
5298 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
5300 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5301 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5302 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5304 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5305 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5306 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5307 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
5309 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5310 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5311 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5312 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5313 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5315 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5316 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5317 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5318 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5319 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5321 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5322 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5323 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5324 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
5325 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
5327 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5328 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5329 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
5331 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5332 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
5333 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
5335 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5336 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
5338 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
5339 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
5341 #undef DO_LD_TLB
5342 #undef DO_ST_TLB
5343 #undef DO_LD_HOST
5344 #undef DO_LD_PRIM_1
5345 #undef DO_ST_PRIM_1
5346 #undef DO_LD_PRIM_2
5347 #undef DO_ST_PRIM_2
5350 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5351 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5352 * element >= @reg_off, or @reg_max if there were no active elements at all.
5354 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5355 intptr_t reg_max, int esz)
5357 uint64_t pg_mask = pred_esz_masks[esz];
5358 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5360 /* In normal usage, the first element is active. */
5361 if (likely(pg & 1)) {
5362 return reg_off;
5365 if (pg == 0) {
5366 reg_off &= -64;
5367 do {
5368 reg_off += 64;
5369 if (unlikely(reg_off >= reg_max)) {
5370 /* The entire predicate was false. */
5371 return reg_max;
5373 pg = vg[reg_off >> 6] & pg_mask;
5374 } while (pg == 0);
5376 reg_off += ctz64(pg);
5378 /* We should never see an out of range predicate bit set. */
5379 tcg_debug_assert(reg_off < reg_max);
5380 return reg_off;
5384 * Resolve the guest virtual address to info->host and info->flags.
5385 * If @nofault, return false if the page is invalid, otherwise
5386 * exit via page fault exception.
5389 typedef struct {
5390 void *host;
5391 int flags;
5392 MemTxAttrs attrs;
5393 } SVEHostPage;
5395 static bool sve_probe_page(SVEHostPage *info, bool nofault,
5396 CPUARMState *env, target_ulong addr,
5397 int mem_off, MMUAccessType access_type,
5398 int mmu_idx, uintptr_t retaddr)
5400 int flags;
5402 addr += mem_off;
5405 * User-only currently always issues with TBI. See the comment
5406 * above useronly_clean_ptr. Usually we clean this top byte away
5407 * during translation, but we can't do that for e.g. vector + imm
5408 * addressing modes.
5410 * We currently always enable TBI for user-only, and do not provide
5411 * a way to turn it off. So clean the pointer unconditionally here,
5412 * rather than look it up here, or pass it down from above.
5414 addr = useronly_clean_ptr(addr);
5416 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5417 &info->host, retaddr);
5418 info->flags = flags;
5420 if (flags & TLB_INVALID_MASK) {
5421 g_assert(nofault);
5422 return false;
5425 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5426 info->host -= mem_off;
5428 #ifdef CONFIG_USER_ONLY
5429 memset(&info->attrs, 0, sizeof(info->attrs));
5430 #else
5432 * Find the iotlbentry for addr and return the transaction attributes.
5433 * This *must* be present in the TLB because we just found the mapping.
5436 uintptr_t index = tlb_index(env, mmu_idx, addr);
5438 # ifdef CONFIG_DEBUG_TCG
5439 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5440 target_ulong comparator = (access_type == MMU_DATA_LOAD
5441 ? entry->addr_read
5442 : tlb_addr_write(entry));
5443 g_assert(tlb_hit(comparator, addr));
5444 # endif
5446 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5447 info->attrs = iotlbentry->attrs;
5449 #endif
5451 return true;
5456 * Analyse contiguous data, protected by a governing predicate.
5459 typedef enum {
5460 FAULT_NO,
5461 FAULT_FIRST,
5462 FAULT_ALL,
5463 } SVEContFault;
5465 typedef struct {
5467 * First and last element wholly contained within the two pages.
5468 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5469 * reg_off_last[0] may be < 0 if the first element crosses pages.
5470 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5471 * are set >= 0 only if there are complete elements on a second page.
5473 * The reg_off_* offsets are relative to the internal vector register.
5474 * The mem_off_first offset is relative to the memory address; the
5475 * two offsets are different when a load operation extends, a store
5476 * operation truncates, or for multi-register operations.
5478 int16_t mem_off_first[2];
5479 int16_t reg_off_first[2];
5480 int16_t reg_off_last[2];
5483 * One element that is misaligned and spans both pages,
5484 * or -1 if there is no such active element.
5486 int16_t mem_off_split;
5487 int16_t reg_off_split;
5490 * The byte offset at which the entire operation crosses a page boundary.
5491 * Set >= 0 if and only if the entire operation spans two pages.
5493 int16_t page_split;
5495 /* TLB data for the two pages. */
5496 SVEHostPage page[2];
5497 } SVEContLdSt;
5500 * Find first active element on each page, and a loose bound for the
5501 * final element on each page. Identify any single element that spans
5502 * the page boundary. Return true if there are any active elements.
5504 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5505 uint64_t *vg, intptr_t reg_max,
5506 int esz, int msize)
5508 const int esize = 1 << esz;
5509 const uint64_t pg_mask = pred_esz_masks[esz];
5510 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5511 intptr_t mem_off_last, mem_off_split;
5512 intptr_t page_split, elt_split;
5513 intptr_t i;
5515 /* Set all of the element indices to -1, and the TLB data to 0. */
5516 memset(info, -1, offsetof(SVEContLdSt, page));
5517 memset(info->page, 0, sizeof(info->page));
5519 /* Gross scan over the entire predicate to find bounds. */
5520 i = 0;
5521 do {
5522 uint64_t pg = vg[i] & pg_mask;
5523 if (pg) {
5524 reg_off_last = i * 64 + 63 - clz64(pg);
5525 if (reg_off_first < 0) {
5526 reg_off_first = i * 64 + ctz64(pg);
5529 } while (++i * 64 < reg_max);
5531 if (unlikely(reg_off_first < 0)) {
5532 /* No active elements, no pages touched. */
5533 return false;
5535 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5537 info->reg_off_first[0] = reg_off_first;
5538 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5539 mem_off_last = (reg_off_last >> esz) * msize;
5541 page_split = -(addr | TARGET_PAGE_MASK);
5542 if (likely(mem_off_last + msize <= page_split)) {
5543 /* The entire operation fits within a single page. */
5544 info->reg_off_last[0] = reg_off_last;
5545 return true;
5548 info->page_split = page_split;
5549 elt_split = page_split / msize;
5550 reg_off_split = elt_split << esz;
5551 mem_off_split = elt_split * msize;
5554 * This is the last full element on the first page, but it is not
5555 * necessarily active. If there is no full element, i.e. the first
5556 * active element is the one that's split, this value remains -1.
5557 * It is useful as iteration bounds.
5559 if (elt_split != 0) {
5560 info->reg_off_last[0] = reg_off_split - esize;
5563 /* Determine if an unaligned element spans the pages. */
5564 if (page_split % msize != 0) {
5565 /* It is helpful to know if the split element is active. */
5566 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5567 info->reg_off_split = reg_off_split;
5568 info->mem_off_split = mem_off_split;
5570 if (reg_off_split == reg_off_last) {
5571 /* The page crossing element is last. */
5572 return true;
5575 reg_off_split += esize;
5576 mem_off_split += msize;
5580 * We do want the first active element on the second page, because
5581 * this may affect the address reported in an exception.
5583 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5584 tcg_debug_assert(reg_off_split <= reg_off_last);
5585 info->reg_off_first[1] = reg_off_split;
5586 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5587 info->reg_off_last[1] = reg_off_last;
5588 return true;
5592 * Resolve the guest virtual addresses to info->page[].
5593 * Control the generation of page faults with @fault. Return false if
5594 * there is no work to do, which can only happen with @fault == FAULT_NO.
5596 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5597 CPUARMState *env, target_ulong addr,
5598 MMUAccessType access_type, uintptr_t retaddr)
5600 int mmu_idx = cpu_mmu_index(env, false);
5601 int mem_off = info->mem_off_first[0];
5602 bool nofault = fault == FAULT_NO;
5603 bool have_work = true;
5605 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5606 access_type, mmu_idx, retaddr)) {
5607 /* No work to be done. */
5608 return false;
5611 if (likely(info->page_split < 0)) {
5612 /* The entire operation was on the one page. */
5613 return true;
5617 * If the second page is invalid, then we want the fault address to be
5618 * the first byte on that page which is accessed.
5620 if (info->mem_off_split >= 0) {
5622 * There is an element split across the pages. The fault address
5623 * should be the first byte of the second page.
5625 mem_off = info->page_split;
5627 * If the split element is also the first active element
5628 * of the vector, then: For first-fault we should continue
5629 * to generate faults for the second page. For no-fault,
5630 * we have work only if the second page is valid.
5632 if (info->mem_off_first[0] < info->mem_off_split) {
5633 nofault = FAULT_FIRST;
5634 have_work = false;
5636 } else {
5638 * There is no element split across the pages. The fault address
5639 * should be the first active element on the second page.
5641 mem_off = info->mem_off_first[1];
5643 * There must have been one active element on the first page,
5644 * so we're out of first-fault territory.
5646 nofault = fault != FAULT_ALL;
5649 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5650 access_type, mmu_idx, retaddr);
5651 return have_work;
5654 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5655 uint64_t *vg, target_ulong addr,
5656 int esize, int msize, int wp_access,
5657 uintptr_t retaddr)
5659 #ifndef CONFIG_USER_ONLY
5660 intptr_t mem_off, reg_off, reg_last;
5661 int flags0 = info->page[0].flags;
5662 int flags1 = info->page[1].flags;
5664 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5665 return;
5668 /* Indicate that watchpoints are handled. */
5669 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5670 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5672 if (flags0 & TLB_WATCHPOINT) {
5673 mem_off = info->mem_off_first[0];
5674 reg_off = info->reg_off_first[0];
5675 reg_last = info->reg_off_last[0];
5677 while (reg_off <= reg_last) {
5678 uint64_t pg = vg[reg_off >> 6];
5679 do {
5680 if ((pg >> (reg_off & 63)) & 1) {
5681 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5682 msize, info->page[0].attrs,
5683 wp_access, retaddr);
5685 reg_off += esize;
5686 mem_off += msize;
5687 } while (reg_off <= reg_last && (reg_off & 63));
5691 mem_off = info->mem_off_split;
5692 if (mem_off >= 0) {
5693 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5694 info->page[0].attrs, wp_access, retaddr);
5697 mem_off = info->mem_off_first[1];
5698 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5699 reg_off = info->reg_off_first[1];
5700 reg_last = info->reg_off_last[1];
5702 do {
5703 uint64_t pg = vg[reg_off >> 6];
5704 do {
5705 if ((pg >> (reg_off & 63)) & 1) {
5706 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5707 msize, info->page[1].attrs,
5708 wp_access, retaddr);
5710 reg_off += esize;
5711 mem_off += msize;
5712 } while (reg_off & 63);
5713 } while (reg_off <= reg_last);
5715 #endif
5718 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5719 uint64_t *vg, target_ulong addr, int esize,
5720 int msize, uint32_t mtedesc, uintptr_t ra)
5722 intptr_t mem_off, reg_off, reg_last;
5724 /* Process the page only if MemAttr == Tagged. */
5725 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5726 mem_off = info->mem_off_first[0];
5727 reg_off = info->reg_off_first[0];
5728 reg_last = info->reg_off_split;
5729 if (reg_last < 0) {
5730 reg_last = info->reg_off_last[0];
5733 do {
5734 uint64_t pg = vg[reg_off >> 6];
5735 do {
5736 if ((pg >> (reg_off & 63)) & 1) {
5737 mte_check(env, mtedesc, addr, ra);
5739 reg_off += esize;
5740 mem_off += msize;
5741 } while (reg_off <= reg_last && (reg_off & 63));
5742 } while (reg_off <= reg_last);
5745 mem_off = info->mem_off_first[1];
5746 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5747 reg_off = info->reg_off_first[1];
5748 reg_last = info->reg_off_last[1];
5750 do {
5751 uint64_t pg = vg[reg_off >> 6];
5752 do {
5753 if ((pg >> (reg_off & 63)) & 1) {
5754 mte_check(env, mtedesc, addr, ra);
5756 reg_off += esize;
5757 mem_off += msize;
5758 } while (reg_off & 63);
5759 } while (reg_off <= reg_last);
5764 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5766 static inline QEMU_ALWAYS_INLINE
5767 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5768 uint32_t desc, const uintptr_t retaddr,
5769 const int esz, const int msz, const int N, uint32_t mtedesc,
5770 sve_ldst1_host_fn *host_fn,
5771 sve_ldst1_tlb_fn *tlb_fn)
5773 const unsigned rd = simd_data(desc);
5774 const intptr_t reg_max = simd_oprsz(desc);
5775 intptr_t reg_off, reg_last, mem_off;
5776 SVEContLdSt info;
5777 void *host;
5778 int flags, i;
5780 /* Find the active elements. */
5781 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5782 /* The entire predicate was false; no load occurs. */
5783 for (i = 0; i < N; ++i) {
5784 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5786 return;
5789 /* Probe the page(s). Exit with exception for any invalid page. */
5790 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5792 /* Handle watchpoints for all active elements. */
5793 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5794 BP_MEM_READ, retaddr);
5797 * Handle mte checks for all active elements.
5798 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5800 if (mtedesc) {
5801 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5802 mtedesc, retaddr);
5805 flags = info.page[0].flags | info.page[1].flags;
5806 if (unlikely(flags != 0)) {
5807 #ifdef CONFIG_USER_ONLY
5808 g_assert_not_reached();
5809 #else
5811 * At least one page includes MMIO.
5812 * Any bus operation can fail with cpu_transaction_failed,
5813 * which for ARM will raise SyncExternal. Perform the load
5814 * into scratch memory to preserve register state until the end.
5816 ARMVectorReg scratch[4] = { };
5818 mem_off = info.mem_off_first[0];
5819 reg_off = info.reg_off_first[0];
5820 reg_last = info.reg_off_last[1];
5821 if (reg_last < 0) {
5822 reg_last = info.reg_off_split;
5823 if (reg_last < 0) {
5824 reg_last = info.reg_off_last[0];
5828 do {
5829 uint64_t pg = vg[reg_off >> 6];
5830 do {
5831 if ((pg >> (reg_off & 63)) & 1) {
5832 for (i = 0; i < N; ++i) {
5833 tlb_fn(env, &scratch[i], reg_off,
5834 addr + mem_off + (i << msz), retaddr);
5837 reg_off += 1 << esz;
5838 mem_off += N << msz;
5839 } while (reg_off & 63);
5840 } while (reg_off <= reg_last);
5842 for (i = 0; i < N; ++i) {
5843 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5845 return;
5846 #endif
5849 /* The entire operation is in RAM, on valid pages. */
5851 for (i = 0; i < N; ++i) {
5852 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5855 mem_off = info.mem_off_first[0];
5856 reg_off = info.reg_off_first[0];
5857 reg_last = info.reg_off_last[0];
5858 host = info.page[0].host;
5860 while (reg_off <= reg_last) {
5861 uint64_t pg = vg[reg_off >> 6];
5862 do {
5863 if ((pg >> (reg_off & 63)) & 1) {
5864 for (i = 0; i < N; ++i) {
5865 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5866 host + mem_off + (i << msz));
5869 reg_off += 1 << esz;
5870 mem_off += N << msz;
5871 } while (reg_off <= reg_last && (reg_off & 63));
5875 * Use the slow path to manage the cross-page misalignment.
5876 * But we know this is RAM and cannot trap.
5878 mem_off = info.mem_off_split;
5879 if (unlikely(mem_off >= 0)) {
5880 reg_off = info.reg_off_split;
5881 for (i = 0; i < N; ++i) {
5882 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5883 addr + mem_off + (i << msz), retaddr);
5887 mem_off = info.mem_off_first[1];
5888 if (unlikely(mem_off >= 0)) {
5889 reg_off = info.reg_off_first[1];
5890 reg_last = info.reg_off_last[1];
5891 host = info.page[1].host;
5893 do {
5894 uint64_t pg = vg[reg_off >> 6];
5895 do {
5896 if ((pg >> (reg_off & 63)) & 1) {
5897 for (i = 0; i < N; ++i) {
5898 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5899 host + mem_off + (i << msz));
5902 reg_off += 1 << esz;
5903 mem_off += N << msz;
5904 } while (reg_off & 63);
5905 } while (reg_off <= reg_last);
5909 static inline QEMU_ALWAYS_INLINE
5910 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5911 uint32_t desc, const uintptr_t ra,
5912 const int esz, const int msz, const int N,
5913 sve_ldst1_host_fn *host_fn,
5914 sve_ldst1_tlb_fn *tlb_fn)
5916 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5917 int bit55 = extract64(addr, 55, 1);
5919 /* Remove mtedesc from the normal sve descriptor. */
5920 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5922 /* Perform gross MTE suppression early. */
5923 if (!tbi_check(desc, bit55) ||
5924 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5925 mtedesc = 0;
5928 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5931 #define DO_LD1_1(NAME, ESZ) \
5932 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5933 target_ulong addr, uint32_t desc) \
5935 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5936 sve_##NAME##_host, sve_##NAME##_tlb); \
5938 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5939 target_ulong addr, uint32_t desc) \
5941 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5942 sve_##NAME##_host, sve_##NAME##_tlb); \
5945 #define DO_LD1_2(NAME, ESZ, MSZ) \
5946 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5947 target_ulong addr, uint32_t desc) \
5949 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5950 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5952 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5953 target_ulong addr, uint32_t desc) \
5955 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5956 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5958 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5959 target_ulong addr, uint32_t desc) \
5961 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5962 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5964 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5965 target_ulong addr, uint32_t desc) \
5967 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5968 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5971 DO_LD1_1(ld1bb, MO_8)
5972 DO_LD1_1(ld1bhu, MO_16)
5973 DO_LD1_1(ld1bhs, MO_16)
5974 DO_LD1_1(ld1bsu, MO_32)
5975 DO_LD1_1(ld1bss, MO_32)
5976 DO_LD1_1(ld1bdu, MO_64)
5977 DO_LD1_1(ld1bds, MO_64)
5979 DO_LD1_2(ld1hh, MO_16, MO_16)
5980 DO_LD1_2(ld1hsu, MO_32, MO_16)
5981 DO_LD1_2(ld1hss, MO_32, MO_16)
5982 DO_LD1_2(ld1hdu, MO_64, MO_16)
5983 DO_LD1_2(ld1hds, MO_64, MO_16)
5985 DO_LD1_2(ld1ss, MO_32, MO_32)
5986 DO_LD1_2(ld1sdu, MO_64, MO_32)
5987 DO_LD1_2(ld1sds, MO_64, MO_32)
5989 DO_LD1_2(ld1dd, MO_64, MO_64)
5991 #undef DO_LD1_1
5992 #undef DO_LD1_2
5994 #define DO_LDN_1(N) \
5995 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5996 target_ulong addr, uint32_t desc) \
5998 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5999 sve_ld1bb_host, sve_ld1bb_tlb); \
6001 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6002 target_ulong addr, uint32_t desc) \
6004 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6005 sve_ld1bb_host, sve_ld1bb_tlb); \
6008 #define DO_LDN_2(N, SUFF, ESZ) \
6009 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6010 target_ulong addr, uint32_t desc) \
6012 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6013 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6015 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6016 target_ulong addr, uint32_t desc) \
6018 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6019 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6021 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6022 target_ulong addr, uint32_t desc) \
6024 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6025 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6027 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6028 target_ulong addr, uint32_t desc) \
6030 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6031 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6034 DO_LDN_1(2)
6035 DO_LDN_1(3)
6036 DO_LDN_1(4)
6038 DO_LDN_2(2, hh, MO_16)
6039 DO_LDN_2(3, hh, MO_16)
6040 DO_LDN_2(4, hh, MO_16)
6042 DO_LDN_2(2, ss, MO_32)
6043 DO_LDN_2(3, ss, MO_32)
6044 DO_LDN_2(4, ss, MO_32)
6046 DO_LDN_2(2, dd, MO_64)
6047 DO_LDN_2(3, dd, MO_64)
6048 DO_LDN_2(4, dd, MO_64)
6050 #undef DO_LDN_1
6051 #undef DO_LDN_2
6054 * Load contiguous data, first-fault and no-fault.
6056 * For user-only, one could argue that we should hold the mmap_lock during
6057 * the operation so that there is no race between page_check_range and the
6058 * load operation. However, unmapping pages out from under a running thread
6059 * is extraordinarily unlikely. This theoretical race condition also affects
6060 * linux-user/ in its get_user/put_user macros.
6062 * TODO: Construct some helpers, written in assembly, that interact with
6063 * handle_cpu_signal to produce memory ops which can properly report errors
6064 * without racing.
6067 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6068 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6069 * option, which leaves subsequent data unchanged.
6071 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6073 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6075 if (i & 63) {
6076 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6077 i = ROUND_UP(i, 64);
6079 for (; i < oprsz; i += 64) {
6080 ffr[i / 64] = 0;
6085 * Common helper for all contiguous no-fault and first-fault loads.
6087 static inline QEMU_ALWAYS_INLINE
6088 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6089 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6090 const int esz, const int msz, const SVEContFault fault,
6091 sve_ldst1_host_fn *host_fn,
6092 sve_ldst1_tlb_fn *tlb_fn)
6094 const unsigned rd = simd_data(desc);
6095 void *vd = &env->vfp.zregs[rd];
6096 const intptr_t reg_max = simd_oprsz(desc);
6097 intptr_t reg_off, mem_off, reg_last;
6098 SVEContLdSt info;
6099 int flags;
6100 void *host;
6102 /* Find the active elements. */
6103 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6104 /* The entire predicate was false; no load occurs. */
6105 memset(vd, 0, reg_max);
6106 return;
6108 reg_off = info.reg_off_first[0];
6110 /* Probe the page(s). */
6111 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6112 /* Fault on first element. */
6113 tcg_debug_assert(fault == FAULT_NO);
6114 memset(vd, 0, reg_max);
6115 goto do_fault;
6118 mem_off = info.mem_off_first[0];
6119 flags = info.page[0].flags;
6122 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6123 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6125 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
6126 mtedesc = 0;
6129 if (fault == FAULT_FIRST) {
6130 /* Trapping mte check for the first-fault element. */
6131 if (mtedesc) {
6132 mte_check(env, mtedesc, addr + mem_off, retaddr);
6136 * Special handling of the first active element,
6137 * if it crosses a page boundary or is MMIO.
6139 bool is_split = mem_off == info.mem_off_split;
6140 if (unlikely(flags != 0) || unlikely(is_split)) {
6142 * Use the slow path for cross-page handling.
6143 * Might trap for MMIO or watchpoints.
6145 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6147 /* After any fault, zero the other elements. */
6148 swap_memzero(vd, reg_off);
6149 reg_off += 1 << esz;
6150 mem_off += 1 << msz;
6151 swap_memzero(vd + reg_off, reg_max - reg_off);
6153 if (is_split) {
6154 goto second_page;
6156 } else {
6157 memset(vd, 0, reg_max);
6159 } else {
6160 memset(vd, 0, reg_max);
6161 if (unlikely(mem_off == info.mem_off_split)) {
6162 /* The first active element crosses a page boundary. */
6163 flags |= info.page[1].flags;
6164 if (unlikely(flags & TLB_MMIO)) {
6165 /* Some page is MMIO, see below. */
6166 goto do_fault;
6168 if (unlikely(flags & TLB_WATCHPOINT) &&
6169 (cpu_watchpoint_address_matches
6170 (env_cpu(env), addr + mem_off, 1 << msz)
6171 & BP_MEM_READ)) {
6172 /* Watchpoint hit, see below. */
6173 goto do_fault;
6175 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6176 goto do_fault;
6179 * Use the slow path for cross-page handling.
6180 * This is RAM, without a watchpoint, and will not trap.
6182 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6183 goto second_page;
6188 * From this point on, all memory operations are MemSingleNF.
6190 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6191 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6193 * Unfortuately we do not have access to the memory attributes from the
6194 * PTE to tell Device memory from Normal memory. So we make a mostly
6195 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6196 * This gives the right answer for the common cases of "Normal memory,
6197 * backed by host RAM" and "Device memory, backed by MMIO".
6198 * The architecture allows us to suppress an NF load and return
6199 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6200 * case of "Normal memory, backed by MMIO" is permitted. The case we
6201 * get wrong is "Device memory, backed by host RAM", for which we
6202 * should return (UNKNOWN, FAULT) for but do not.
6204 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6205 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6206 * architectural breakpoints the same.
6208 if (unlikely(flags & TLB_MMIO)) {
6209 goto do_fault;
6212 reg_last = info.reg_off_last[0];
6213 host = info.page[0].host;
6215 do {
6216 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6217 do {
6218 if ((pg >> (reg_off & 63)) & 1) {
6219 if (unlikely(flags & TLB_WATCHPOINT) &&
6220 (cpu_watchpoint_address_matches
6221 (env_cpu(env), addr + mem_off, 1 << msz)
6222 & BP_MEM_READ)) {
6223 goto do_fault;
6225 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6226 goto do_fault;
6228 host_fn(vd, reg_off, host + mem_off);
6230 reg_off += 1 << esz;
6231 mem_off += 1 << msz;
6232 } while (reg_off <= reg_last && (reg_off & 63));
6233 } while (reg_off <= reg_last);
6236 * MemSingleNF is allowed to fail for any reason. We have special
6237 * code above to handle the first element crossing a page boundary.
6238 * As an implementation choice, decline to handle a cross-page element
6239 * in any other position.
6241 reg_off = info.reg_off_split;
6242 if (reg_off >= 0) {
6243 goto do_fault;
6246 second_page:
6247 reg_off = info.reg_off_first[1];
6248 if (likely(reg_off < 0)) {
6249 /* No active elements on the second page. All done. */
6250 return;
6254 * MemSingleNF is allowed to fail for any reason. As an implementation
6255 * choice, decline to handle elements on the second page. This should
6256 * be low frequency as the guest walks through memory -- the next
6257 * iteration of the guest's loop should be aligned on the page boundary,
6258 * and then all following iterations will stay aligned.
6261 do_fault:
6262 record_fault(env, reg_off, reg_max);
6265 static inline QEMU_ALWAYS_INLINE
6266 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6267 uint32_t desc, const uintptr_t retaddr,
6268 const int esz, const int msz, const SVEContFault fault,
6269 sve_ldst1_host_fn *host_fn,
6270 sve_ldst1_tlb_fn *tlb_fn)
6272 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6273 int bit55 = extract64(addr, 55, 1);
6275 /* Remove mtedesc from the normal sve descriptor. */
6276 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6278 /* Perform gross MTE suppression early. */
6279 if (!tbi_check(desc, bit55) ||
6280 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6281 mtedesc = 0;
6284 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6285 esz, msz, fault, host_fn, tlb_fn);
6288 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6289 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6290 target_ulong addr, uint32_t desc) \
6292 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6293 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6295 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6296 target_ulong addr, uint32_t desc) \
6298 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6299 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6301 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6302 target_ulong addr, uint32_t desc) \
6304 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6305 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6307 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6308 target_ulong addr, uint32_t desc) \
6310 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6311 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6314 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6315 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6316 target_ulong addr, uint32_t desc) \
6318 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6319 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6321 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6322 target_ulong addr, uint32_t desc) \
6324 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6325 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6327 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6328 target_ulong addr, uint32_t desc) \
6330 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6331 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6333 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6334 target_ulong addr, uint32_t desc) \
6336 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6337 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6339 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6340 target_ulong addr, uint32_t desc) \
6342 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6343 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6345 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6346 target_ulong addr, uint32_t desc) \
6348 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6349 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6351 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6352 target_ulong addr, uint32_t desc) \
6354 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6355 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6357 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6358 target_ulong addr, uint32_t desc) \
6360 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6361 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6364 DO_LDFF1_LDNF1_1(bb, MO_8)
6365 DO_LDFF1_LDNF1_1(bhu, MO_16)
6366 DO_LDFF1_LDNF1_1(bhs, MO_16)
6367 DO_LDFF1_LDNF1_1(bsu, MO_32)
6368 DO_LDFF1_LDNF1_1(bss, MO_32)
6369 DO_LDFF1_LDNF1_1(bdu, MO_64)
6370 DO_LDFF1_LDNF1_1(bds, MO_64)
6372 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6373 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6374 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6375 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6376 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6378 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6379 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6380 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6382 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6384 #undef DO_LDFF1_LDNF1_1
6385 #undef DO_LDFF1_LDNF1_2
6388 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6391 static inline QEMU_ALWAYS_INLINE
6392 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6393 uint32_t desc, const uintptr_t retaddr,
6394 const int esz, const int msz, const int N, uint32_t mtedesc,
6395 sve_ldst1_host_fn *host_fn,
6396 sve_ldst1_tlb_fn *tlb_fn)
6398 const unsigned rd = simd_data(desc);
6399 const intptr_t reg_max = simd_oprsz(desc);
6400 intptr_t reg_off, reg_last, mem_off;
6401 SVEContLdSt info;
6402 void *host;
6403 int i, flags;
6405 /* Find the active elements. */
6406 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6407 /* The entire predicate was false; no store occurs. */
6408 return;
6411 /* Probe the page(s). Exit with exception for any invalid page. */
6412 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6414 /* Handle watchpoints for all active elements. */
6415 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6416 BP_MEM_WRITE, retaddr);
6419 * Handle mte checks for all active elements.
6420 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6422 if (mtedesc) {
6423 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6424 mtedesc, retaddr);
6427 flags = info.page[0].flags | info.page[1].flags;
6428 if (unlikely(flags != 0)) {
6429 #ifdef CONFIG_USER_ONLY
6430 g_assert_not_reached();
6431 #else
6433 * At least one page includes MMIO.
6434 * Any bus operation can fail with cpu_transaction_failed,
6435 * which for ARM will raise SyncExternal. We cannot avoid
6436 * this fault and will leave with the store incomplete.
6438 mem_off = info.mem_off_first[0];
6439 reg_off = info.reg_off_first[0];
6440 reg_last = info.reg_off_last[1];
6441 if (reg_last < 0) {
6442 reg_last = info.reg_off_split;
6443 if (reg_last < 0) {
6444 reg_last = info.reg_off_last[0];
6448 do {
6449 uint64_t pg = vg[reg_off >> 6];
6450 do {
6451 if ((pg >> (reg_off & 63)) & 1) {
6452 for (i = 0; i < N; ++i) {
6453 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6454 addr + mem_off + (i << msz), retaddr);
6457 reg_off += 1 << esz;
6458 mem_off += N << msz;
6459 } while (reg_off & 63);
6460 } while (reg_off <= reg_last);
6461 return;
6462 #endif
6465 mem_off = info.mem_off_first[0];
6466 reg_off = info.reg_off_first[0];
6467 reg_last = info.reg_off_last[0];
6468 host = info.page[0].host;
6470 while (reg_off <= reg_last) {
6471 uint64_t pg = vg[reg_off >> 6];
6472 do {
6473 if ((pg >> (reg_off & 63)) & 1) {
6474 for (i = 0; i < N; ++i) {
6475 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6476 host + mem_off + (i << msz));
6479 reg_off += 1 << esz;
6480 mem_off += N << msz;
6481 } while (reg_off <= reg_last && (reg_off & 63));
6485 * Use the slow path to manage the cross-page misalignment.
6486 * But we know this is RAM and cannot trap.
6488 mem_off = info.mem_off_split;
6489 if (unlikely(mem_off >= 0)) {
6490 reg_off = info.reg_off_split;
6491 for (i = 0; i < N; ++i) {
6492 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6493 addr + mem_off + (i << msz), retaddr);
6497 mem_off = info.mem_off_first[1];
6498 if (unlikely(mem_off >= 0)) {
6499 reg_off = info.reg_off_first[1];
6500 reg_last = info.reg_off_last[1];
6501 host = info.page[1].host;
6503 do {
6504 uint64_t pg = vg[reg_off >> 6];
6505 do {
6506 if ((pg >> (reg_off & 63)) & 1) {
6507 for (i = 0; i < N; ++i) {
6508 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6509 host + mem_off + (i << msz));
6512 reg_off += 1 << esz;
6513 mem_off += N << msz;
6514 } while (reg_off & 63);
6515 } while (reg_off <= reg_last);
6519 static inline QEMU_ALWAYS_INLINE
6520 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6521 uint32_t desc, const uintptr_t ra,
6522 const int esz, const int msz, const int N,
6523 sve_ldst1_host_fn *host_fn,
6524 sve_ldst1_tlb_fn *tlb_fn)
6526 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6527 int bit55 = extract64(addr, 55, 1);
6529 /* Remove mtedesc from the normal sve descriptor. */
6530 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6532 /* Perform gross MTE suppression early. */
6533 if (!tbi_check(desc, bit55) ||
6534 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6535 mtedesc = 0;
6538 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6541 #define DO_STN_1(N, NAME, ESZ) \
6542 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6543 target_ulong addr, uint32_t desc) \
6545 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6546 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6548 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6549 target_ulong addr, uint32_t desc) \
6551 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6552 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6555 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6556 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6557 target_ulong addr, uint32_t desc) \
6559 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6560 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6562 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6563 target_ulong addr, uint32_t desc) \
6565 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6566 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6568 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6569 target_ulong addr, uint32_t desc) \
6571 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6572 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6574 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6575 target_ulong addr, uint32_t desc) \
6577 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6578 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6581 DO_STN_1(1, bb, MO_8)
6582 DO_STN_1(1, bh, MO_16)
6583 DO_STN_1(1, bs, MO_32)
6584 DO_STN_1(1, bd, MO_64)
6585 DO_STN_1(2, bb, MO_8)
6586 DO_STN_1(3, bb, MO_8)
6587 DO_STN_1(4, bb, MO_8)
6589 DO_STN_2(1, hh, MO_16, MO_16)
6590 DO_STN_2(1, hs, MO_32, MO_16)
6591 DO_STN_2(1, hd, MO_64, MO_16)
6592 DO_STN_2(2, hh, MO_16, MO_16)
6593 DO_STN_2(3, hh, MO_16, MO_16)
6594 DO_STN_2(4, hh, MO_16, MO_16)
6596 DO_STN_2(1, ss, MO_32, MO_32)
6597 DO_STN_2(1, sd, MO_64, MO_32)
6598 DO_STN_2(2, ss, MO_32, MO_32)
6599 DO_STN_2(3, ss, MO_32, MO_32)
6600 DO_STN_2(4, ss, MO_32, MO_32)
6602 DO_STN_2(1, dd, MO_64, MO_64)
6603 DO_STN_2(2, dd, MO_64, MO_64)
6604 DO_STN_2(3, dd, MO_64, MO_64)
6605 DO_STN_2(4, dd, MO_64, MO_64)
6607 #undef DO_STN_1
6608 #undef DO_STN_2
6611 * Loads with a vector index.
6615 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6617 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6619 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6621 return *(uint32_t *)(reg + H1_4(reg_ofs));
6624 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6626 return *(int32_t *)(reg + H1_4(reg_ofs));
6629 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6631 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6634 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6636 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6639 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6641 return *(uint64_t *)(reg + reg_ofs);
6644 static inline QEMU_ALWAYS_INLINE
6645 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6646 target_ulong base, uint32_t desc, uintptr_t retaddr,
6647 uint32_t mtedesc, int esize, int msize,
6648 zreg_off_fn *off_fn,
6649 sve_ldst1_host_fn *host_fn,
6650 sve_ldst1_tlb_fn *tlb_fn)
6652 const int mmu_idx = cpu_mmu_index(env, false);
6653 const intptr_t reg_max = simd_oprsz(desc);
6654 const int scale = simd_data(desc);
6655 ARMVectorReg scratch;
6656 intptr_t reg_off;
6657 SVEHostPage info, info2;
6659 memset(&scratch, 0, reg_max);
6660 reg_off = 0;
6661 do {
6662 uint64_t pg = vg[reg_off >> 6];
6663 do {
6664 if (likely(pg & 1)) {
6665 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6666 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6668 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6669 mmu_idx, retaddr);
6671 if (likely(in_page >= msize)) {
6672 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6673 cpu_check_watchpoint(env_cpu(env), addr, msize,
6674 info.attrs, BP_MEM_READ, retaddr);
6676 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6677 mte_check(env, mtedesc, addr, retaddr);
6679 host_fn(&scratch, reg_off, info.host);
6680 } else {
6681 /* Element crosses the page boundary. */
6682 sve_probe_page(&info2, false, env, addr + in_page, 0,
6683 MMU_DATA_LOAD, mmu_idx, retaddr);
6684 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6685 cpu_check_watchpoint(env_cpu(env), addr,
6686 msize, info.attrs,
6687 BP_MEM_READ, retaddr);
6689 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6690 mte_check(env, mtedesc, addr, retaddr);
6692 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6695 reg_off += esize;
6696 pg >>= esize;
6697 } while (reg_off & 63);
6698 } while (reg_off < reg_max);
6700 /* Wait until all exceptions have been raised to write back. */
6701 memcpy(vd, &scratch, reg_max);
6704 static inline QEMU_ALWAYS_INLINE
6705 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6706 target_ulong base, uint32_t desc, uintptr_t retaddr,
6707 int esize, int msize, zreg_off_fn *off_fn,
6708 sve_ldst1_host_fn *host_fn,
6709 sve_ldst1_tlb_fn *tlb_fn)
6711 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6712 /* Remove mtedesc from the normal sve descriptor. */
6713 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6716 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6717 * offset base entirely over the address space hole to change the
6718 * pointer tag, or change the bit55 selector. So we could here
6719 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6721 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6722 esize, msize, off_fn, host_fn, tlb_fn);
6725 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6726 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6727 void *vm, target_ulong base, uint32_t desc) \
6729 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6730 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6732 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6733 void *vm, target_ulong base, uint32_t desc) \
6735 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6736 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6739 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6740 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6741 void *vm, target_ulong base, uint32_t desc) \
6743 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6744 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6746 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6747 void *vm, target_ulong base, uint32_t desc) \
6749 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6750 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6753 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6754 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6755 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6756 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6757 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6759 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6760 DO_LD1_ZPZ_S(bss, zss, MO_8)
6761 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6762 DO_LD1_ZPZ_D(bds, zss, MO_8)
6763 DO_LD1_ZPZ_D(bds, zd, MO_8)
6765 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6766 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6767 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6768 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6769 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6771 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6772 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6773 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6774 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6775 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6777 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6778 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6779 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6780 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6781 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6783 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6784 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6785 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6786 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6787 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6789 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6790 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6791 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6792 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6793 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6795 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6796 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6797 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6798 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6799 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6801 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6802 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6803 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6805 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6806 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6807 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6809 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6810 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6811 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6813 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6814 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6815 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6817 #undef DO_LD1_ZPZ_S
6818 #undef DO_LD1_ZPZ_D
6820 /* First fault loads with a vector index. */
6823 * Common helpers for all gather first-faulting loads.
6826 static inline QEMU_ALWAYS_INLINE
6827 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6828 target_ulong base, uint32_t desc, uintptr_t retaddr,
6829 uint32_t mtedesc, const int esz, const int msz,
6830 zreg_off_fn *off_fn,
6831 sve_ldst1_host_fn *host_fn,
6832 sve_ldst1_tlb_fn *tlb_fn)
6834 const int mmu_idx = cpu_mmu_index(env, false);
6835 const intptr_t reg_max = simd_oprsz(desc);
6836 const int scale = simd_data(desc);
6837 const int esize = 1 << esz;
6838 const int msize = 1 << msz;
6839 intptr_t reg_off;
6840 SVEHostPage info;
6841 target_ulong addr, in_page;
6843 /* Skip to the first true predicate. */
6844 reg_off = find_next_active(vg, 0, reg_max, esz);
6845 if (unlikely(reg_off >= reg_max)) {
6846 /* The entire predicate was false; no load occurs. */
6847 memset(vd, 0, reg_max);
6848 return;
6852 * Probe the first element, allowing faults.
6854 addr = base + (off_fn(vm, reg_off) << scale);
6855 if (mtedesc) {
6856 mte_check(env, mtedesc, addr, retaddr);
6858 tlb_fn(env, vd, reg_off, addr, retaddr);
6860 /* After any fault, zero the other elements. */
6861 swap_memzero(vd, reg_off);
6862 reg_off += esize;
6863 swap_memzero(vd + reg_off, reg_max - reg_off);
6866 * Probe the remaining elements, not allowing faults.
6868 while (reg_off < reg_max) {
6869 uint64_t pg = vg[reg_off >> 6];
6870 do {
6871 if (likely((pg >> (reg_off & 63)) & 1)) {
6872 addr = base + (off_fn(vm, reg_off) << scale);
6873 in_page = -(addr | TARGET_PAGE_MASK);
6875 if (unlikely(in_page < msize)) {
6876 /* Stop if the element crosses a page boundary. */
6877 goto fault;
6880 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6881 mmu_idx, retaddr);
6882 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6883 goto fault;
6885 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6886 (cpu_watchpoint_address_matches
6887 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6888 goto fault;
6890 if (mtedesc &&
6891 arm_tlb_mte_tagged(&info.attrs) &&
6892 !mte_probe(env, mtedesc, addr)) {
6893 goto fault;
6896 host_fn(vd, reg_off, info.host);
6898 reg_off += esize;
6899 } while (reg_off & 63);
6901 return;
6903 fault:
6904 record_fault(env, reg_off, reg_max);
6907 static inline QEMU_ALWAYS_INLINE
6908 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6909 target_ulong base, uint32_t desc, uintptr_t retaddr,
6910 const int esz, const int msz,
6911 zreg_off_fn *off_fn,
6912 sve_ldst1_host_fn *host_fn,
6913 sve_ldst1_tlb_fn *tlb_fn)
6915 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6916 /* Remove mtedesc from the normal sve descriptor. */
6917 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6920 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6921 * offset base entirely over the address space hole to change the
6922 * pointer tag, or change the bit55 selector. So we could here
6923 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6925 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6926 esz, msz, off_fn, host_fn, tlb_fn);
6929 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6930 void HELPER(sve_ldff##MEM##_##OFS) \
6931 (CPUARMState *env, void *vd, void *vg, \
6932 void *vm, target_ulong base, uint32_t desc) \
6934 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6935 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6937 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6938 (CPUARMState *env, void *vd, void *vg, \
6939 void *vm, target_ulong base, uint32_t desc) \
6941 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6942 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6945 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6946 void HELPER(sve_ldff##MEM##_##OFS) \
6947 (CPUARMState *env, void *vd, void *vg, \
6948 void *vm, target_ulong base, uint32_t desc) \
6950 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6951 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6953 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6954 (CPUARMState *env, void *vd, void *vg, \
6955 void *vm, target_ulong base, uint32_t desc) \
6957 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6958 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6961 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6962 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6963 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6964 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6965 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6967 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6968 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6969 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6970 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6971 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6973 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6974 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6975 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6976 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6977 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6979 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6980 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6981 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6982 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6983 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6985 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6986 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6987 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6988 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6989 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6991 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6992 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6993 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6994 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6995 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6997 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6998 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6999 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7000 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7001 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7003 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7004 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7005 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7006 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7007 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7009 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7010 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7011 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7013 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7014 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7015 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7017 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7018 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7019 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7021 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7022 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7023 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7025 /* Stores with a vector index. */
7027 static inline QEMU_ALWAYS_INLINE
7028 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7029 target_ulong base, uint32_t desc, uintptr_t retaddr,
7030 uint32_t mtedesc, int esize, int msize,
7031 zreg_off_fn *off_fn,
7032 sve_ldst1_host_fn *host_fn,
7033 sve_ldst1_tlb_fn *tlb_fn)
7035 const int mmu_idx = cpu_mmu_index(env, false);
7036 const intptr_t reg_max = simd_oprsz(desc);
7037 const int scale = simd_data(desc);
7038 void *host[ARM_MAX_VQ * 4];
7039 intptr_t reg_off, i;
7040 SVEHostPage info, info2;
7043 * Probe all of the elements for host addresses and flags.
7045 i = reg_off = 0;
7046 do {
7047 uint64_t pg = vg[reg_off >> 6];
7048 do {
7049 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7050 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7052 host[i] = NULL;
7053 if (likely((pg >> (reg_off & 63)) & 1)) {
7054 if (likely(in_page >= msize)) {
7055 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7056 mmu_idx, retaddr);
7057 host[i] = info.host;
7058 } else {
7060 * Element crosses the page boundary.
7061 * Probe both pages, but do not record the host address,
7062 * so that we use the slow path.
7064 sve_probe_page(&info, false, env, addr, 0,
7065 MMU_DATA_STORE, mmu_idx, retaddr);
7066 sve_probe_page(&info2, false, env, addr + in_page, 0,
7067 MMU_DATA_STORE, mmu_idx, retaddr);
7068 info.flags |= info2.flags;
7071 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7072 cpu_check_watchpoint(env_cpu(env), addr, msize,
7073 info.attrs, BP_MEM_WRITE, retaddr);
7076 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
7077 mte_check(env, mtedesc, addr, retaddr);
7080 i += 1;
7081 reg_off += esize;
7082 } while (reg_off & 63);
7083 } while (reg_off < reg_max);
7086 * Now that we have recognized all exceptions except SyncExternal
7087 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7089 * Note for the common case of an element in RAM, not crossing a page
7090 * boundary, we have stored the host address in host[]. This doubles
7091 * as a first-level check against the predicate, since only enabled
7092 * elements have non-null host addresses.
7094 i = reg_off = 0;
7095 do {
7096 void *h = host[i];
7097 if (likely(h != NULL)) {
7098 host_fn(vd, reg_off, h);
7099 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7100 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7101 tlb_fn(env, vd, reg_off, addr, retaddr);
7103 i += 1;
7104 reg_off += esize;
7105 } while (reg_off < reg_max);
7108 static inline QEMU_ALWAYS_INLINE
7109 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7110 target_ulong base, uint32_t desc, uintptr_t retaddr,
7111 int esize, int msize, zreg_off_fn *off_fn,
7112 sve_ldst1_host_fn *host_fn,
7113 sve_ldst1_tlb_fn *tlb_fn)
7115 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7116 /* Remove mtedesc from the normal sve descriptor. */
7117 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7120 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7121 * offset base entirely over the address space hole to change the
7122 * pointer tag, or change the bit55 selector. So we could here
7123 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7125 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7126 esize, msize, off_fn, host_fn, tlb_fn);
7129 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7130 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7131 void *vm, target_ulong base, uint32_t desc) \
7133 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7134 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7136 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7137 void *vm, target_ulong base, uint32_t desc) \
7139 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7140 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7143 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7144 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7145 void *vm, target_ulong base, uint32_t desc) \
7147 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7148 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7150 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7151 void *vm, target_ulong base, uint32_t desc) \
7153 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7154 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7157 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7158 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7159 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7160 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7161 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7163 DO_ST1_ZPZ_S(bs, zss, MO_8)
7164 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7165 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7166 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7167 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7169 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7170 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7171 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7172 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7173 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7174 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7175 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7177 DO_ST1_ZPZ_D(bd, zss, MO_8)
7178 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7179 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7180 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7181 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7182 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7183 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7185 DO_ST1_ZPZ_D(bd, zd, MO_8)
7186 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7187 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7188 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7189 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7190 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7191 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7193 #undef DO_ST1_ZPZ_S
7194 #undef DO_ST1_ZPZ_D
7196 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7198 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7199 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7201 for (i = 0; i < opr_sz; ++i) {
7202 d[i] = n[i] ^ m[i] ^ k[i];
7206 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7209 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7211 for (i = 0; i < opr_sz; ++i) {
7212 d[i] = n[i] ^ (m[i] & ~k[i]);
7216 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7218 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7219 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7221 for (i = 0; i < opr_sz; ++i) {
7222 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7226 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7228 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7229 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7231 for (i = 0; i < opr_sz; ++i) {
7232 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7236 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7238 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7239 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7241 for (i = 0; i < opr_sz; ++i) {
7242 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7247 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7248 * See hasless(v,1) from
7249 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7251 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7253 int bits = 8 << esz;
7254 uint64_t ones = dup_const(esz, 1);
7255 uint64_t signs = ones << (bits - 1);
7256 uint64_t cmp0, cmp1;
7258 cmp1 = dup_const(esz, n);
7259 cmp0 = cmp1 ^ m0;
7260 cmp1 = cmp1 ^ m1;
7261 cmp0 = (cmp0 - ones) & ~cmp0;
7262 cmp1 = (cmp1 - ones) & ~cmp1;
7263 return (cmp0 | cmp1) & signs;
7266 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7267 uint32_t desc, int esz, bool nmatch)
7269 uint16_t esz_mask = pred_esz_masks[esz];
7270 intptr_t opr_sz = simd_oprsz(desc);
7271 uint32_t flags = PREDTEST_INIT;
7272 intptr_t i, j, k;
7274 for (i = 0; i < opr_sz; i += 16) {
7275 uint64_t m0 = *(uint64_t *)(vm + i);
7276 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7277 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7278 uint16_t out = 0;
7280 for (j = 0; j < 16; j += 8) {
7281 uint64_t n = *(uint64_t *)(vn + i + j);
7283 for (k = 0; k < 8; k += 1 << esz) {
7284 if (pg & (1 << (j + k))) {
7285 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7286 out |= (o ^ nmatch) << (j + k);
7290 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7291 flags = iter_predtest_fwd(out, pg, flags);
7293 return flags;
7296 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7297 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7299 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7302 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7303 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7305 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7306 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7308 #undef DO_PPZZ_MATCH
7310 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7311 uint32_t desc)
7313 ARMVectorReg scratch;
7314 intptr_t i, j;
7315 intptr_t opr_sz = simd_oprsz(desc);
7316 uint32_t *d = vd, *n = vn, *m = vm;
7317 uint8_t *pg = vg;
7319 if (d == n) {
7320 n = memcpy(&scratch, n, opr_sz);
7321 if (d == m) {
7322 m = n;
7324 } else if (d == m) {
7325 m = memcpy(&scratch, m, opr_sz);
7328 for (i = 0; i < opr_sz; i += 4) {
7329 uint64_t count = 0;
7330 uint8_t pred;
7332 pred = pg[H1(i >> 3)] >> (i & 7);
7333 if (pred & 1) {
7334 uint32_t nn = n[H4(i >> 2)];
7336 for (j = 0; j <= i; j += 4) {
7337 pred = pg[H1(j >> 3)] >> (j & 7);
7338 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7339 ++count;
7343 d[H4(i >> 2)] = count;
7347 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7348 uint32_t desc)
7350 ARMVectorReg scratch;
7351 intptr_t i, j;
7352 intptr_t opr_sz = simd_oprsz(desc);
7353 uint64_t *d = vd, *n = vn, *m = vm;
7354 uint8_t *pg = vg;
7356 if (d == n) {
7357 n = memcpy(&scratch, n, opr_sz);
7358 if (d == m) {
7359 m = n;
7361 } else if (d == m) {
7362 m = memcpy(&scratch, m, opr_sz);
7365 for (i = 0; i < opr_sz / 8; ++i) {
7366 uint64_t count = 0;
7367 if (pg[H1(i)] & 1) {
7368 uint64_t nn = n[i];
7369 for (j = 0; j <= i; ++j) {
7370 if ((pg[H1(j)] & 1) && nn == m[j]) {
7371 ++count;
7375 d[i] = count;
7380 * Returns the number of bytes in m0 and m1 that match n.
7381 * Unlike do_match2 we don't just need true/false, we need an exact count.
7382 * This requires two extra logical operations.
7384 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7386 const uint64_t mask = dup_const(MO_8, 0x7f);
7387 uint64_t cmp0, cmp1;
7389 cmp1 = dup_const(MO_8, n);
7390 cmp0 = cmp1 ^ m0;
7391 cmp1 = cmp1 ^ m1;
7394 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7395 * 2: carry in to msb if byte != 0 (+ mask)
7396 * 3: set msb if cmp has msb set (| cmp)
7397 * 4: set ~msb to ignore them (| mask)
7398 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7399 * 5: invert, resulting in 0x80 if and only if byte == 0.
7401 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7402 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7405 * Combine the two compares in a way that the bits do
7406 * not overlap, and so preserves the count of set bits.
7407 * If the host has an efficient instruction for ctpop,
7408 * then ctpop(x) + ctpop(y) has the same number of
7409 * operations as ctpop(x | (y >> 1)). If the host does
7410 * not have an efficient ctpop, then we only want to
7411 * use it once.
7413 return ctpop64(cmp0 | (cmp1 >> 1));
7416 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7418 intptr_t i, j;
7419 intptr_t opr_sz = simd_oprsz(desc);
7421 for (i = 0; i < opr_sz; i += 16) {
7422 uint64_t n0 = *(uint64_t *)(vn + i);
7423 uint64_t m0 = *(uint64_t *)(vm + i);
7424 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7425 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7426 uint64_t out0 = 0;
7427 uint64_t out1 = 0;
7429 for (j = 0; j < 64; j += 8) {
7430 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7431 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7432 out0 |= cnt0 << j;
7433 out1 |= cnt1 << j;
7436 *(uint64_t *)(vd + i) = out0;
7437 *(uint64_t *)(vd + i + 8) = out1;
7441 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7443 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7444 int shr = simd_data(desc);
7445 int shl = 8 - shr;
7446 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7447 uint64_t *d = vd, *n = vn, *m = vm;
7449 for (i = 0; i < opr_sz; ++i) {
7450 uint64_t t = n[i] ^ m[i];
7451 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7455 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7457 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7458 int shr = simd_data(desc);
7459 int shl = 16 - shr;
7460 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7461 uint64_t *d = vd, *n = vn, *m = vm;
7463 for (i = 0; i < opr_sz; ++i) {
7464 uint64_t t = n[i] ^ m[i];
7465 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7469 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7471 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7472 int shr = simd_data(desc);
7473 uint32_t *d = vd, *n = vn, *m = vm;
7475 for (i = 0; i < opr_sz; ++i) {
7476 d[i] = ror32(n[i] ^ m[i], shr);
7480 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7481 void *status, uint32_t desc)
7483 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7485 for (s = 0; s < opr_sz; ++s) {
7486 float32 *n = vn + s * sizeof(float32) * 4;
7487 float32 *m = vm + s * sizeof(float32) * 4;
7488 float32 *a = va + s * sizeof(float32) * 4;
7489 float32 *d = vd + s * sizeof(float32) * 4;
7490 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7491 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7492 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7493 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7494 float32 p0, p1;
7496 /* i = 0, j = 0 */
7497 p0 = float32_mul(n00, m00, status);
7498 p1 = float32_mul(n01, m01, status);
7499 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7501 /* i = 0, j = 1 */
7502 p0 = float32_mul(n00, m10, status);
7503 p1 = float32_mul(n01, m11, status);
7504 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7506 /* i = 1, j = 0 */
7507 p0 = float32_mul(n10, m00, status);
7508 p1 = float32_mul(n11, m01, status);
7509 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7511 /* i = 1, j = 1 */
7512 p0 = float32_mul(n10, m10, status);
7513 p1 = float32_mul(n11, m11, status);
7514 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7518 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7519 void *status, uint32_t desc)
7521 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7523 for (s = 0; s < opr_sz; ++s) {
7524 float64 *n = vn + s * sizeof(float64) * 4;
7525 float64 *m = vm + s * sizeof(float64) * 4;
7526 float64 *a = va + s * sizeof(float64) * 4;
7527 float64 *d = vd + s * sizeof(float64) * 4;
7528 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7529 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7530 float64 p0, p1;
7532 /* i = 0, j = 0 */
7533 p0 = float64_mul(n00, m00, status);
7534 p1 = float64_mul(n01, m01, status);
7535 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7537 /* i = 0, j = 1 */
7538 p0 = float64_mul(n00, m10, status);
7539 p1 = float64_mul(n01, m11, status);
7540 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7542 /* i = 1, j = 0 */
7543 p0 = float64_mul(n10, m00, status);
7544 p1 = float64_mul(n11, m01, status);
7545 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7547 /* i = 1, j = 1 */
7548 p0 = float64_mul(n10, m10, status);
7549 p1 = float64_mul(n11, m11, status);
7550 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);