4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
30 /* Note that vector data is stored in host-endian 64-bit chunks,
31 so addressing units smaller than that needs a host-endian fixup. */
32 #ifdef HOST_WORDS_BIGENDIAN
33 #define H1(x) ((x) ^ 7)
34 #define H1_2(x) ((x) ^ 6)
35 #define H1_4(x) ((x) ^ 4)
36 #define H2(x) ((x) ^ 3)
37 #define H4(x) ((x) ^ 1)
46 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
48 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
49 * and bit 0 set if C is set. Compare the definitions of these variables
53 /* For no G bits set, NZCV = C. */
54 #define PREDTEST_INIT 1
56 /* This is an iterative function, called for each Pd and Pg word
59 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
62 /* Compute N from first D & G.
63 Use bit 2 to signal first G bit seen. */
65 flags
|= ((d
& (g
& -g
)) != 0) << 31;
69 /* Accumulate Z from each D & G. */
70 flags
|= ((d
& g
) != 0) << 1;
72 /* Compute C from last !(D & G). Replace previous. */
73 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
78 /* This is an iterative function, called for each Pd and Pg word
81 static uint32_t iter_predtest_bwd(uint64_t d
, uint64_t g
, uint32_t flags
)
84 /* Compute C from first (i.e last) !(D & G).
85 Use bit 2 to signal first G bit seen. */
87 flags
+= 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
88 flags
|= (d
& pow2floor(g
)) == 0;
91 /* Accumulate Z from each D & G. */
92 flags
|= ((d
& g
) != 0) << 1;
94 /* Compute N from last (i.e first) D & G. Replace previous. */
95 flags
= deposit32(flags
, 31, 1, (d
& (g
& -g
)) != 0);
100 /* The same for a single word predicate. */
101 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
103 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
106 /* The same for a multi-word predicate. */
107 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
109 uint32_t flags
= PREDTEST_INIT
;
110 uint64_t *d
= vd
, *g
= vg
;
114 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
115 } while (++i
< words
);
120 /* Expand active predicate bits to bytes, for byte elements.
121 * for (i = 0; i < 256; ++i) {
122 * unsigned long m = 0;
123 * for (j = 0; j < 8; j++) {
124 * if ((i >> j) & 1) {
125 * m |= 0xfful << (j << 3);
128 * printf("0x%016lx,\n", m);
131 static inline uint64_t expand_pred_b(uint8_t byte
)
133 static const uint64_t word
[256] = {
134 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
135 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
136 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
137 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
138 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
139 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
140 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
141 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
142 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
143 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
144 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
145 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
146 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
147 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
148 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
149 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
150 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
151 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
152 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
153 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
154 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
155 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
156 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
157 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
158 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
159 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
160 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
161 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
162 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
163 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
164 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
165 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
166 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
167 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
168 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
169 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
170 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
171 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
172 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
173 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
174 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
175 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
176 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
177 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
178 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
179 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
180 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
181 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
182 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
183 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
184 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
185 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
186 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
187 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
188 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
189 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
190 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
191 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
192 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
193 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
194 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
195 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
196 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
197 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
198 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
199 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
200 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
201 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
202 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
203 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
204 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
205 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
206 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
207 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
208 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
209 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
210 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
211 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
212 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
213 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
214 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
215 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
216 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
217 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
218 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
224 /* Similarly for half-word elements.
225 * for (i = 0; i < 256; ++i) {
226 * unsigned long m = 0;
230 * for (j = 0; j < 8; j += 2) {
231 * if ((i >> j) & 1) {
232 * m |= 0xfffful << (j << 3);
235 * printf("[0x%x] = 0x%016lx,\n", i, m);
238 static inline uint64_t expand_pred_h(uint8_t byte
)
240 static const uint64_t word
[] = {
241 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
242 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
243 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
244 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
245 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
246 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
247 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
248 [0x55] = 0xffffffffffffffff,
250 return word
[byte
& 0x55];
253 /* Similarly for single word elements. */
254 static inline uint64_t expand_pred_s(uint8_t byte
)
256 static const uint64_t word
[] = {
257 [0x01] = 0x00000000ffffffffull
,
258 [0x10] = 0xffffffff00000000ull
,
259 [0x11] = 0xffffffffffffffffull
,
261 return word
[byte
& 0x11];
264 /* Swap 16-bit words within a 32-bit word. */
265 static inline uint32_t hswap32(uint32_t h
)
270 /* Swap 16-bit words within a 64-bit word. */
271 static inline uint64_t hswap64(uint64_t h
)
273 uint64_t m
= 0x0000ffff0000ffffull
;
275 return ((h
& m
) << 16) | ((h
>> 16) & m
);
278 /* Swap 32-bit words within a 64-bit word. */
279 static inline uint64_t wswap64(uint64_t h
)
284 #define LOGICAL_PPPP(NAME, FUNC) \
285 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287 uintptr_t opr_sz = simd_oprsz(desc); \
288 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 for (i = 0; i < opr_sz / 8; ++i) { \
291 d[i] = FUNC(n[i], m[i], g[i]); \
295 #define DO_AND(N, M, G) (((N) & (M)) & (G))
296 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
297 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
298 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
299 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
300 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
301 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
302 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
305 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
306 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
307 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
308 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
309 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
310 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
311 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
323 /* Fully general three-operand expander, controlled by a predicate.
324 * This is complicated by the host-endian storage of the register file.
326 /* ??? I don't expect the compiler could ever vectorize this itself.
327 * With some tables we can convert bit masks to byte masks, and with
328 * extra care wrt byte/word ordering we could use gcc generic vectors
329 * and do 16 bytes at a time.
331 #define DO_ZPZZ(NAME, TYPE, H, OP) \
332 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334 intptr_t i, opr_sz = simd_oprsz(desc); \
335 for (i = 0; i < opr_sz; ) { \
336 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
339 TYPE nn = *(TYPE *)(vn + H(i)); \
340 TYPE mm = *(TYPE *)(vm + H(i)); \
341 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
348 /* Similarly, specialized for 64-bit operands. */
349 #define DO_ZPZZ_D(NAME, TYPE, OP) \
350 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
353 TYPE *d = vd, *n = vn, *m = vm; \
355 for (i = 0; i < opr_sz; i += 1) { \
356 if (pg[H1(i)] & 1) { \
357 TYPE nn = n[i], mm = m[i]; \
363 #define DO_AND(N, M) (N & M)
364 #define DO_EOR(N, M) (N ^ M)
365 #define DO_ORR(N, M) (N | M)
366 #define DO_BIC(N, M) (N & ~M)
367 #define DO_ADD(N, M) (N + M)
368 #define DO_SUB(N, M) (N - M)
369 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
370 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
371 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
372 #define DO_MUL(N, M) (N * M)
376 * We must avoid the C undefined behaviour cases: division by
377 * zero and signed division of INT_MIN by -1. Both of these
378 * have architecturally defined required results for Arm.
379 * We special case all signed divisions by -1 to avoid having
380 * to deduce the minimum integer for the type involved.
382 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
383 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
385 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
386 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
387 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
388 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
390 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
391 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
392 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
393 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
395 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
396 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
397 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
398 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
400 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
401 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
402 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
403 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
405 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
406 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
407 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
408 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
410 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
411 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
412 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
413 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
415 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
416 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
417 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
418 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
420 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
421 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
422 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
423 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
425 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
426 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
427 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
428 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
430 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
431 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
432 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
433 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
435 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
436 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
437 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
438 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
440 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
441 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
442 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
443 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
445 /* Because the computation type is at least twice as large as required,
446 these work for both signed and unsigned source types. */
447 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
452 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
454 return (n
* m
) >> 16;
457 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
459 return (n
* m
) >> 32;
462 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
465 muls64(&lo
, &hi
, n
, m
);
469 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
472 mulu64(&lo
, &hi
, n
, m
);
476 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
477 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
478 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
479 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
481 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
482 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
483 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
484 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
486 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
487 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
488 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
489 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
491 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_SDIV
)
492 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_SDIV
)
494 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_UDIV
)
495 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_UDIV
)
497 /* Note that all bits of the shift are significant
498 and not modulo the element size. */
499 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
500 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
501 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
504 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
505 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
507 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
508 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
509 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
511 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
512 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
513 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
515 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
516 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
517 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
522 /* Three-operand expander, controlled by a predicate, in which the
523 * third operand is "wide". That is, for D = N op M, the same 64-bit
524 * value of M is used with all of the narrower values of N.
526 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
527 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529 intptr_t i, opr_sz = simd_oprsz(desc); \
530 for (i = 0; i < opr_sz; ) { \
531 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
532 TYPEW mm = *(TYPEW *)(vm + i); \
535 TYPE nn = *(TYPE *)(vn + H(i)); \
536 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
543 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
544 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
545 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
547 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
548 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
549 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
551 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
552 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
553 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
557 /* Fully general two-operand expander, controlled by a predicate.
559 #define DO_ZPZ(NAME, TYPE, H, OP) \
560 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562 intptr_t i, opr_sz = simd_oprsz(desc); \
563 for (i = 0; i < opr_sz; ) { \
564 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
567 TYPE nn = *(TYPE *)(vn + H(i)); \
568 *(TYPE *)(vd + H(i)) = OP(nn); \
570 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
575 /* Similarly, specialized for 64-bit operands. */
576 #define DO_ZPZ_D(NAME, TYPE, OP) \
577 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
580 TYPE *d = vd, *n = vn; \
582 for (i = 0; i < opr_sz; i += 1) { \
583 if (pg[H1(i)] & 1) { \
590 #define DO_CLS_B(N) (clrsb32(N) - 24)
591 #define DO_CLS_H(N) (clrsb32(N) - 16)
593 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
594 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
595 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
596 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
598 #define DO_CLZ_B(N) (clz32(N) - 24)
599 #define DO_CLZ_H(N) (clz32(N) - 16)
601 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
602 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
603 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
604 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
606 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
607 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
608 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
609 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
611 #define DO_CNOT(N) (N == 0)
613 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
614 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
615 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
616 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
618 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
621 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
622 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
624 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
627 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
628 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
630 #define DO_NOT(N) (~N)
632 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
633 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
634 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
635 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
637 #define DO_SXTB(N) ((int8_t)N)
638 #define DO_SXTH(N) ((int16_t)N)
639 #define DO_SXTS(N) ((int32_t)N)
640 #define DO_UXTB(N) ((uint8_t)N)
641 #define DO_UXTH(N) ((uint16_t)N)
642 #define DO_UXTS(N) ((uint32_t)N)
644 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
645 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
646 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
647 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
648 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
649 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
651 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
652 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
653 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
654 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
655 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
656 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
658 #define DO_ABS(N) (N < 0 ? -N : N)
660 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
661 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
662 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
663 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
665 #define DO_NEG(N) (-N)
667 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
668 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
669 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
670 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
672 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
673 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
674 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
676 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
677 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
679 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
681 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
682 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
683 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
684 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
686 /* Three-operand expander, unpredicated, in which the third operand is "wide".
688 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
689 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691 intptr_t i, opr_sz = simd_oprsz(desc); \
692 for (i = 0; i < opr_sz; ) { \
693 TYPEW mm = *(TYPEW *)(vm + i); \
695 TYPE nn = *(TYPE *)(vn + H(i)); \
696 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
702 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
703 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
704 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
706 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
707 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
708 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
710 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
711 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
712 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
728 /* Two-operand reduction expander, controlled by a predicate.
729 * The difference between TYPERED and TYPERET has to do with
730 * sign-extension. E.g. for SMAX, TYPERED must be signed,
731 * but TYPERET must be unsigned so that e.g. a 32-bit value
732 * is not sign-extended to the ABI uint64_t return type.
734 /* ??? If we were to vectorize this by hand the reduction ordering
735 * would change. For integer operands, this is perfectly fine.
737 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
738 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740 intptr_t i, opr_sz = simd_oprsz(desc); \
741 TYPERED ret = INIT; \
742 for (i = 0; i < opr_sz; ) { \
743 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
746 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
749 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
752 return (TYPERET)ret; \
755 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
756 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
762 for (i = 0; i < opr_sz; i += 1) { \
763 if (pg[H1(i)] & 1) { \
771 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
772 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
773 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
774 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
776 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
777 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
778 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
779 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
781 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
782 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
783 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
784 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
786 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
787 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
788 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
790 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
791 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
792 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
793 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
795 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
796 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
797 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
798 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
800 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
801 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
802 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
803 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
805 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
806 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
807 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
808 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
810 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
811 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
812 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
813 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
818 /* Two vector operand, one scalar operand, unpredicated. */
819 #define DO_ZZI(NAME, TYPE, OP) \
820 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
823 TYPE s = s64, *d = vd, *n = vn; \
824 for (i = 0; i < opr_sz; ++i) { \
825 d[i] = OP(n[i], s); \
829 #define DO_SUBR(X, Y) (Y - X)
831 DO_ZZI(sve_subri_b
, uint8_t, DO_SUBR
)
832 DO_ZZI(sve_subri_h
, uint16_t, DO_SUBR
)
833 DO_ZZI(sve_subri_s
, uint32_t, DO_SUBR
)
834 DO_ZZI(sve_subri_d
, uint64_t, DO_SUBR
)
836 DO_ZZI(sve_smaxi_b
, int8_t, DO_MAX
)
837 DO_ZZI(sve_smaxi_h
, int16_t, DO_MAX
)
838 DO_ZZI(sve_smaxi_s
, int32_t, DO_MAX
)
839 DO_ZZI(sve_smaxi_d
, int64_t, DO_MAX
)
841 DO_ZZI(sve_smini_b
, int8_t, DO_MIN
)
842 DO_ZZI(sve_smini_h
, int16_t, DO_MIN
)
843 DO_ZZI(sve_smini_s
, int32_t, DO_MIN
)
844 DO_ZZI(sve_smini_d
, int64_t, DO_MIN
)
846 DO_ZZI(sve_umaxi_b
, uint8_t, DO_MAX
)
847 DO_ZZI(sve_umaxi_h
, uint16_t, DO_MAX
)
848 DO_ZZI(sve_umaxi_s
, uint32_t, DO_MAX
)
849 DO_ZZI(sve_umaxi_d
, uint64_t, DO_MAX
)
851 DO_ZZI(sve_umini_b
, uint8_t, DO_MIN
)
852 DO_ZZI(sve_umini_h
, uint16_t, DO_MIN
)
853 DO_ZZI(sve_umini_s
, uint32_t, DO_MIN
)
854 DO_ZZI(sve_umini_d
, uint64_t, DO_MIN
)
874 /* Similar to the ARM LastActiveElement pseudocode function, except the
875 result is multiplied by the element size. This includes the not found
876 indication; e.g. not found for esz=3 is -8. */
877 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
879 uint64_t mask
= pred_esz_masks
[esz
];
883 uint64_t this_g
= g
[--i
] & mask
;
885 return i
* 64 + (63 - clz64(this_g
));
888 return (intptr_t)-1 << esz
;
891 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t words
)
893 uint32_t flags
= PREDTEST_INIT
;
894 uint64_t *d
= vd
, *g
= vg
;
898 uint64_t this_d
= d
[i
];
899 uint64_t this_g
= g
[i
];
903 /* Set in D the first bit of G. */
904 this_d
|= this_g
& -this_g
;
907 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
909 } while (++i
< words
);
914 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
916 intptr_t words
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
);
917 intptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
918 uint32_t flags
= PREDTEST_INIT
;
919 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
922 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
923 esz_mask
= pred_esz_masks
[esz
];
925 /* Similar to the pseudocode for pnext, but scaled by ESZ
926 so that we find the correct bit. */
927 if (next
< words
* 64) {
931 mask
= ~((1ull << (next
& 63)) - 1);
935 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
937 next
= (next
& -64) + ctz64(this_g
);
942 } while (next
< words
* 64);
948 if (i
== next
/ 64) {
949 this_d
= 1ull << (next
& 63);
952 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
953 } while (++i
< words
);
958 /* Store zero into every active element of Zd. We will use this for two
959 * and three-operand predicated instructions for which logic dictates a
960 * zero result. In particular, logical shift by element size, which is
961 * otherwise undefined on the host.
963 * For element sizes smaller than uint64_t, we use tables to expand
964 * the N bits of the controlling predicate to a byte mask, and clear
967 void HELPER(sve_clr_b
)(void *vd
, void *vg
, uint32_t desc
)
969 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
972 for (i
= 0; i
< opr_sz
; i
+= 1) {
973 d
[i
] &= ~expand_pred_b(pg
[H1(i
)]);
977 void HELPER(sve_clr_h
)(void *vd
, void *vg
, uint32_t desc
)
979 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
982 for (i
= 0; i
< opr_sz
; i
+= 1) {
983 d
[i
] &= ~expand_pred_h(pg
[H1(i
)]);
987 void HELPER(sve_clr_s
)(void *vd
, void *vg
, uint32_t desc
)
989 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
992 for (i
= 0; i
< opr_sz
; i
+= 1) {
993 d
[i
] &= ~expand_pred_s(pg
[H1(i
)]);
997 void HELPER(sve_clr_d
)(void *vd
, void *vg
, uint32_t desc
)
999 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1002 for (i
= 0; i
< opr_sz
; i
+= 1) {
1003 if (pg
[H1(i
)] & 1) {
1009 /* Copy Zn into Zd, and store zero into inactive elements. */
1010 void HELPER(sve_movz_b
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1012 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1013 uint64_t *d
= vd
, *n
= vn
;
1015 for (i
= 0; i
< opr_sz
; i
+= 1) {
1016 d
[i
] = n
[i
] & expand_pred_b(pg
[H1(i
)]);
1020 void HELPER(sve_movz_h
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1022 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1023 uint64_t *d
= vd
, *n
= vn
;
1025 for (i
= 0; i
< opr_sz
; i
+= 1) {
1026 d
[i
] = n
[i
] & expand_pred_h(pg
[H1(i
)]);
1030 void HELPER(sve_movz_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1032 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1033 uint64_t *d
= vd
, *n
= vn
;
1035 for (i
= 0; i
< opr_sz
; i
+= 1) {
1036 d
[i
] = n
[i
] & expand_pred_s(pg
[H1(i
)]);
1040 void HELPER(sve_movz_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1042 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1043 uint64_t *d
= vd
, *n
= vn
;
1045 for (i
= 0; i
< opr_sz
; i
+= 1) {
1046 d
[i
] = n
[i
] & -(uint64_t)(pg
[H1(i
)] & 1);
1050 /* Three-operand expander, immediate operand, controlled by a predicate.
1052 #define DO_ZPZI(NAME, TYPE, H, OP) \
1053 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1055 intptr_t i, opr_sz = simd_oprsz(desc); \
1056 TYPE imm = simd_data(desc); \
1057 for (i = 0; i < opr_sz; ) { \
1058 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1061 TYPE nn = *(TYPE *)(vn + H(i)); \
1062 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1064 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1069 /* Similarly, specialized for 64-bit operands. */
1070 #define DO_ZPZI_D(NAME, TYPE, OP) \
1071 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1073 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1074 TYPE *d = vd, *n = vn; \
1075 TYPE imm = simd_data(desc); \
1077 for (i = 0; i < opr_sz; i += 1) { \
1078 if (pg[H1(i)] & 1) { \
1080 d[i] = OP(nn, imm); \
1085 #define DO_SHR(N, M) (N >> M)
1086 #define DO_SHL(N, M) (N << M)
1088 /* Arithmetic shift right for division. This rounds negative numbers
1089 toward zero as per signed division. Therefore before shifting,
1090 when N is negative, add 2**M-1. */
1091 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1093 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
1094 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
1095 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
1096 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
1098 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
1099 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
1100 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
1101 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
1103 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
1104 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
1105 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
1106 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
1108 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
1109 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
1110 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
1111 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
1119 /* Fully general four-operand expander, controlled by a predicate.
1121 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1122 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1123 void *vg, uint32_t desc) \
1125 intptr_t i, opr_sz = simd_oprsz(desc); \
1126 for (i = 0; i < opr_sz; ) { \
1127 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1130 TYPE nn = *(TYPE *)(vn + H(i)); \
1131 TYPE mm = *(TYPE *)(vm + H(i)); \
1132 TYPE aa = *(TYPE *)(va + H(i)); \
1133 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1135 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1140 /* Similarly, specialized for 64-bit operands. */
1141 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1142 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1143 void *vg, uint32_t desc) \
1145 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1146 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1148 for (i = 0; i < opr_sz; i += 1) { \
1149 if (pg[H1(i)] & 1) { \
1150 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1151 d[i] = OP(aa, nn, mm); \
1156 #define DO_MLA(A, N, M) (A + N * M)
1157 #define DO_MLS(A, N, M) (A - N * M)
1159 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
1160 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
1162 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
1163 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
1165 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
1166 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
1168 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
1169 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
1176 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
1177 uint32_t incr
, uint32_t desc
)
1179 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1181 for (i
= 0; i
< opr_sz
; i
+= 1) {
1182 d
[H1(i
)] = start
+ i
* incr
;
1186 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
1187 uint32_t incr
, uint32_t desc
)
1189 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1191 for (i
= 0; i
< opr_sz
; i
+= 1) {
1192 d
[H2(i
)] = start
+ i
* incr
;
1196 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
1197 uint32_t incr
, uint32_t desc
)
1199 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1201 for (i
= 0; i
< opr_sz
; i
+= 1) {
1202 d
[H4(i
)] = start
+ i
* incr
;
1206 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
1207 uint64_t incr
, uint32_t desc
)
1209 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1211 for (i
= 0; i
< opr_sz
; i
+= 1) {
1212 d
[i
] = start
+ i
* incr
;
1216 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1218 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1219 uint32_t sh
= simd_data(desc
);
1220 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
1221 for (i
= 0; i
< opr_sz
; i
+= 1) {
1222 d
[i
] = n
[i
] + (m
[i
] << sh
);
1226 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1228 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1229 uint64_t sh
= simd_data(desc
);
1230 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1231 for (i
= 0; i
< opr_sz
; i
+= 1) {
1232 d
[i
] = n
[i
] + (m
[i
] << sh
);
1236 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1238 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1239 uint64_t sh
= simd_data(desc
);
1240 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1241 for (i
= 0; i
< opr_sz
; i
+= 1) {
1242 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
1246 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1248 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1249 uint64_t sh
= simd_data(desc
);
1250 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1251 for (i
= 0; i
< opr_sz
; i
+= 1) {
1252 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
1256 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
1258 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1259 static const uint16_t coeff
[] = {
1260 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1261 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1262 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1263 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1265 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1266 uint16_t *d
= vd
, *n
= vn
;
1268 for (i
= 0; i
< opr_sz
; i
++) {
1270 intptr_t idx
= extract32(nn
, 0, 5);
1271 uint16_t exp
= extract32(nn
, 5, 5);
1272 d
[i
] = coeff
[idx
] | (exp
<< 10);
1276 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
1278 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1279 static const uint32_t coeff
[] = {
1280 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1281 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1282 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1283 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1284 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1285 0x1ef532, 0x20b051, 0x227043, 0x243516,
1286 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1287 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1288 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1289 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1290 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1291 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1292 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1293 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1294 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1295 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1297 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1298 uint32_t *d
= vd
, *n
= vn
;
1300 for (i
= 0; i
< opr_sz
; i
++) {
1302 intptr_t idx
= extract32(nn
, 0, 6);
1303 uint32_t exp
= extract32(nn
, 6, 8);
1304 d
[i
] = coeff
[idx
] | (exp
<< 23);
1308 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
1310 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1311 static const uint64_t coeff
[] = {
1312 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
1313 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
1314 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
1315 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
1316 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
1317 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
1318 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
1319 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
1320 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
1321 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
1322 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
1323 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
1324 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
1325 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
1326 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
1327 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
1328 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
1329 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
1330 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
1331 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
1332 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
1335 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1336 uint64_t *d
= vd
, *n
= vn
;
1338 for (i
= 0; i
< opr_sz
; i
++) {
1340 intptr_t idx
= extract32(nn
, 0, 6);
1341 uint64_t exp
= extract32(nn
, 6, 11);
1342 d
[i
] = coeff
[idx
] | (exp
<< 52);
1346 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1348 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1349 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
1350 for (i
= 0; i
< opr_sz
; i
+= 1) {
1356 d
[i
] = nn
^ (mm
& 2) << 14;
1360 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1362 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1363 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
1364 for (i
= 0; i
< opr_sz
; i
+= 1) {
1370 d
[i
] = nn
^ (mm
& 2) << 30;
1374 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1376 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1377 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1378 for (i
= 0; i
< opr_sz
; i
+= 1) {
1384 d
[i
] = nn
^ (mm
& 2) << 62;
1389 * Signed saturating addition with scalar operand.
1392 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1394 intptr_t i
, oprsz
= simd_oprsz(desc
);
1396 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
1397 int r
= *(int8_t *)(a
+ i
) + b
;
1400 } else if (r
< INT8_MIN
) {
1403 *(int8_t *)(d
+ i
) = r
;
1407 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1409 intptr_t i
, oprsz
= simd_oprsz(desc
);
1411 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
1412 int r
= *(int16_t *)(a
+ i
) + b
;
1413 if (r
> INT16_MAX
) {
1415 } else if (r
< INT16_MIN
) {
1418 *(int16_t *)(d
+ i
) = r
;
1422 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1424 intptr_t i
, oprsz
= simd_oprsz(desc
);
1426 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
1427 int64_t r
= *(int32_t *)(a
+ i
) + b
;
1428 if (r
> INT32_MAX
) {
1430 } else if (r
< INT32_MIN
) {
1433 *(int32_t *)(d
+ i
) = r
;
1437 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1439 intptr_t i
, oprsz
= simd_oprsz(desc
);
1441 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
1442 int64_t ai
= *(int64_t *)(a
+ i
);
1444 if (((r
^ ai
) & ~(ai
^ b
)) < 0) {
1445 /* Signed overflow. */
1446 r
= (r
< 0 ? INT64_MAX
: INT64_MIN
);
1448 *(int64_t *)(d
+ i
) = r
;
1453 * Unsigned saturating addition with scalar operand.
1456 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1458 intptr_t i
, oprsz
= simd_oprsz(desc
);
1460 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
1461 int r
= *(uint8_t *)(a
+ i
) + b
;
1462 if (r
> UINT8_MAX
) {
1467 *(uint8_t *)(d
+ i
) = r
;
1471 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1473 intptr_t i
, oprsz
= simd_oprsz(desc
);
1475 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
1476 int r
= *(uint16_t *)(a
+ i
) + b
;
1477 if (r
> UINT16_MAX
) {
1482 *(uint16_t *)(d
+ i
) = r
;
1486 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1488 intptr_t i
, oprsz
= simd_oprsz(desc
);
1490 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
1491 int64_t r
= *(uint32_t *)(a
+ i
) + b
;
1492 if (r
> UINT32_MAX
) {
1497 *(uint32_t *)(d
+ i
) = r
;
1501 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
1503 intptr_t i
, oprsz
= simd_oprsz(desc
);
1505 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1506 uint64_t r
= *(uint64_t *)(a
+ i
) + b
;
1510 *(uint64_t *)(d
+ i
) = r
;
1514 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
1516 intptr_t i
, oprsz
= simd_oprsz(desc
);
1518 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1519 uint64_t ai
= *(uint64_t *)(a
+ i
);
1520 *(uint64_t *)(d
+ i
) = (ai
< b
? 0 : ai
- b
);
1524 /* Two operand predicated copy immediate with merge. All valid immediates
1525 * can fit within 17 signed bits in the simd_data field.
1527 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
1528 uint64_t mm
, uint32_t desc
)
1530 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1531 uint64_t *d
= vd
, *n
= vn
;
1534 mm
= dup_const(MO_8
, mm
);
1535 for (i
= 0; i
< opr_sz
; i
+= 1) {
1537 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
1538 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1542 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
1543 uint64_t mm
, uint32_t desc
)
1545 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1546 uint64_t *d
= vd
, *n
= vn
;
1549 mm
= dup_const(MO_16
, mm
);
1550 for (i
= 0; i
< opr_sz
; i
+= 1) {
1552 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
1553 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1557 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
1558 uint64_t mm
, uint32_t desc
)
1560 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1561 uint64_t *d
= vd
, *n
= vn
;
1564 mm
= dup_const(MO_32
, mm
);
1565 for (i
= 0; i
< opr_sz
; i
+= 1) {
1567 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
1568 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1572 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
1573 uint64_t mm
, uint32_t desc
)
1575 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1576 uint64_t *d
= vd
, *n
= vn
;
1579 for (i
= 0; i
< opr_sz
; i
+= 1) {
1581 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
1585 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1587 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1591 val
= dup_const(MO_8
, val
);
1592 for (i
= 0; i
< opr_sz
; i
+= 1) {
1593 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
1597 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1599 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1603 val
= dup_const(MO_16
, val
);
1604 for (i
= 0; i
< opr_sz
; i
+= 1) {
1605 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
1609 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1611 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1615 val
= dup_const(MO_32
, val
);
1616 for (i
= 0; i
< opr_sz
; i
+= 1) {
1617 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
1621 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
1623 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1627 for (i
= 0; i
< opr_sz
; i
+= 1) {
1628 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
1632 /* Big-endian hosts need to frob the byte indicies. If the copy
1633 * happens to be 8-byte aligned, then no frobbing necessary.
1635 static void swap_memmove(void *vd
, void *vs
, size_t n
)
1637 uintptr_t d
= (uintptr_t)vd
;
1638 uintptr_t s
= (uintptr_t)vs
;
1639 uintptr_t o
= (d
| s
| n
) & 7;
1642 #ifndef HOST_WORDS_BIGENDIAN
1651 if (d
< s
|| d
>= s
+ n
) {
1652 for (i
= 0; i
< n
; i
+= 4) {
1653 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
1656 for (i
= n
; i
> 0; ) {
1658 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
1665 if (d
< s
|| d
>= s
+ n
) {
1666 for (i
= 0; i
< n
; i
+= 2) {
1667 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
1670 for (i
= n
; i
> 0; ) {
1672 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
1678 if (d
< s
|| d
>= s
+ n
) {
1679 for (i
= 0; i
< n
; i
++) {
1680 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
1683 for (i
= n
; i
> 0; ) {
1685 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
1692 /* Similarly for memset of 0. */
1693 static void swap_memzero(void *vd
, size_t n
)
1695 uintptr_t d
= (uintptr_t)vd
;
1696 uintptr_t o
= (d
| n
) & 7;
1699 /* Usually, the first bit of a predicate is set, so N is 0. */
1700 if (likely(n
== 0)) {
1704 #ifndef HOST_WORDS_BIGENDIAN
1713 for (i
= 0; i
< n
; i
+= 4) {
1714 *(uint32_t *)H1_4(d
+ i
) = 0;
1720 for (i
= 0; i
< n
; i
+= 2) {
1721 *(uint16_t *)H1_2(d
+ i
) = 0;
1726 for (i
= 0; i
< n
; i
++) {
1727 *(uint8_t *)H1(d
+ i
) = 0;
1733 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1735 intptr_t opr_sz
= simd_oprsz(desc
);
1736 size_t n_ofs
= simd_data(desc
);
1737 size_t n_siz
= opr_sz
- n_ofs
;
1740 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
1741 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
1742 } else if (vd
!= vn
) {
1743 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
1744 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
1746 /* vd == vn == vm. Need temp space. */
1748 swap_memmove(&tmp
, vm
, n_ofs
);
1749 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
1750 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
1754 #define DO_INSR(NAME, TYPE, H) \
1755 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1757 intptr_t opr_sz = simd_oprsz(desc); \
1758 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1759 *(TYPE *)(vd + H(0)) = val; \
1762 DO_INSR(sve_insr_b
, uint8_t, H1
)
1763 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
1764 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
1765 DO_INSR(sve_insr_d
, uint64_t, )
1769 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
1771 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1772 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1773 uint64_t f
= *(uint64_t *)(vn
+ i
);
1774 uint64_t b
= *(uint64_t *)(vn
+ j
);
1775 *(uint64_t *)(vd
+ i
) = bswap64(b
);
1776 *(uint64_t *)(vd
+ j
) = bswap64(f
);
1780 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
1782 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1783 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1784 uint64_t f
= *(uint64_t *)(vn
+ i
);
1785 uint64_t b
= *(uint64_t *)(vn
+ j
);
1786 *(uint64_t *)(vd
+ i
) = hswap64(b
);
1787 *(uint64_t *)(vd
+ j
) = hswap64(f
);
1791 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
1793 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1794 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1795 uint64_t f
= *(uint64_t *)(vn
+ i
);
1796 uint64_t b
= *(uint64_t *)(vn
+ j
);
1797 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
1798 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
1802 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
1804 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
1805 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
1806 uint64_t f
= *(uint64_t *)(vn
+ i
);
1807 uint64_t b
= *(uint64_t *)(vn
+ j
);
1808 *(uint64_t *)(vd
+ i
) = b
;
1809 *(uint64_t *)(vd
+ j
) = f
;
1813 #define DO_TBL(NAME, TYPE, H) \
1814 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1816 intptr_t i, opr_sz = simd_oprsz(desc); \
1817 uintptr_t elem = opr_sz / sizeof(TYPE); \
1818 TYPE *d = vd, *n = vn, *m = vm; \
1820 if (unlikely(vd == vn)) { \
1821 n = memcpy(&tmp, vn, opr_sz); \
1823 for (i = 0; i < elem; i++) { \
1825 d[H(i)] = j < elem ? n[H(j)] : 0; \
1829 DO_TBL(sve_tbl_b
, uint8_t, H1
)
1830 DO_TBL(sve_tbl_h
, uint16_t, H2
)
1831 DO_TBL(sve_tbl_s
, uint32_t, H4
)
1832 DO_TBL(sve_tbl_d
, uint64_t, )
1836 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1837 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1839 intptr_t i, opr_sz = simd_oprsz(desc); \
1843 if (unlikely(vn - vd < opr_sz)) { \
1844 n = memcpy(&tmp, n, opr_sz / 2); \
1846 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1847 d[HD(i)] = n[HS(i)]; \
1851 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
1852 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
1853 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, , H4
)
1855 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
1856 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
1857 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, , H4
)
1861 /* Mask of bits included in the even numbered predicates of width esz.
1862 * We also use this for expand_bits/compress_bits, and so extend the
1863 * same pattern out to 16-bit units.
1865 static const uint64_t even_bit_esz_masks
[5] = {
1866 0x5555555555555555ull
,
1867 0x3333333333333333ull
,
1868 0x0f0f0f0f0f0f0f0full
,
1869 0x00ff00ff00ff00ffull
,
1870 0x0000ffff0000ffffull
,
1873 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1874 * For N==0, this corresponds to the operation that in qemu/bitops.h
1875 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1876 * section 7-2 Shuffling Bits.
1878 static uint64_t expand_bits(uint64_t x
, int n
)
1883 for (i
= 4; i
>= n
; i
--) {
1885 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
1890 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1891 * For N==0, this corresponds to the operation that in qemu/bitops.h
1892 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1893 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1895 static uint64_t compress_bits(uint64_t x
, int n
)
1899 for (i
= n
; i
<= 4; i
++) {
1901 x
&= even_bit_esz_masks
[i
];
1904 return x
& 0xffffffffu
;
1907 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
1909 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1910 int esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
1911 intptr_t high
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1);
1916 uint64_t nn
= *(uint64_t *)vn
;
1917 uint64_t mm
= *(uint64_t *)vm
;
1918 int half
= 4 * oprsz
;
1920 nn
= extract64(nn
, high
* half
, half
);
1921 mm
= extract64(mm
, high
* half
, half
);
1922 nn
= expand_bits(nn
, esz
);
1923 mm
= expand_bits(mm
, esz
);
1924 d
[0] = nn
+ (mm
<< (1 << esz
));
1926 ARMPredicateReg tmp_n
, tmp_m
;
1928 /* We produce output faster than we consume input.
1929 Therefore we must be mindful of possible overlap. */
1930 if ((vn
- vd
) < (uintptr_t)oprsz
) {
1931 vn
= memcpy(&tmp_n
, vn
, oprsz
);
1933 if ((vm
- vd
) < (uintptr_t)oprsz
) {
1934 vm
= memcpy(&tmp_m
, vm
, oprsz
);
1940 if ((high
& 3) == 0) {
1941 uint32_t *n
= vn
, *m
= vm
;
1944 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
1945 uint64_t nn
= n
[H4(high
+ i
)];
1946 uint64_t mm
= m
[H4(high
+ i
)];
1948 nn
= expand_bits(nn
, esz
);
1949 mm
= expand_bits(mm
, esz
);
1950 d
[i
] = nn
+ (mm
<< (1 << esz
));
1953 uint8_t *n
= vn
, *m
= vm
;
1956 for (i
= 0; i
< oprsz
/ 2; i
++) {
1957 uint16_t nn
= n
[H1(high
+ i
)];
1958 uint16_t mm
= m
[H1(high
+ i
)];
1960 nn
= expand_bits(nn
, esz
);
1961 mm
= expand_bits(mm
, esz
);
1962 d16
[H2(i
)] = nn
+ (mm
<< (1 << esz
));
1968 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
1970 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
1971 int esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
1972 int odd
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1) << esz
;
1973 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1978 l
= compress_bits(n
[0] >> odd
, esz
);
1979 h
= compress_bits(m
[0] >> odd
, esz
);
1980 d
[0] = extract64(l
+ (h
<< (4 * oprsz
)), 0, 8 * oprsz
);
1982 ARMPredicateReg tmp_m
;
1983 intptr_t oprsz_16
= oprsz
/ 16;
1985 if ((vm
- vd
) < (uintptr_t)oprsz
) {
1986 m
= memcpy(&tmp_m
, vm
, oprsz
);
1989 for (i
= 0; i
< oprsz_16
; i
++) {
1992 l
= compress_bits(l
>> odd
, esz
);
1993 h
= compress_bits(h
>> odd
, esz
);
1994 d
[i
] = l
+ (h
<< 32);
1997 /* For VL which is not a power of 2, the results from M do not
1998 align nicely with the uint64_t for D. Put the aligned results
1999 from M into TMP_M and then copy it into place afterward. */
2001 d
[i
] = compress_bits(n
[2 * i
] >> odd
, esz
);
2003 for (i
= 0; i
< oprsz_16
; i
++) {
2006 l
= compress_bits(l
>> odd
, esz
);
2007 h
= compress_bits(h
>> odd
, esz
);
2008 tmp_m
.p
[i
] = l
+ (h
<< 32);
2010 tmp_m
.p
[i
] = compress_bits(m
[2 * i
] >> odd
, esz
);
2012 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
2014 for (i
= 0; i
< oprsz_16
; i
++) {
2017 l
= compress_bits(l
>> odd
, esz
);
2018 h
= compress_bits(h
>> odd
, esz
);
2019 d
[oprsz_16
+ i
] = l
+ (h
<< 32);
2025 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2027 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2028 uintptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
2029 bool odd
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1);
2030 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2037 mask
= even_bit_esz_masks
[esz
];
2044 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
2045 uint64_t nn
= (n
[i
] & mask
) >> shr
;
2046 uint64_t mm
= (m
[i
] & mask
) << shl
;
2051 /* Reverse units of 2**N bits. */
2052 static uint64_t reverse_bits_64(uint64_t x
, int n
)
2057 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
2058 uint64_t mask
= even_bit_esz_masks
[i
];
2059 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
2064 static uint8_t reverse_bits_8(uint8_t x
, int n
)
2066 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
2069 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
2070 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
2075 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
2077 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2078 int esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
2079 intptr_t i
, oprsz_2
= oprsz
/ 2;
2082 uint64_t l
= *(uint64_t *)vn
;
2083 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
2084 *(uint64_t *)vd
= l
;
2085 } else if ((oprsz
& 15) == 0) {
2086 for (i
= 0; i
< oprsz_2
; i
+= 8) {
2087 intptr_t ih
= oprsz
- 8 - i
;
2088 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
2089 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
2090 *(uint64_t *)(vd
+ i
) = h
;
2091 *(uint64_t *)(vd
+ ih
) = l
;
2094 for (i
= 0; i
< oprsz_2
; i
+= 1) {
2095 intptr_t il
= H1(i
);
2096 intptr_t ih
= H1(oprsz
- 1 - i
);
2097 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
2098 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
2099 *(uint8_t *)(vd
+ il
) = h
;
2100 *(uint8_t *)(vd
+ ih
) = l
;
2105 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
2107 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2108 intptr_t high
= extract32(pred_desc
, SIMD_DATA_SHIFT
+ 2, 1);
2113 uint64_t nn
= *(uint64_t *)vn
;
2114 int half
= 4 * oprsz
;
2116 nn
= extract64(nn
, high
* half
, half
);
2117 nn
= expand_bits(nn
, 0);
2120 ARMPredicateReg tmp_n
;
2122 /* We produce output faster than we consume input.
2123 Therefore we must be mindful of possible overlap. */
2124 if ((vn
- vd
) < (uintptr_t)oprsz
) {
2125 vn
= memcpy(&tmp_n
, vn
, oprsz
);
2131 if ((high
& 3) == 0) {
2135 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
2136 uint64_t nn
= n
[H4(high
+ i
)];
2137 d
[i
] = expand_bits(nn
, 0);
2143 for (i
= 0; i
< oprsz
/ 2; i
++) {
2144 uint16_t nn
= n
[H1(high
+ i
)];
2145 d16
[H2(i
)] = expand_bits(nn
, 0);
2151 #define DO_ZIP(NAME, TYPE, H) \
2152 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2154 intptr_t oprsz = simd_oprsz(desc); \
2155 intptr_t i, oprsz_2 = oprsz / 2; \
2156 ARMVectorReg tmp_n, tmp_m; \
2157 /* We produce output faster than we consume input. \
2158 Therefore we must be mindful of possible overlap. */ \
2159 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2160 vn = memcpy(&tmp_n, vn, oprsz_2); \
2162 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2163 vm = memcpy(&tmp_m, vm, oprsz_2); \
2165 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2166 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2167 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2171 DO_ZIP(sve_zip_b
, uint8_t, H1
)
2172 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
2173 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
2174 DO_ZIP(sve_zip_d
, uint64_t, )
2176 #define DO_UZP(NAME, TYPE, H) \
2177 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2179 intptr_t oprsz = simd_oprsz(desc); \
2180 intptr_t oprsz_2 = oprsz / 2; \
2181 intptr_t odd_ofs = simd_data(desc); \
2183 ARMVectorReg tmp_m; \
2184 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2185 vm = memcpy(&tmp_m, vm, oprsz); \
2187 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2188 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2190 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2191 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2195 DO_UZP(sve_uzp_b
, uint8_t, H1
)
2196 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
2197 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
2198 DO_UZP(sve_uzp_d
, uint64_t, )
2200 #define DO_TRN(NAME, TYPE, H) \
2201 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2203 intptr_t oprsz = simd_oprsz(desc); \
2204 intptr_t odd_ofs = simd_data(desc); \
2206 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2207 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2208 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2209 *(TYPE *)(vd + H(i + 0)) = ae; \
2210 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2214 DO_TRN(sve_trn_b
, uint8_t, H1
)
2215 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
2216 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
2217 DO_TRN(sve_trn_d
, uint64_t, )
2223 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2225 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
2226 uint32_t *d
= vd
, *n
= vn
;
2229 for (i
= j
= 0; i
< opr_sz
; i
++) {
2230 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
2231 d
[H4(j
)] = n
[H4(i
)];
2235 for (; j
< opr_sz
; j
++) {
2240 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2242 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
2243 uint64_t *d
= vd
, *n
= vn
;
2246 for (i
= j
= 0; i
< opr_sz
; i
++) {
2247 if (pg
[H1(i
)] & 1) {
2252 for (; j
< opr_sz
; j
++) {
2257 /* Similar to the ARM LastActiveElement pseudocode function, except the
2258 * result is multiplied by the element size. This includes the not found
2259 * indication; e.g. not found for esz=3 is -8.
2261 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
2263 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2264 intptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
2266 return last_active_element(vg
, DIV_ROUND_UP(oprsz
, 8), esz
);
2269 void HELPER(sve_splice
)(void *vd
, void *vn
, void *vm
, void *vg
, uint32_t desc
)
2271 intptr_t opr_sz
= simd_oprsz(desc
) / 8;
2272 int esz
= simd_data(desc
);
2273 uint64_t pg
, first_g
, last_g
, len
, mask
= pred_esz_masks
[esz
];
2274 intptr_t i
, first_i
, last_i
;
2277 first_i
= last_i
= 0;
2278 first_g
= last_g
= 0;
2280 /* Find the extent of the active elements within VG. */
2281 for (i
= QEMU_ALIGN_UP(opr_sz
, 8) - 8; i
>= 0; i
-= 8) {
2282 pg
= *(uint64_t *)(vg
+ i
) & mask
;
2295 first_i
= first_i
* 8 + ctz64(first_g
);
2296 last_i
= last_i
* 8 + 63 - clz64(last_g
);
2297 len
= last_i
- first_i
+ (1 << esz
);
2299 vm
= memcpy(&tmp
, vm
, opr_sz
* 8);
2301 swap_memmove(vd
, vn
+ first_i
, len
);
2303 swap_memmove(vd
+ len
, vm
, opr_sz
* 8 - len
);
2306 void HELPER(sve_sel_zpzz_b
)(void *vd
, void *vn
, void *vm
,
2307 void *vg
, uint32_t desc
)
2309 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2310 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2313 for (i
= 0; i
< opr_sz
; i
+= 1) {
2314 uint64_t nn
= n
[i
], mm
= m
[i
];
2315 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
2316 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
2320 void HELPER(sve_sel_zpzz_h
)(void *vd
, void *vn
, void *vm
,
2321 void *vg
, uint32_t desc
)
2323 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2324 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2327 for (i
= 0; i
< opr_sz
; i
+= 1) {
2328 uint64_t nn
= n
[i
], mm
= m
[i
];
2329 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
2330 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
2334 void HELPER(sve_sel_zpzz_s
)(void *vd
, void *vn
, void *vm
,
2335 void *vg
, uint32_t desc
)
2337 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2338 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2341 for (i
= 0; i
< opr_sz
; i
+= 1) {
2342 uint64_t nn
= n
[i
], mm
= m
[i
];
2343 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
2344 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
2348 void HELPER(sve_sel_zpzz_d
)(void *vd
, void *vn
, void *vm
,
2349 void *vg
, uint32_t desc
)
2351 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2352 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2355 for (i
= 0; i
< opr_sz
; i
+= 1) {
2356 uint64_t nn
= n
[i
], mm
= m
[i
];
2357 d
[i
] = (pg
[H1(i
)] & 1 ? nn
: mm
);
2361 /* Two operand comparison controlled by a predicate.
2362 * ??? It is very tempting to want to be able to expand this inline
2363 * with x86 instructions, e.g.
2365 * vcmpeqw zm, zn, %ymm0
2366 * vpmovmskb %ymm0, %eax
2370 * or even aarch64, e.g.
2372 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2373 * cmeq v0.8h, zn, zm
2374 * and v0.8h, v0.8h, mask
2378 * However, coming up with an abstraction that allows vector inputs and
2379 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2380 * scalar outputs, is tricky.
2382 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2383 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2385 intptr_t opr_sz = simd_oprsz(desc); \
2386 uint32_t flags = PREDTEST_INIT; \
2387 intptr_t i = opr_sz; \
2389 uint64_t out = 0, pg; \
2391 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2392 TYPE nn = *(TYPE *)(vn + H(i)); \
2393 TYPE mm = *(TYPE *)(vm + H(i)); \
2396 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2398 *(uint64_t *)(vd + (i >> 3)) = out; \
2399 flags = iter_predtest_bwd(out, pg, flags); \
2404 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2405 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2406 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2407 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2408 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2409 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2410 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2411 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2413 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b
, uint8_t, ==)
2414 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h
, uint16_t, ==)
2415 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s
, uint32_t, ==)
2416 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d
, uint64_t, ==)
2418 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b
, uint8_t, !=)
2419 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h
, uint16_t, !=)
2420 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s
, uint32_t, !=)
2421 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d
, uint64_t, !=)
2423 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b
, int8_t, >)
2424 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h
, int16_t, >)
2425 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s
, int32_t, >)
2426 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d
, int64_t, >)
2428 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b
, int8_t, >=)
2429 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h
, int16_t, >=)
2430 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s
, int32_t, >=)
2431 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d
, int64_t, >=)
2433 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b
, uint8_t, >)
2434 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h
, uint16_t, >)
2435 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s
, uint32_t, >)
2436 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d
, uint64_t, >)
2438 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b
, uint8_t, >=)
2439 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h
, uint16_t, >=)
2440 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s
, uint32_t, >=)
2441 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d
, uint64_t, >=)
2443 #undef DO_CMP_PPZZ_B
2444 #undef DO_CMP_PPZZ_H
2445 #undef DO_CMP_PPZZ_S
2446 #undef DO_CMP_PPZZ_D
2449 /* Similar, but the second source is "wide". */
2450 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2451 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2453 intptr_t opr_sz = simd_oprsz(desc); \
2454 uint32_t flags = PREDTEST_INIT; \
2455 intptr_t i = opr_sz; \
2457 uint64_t out = 0, pg; \
2459 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2461 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2462 TYPE nn = *(TYPE *)(vn + H(i)); \
2466 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2468 *(uint64_t *)(vd + (i >> 3)) = out; \
2469 flags = iter_predtest_bwd(out, pg, flags); \
2474 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2475 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2476 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2477 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2478 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2479 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2481 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b
, int8_t, uint64_t, ==)
2482 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h
, int16_t, uint64_t, ==)
2483 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s
, int32_t, uint64_t, ==)
2485 DO_CMP_PPZW_B(sve_cmpne_ppzw_b
, int8_t, uint64_t, !=)
2486 DO_CMP_PPZW_H(sve_cmpne_ppzw_h
, int16_t, uint64_t, !=)
2487 DO_CMP_PPZW_S(sve_cmpne_ppzw_s
, int32_t, uint64_t, !=)
2489 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b
, int8_t, int64_t, >)
2490 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h
, int16_t, int64_t, >)
2491 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s
, int32_t, int64_t, >)
2493 DO_CMP_PPZW_B(sve_cmpge_ppzw_b
, int8_t, int64_t, >=)
2494 DO_CMP_PPZW_H(sve_cmpge_ppzw_h
, int16_t, int64_t, >=)
2495 DO_CMP_PPZW_S(sve_cmpge_ppzw_s
, int32_t, int64_t, >=)
2497 DO_CMP_PPZW_B(sve_cmphi_ppzw_b
, uint8_t, uint64_t, >)
2498 DO_CMP_PPZW_H(sve_cmphi_ppzw_h
, uint16_t, uint64_t, >)
2499 DO_CMP_PPZW_S(sve_cmphi_ppzw_s
, uint32_t, uint64_t, >)
2501 DO_CMP_PPZW_B(sve_cmphs_ppzw_b
, uint8_t, uint64_t, >=)
2502 DO_CMP_PPZW_H(sve_cmphs_ppzw_h
, uint16_t, uint64_t, >=)
2503 DO_CMP_PPZW_S(sve_cmphs_ppzw_s
, uint32_t, uint64_t, >=)
2505 DO_CMP_PPZW_B(sve_cmplt_ppzw_b
, int8_t, int64_t, <)
2506 DO_CMP_PPZW_H(sve_cmplt_ppzw_h
, int16_t, int64_t, <)
2507 DO_CMP_PPZW_S(sve_cmplt_ppzw_s
, int32_t, int64_t, <)
2509 DO_CMP_PPZW_B(sve_cmple_ppzw_b
, int8_t, int64_t, <=)
2510 DO_CMP_PPZW_H(sve_cmple_ppzw_h
, int16_t, int64_t, <=)
2511 DO_CMP_PPZW_S(sve_cmple_ppzw_s
, int32_t, int64_t, <=)
2513 DO_CMP_PPZW_B(sve_cmplo_ppzw_b
, uint8_t, uint64_t, <)
2514 DO_CMP_PPZW_H(sve_cmplo_ppzw_h
, uint16_t, uint64_t, <)
2515 DO_CMP_PPZW_S(sve_cmplo_ppzw_s
, uint32_t, uint64_t, <)
2517 DO_CMP_PPZW_B(sve_cmpls_ppzw_b
, uint8_t, uint64_t, <=)
2518 DO_CMP_PPZW_H(sve_cmpls_ppzw_h
, uint16_t, uint64_t, <=)
2519 DO_CMP_PPZW_S(sve_cmpls_ppzw_s
, uint32_t, uint64_t, <=)
2521 #undef DO_CMP_PPZW_B
2522 #undef DO_CMP_PPZW_H
2523 #undef DO_CMP_PPZW_S
2526 /* Similar, but the second source is immediate. */
2527 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2528 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2530 intptr_t opr_sz = simd_oprsz(desc); \
2531 uint32_t flags = PREDTEST_INIT; \
2532 TYPE mm = simd_data(desc); \
2533 intptr_t i = opr_sz; \
2535 uint64_t out = 0, pg; \
2537 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2538 TYPE nn = *(TYPE *)(vn + H(i)); \
2541 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2543 *(uint64_t *)(vd + (i >> 3)) = out; \
2544 flags = iter_predtest_bwd(out, pg, flags); \
2549 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2550 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2551 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2552 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2553 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2554 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2555 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2556 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2558 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b
, uint8_t, ==)
2559 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h
, uint16_t, ==)
2560 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s
, uint32_t, ==)
2561 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d
, uint64_t, ==)
2563 DO_CMP_PPZI_B(sve_cmpne_ppzi_b
, uint8_t, !=)
2564 DO_CMP_PPZI_H(sve_cmpne_ppzi_h
, uint16_t, !=)
2565 DO_CMP_PPZI_S(sve_cmpne_ppzi_s
, uint32_t, !=)
2566 DO_CMP_PPZI_D(sve_cmpne_ppzi_d
, uint64_t, !=)
2568 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b
, int8_t, >)
2569 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h
, int16_t, >)
2570 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s
, int32_t, >)
2571 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d
, int64_t, >)
2573 DO_CMP_PPZI_B(sve_cmpge_ppzi_b
, int8_t, >=)
2574 DO_CMP_PPZI_H(sve_cmpge_ppzi_h
, int16_t, >=)
2575 DO_CMP_PPZI_S(sve_cmpge_ppzi_s
, int32_t, >=)
2576 DO_CMP_PPZI_D(sve_cmpge_ppzi_d
, int64_t, >=)
2578 DO_CMP_PPZI_B(sve_cmphi_ppzi_b
, uint8_t, >)
2579 DO_CMP_PPZI_H(sve_cmphi_ppzi_h
, uint16_t, >)
2580 DO_CMP_PPZI_S(sve_cmphi_ppzi_s
, uint32_t, >)
2581 DO_CMP_PPZI_D(sve_cmphi_ppzi_d
, uint64_t, >)
2583 DO_CMP_PPZI_B(sve_cmphs_ppzi_b
, uint8_t, >=)
2584 DO_CMP_PPZI_H(sve_cmphs_ppzi_h
, uint16_t, >=)
2585 DO_CMP_PPZI_S(sve_cmphs_ppzi_s
, uint32_t, >=)
2586 DO_CMP_PPZI_D(sve_cmphs_ppzi_d
, uint64_t, >=)
2588 DO_CMP_PPZI_B(sve_cmplt_ppzi_b
, int8_t, <)
2589 DO_CMP_PPZI_H(sve_cmplt_ppzi_h
, int16_t, <)
2590 DO_CMP_PPZI_S(sve_cmplt_ppzi_s
, int32_t, <)
2591 DO_CMP_PPZI_D(sve_cmplt_ppzi_d
, int64_t, <)
2593 DO_CMP_PPZI_B(sve_cmple_ppzi_b
, int8_t, <=)
2594 DO_CMP_PPZI_H(sve_cmple_ppzi_h
, int16_t, <=)
2595 DO_CMP_PPZI_S(sve_cmple_ppzi_s
, int32_t, <=)
2596 DO_CMP_PPZI_D(sve_cmple_ppzi_d
, int64_t, <=)
2598 DO_CMP_PPZI_B(sve_cmplo_ppzi_b
, uint8_t, <)
2599 DO_CMP_PPZI_H(sve_cmplo_ppzi_h
, uint16_t, <)
2600 DO_CMP_PPZI_S(sve_cmplo_ppzi_s
, uint32_t, <)
2601 DO_CMP_PPZI_D(sve_cmplo_ppzi_d
, uint64_t, <)
2603 DO_CMP_PPZI_B(sve_cmpls_ppzi_b
, uint8_t, <=)
2604 DO_CMP_PPZI_H(sve_cmpls_ppzi_h
, uint16_t, <=)
2605 DO_CMP_PPZI_S(sve_cmpls_ppzi_s
, uint32_t, <=)
2606 DO_CMP_PPZI_D(sve_cmpls_ppzi_d
, uint64_t, <=)
2608 #undef DO_CMP_PPZI_B
2609 #undef DO_CMP_PPZI_H
2610 #undef DO_CMP_PPZI_S
2611 #undef DO_CMP_PPZI_D
2614 /* Similar to the ARM LastActive pseudocode function. */
2615 static bool last_active_pred(void *vd
, void *vg
, intptr_t oprsz
)
2619 for (i
= QEMU_ALIGN_UP(oprsz
, 8) - 8; i
>= 0; i
-= 8) {
2620 uint64_t pg
= *(uint64_t *)(vg
+ i
);
2622 return (pow2floor(pg
) & *(uint64_t *)(vd
+ i
)) != 0;
2628 /* Compute a mask into RETB that is true for all G, up to and including
2629 * (if after) or excluding (if !after) the first G & N.
2630 * Return true if BRK found.
2632 static bool compute_brk(uint64_t *retb
, uint64_t n
, uint64_t g
,
2633 bool brk
, bool after
)
2639 } else if ((g
& n
) == 0) {
2640 /* For all G, no N are set; break not found. */
2643 /* Break somewhere in N. Locate it. */
2644 b
= g
& n
; /* guard true, pred true */
2645 b
= b
& -b
; /* first such */
2647 b
= b
| (b
- 1); /* break after same */
2649 b
= b
- 1; /* break before same */
2658 /* Compute a zeroing BRK. */
2659 static void compute_brk_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
2660 intptr_t oprsz
, bool after
)
2665 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
2666 uint64_t this_b
, this_g
= g
[i
];
2668 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
2669 d
[i
] = this_b
& this_g
;
2673 /* Likewise, but also compute flags. */
2674 static uint32_t compute_brks_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
2675 intptr_t oprsz
, bool after
)
2677 uint32_t flags
= PREDTEST_INIT
;
2681 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
2682 uint64_t this_b
, this_d
, this_g
= g
[i
];
2684 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
2685 d
[i
] = this_d
= this_b
& this_g
;
2686 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
2691 /* Compute a merging BRK. */
2692 static void compute_brk_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
2693 intptr_t oprsz
, bool after
)
2698 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
2699 uint64_t this_b
, this_g
= g
[i
];
2701 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
2702 d
[i
] = (this_b
& this_g
) | (d
[i
] & ~this_g
);
2706 /* Likewise, but also compute flags. */
2707 static uint32_t compute_brks_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
2708 intptr_t oprsz
, bool after
)
2710 uint32_t flags
= PREDTEST_INIT
;
2714 for (i
= 0; i
< oprsz
/ 8; ++i
) {
2715 uint64_t this_b
, this_d
= d
[i
], this_g
= g
[i
];
2717 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
2718 d
[i
] = this_d
= (this_b
& this_g
) | (this_d
& ~this_g
);
2719 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
2724 static uint32_t do_zero(ARMPredicateReg
*d
, intptr_t oprsz
)
2726 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2727 * The compiler should turn this into 4 64-bit integer stores.
2729 memset(d
, 0, sizeof(ARMPredicateReg
));
2730 return PREDTEST_INIT
;
2733 void HELPER(sve_brkpa
)(void *vd
, void *vn
, void *vm
, void *vg
,
2736 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2737 if (last_active_pred(vn
, vg
, oprsz
)) {
2738 compute_brk_z(vd
, vm
, vg
, oprsz
, true);
2744 uint32_t HELPER(sve_brkpas
)(void *vd
, void *vn
, void *vm
, void *vg
,
2747 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2748 if (last_active_pred(vn
, vg
, oprsz
)) {
2749 return compute_brks_z(vd
, vm
, vg
, oprsz
, true);
2751 return do_zero(vd
, oprsz
);
2755 void HELPER(sve_brkpb
)(void *vd
, void *vn
, void *vm
, void *vg
,
2758 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2759 if (last_active_pred(vn
, vg
, oprsz
)) {
2760 compute_brk_z(vd
, vm
, vg
, oprsz
, false);
2766 uint32_t HELPER(sve_brkpbs
)(void *vd
, void *vn
, void *vm
, void *vg
,
2769 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2770 if (last_active_pred(vn
, vg
, oprsz
)) {
2771 return compute_brks_z(vd
, vm
, vg
, oprsz
, false);
2773 return do_zero(vd
, oprsz
);
2777 void HELPER(sve_brka_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2779 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2780 compute_brk_z(vd
, vn
, vg
, oprsz
, true);
2783 uint32_t HELPER(sve_brkas_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2785 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2786 return compute_brks_z(vd
, vn
, vg
, oprsz
, true);
2789 void HELPER(sve_brkb_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2791 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2792 compute_brk_z(vd
, vn
, vg
, oprsz
, false);
2795 uint32_t HELPER(sve_brkbs_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2797 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2798 return compute_brks_z(vd
, vn
, vg
, oprsz
, false);
2801 void HELPER(sve_brka_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2803 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2804 compute_brk_m(vd
, vn
, vg
, oprsz
, true);
2807 uint32_t HELPER(sve_brkas_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2809 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2810 return compute_brks_m(vd
, vn
, vg
, oprsz
, true);
2813 void HELPER(sve_brkb_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2815 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2816 compute_brk_m(vd
, vn
, vg
, oprsz
, false);
2819 uint32_t HELPER(sve_brkbs_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2821 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2822 return compute_brks_m(vd
, vn
, vg
, oprsz
, false);
2825 void HELPER(sve_brkn
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2827 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2829 if (!last_active_pred(vn
, vg
, oprsz
)) {
2834 /* As if PredTest(Ones(PL), D, esz). */
2835 static uint32_t predtest_ones(ARMPredicateReg
*d
, intptr_t oprsz
,
2838 uint32_t flags
= PREDTEST_INIT
;
2841 for (i
= 0; i
< oprsz
/ 8; i
++) {
2842 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
, flags
);
2845 uint64_t mask
= ~(-1ULL << (8 * (oprsz
& 7)));
2846 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
& mask
, flags
);
2851 uint32_t HELPER(sve_brkns
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
2853 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2855 if (last_active_pred(vn
, vg
, oprsz
)) {
2856 return predtest_ones(vd
, oprsz
, -1);
2858 return do_zero(vd
, oprsz
);
2862 uint64_t HELPER(sve_cntp
)(void *vn
, void *vg
, uint32_t pred_desc
)
2864 intptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2865 intptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
2866 uint64_t *n
= vn
, *g
= vg
, sum
= 0, mask
= pred_esz_masks
[esz
];
2869 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
2870 uint64_t t
= n
[i
] & g
[i
] & mask
;
2876 uint32_t HELPER(sve_while
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
2878 uintptr_t oprsz
= extract32(pred_desc
, 0, SIMD_OPRSZ_BITS
) + 2;
2879 intptr_t esz
= extract32(pred_desc
, SIMD_DATA_SHIFT
, 2);
2880 uint64_t esz_mask
= pred_esz_masks
[esz
];
2881 ARMPredicateReg
*d
= vd
;
2885 /* Begin with a zero predicate register. */
2886 flags
= do_zero(d
, oprsz
);
2891 /* Set all of the requested bits. */
2892 for (i
= 0; i
< count
/ 64; ++i
) {
2896 d
->p
[i
] = MAKE_64BIT_MASK(0, count
& 63) & esz_mask
;
2899 return predtest_ones(d
, oprsz
, esz_mask
);
2902 /* Recursive reduction on a function;
2903 * C.f. the ARM ARM function ReducePredicated.
2905 * While it would be possible to write this without the DATA temporary,
2906 * it is much simpler to process the predicate register this way.
2907 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2908 * little to gain with a more complex non-recursive form.
2910 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2911 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2916 uintptr_t half = n / 2; \
2917 TYPE lo = NAME##_reduce(data, status, half); \
2918 TYPE hi = NAME##_reduce(data + half, status, half); \
2919 return TYPE##_##FUNC(lo, hi, status); \
2922 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2924 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2925 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2926 for (i = 0; i < oprsz; ) { \
2927 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2929 TYPE nn = *(TYPE *)(vn + H(i)); \
2930 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2931 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2934 for (; i < maxsz; i += sizeof(TYPE)) { \
2935 *(TYPE *)((void *)data + i) = IDENT; \
2937 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2940 DO_REDUCE(sve_faddv_h
, float16
, H1_2
, add
, float16_zero
)
2941 DO_REDUCE(sve_faddv_s
, float32
, H1_4
, add
, float32_zero
)
2942 DO_REDUCE(sve_faddv_d
, float64
, , add
, float64_zero
)
2944 /* Identity is floatN_default_nan, without the function call. */
2945 DO_REDUCE(sve_fminnmv_h
, float16
, H1_2
, minnum
, 0x7E00)
2946 DO_REDUCE(sve_fminnmv_s
, float32
, H1_4
, minnum
, 0x7FC00000)
2947 DO_REDUCE(sve_fminnmv_d
, float64
, , minnum
, 0x7FF8000000000000ULL
)
2949 DO_REDUCE(sve_fmaxnmv_h
, float16
, H1_2
, maxnum
, 0x7E00)
2950 DO_REDUCE(sve_fmaxnmv_s
, float32
, H1_4
, maxnum
, 0x7FC00000)
2951 DO_REDUCE(sve_fmaxnmv_d
, float64
, , maxnum
, 0x7FF8000000000000ULL
)
2953 DO_REDUCE(sve_fminv_h
, float16
, H1_2
, min
, float16_infinity
)
2954 DO_REDUCE(sve_fminv_s
, float32
, H1_4
, min
, float32_infinity
)
2955 DO_REDUCE(sve_fminv_d
, float64
, , min
, float64_infinity
)
2957 DO_REDUCE(sve_fmaxv_h
, float16
, H1_2
, max
, float16_chs(float16_infinity
))
2958 DO_REDUCE(sve_fmaxv_s
, float32
, H1_4
, max
, float32_chs(float32_infinity
))
2959 DO_REDUCE(sve_fmaxv_d
, float64
, , max
, float64_chs(float64_infinity
))
2963 uint64_t HELPER(sve_fadda_h
)(uint64_t nn
, void *vm
, void *vg
,
2964 void *status
, uint32_t desc
)
2966 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
2967 float16 result
= nn
;
2970 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
2973 float16 mm
= *(float16
*)(vm
+ H1_2(i
));
2974 result
= float16_add(result
, mm
, status
);
2976 i
+= sizeof(float16
), pg
>>= sizeof(float16
);
2978 } while (i
< opr_sz
);
2983 uint64_t HELPER(sve_fadda_s
)(uint64_t nn
, void *vm
, void *vg
,
2984 void *status
, uint32_t desc
)
2986 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
2987 float32 result
= nn
;
2990 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
2993 float32 mm
= *(float32
*)(vm
+ H1_2(i
));
2994 result
= float32_add(result
, mm
, status
);
2996 i
+= sizeof(float32
), pg
>>= sizeof(float32
);
2998 } while (i
< opr_sz
);
3003 uint64_t HELPER(sve_fadda_d
)(uint64_t nn
, void *vm
, void *vg
,
3004 void *status
, uint32_t desc
)
3006 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
) / 8;
3010 for (i
= 0; i
< opr_sz
; i
++) {
3011 if (pg
[H1(i
)] & 1) {
3012 nn
= float64_add(nn
, m
[i
], status
);
3019 /* Fully general three-operand expander, controlled by a predicate,
3020 * With the extra float_status parameter.
3022 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3023 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3024 void *status, uint32_t desc) \
3026 intptr_t i = simd_oprsz(desc); \
3029 uint64_t pg = g[(i - 1) >> 6]; \
3031 i -= sizeof(TYPE); \
3032 if (likely((pg >> (i & 63)) & 1)) { \
3033 TYPE nn = *(TYPE *)(vn + H(i)); \
3034 TYPE mm = *(TYPE *)(vm + H(i)); \
3035 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3041 DO_ZPZZ_FP(sve_fadd_h
, uint16_t, H1_2
, float16_add
)
3042 DO_ZPZZ_FP(sve_fadd_s
, uint32_t, H1_4
, float32_add
)
3043 DO_ZPZZ_FP(sve_fadd_d
, uint64_t, , float64_add
)
3045 DO_ZPZZ_FP(sve_fsub_h
, uint16_t, H1_2
, float16_sub
)
3046 DO_ZPZZ_FP(sve_fsub_s
, uint32_t, H1_4
, float32_sub
)
3047 DO_ZPZZ_FP(sve_fsub_d
, uint64_t, , float64_sub
)
3049 DO_ZPZZ_FP(sve_fmul_h
, uint16_t, H1_2
, float16_mul
)
3050 DO_ZPZZ_FP(sve_fmul_s
, uint32_t, H1_4
, float32_mul
)
3051 DO_ZPZZ_FP(sve_fmul_d
, uint64_t, , float64_mul
)
3053 DO_ZPZZ_FP(sve_fdiv_h
, uint16_t, H1_2
, float16_div
)
3054 DO_ZPZZ_FP(sve_fdiv_s
, uint32_t, H1_4
, float32_div
)
3055 DO_ZPZZ_FP(sve_fdiv_d
, uint64_t, , float64_div
)
3057 DO_ZPZZ_FP(sve_fmin_h
, uint16_t, H1_2
, float16_min
)
3058 DO_ZPZZ_FP(sve_fmin_s
, uint32_t, H1_4
, float32_min
)
3059 DO_ZPZZ_FP(sve_fmin_d
, uint64_t, , float64_min
)
3061 DO_ZPZZ_FP(sve_fmax_h
, uint16_t, H1_2
, float16_max
)
3062 DO_ZPZZ_FP(sve_fmax_s
, uint32_t, H1_4
, float32_max
)
3063 DO_ZPZZ_FP(sve_fmax_d
, uint64_t, , float64_max
)
3065 DO_ZPZZ_FP(sve_fminnum_h
, uint16_t, H1_2
, float16_minnum
)
3066 DO_ZPZZ_FP(sve_fminnum_s
, uint32_t, H1_4
, float32_minnum
)
3067 DO_ZPZZ_FP(sve_fminnum_d
, uint64_t, , float64_minnum
)
3069 DO_ZPZZ_FP(sve_fmaxnum_h
, uint16_t, H1_2
, float16_maxnum
)
3070 DO_ZPZZ_FP(sve_fmaxnum_s
, uint32_t, H1_4
, float32_maxnum
)
3071 DO_ZPZZ_FP(sve_fmaxnum_d
, uint64_t, , float64_maxnum
)
3073 static inline float16
abd_h(float16 a
, float16 b
, float_status
*s
)
3075 return float16_abs(float16_sub(a
, b
, s
));
3078 static inline float32
abd_s(float32 a
, float32 b
, float_status
*s
)
3080 return float32_abs(float32_sub(a
, b
, s
));
3083 static inline float64
abd_d(float64 a
, float64 b
, float_status
*s
)
3085 return float64_abs(float64_sub(a
, b
, s
));
3088 DO_ZPZZ_FP(sve_fabd_h
, uint16_t, H1_2
, abd_h
)
3089 DO_ZPZZ_FP(sve_fabd_s
, uint32_t, H1_4
, abd_s
)
3090 DO_ZPZZ_FP(sve_fabd_d
, uint64_t, , abd_d
)
3092 static inline float64
scalbn_d(float64 a
, int64_t b
, float_status
*s
)
3094 int b_int
= MIN(MAX(b
, INT_MIN
), INT_MAX
);
3095 return float64_scalbn(a
, b_int
, s
);
3098 DO_ZPZZ_FP(sve_fscalbn_h
, int16_t, H1_2
, float16_scalbn
)
3099 DO_ZPZZ_FP(sve_fscalbn_s
, int32_t, H1_4
, float32_scalbn
)
3100 DO_ZPZZ_FP(sve_fscalbn_d
, int64_t, , scalbn_d
)
3102 DO_ZPZZ_FP(sve_fmulx_h
, uint16_t, H1_2
, helper_advsimd_mulxh
)
3103 DO_ZPZZ_FP(sve_fmulx_s
, uint32_t, H1_4
, helper_vfp_mulxs
)
3104 DO_ZPZZ_FP(sve_fmulx_d
, uint64_t, , helper_vfp_mulxd
)
3108 /* Three-operand expander, with one scalar operand, controlled by
3109 * a predicate, with the extra float_status parameter.
3111 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3112 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3113 void *status, uint32_t desc) \
3115 intptr_t i = simd_oprsz(desc); \
3119 uint64_t pg = g[(i - 1) >> 6]; \
3121 i -= sizeof(TYPE); \
3122 if (likely((pg >> (i & 63)) & 1)) { \
3123 TYPE nn = *(TYPE *)(vn + H(i)); \
3124 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3130 DO_ZPZS_FP(sve_fadds_h
, float16
, H1_2
, float16_add
)
3131 DO_ZPZS_FP(sve_fadds_s
, float32
, H1_4
, float32_add
)
3132 DO_ZPZS_FP(sve_fadds_d
, float64
, , float64_add
)
3134 DO_ZPZS_FP(sve_fsubs_h
, float16
, H1_2
, float16_sub
)
3135 DO_ZPZS_FP(sve_fsubs_s
, float32
, H1_4
, float32_sub
)
3136 DO_ZPZS_FP(sve_fsubs_d
, float64
, , float64_sub
)
3138 DO_ZPZS_FP(sve_fmuls_h
, float16
, H1_2
, float16_mul
)
3139 DO_ZPZS_FP(sve_fmuls_s
, float32
, H1_4
, float32_mul
)
3140 DO_ZPZS_FP(sve_fmuls_d
, float64
, , float64_mul
)
3142 static inline float16
subr_h(float16 a
, float16 b
, float_status
*s
)
3144 return float16_sub(b
, a
, s
);
3147 static inline float32
subr_s(float32 a
, float32 b
, float_status
*s
)
3149 return float32_sub(b
, a
, s
);
3152 static inline float64
subr_d(float64 a
, float64 b
, float_status
*s
)
3154 return float64_sub(b
, a
, s
);
3157 DO_ZPZS_FP(sve_fsubrs_h
, float16
, H1_2
, subr_h
)
3158 DO_ZPZS_FP(sve_fsubrs_s
, float32
, H1_4
, subr_s
)
3159 DO_ZPZS_FP(sve_fsubrs_d
, float64
, , subr_d
)
3161 DO_ZPZS_FP(sve_fmaxnms_h
, float16
, H1_2
, float16_maxnum
)
3162 DO_ZPZS_FP(sve_fmaxnms_s
, float32
, H1_4
, float32_maxnum
)
3163 DO_ZPZS_FP(sve_fmaxnms_d
, float64
, , float64_maxnum
)
3165 DO_ZPZS_FP(sve_fminnms_h
, float16
, H1_2
, float16_minnum
)
3166 DO_ZPZS_FP(sve_fminnms_s
, float32
, H1_4
, float32_minnum
)
3167 DO_ZPZS_FP(sve_fminnms_d
, float64
, , float64_minnum
)
3169 DO_ZPZS_FP(sve_fmaxs_h
, float16
, H1_2
, float16_max
)
3170 DO_ZPZS_FP(sve_fmaxs_s
, float32
, H1_4
, float32_max
)
3171 DO_ZPZS_FP(sve_fmaxs_d
, float64
, , float64_max
)
3173 DO_ZPZS_FP(sve_fmins_h
, float16
, H1_2
, float16_min
)
3174 DO_ZPZS_FP(sve_fmins_s
, float32
, H1_4
, float32_min
)
3175 DO_ZPZS_FP(sve_fmins_d
, float64
, , float64_min
)
3177 /* Fully general two-operand expander, controlled by a predicate,
3178 * With the extra float_status parameter.
3180 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3181 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3183 intptr_t i = simd_oprsz(desc); \
3186 uint64_t pg = g[(i - 1) >> 6]; \
3188 i -= sizeof(TYPE); \
3189 if (likely((pg >> (i & 63)) & 1)) { \
3190 TYPE nn = *(TYPE *)(vn + H(i)); \
3191 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3197 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3198 * FZ16. When converting from fp16, this affects flushing input denormals;
3199 * when converting to fp16, this affects flushing output denormals.
3201 static inline float32
sve_f16_to_f32(float16 f
, float_status
*fpst
)
3203 flag save
= get_flush_inputs_to_zero(fpst
);
3206 set_flush_inputs_to_zero(false, fpst
);
3207 ret
= float16_to_float32(f
, true, fpst
);
3208 set_flush_inputs_to_zero(save
, fpst
);
3212 static inline float64
sve_f16_to_f64(float16 f
, float_status
*fpst
)
3214 flag save
= get_flush_inputs_to_zero(fpst
);
3217 set_flush_inputs_to_zero(false, fpst
);
3218 ret
= float16_to_float64(f
, true, fpst
);
3219 set_flush_inputs_to_zero(save
, fpst
);
3223 static inline float16
sve_f32_to_f16(float32 f
, float_status
*fpst
)
3225 flag save
= get_flush_to_zero(fpst
);
3228 set_flush_to_zero(false, fpst
);
3229 ret
= float32_to_float16(f
, true, fpst
);
3230 set_flush_to_zero(save
, fpst
);
3234 static inline float16
sve_f64_to_f16(float64 f
, float_status
*fpst
)
3236 flag save
= get_flush_to_zero(fpst
);
3239 set_flush_to_zero(false, fpst
);
3240 ret
= float64_to_float16(f
, true, fpst
);
3241 set_flush_to_zero(save
, fpst
);
3245 static inline int16_t vfp_float16_to_int16_rtz(float16 f
, float_status
*s
)
3247 if (float16_is_any_nan(f
)) {
3248 float_raise(float_flag_invalid
, s
);
3251 return float16_to_int16_round_to_zero(f
, s
);
3254 static inline int64_t vfp_float16_to_int64_rtz(float16 f
, float_status
*s
)
3256 if (float16_is_any_nan(f
)) {
3257 float_raise(float_flag_invalid
, s
);
3260 return float16_to_int64_round_to_zero(f
, s
);
3263 static inline int64_t vfp_float32_to_int64_rtz(float32 f
, float_status
*s
)
3265 if (float32_is_any_nan(f
)) {
3266 float_raise(float_flag_invalid
, s
);
3269 return float32_to_int64_round_to_zero(f
, s
);
3272 static inline int64_t vfp_float64_to_int64_rtz(float64 f
, float_status
*s
)
3274 if (float64_is_any_nan(f
)) {
3275 float_raise(float_flag_invalid
, s
);
3278 return float64_to_int64_round_to_zero(f
, s
);
3281 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f
, float_status
*s
)
3283 if (float16_is_any_nan(f
)) {
3284 float_raise(float_flag_invalid
, s
);
3287 return float16_to_uint16_round_to_zero(f
, s
);
3290 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f
, float_status
*s
)
3292 if (float16_is_any_nan(f
)) {
3293 float_raise(float_flag_invalid
, s
);
3296 return float16_to_uint64_round_to_zero(f
, s
);
3299 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f
, float_status
*s
)
3301 if (float32_is_any_nan(f
)) {
3302 float_raise(float_flag_invalid
, s
);
3305 return float32_to_uint64_round_to_zero(f
, s
);
3308 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f
, float_status
*s
)
3310 if (float64_is_any_nan(f
)) {
3311 float_raise(float_flag_invalid
, s
);
3314 return float64_to_uint64_round_to_zero(f
, s
);
3317 DO_ZPZ_FP(sve_fcvt_sh
, uint32_t, H1_4
, sve_f32_to_f16
)
3318 DO_ZPZ_FP(sve_fcvt_hs
, uint32_t, H1_4
, sve_f16_to_f32
)
3319 DO_ZPZ_FP(sve_fcvt_dh
, uint64_t, , sve_f64_to_f16
)
3320 DO_ZPZ_FP(sve_fcvt_hd
, uint64_t, , sve_f16_to_f64
)
3321 DO_ZPZ_FP(sve_fcvt_ds
, uint64_t, , float64_to_float32
)
3322 DO_ZPZ_FP(sve_fcvt_sd
, uint64_t, , float32_to_float64
)
3324 DO_ZPZ_FP(sve_fcvtzs_hh
, uint16_t, H1_2
, vfp_float16_to_int16_rtz
)
3325 DO_ZPZ_FP(sve_fcvtzs_hs
, uint32_t, H1_4
, helper_vfp_tosizh
)
3326 DO_ZPZ_FP(sve_fcvtzs_ss
, uint32_t, H1_4
, helper_vfp_tosizs
)
3327 DO_ZPZ_FP(sve_fcvtzs_hd
, uint64_t, , vfp_float16_to_int64_rtz
)
3328 DO_ZPZ_FP(sve_fcvtzs_sd
, uint64_t, , vfp_float32_to_int64_rtz
)
3329 DO_ZPZ_FP(sve_fcvtzs_ds
, uint64_t, , helper_vfp_tosizd
)
3330 DO_ZPZ_FP(sve_fcvtzs_dd
, uint64_t, , vfp_float64_to_int64_rtz
)
3332 DO_ZPZ_FP(sve_fcvtzu_hh
, uint16_t, H1_2
, vfp_float16_to_uint16_rtz
)
3333 DO_ZPZ_FP(sve_fcvtzu_hs
, uint32_t, H1_4
, helper_vfp_touizh
)
3334 DO_ZPZ_FP(sve_fcvtzu_ss
, uint32_t, H1_4
, helper_vfp_touizs
)
3335 DO_ZPZ_FP(sve_fcvtzu_hd
, uint64_t, , vfp_float16_to_uint64_rtz
)
3336 DO_ZPZ_FP(sve_fcvtzu_sd
, uint64_t, , vfp_float32_to_uint64_rtz
)
3337 DO_ZPZ_FP(sve_fcvtzu_ds
, uint64_t, , helper_vfp_touizd
)
3338 DO_ZPZ_FP(sve_fcvtzu_dd
, uint64_t, , vfp_float64_to_uint64_rtz
)
3340 DO_ZPZ_FP(sve_frint_h
, uint16_t, H1_2
, helper_advsimd_rinth
)
3341 DO_ZPZ_FP(sve_frint_s
, uint32_t, H1_4
, helper_rints
)
3342 DO_ZPZ_FP(sve_frint_d
, uint64_t, , helper_rintd
)
3344 DO_ZPZ_FP(sve_frintx_h
, uint16_t, H1_2
, float16_round_to_int
)
3345 DO_ZPZ_FP(sve_frintx_s
, uint32_t, H1_4
, float32_round_to_int
)
3346 DO_ZPZ_FP(sve_frintx_d
, uint64_t, , float64_round_to_int
)
3348 DO_ZPZ_FP(sve_frecpx_h
, uint16_t, H1_2
, helper_frecpx_f16
)
3349 DO_ZPZ_FP(sve_frecpx_s
, uint32_t, H1_4
, helper_frecpx_f32
)
3350 DO_ZPZ_FP(sve_frecpx_d
, uint64_t, , helper_frecpx_f64
)
3352 DO_ZPZ_FP(sve_fsqrt_h
, uint16_t, H1_2
, float16_sqrt
)
3353 DO_ZPZ_FP(sve_fsqrt_s
, uint32_t, H1_4
, float32_sqrt
)
3354 DO_ZPZ_FP(sve_fsqrt_d
, uint64_t, , float64_sqrt
)
3356 DO_ZPZ_FP(sve_scvt_hh
, uint16_t, H1_2
, int16_to_float16
)
3357 DO_ZPZ_FP(sve_scvt_sh
, uint32_t, H1_4
, int32_to_float16
)
3358 DO_ZPZ_FP(sve_scvt_ss
, uint32_t, H1_4
, int32_to_float32
)
3359 DO_ZPZ_FP(sve_scvt_sd
, uint64_t, , int32_to_float64
)
3360 DO_ZPZ_FP(sve_scvt_dh
, uint64_t, , int64_to_float16
)
3361 DO_ZPZ_FP(sve_scvt_ds
, uint64_t, , int64_to_float32
)
3362 DO_ZPZ_FP(sve_scvt_dd
, uint64_t, , int64_to_float64
)
3364 DO_ZPZ_FP(sve_ucvt_hh
, uint16_t, H1_2
, uint16_to_float16
)
3365 DO_ZPZ_FP(sve_ucvt_sh
, uint32_t, H1_4
, uint32_to_float16
)
3366 DO_ZPZ_FP(sve_ucvt_ss
, uint32_t, H1_4
, uint32_to_float32
)
3367 DO_ZPZ_FP(sve_ucvt_sd
, uint64_t, , uint32_to_float64
)
3368 DO_ZPZ_FP(sve_ucvt_dh
, uint64_t, , uint64_to_float16
)
3369 DO_ZPZ_FP(sve_ucvt_ds
, uint64_t, , uint64_to_float32
)
3370 DO_ZPZ_FP(sve_ucvt_dd
, uint64_t, , uint64_to_float64
)
3374 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3375 * "properly", so we need to encode some of the registers into DESC.
3377 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT
+ 20 > 32);
3379 static void do_fmla_zpzzz_h(CPUARMState
*env
, void *vg
, uint32_t desc
,
3380 uint16_t neg1
, uint16_t neg3
)
3382 intptr_t i
= simd_oprsz(desc
);
3383 unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
3384 unsigned rn
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 5);
3385 unsigned rm
= extract32(desc
, SIMD_DATA_SHIFT
+ 10, 5);
3386 unsigned ra
= extract32(desc
, SIMD_DATA_SHIFT
+ 15, 5);
3387 void *vd
= &env
->vfp
.zregs
[rd
];
3388 void *vn
= &env
->vfp
.zregs
[rn
];
3389 void *vm
= &env
->vfp
.zregs
[rm
];
3390 void *va
= &env
->vfp
.zregs
[ra
];
3394 uint64_t pg
= g
[(i
- 1) >> 6];
3397 if (likely((pg
>> (i
& 63)) & 1)) {
3398 float16 e1
, e2
, e3
, r
;
3400 e1
= *(uint16_t *)(vn
+ H1_2(i
)) ^ neg1
;
3401 e2
= *(uint16_t *)(vm
+ H1_2(i
));
3402 e3
= *(uint16_t *)(va
+ H1_2(i
)) ^ neg3
;
3403 r
= float16_muladd(e1
, e2
, e3
, 0, &env
->vfp
.fp_status_f16
);
3404 *(uint16_t *)(vd
+ H1_2(i
)) = r
;
3410 void HELPER(sve_fmla_zpzzz_h
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3412 do_fmla_zpzzz_h(env
, vg
, desc
, 0, 0);
3415 void HELPER(sve_fmls_zpzzz_h
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3417 do_fmla_zpzzz_h(env
, vg
, desc
, 0x8000, 0);
3420 void HELPER(sve_fnmla_zpzzz_h
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3422 do_fmla_zpzzz_h(env
, vg
, desc
, 0x8000, 0x8000);
3425 void HELPER(sve_fnmls_zpzzz_h
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3427 do_fmla_zpzzz_h(env
, vg
, desc
, 0, 0x8000);
3430 static void do_fmla_zpzzz_s(CPUARMState
*env
, void *vg
, uint32_t desc
,
3431 uint32_t neg1
, uint32_t neg3
)
3433 intptr_t i
= simd_oprsz(desc
);
3434 unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
3435 unsigned rn
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 5);
3436 unsigned rm
= extract32(desc
, SIMD_DATA_SHIFT
+ 10, 5);
3437 unsigned ra
= extract32(desc
, SIMD_DATA_SHIFT
+ 15, 5);
3438 void *vd
= &env
->vfp
.zregs
[rd
];
3439 void *vn
= &env
->vfp
.zregs
[rn
];
3440 void *vm
= &env
->vfp
.zregs
[rm
];
3441 void *va
= &env
->vfp
.zregs
[ra
];
3445 uint64_t pg
= g
[(i
- 1) >> 6];
3448 if (likely((pg
>> (i
& 63)) & 1)) {
3449 float32 e1
, e2
, e3
, r
;
3451 e1
= *(uint32_t *)(vn
+ H1_4(i
)) ^ neg1
;
3452 e2
= *(uint32_t *)(vm
+ H1_4(i
));
3453 e3
= *(uint32_t *)(va
+ H1_4(i
)) ^ neg3
;
3454 r
= float32_muladd(e1
, e2
, e3
, 0, &env
->vfp
.fp_status
);
3455 *(uint32_t *)(vd
+ H1_4(i
)) = r
;
3461 void HELPER(sve_fmla_zpzzz_s
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3463 do_fmla_zpzzz_s(env
, vg
, desc
, 0, 0);
3466 void HELPER(sve_fmls_zpzzz_s
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3468 do_fmla_zpzzz_s(env
, vg
, desc
, 0x80000000, 0);
3471 void HELPER(sve_fnmla_zpzzz_s
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3473 do_fmla_zpzzz_s(env
, vg
, desc
, 0x80000000, 0x80000000);
3476 void HELPER(sve_fnmls_zpzzz_s
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3478 do_fmla_zpzzz_s(env
, vg
, desc
, 0, 0x80000000);
3481 static void do_fmla_zpzzz_d(CPUARMState
*env
, void *vg
, uint32_t desc
,
3482 uint64_t neg1
, uint64_t neg3
)
3484 intptr_t i
= simd_oprsz(desc
);
3485 unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
3486 unsigned rn
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 5);
3487 unsigned rm
= extract32(desc
, SIMD_DATA_SHIFT
+ 10, 5);
3488 unsigned ra
= extract32(desc
, SIMD_DATA_SHIFT
+ 15, 5);
3489 void *vd
= &env
->vfp
.zregs
[rd
];
3490 void *vn
= &env
->vfp
.zregs
[rn
];
3491 void *vm
= &env
->vfp
.zregs
[rm
];
3492 void *va
= &env
->vfp
.zregs
[ra
];
3496 uint64_t pg
= g
[(i
- 1) >> 6];
3499 if (likely((pg
>> (i
& 63)) & 1)) {
3500 float64 e1
, e2
, e3
, r
;
3502 e1
= *(uint64_t *)(vn
+ i
) ^ neg1
;
3503 e2
= *(uint64_t *)(vm
+ i
);
3504 e3
= *(uint64_t *)(va
+ i
) ^ neg3
;
3505 r
= float64_muladd(e1
, e2
, e3
, 0, &env
->vfp
.fp_status
);
3506 *(uint64_t *)(vd
+ i
) = r
;
3512 void HELPER(sve_fmla_zpzzz_d
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3514 do_fmla_zpzzz_d(env
, vg
, desc
, 0, 0);
3517 void HELPER(sve_fmls_zpzzz_d
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3519 do_fmla_zpzzz_d(env
, vg
, desc
, INT64_MIN
, 0);
3522 void HELPER(sve_fnmla_zpzzz_d
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3524 do_fmla_zpzzz_d(env
, vg
, desc
, INT64_MIN
, INT64_MIN
);
3527 void HELPER(sve_fnmls_zpzzz_d
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3529 do_fmla_zpzzz_d(env
, vg
, desc
, 0, INT64_MIN
);
3532 /* Two operand floating-point comparison controlled by a predicate.
3533 * Unlike the integer version, we are not allowed to optimistically
3534 * compare operands, since the comparison may have side effects wrt
3537 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3538 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3539 void *status, uint32_t desc) \
3541 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3542 uint64_t *d = vd, *g = vg; \
3544 uint64_t out = 0, pg = g[j]; \
3546 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3547 if (likely((pg >> (i & 63)) & 1)) { \
3548 TYPE nn = *(TYPE *)(vn + H(i)); \
3549 TYPE mm = *(TYPE *)(vm + H(i)); \
3550 out |= OP(TYPE, nn, mm, status); \
3557 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3558 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3559 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3560 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3561 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3562 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3564 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3565 DO_FPCMP_PPZZ_H(NAME, OP) \
3566 DO_FPCMP_PPZZ_S(NAME, OP) \
3567 DO_FPCMP_PPZZ_D(NAME, OP)
3569 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3570 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3571 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3572 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3573 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3574 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3575 #define DO_FCMUO(TYPE, X, Y, ST) \
3576 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3577 #define DO_FACGE(TYPE, X, Y, ST) \
3578 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3579 #define DO_FACGT(TYPE, X, Y, ST) \
3580 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3582 DO_FPCMP_PPZZ_ALL(sve_fcmge
, DO_FCMGE
)
3583 DO_FPCMP_PPZZ_ALL(sve_fcmgt
, DO_FCMGT
)
3584 DO_FPCMP_PPZZ_ALL(sve_fcmeq
, DO_FCMEQ
)
3585 DO_FPCMP_PPZZ_ALL(sve_fcmne
, DO_FCMNE
)
3586 DO_FPCMP_PPZZ_ALL(sve_fcmuo
, DO_FCMUO
)
3587 DO_FPCMP_PPZZ_ALL(sve_facge
, DO_FACGE
)
3588 DO_FPCMP_PPZZ_ALL(sve_facgt
, DO_FACGT
)
3590 #undef DO_FPCMP_PPZZ_ALL
3591 #undef DO_FPCMP_PPZZ_D
3592 #undef DO_FPCMP_PPZZ_S
3593 #undef DO_FPCMP_PPZZ_H
3594 #undef DO_FPCMP_PPZZ
3596 /* One operand floating-point comparison against zero, controlled
3599 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3600 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3601 void *status, uint32_t desc) \
3603 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3604 uint64_t *d = vd, *g = vg; \
3606 uint64_t out = 0, pg = g[j]; \
3608 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3609 if ((pg >> (i & 63)) & 1) { \
3610 TYPE nn = *(TYPE *)(vn + H(i)); \
3611 out |= OP(TYPE, nn, 0, status); \
3618 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3619 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3620 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3621 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3622 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3623 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3625 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3626 DO_FPCMP_PPZ0_H(NAME, OP) \
3627 DO_FPCMP_PPZ0_S(NAME, OP) \
3628 DO_FPCMP_PPZ0_D(NAME, OP)
3630 DO_FPCMP_PPZ0_ALL(sve_fcmge0
, DO_FCMGE
)
3631 DO_FPCMP_PPZ0_ALL(sve_fcmgt0
, DO_FCMGT
)
3632 DO_FPCMP_PPZ0_ALL(sve_fcmle0
, DO_FCMLE
)
3633 DO_FPCMP_PPZ0_ALL(sve_fcmlt0
, DO_FCMLT
)
3634 DO_FPCMP_PPZ0_ALL(sve_fcmeq0
, DO_FCMEQ
)
3635 DO_FPCMP_PPZ0_ALL(sve_fcmne0
, DO_FCMNE
)
3637 /* FP Trig Multiply-Add. */
3639 void HELPER(sve_ftmad_h
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
3641 static const float16 coeff
[16] = {
3642 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3643 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3645 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float16
);
3646 intptr_t x
= simd_data(desc
);
3647 float16
*d
= vd
, *n
= vn
, *m
= vm
;
3648 for (i
= 0; i
< opr_sz
; i
++) {
3651 if (float16_is_neg(mm
)) {
3652 mm
= float16_abs(mm
);
3655 d
[i
] = float16_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
3659 void HELPER(sve_ftmad_s
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
3661 static const float32 coeff
[16] = {
3662 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3663 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3664 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3665 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3667 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float32
);
3668 intptr_t x
= simd_data(desc
);
3669 float32
*d
= vd
, *n
= vn
, *m
= vm
;
3670 for (i
= 0; i
< opr_sz
; i
++) {
3673 if (float32_is_neg(mm
)) {
3674 mm
= float32_abs(mm
);
3677 d
[i
] = float32_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
3681 void HELPER(sve_ftmad_d
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
3683 static const float64 coeff
[16] = {
3684 0x3ff0000000000000ull
, 0xbfc5555555555543ull
,
3685 0x3f8111111110f30cull
, 0xbf2a01a019b92fc6ull
,
3686 0x3ec71de351f3d22bull
, 0xbe5ae5e2b60f7b91ull
,
3687 0x3de5d8408868552full
, 0x0000000000000000ull
,
3688 0x3ff0000000000000ull
, 0xbfe0000000000000ull
,
3689 0x3fa5555555555536ull
, 0xbf56c16c16c13a0bull
,
3690 0x3efa01a019b1e8d8ull
, 0xbe927e4f7282f468ull
,
3691 0x3e21ee96d2641b13ull
, 0xbda8f76380fbb401ull
,
3693 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float64
);
3694 intptr_t x
= simd_data(desc
);
3695 float64
*d
= vd
, *n
= vn
, *m
= vm
;
3696 for (i
= 0; i
< opr_sz
; i
++) {
3699 if (float64_is_neg(mm
)) {
3700 mm
= float64_abs(mm
);
3703 d
[i
] = float64_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
3711 void HELPER(sve_fcadd_h
)(void *vd
, void *vn
, void *vm
, void *vg
,
3712 void *vs
, uint32_t desc
)
3714 intptr_t j
, i
= simd_oprsz(desc
);
3716 float16 neg_imag
= float16_set_sign(0, simd_data(desc
));
3717 float16 neg_real
= float16_chs(neg_imag
);
3720 uint64_t pg
= g
[(i
- 1) >> 6];
3722 float16 e0
, e1
, e2
, e3
;
3724 /* I holds the real index; J holds the imag index. */
3725 j
= i
- sizeof(float16
);
3726 i
-= 2 * sizeof(float16
);
3728 e0
= *(float16
*)(vn
+ H1_2(i
));
3729 e1
= *(float16
*)(vm
+ H1_2(j
)) ^ neg_real
;
3730 e2
= *(float16
*)(vn
+ H1_2(j
));
3731 e3
= *(float16
*)(vm
+ H1_2(i
)) ^ neg_imag
;
3733 if (likely((pg
>> (i
& 63)) & 1)) {
3734 *(float16
*)(vd
+ H1_2(i
)) = float16_add(e0
, e1
, vs
);
3736 if (likely((pg
>> (j
& 63)) & 1)) {
3737 *(float16
*)(vd
+ H1_2(j
)) = float16_add(e2
, e3
, vs
);
3743 void HELPER(sve_fcadd_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
3744 void *vs
, uint32_t desc
)
3746 intptr_t j
, i
= simd_oprsz(desc
);
3748 float32 neg_imag
= float32_set_sign(0, simd_data(desc
));
3749 float32 neg_real
= float32_chs(neg_imag
);
3752 uint64_t pg
= g
[(i
- 1) >> 6];
3754 float32 e0
, e1
, e2
, e3
;
3756 /* I holds the real index; J holds the imag index. */
3757 j
= i
- sizeof(float32
);
3758 i
-= 2 * sizeof(float32
);
3760 e0
= *(float32
*)(vn
+ H1_2(i
));
3761 e1
= *(float32
*)(vm
+ H1_2(j
)) ^ neg_real
;
3762 e2
= *(float32
*)(vn
+ H1_2(j
));
3763 e3
= *(float32
*)(vm
+ H1_2(i
)) ^ neg_imag
;
3765 if (likely((pg
>> (i
& 63)) & 1)) {
3766 *(float32
*)(vd
+ H1_2(i
)) = float32_add(e0
, e1
, vs
);
3768 if (likely((pg
>> (j
& 63)) & 1)) {
3769 *(float32
*)(vd
+ H1_2(j
)) = float32_add(e2
, e3
, vs
);
3775 void HELPER(sve_fcadd_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
3776 void *vs
, uint32_t desc
)
3778 intptr_t j
, i
= simd_oprsz(desc
);
3780 float64 neg_imag
= float64_set_sign(0, simd_data(desc
));
3781 float64 neg_real
= float64_chs(neg_imag
);
3784 uint64_t pg
= g
[(i
- 1) >> 6];
3786 float64 e0
, e1
, e2
, e3
;
3788 /* I holds the real index; J holds the imag index. */
3789 j
= i
- sizeof(float64
);
3790 i
-= 2 * sizeof(float64
);
3792 e0
= *(float64
*)(vn
+ H1_2(i
));
3793 e1
= *(float64
*)(vm
+ H1_2(j
)) ^ neg_real
;
3794 e2
= *(float64
*)(vn
+ H1_2(j
));
3795 e3
= *(float64
*)(vm
+ H1_2(i
)) ^ neg_imag
;
3797 if (likely((pg
>> (i
& 63)) & 1)) {
3798 *(float64
*)(vd
+ H1_2(i
)) = float64_add(e0
, e1
, vs
);
3800 if (likely((pg
>> (j
& 63)) & 1)) {
3801 *(float64
*)(vd
+ H1_2(j
)) = float64_add(e2
, e3
, vs
);
3808 * FP Complex Multiply
3811 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT
+ 22 > 32);
3813 void HELPER(sve_fcmla_zpzzz_h
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3815 intptr_t j
, i
= simd_oprsz(desc
);
3816 unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
3817 unsigned rn
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 5);
3818 unsigned rm
= extract32(desc
, SIMD_DATA_SHIFT
+ 10, 5);
3819 unsigned ra
= extract32(desc
, SIMD_DATA_SHIFT
+ 15, 5);
3820 unsigned rot
= extract32(desc
, SIMD_DATA_SHIFT
+ 20, 2);
3821 bool flip
= rot
& 1;
3822 float16 neg_imag
, neg_real
;
3823 void *vd
= &env
->vfp
.zregs
[rd
];
3824 void *vn
= &env
->vfp
.zregs
[rn
];
3825 void *vm
= &env
->vfp
.zregs
[rm
];
3826 void *va
= &env
->vfp
.zregs
[ra
];
3829 neg_imag
= float16_set_sign(0, (rot
& 2) != 0);
3830 neg_real
= float16_set_sign(0, rot
== 1 || rot
== 2);
3833 uint64_t pg
= g
[(i
- 1) >> 6];
3835 float16 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
3837 /* I holds the real index; J holds the imag index. */
3838 j
= i
- sizeof(float16
);
3839 i
-= 2 * sizeof(float16
);
3841 nr
= *(float16
*)(vn
+ H1_2(i
));
3842 ni
= *(float16
*)(vn
+ H1_2(j
));
3843 mr
= *(float16
*)(vm
+ H1_2(i
));
3844 mi
= *(float16
*)(vm
+ H1_2(j
));
3846 e2
= (flip
? ni
: nr
);
3847 e1
= (flip
? mi
: mr
) ^ neg_real
;
3849 e3
= (flip
? mr
: mi
) ^ neg_imag
;
3851 if (likely((pg
>> (i
& 63)) & 1)) {
3852 d
= *(float16
*)(va
+ H1_2(i
));
3853 d
= float16_muladd(e2
, e1
, d
, 0, &env
->vfp
.fp_status_f16
);
3854 *(float16
*)(vd
+ H1_2(i
)) = d
;
3856 if (likely((pg
>> (j
& 63)) & 1)) {
3857 d
= *(float16
*)(va
+ H1_2(j
));
3858 d
= float16_muladd(e4
, e3
, d
, 0, &env
->vfp
.fp_status_f16
);
3859 *(float16
*)(vd
+ H1_2(j
)) = d
;
3865 void HELPER(sve_fcmla_zpzzz_s
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3867 intptr_t j
, i
= simd_oprsz(desc
);
3868 unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
3869 unsigned rn
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 5);
3870 unsigned rm
= extract32(desc
, SIMD_DATA_SHIFT
+ 10, 5);
3871 unsigned ra
= extract32(desc
, SIMD_DATA_SHIFT
+ 15, 5);
3872 unsigned rot
= extract32(desc
, SIMD_DATA_SHIFT
+ 20, 2);
3873 bool flip
= rot
& 1;
3874 float32 neg_imag
, neg_real
;
3875 void *vd
= &env
->vfp
.zregs
[rd
];
3876 void *vn
= &env
->vfp
.zregs
[rn
];
3877 void *vm
= &env
->vfp
.zregs
[rm
];
3878 void *va
= &env
->vfp
.zregs
[ra
];
3881 neg_imag
= float32_set_sign(0, (rot
& 2) != 0);
3882 neg_real
= float32_set_sign(0, rot
== 1 || rot
== 2);
3885 uint64_t pg
= g
[(i
- 1) >> 6];
3887 float32 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
3889 /* I holds the real index; J holds the imag index. */
3890 j
= i
- sizeof(float32
);
3891 i
-= 2 * sizeof(float32
);
3893 nr
= *(float32
*)(vn
+ H1_2(i
));
3894 ni
= *(float32
*)(vn
+ H1_2(j
));
3895 mr
= *(float32
*)(vm
+ H1_2(i
));
3896 mi
= *(float32
*)(vm
+ H1_2(j
));
3898 e2
= (flip
? ni
: nr
);
3899 e1
= (flip
? mi
: mr
) ^ neg_real
;
3901 e3
= (flip
? mr
: mi
) ^ neg_imag
;
3903 if (likely((pg
>> (i
& 63)) & 1)) {
3904 d
= *(float32
*)(va
+ H1_2(i
));
3905 d
= float32_muladd(e2
, e1
, d
, 0, &env
->vfp
.fp_status
);
3906 *(float32
*)(vd
+ H1_2(i
)) = d
;
3908 if (likely((pg
>> (j
& 63)) & 1)) {
3909 d
= *(float32
*)(va
+ H1_2(j
));
3910 d
= float32_muladd(e4
, e3
, d
, 0, &env
->vfp
.fp_status
);
3911 *(float32
*)(vd
+ H1_2(j
)) = d
;
3917 void HELPER(sve_fcmla_zpzzz_d
)(CPUARMState
*env
, void *vg
, uint32_t desc
)
3919 intptr_t j
, i
= simd_oprsz(desc
);
3920 unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
, 5);
3921 unsigned rn
= extract32(desc
, SIMD_DATA_SHIFT
+ 5, 5);
3922 unsigned rm
= extract32(desc
, SIMD_DATA_SHIFT
+ 10, 5);
3923 unsigned ra
= extract32(desc
, SIMD_DATA_SHIFT
+ 15, 5);
3924 unsigned rot
= extract32(desc
, SIMD_DATA_SHIFT
+ 20, 2);
3925 bool flip
= rot
& 1;
3926 float64 neg_imag
, neg_real
;
3927 void *vd
= &env
->vfp
.zregs
[rd
];
3928 void *vn
= &env
->vfp
.zregs
[rn
];
3929 void *vm
= &env
->vfp
.zregs
[rm
];
3930 void *va
= &env
->vfp
.zregs
[ra
];
3933 neg_imag
= float64_set_sign(0, (rot
& 2) != 0);
3934 neg_real
= float64_set_sign(0, rot
== 1 || rot
== 2);
3937 uint64_t pg
= g
[(i
- 1) >> 6];
3939 float64 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
3941 /* I holds the real index; J holds the imag index. */
3942 j
= i
- sizeof(float64
);
3943 i
-= 2 * sizeof(float64
);
3945 nr
= *(float64
*)(vn
+ H1_2(i
));
3946 ni
= *(float64
*)(vn
+ H1_2(j
));
3947 mr
= *(float64
*)(vm
+ H1_2(i
));
3948 mi
= *(float64
*)(vm
+ H1_2(j
));
3950 e2
= (flip
? ni
: nr
);
3951 e1
= (flip
? mi
: mr
) ^ neg_real
;
3953 e3
= (flip
? mr
: mi
) ^ neg_imag
;
3955 if (likely((pg
>> (i
& 63)) & 1)) {
3956 d
= *(float64
*)(va
+ H1_2(i
));
3957 d
= float64_muladd(e2
, e1
, d
, 0, &env
->vfp
.fp_status
);
3958 *(float64
*)(vd
+ H1_2(i
)) = d
;
3960 if (likely((pg
>> (j
& 63)) & 1)) {
3961 d
= *(float64
*)(va
+ H1_2(j
));
3962 d
= float64_muladd(e4
, e3
, d
, 0, &env
->vfp
.fp_status
);
3963 *(float64
*)(vd
+ H1_2(j
)) = d
;
3970 * Load contiguous data, protected by a governing predicate.
3974 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3975 * Memory is valid through @host + @mem_max. The register element
3976 * indicies are inferred from @mem_ofs, as modified by the types for
3977 * which the helper is built. Return the @mem_ofs of the first element
3978 * not loaded (which is @mem_max if they are all loaded).
3980 * For softmmu, we have fully validated the guest page. For user-only,
3981 * we cannot fully validate without taking the mmap lock, but since we
3982 * know the access is within one host page, if any access is valid they
3983 * all must be valid. However, when @vg is all false, it may be that
3984 * no access is valid.
3986 typedef intptr_t sve_ld1_host_fn(void *vd
, void *vg
, void *host
,
3987 intptr_t mem_ofs
, intptr_t mem_max
);
3990 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3991 * The controlling predicate is known to be true.
3993 typedef void sve_ld1_tlb_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
3994 target_ulong vaddr
, TCGMemOpIdx oi
, uintptr_t ra
);
3995 typedef sve_ld1_tlb_fn sve_st1_tlb_fn
;
3998 * Generate the above primitives.
4001 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4002 static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
4003 intptr_t mem_off, const intptr_t mem_max) \
4005 intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
4006 uint64_t *pg = vg; \
4007 while (mem_off + sizeof(TYPEM) <= mem_max) { \
4009 if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
4010 val = HOST(host + mem_off); \
4012 *(TYPEE *)(vd + H(reg_off)) = val; \
4013 mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
4018 #ifdef CONFIG_SOFTMMU
4019 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4020 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4021 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4023 TYPEM val = TLB(env, addr, oi, ra); \
4024 *(TYPEE *)(vd + H(reg_off)) = val; \
4027 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4028 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4029 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4031 TYPEM val = HOST(g2h(addr)); \
4032 *(TYPEE *)(vd + H(reg_off)) = val; \
4036 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4037 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4038 DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4040 DO_LD_PRIM_1(ld1bb
, H1
, uint8_t, uint8_t)
4041 DO_LD_PRIM_1(ld1bhu
, H1_2
, uint16_t, uint8_t)
4042 DO_LD_PRIM_1(ld1bhs
, H1_2
, uint16_t, int8_t)
4043 DO_LD_PRIM_1(ld1bsu
, H1_4
, uint32_t, uint8_t)
4044 DO_LD_PRIM_1(ld1bss
, H1_4
, uint32_t, int8_t)
4045 DO_LD_PRIM_1(ld1bdu
, , uint64_t, uint8_t)
4046 DO_LD_PRIM_1(ld1bds
, , uint64_t, int8_t)
4048 #define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
4049 DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
4050 DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
4051 MOEND, helper_##end##_##PT##_mmu)
4053 DO_LD_PRIM_2(ld1hh
, le
, MO_LE
, H1_2
, uint16_t, uint16_t, lduw
, lduw
)
4054 DO_LD_PRIM_2(ld1hsu
, le
, MO_LE
, H1_4
, uint32_t, uint16_t, lduw
, lduw
)
4055 DO_LD_PRIM_2(ld1hss
, le
, MO_LE
, H1_4
, uint32_t, int16_t, lduw
, lduw
)
4056 DO_LD_PRIM_2(ld1hdu
, le
, MO_LE
, , uint64_t, uint16_t, lduw
, lduw
)
4057 DO_LD_PRIM_2(ld1hds
, le
, MO_LE
, , uint64_t, int16_t, lduw
, lduw
)
4059 DO_LD_PRIM_2(ld1ss
, le
, MO_LE
, H1_4
, uint32_t, uint32_t, ldl
, ldul
)
4060 DO_LD_PRIM_2(ld1sdu
, le
, MO_LE
, , uint64_t, uint32_t, ldl
, ldul
)
4061 DO_LD_PRIM_2(ld1sds
, le
, MO_LE
, , uint64_t, int32_t, ldl
, ldul
)
4063 DO_LD_PRIM_2(ld1dd
, le
, MO_LE
, , uint64_t, uint64_t, ldq
, ldq
)
4065 DO_LD_PRIM_2(ld1hh
, be
, MO_BE
, H1_2
, uint16_t, uint16_t, lduw
, lduw
)
4066 DO_LD_PRIM_2(ld1hsu
, be
, MO_BE
, H1_4
, uint32_t, uint16_t, lduw
, lduw
)
4067 DO_LD_PRIM_2(ld1hss
, be
, MO_BE
, H1_4
, uint32_t, int16_t, lduw
, lduw
)
4068 DO_LD_PRIM_2(ld1hdu
, be
, MO_BE
, , uint64_t, uint16_t, lduw
, lduw
)
4069 DO_LD_PRIM_2(ld1hds
, be
, MO_BE
, , uint64_t, int16_t, lduw
, lduw
)
4071 DO_LD_PRIM_2(ld1ss
, be
, MO_BE
, H1_4
, uint32_t, uint32_t, ldl
, ldul
)
4072 DO_LD_PRIM_2(ld1sdu
, be
, MO_BE
, , uint64_t, uint32_t, ldl
, ldul
)
4073 DO_LD_PRIM_2(ld1sds
, be
, MO_BE
, , uint64_t, int32_t, ldl
, ldul
)
4075 DO_LD_PRIM_2(ld1dd
, be
, MO_BE
, , uint64_t, uint64_t, ldq
, ldq
)
4083 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4084 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4085 * element >= @reg_off, or @reg_max if there were no active elements at all.
4087 static intptr_t find_next_active(uint64_t *vg
, intptr_t reg_off
,
4088 intptr_t reg_max
, int esz
)
4090 uint64_t pg_mask
= pred_esz_masks
[esz
];
4091 uint64_t pg
= (vg
[reg_off
>> 6] & pg_mask
) >> (reg_off
& 63);
4093 /* In normal usage, the first element is active. */
4094 if (likely(pg
& 1)) {
4102 if (unlikely(reg_off
>= reg_max
)) {
4103 /* The entire predicate was false. */
4106 pg
= vg
[reg_off
>> 6] & pg_mask
;
4109 reg_off
+= ctz64(pg
);
4111 /* We should never see an out of range predicate bit set. */
4112 tcg_debug_assert(reg_off
< reg_max
);
4117 * Return the maximum offset <= @mem_max which is still within the page
4118 * referenced by @base + @mem_off.
4120 static intptr_t max_for_page(target_ulong base
, intptr_t mem_off
,
4123 target_ulong addr
= base
+ mem_off
;
4124 intptr_t split
= -(intptr_t)(addr
| TARGET_PAGE_MASK
);
4125 return MIN(split
, mem_max
- mem_off
) + mem_off
;
4128 static inline void set_helper_retaddr(uintptr_t ra
)
4130 #ifdef CONFIG_USER_ONLY
4131 helper_retaddr
= ra
;
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null. Elide the useless test.
4139 static inline bool test_host_page(void *host
)
4141 #ifdef CONFIG_USER_ONLY
4144 return likely(host
!= NULL
);
4149 * Common helper for all contiguous one-register predicated loads.
4151 static void sve_ld1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
4152 uint32_t desc
, const uintptr_t retaddr
,
4153 const int esz
, const int msz
,
4154 sve_ld1_host_fn
*host_fn
,
4155 sve_ld1_tlb_fn
*tlb_fn
)
4157 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4158 const int mmu_idx
= get_mmuidx(oi
);
4159 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4160 void *vd
= &env
->vfp
.zregs
[rd
];
4161 const int diffsz
= esz
- msz
;
4162 const intptr_t reg_max
= simd_oprsz(desc
);
4163 const intptr_t mem_max
= reg_max
>> diffsz
;
4164 ARMVectorReg scratch
;
4166 intptr_t split
, reg_off
, mem_off
;
4168 /* Find the first active element. */
4169 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
4170 if (unlikely(reg_off
== reg_max
)) {
4171 /* The entire predicate was false; no load occurs. */
4172 memset(vd
, 0, reg_max
);
4175 mem_off
= reg_off
>> diffsz
;
4176 set_helper_retaddr(retaddr
);
4179 * If the (remaining) load is entirely within a single page, then:
4180 * For softmmu, and the tlb hits, then no faults will occur;
4181 * For user-only, either the first load will fault or none will.
4182 * We can thus perform the load directly to the destination and
4183 * Vd will be unmodified on any exception path.
4185 split
= max_for_page(addr
, mem_off
, mem_max
);
4186 if (likely(split
== mem_max
)) {
4187 host
= tlb_vaddr_to_host(env
, addr
+ mem_off
, MMU_DATA_LOAD
, mmu_idx
);
4188 if (test_host_page(host
)) {
4189 mem_off
= host_fn(vd
, vg
, host
- mem_off
, mem_off
, mem_max
);
4190 tcg_debug_assert(mem_off
== mem_max
);
4191 set_helper_retaddr(0);
4192 /* After having taken any fault, zero leading inactive elements. */
4193 swap_memzero(vd
, reg_off
);
4199 * Perform the predicated read into a temporary, thus ensuring
4200 * if the load of the last element faults, Vd is not modified.
4202 #ifdef CONFIG_USER_ONLY
4203 swap_memzero(&scratch
, reg_off
);
4204 host_fn(&scratch
, vg
, g2h(addr
), mem_off
, mem_max
);
4206 memset(&scratch
, 0, reg_max
);
4209 reg_off
= find_next_active(vg
, reg_off
, reg_max
, esz
);
4210 if (reg_off
>= reg_max
) {
4213 mem_off
= reg_off
>> diffsz
;
4214 split
= max_for_page(addr
, mem_off
, mem_max
);
4217 if (split
- mem_off
>= (1 << msz
)) {
4218 /* At least one whole element on this page. */
4219 host
= tlb_vaddr_to_host(env
, addr
+ mem_off
,
4220 MMU_DATA_LOAD
, mmu_idx
);
4222 mem_off
= host_fn(&scratch
, vg
, host
- mem_off
,
4224 reg_off
= mem_off
<< diffsz
;
4230 * Perform one normal read. This may fault, longjmping out to the
4231 * main loop in order to raise an exception. It may succeed, and
4232 * as a side-effect load the TLB entry for the next round. Finally,
4233 * in the extremely unlikely case we're performing this operation
4234 * on I/O memory, it may succeed but not bring in the TLB entry.
4235 * But even then we have still made forward progress.
4237 tlb_fn(env
, &scratch
, reg_off
, addr
+ mem_off
, oi
, retaddr
);
4238 reg_off
+= 1 << esz
;
4242 set_helper_retaddr(0);
4243 memcpy(vd
, &scratch
, reg_max
);
4246 #define DO_LD1_1(NAME, ESZ) \
4247 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4248 target_ulong addr, uint32_t desc) \
4250 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4251 sve_##NAME##_host, sve_##NAME##_tlb); \
4254 #define DO_LD1_2(NAME, ESZ, MSZ) \
4255 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4256 target_ulong addr, uint32_t desc) \
4258 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4259 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4261 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4262 target_ulong addr, uint32_t desc) \
4264 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4265 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4276 DO_LD1_2(ld1hh
, 1, 1)
4277 DO_LD1_2(ld1hsu
, 2, 1)
4278 DO_LD1_2(ld1hss
, 2, 1)
4279 DO_LD1_2(ld1hdu
, 3, 1)
4280 DO_LD1_2(ld1hds
, 3, 1)
4282 DO_LD1_2(ld1ss
, 2, 2)
4283 DO_LD1_2(ld1sdu
, 3, 2)
4284 DO_LD1_2(ld1sds
, 3, 2)
4286 DO_LD1_2(ld1dd
, 3, 3)
4292 * Common helpers for all contiguous 2,3,4-register predicated loads.
4294 static void sve_ld2_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4295 uint32_t desc
, int size
, uintptr_t ra
,
4296 sve_ld1_tlb_fn
*tlb_fn
)
4298 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4299 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4300 intptr_t i
, oprsz
= simd_oprsz(desc
);
4301 ARMVectorReg scratch
[2] = { };
4303 set_helper_retaddr(ra
);
4304 for (i
= 0; i
< oprsz
; ) {
4305 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4308 tlb_fn(env
, &scratch
[0], i
, addr
, oi
, ra
);
4309 tlb_fn(env
, &scratch
[1], i
, addr
+ size
, oi
, ra
);
4311 i
+= size
, pg
>>= size
;
4315 set_helper_retaddr(0);
4317 /* Wait until all exceptions have been raised to write back. */
4318 memcpy(&env
->vfp
.zregs
[rd
], &scratch
[0], oprsz
);
4319 memcpy(&env
->vfp
.zregs
[(rd
+ 1) & 31], &scratch
[1], oprsz
);
4322 static void sve_ld3_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4323 uint32_t desc
, int size
, uintptr_t ra
,
4324 sve_ld1_tlb_fn
*tlb_fn
)
4326 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4327 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4328 intptr_t i
, oprsz
= simd_oprsz(desc
);
4329 ARMVectorReg scratch
[3] = { };
4331 set_helper_retaddr(ra
);
4332 for (i
= 0; i
< oprsz
; ) {
4333 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4336 tlb_fn(env
, &scratch
[0], i
, addr
, oi
, ra
);
4337 tlb_fn(env
, &scratch
[1], i
, addr
+ size
, oi
, ra
);
4338 tlb_fn(env
, &scratch
[2], i
, addr
+ 2 * size
, oi
, ra
);
4340 i
+= size
, pg
>>= size
;
4344 set_helper_retaddr(0);
4346 /* Wait until all exceptions have been raised to write back. */
4347 memcpy(&env
->vfp
.zregs
[rd
], &scratch
[0], oprsz
);
4348 memcpy(&env
->vfp
.zregs
[(rd
+ 1) & 31], &scratch
[1], oprsz
);
4349 memcpy(&env
->vfp
.zregs
[(rd
+ 2) & 31], &scratch
[2], oprsz
);
4352 static void sve_ld4_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4353 uint32_t desc
, int size
, uintptr_t ra
,
4354 sve_ld1_tlb_fn
*tlb_fn
)
4356 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4357 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4358 intptr_t i
, oprsz
= simd_oprsz(desc
);
4359 ARMVectorReg scratch
[4] = { };
4361 set_helper_retaddr(ra
);
4362 for (i
= 0; i
< oprsz
; ) {
4363 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4366 tlb_fn(env
, &scratch
[0], i
, addr
, oi
, ra
);
4367 tlb_fn(env
, &scratch
[1], i
, addr
+ size
, oi
, ra
);
4368 tlb_fn(env
, &scratch
[2], i
, addr
+ 2 * size
, oi
, ra
);
4369 tlb_fn(env
, &scratch
[3], i
, addr
+ 3 * size
, oi
, ra
);
4371 i
+= size
, pg
>>= size
;
4375 set_helper_retaddr(0);
4377 /* Wait until all exceptions have been raised to write back. */
4378 memcpy(&env
->vfp
.zregs
[rd
], &scratch
[0], oprsz
);
4379 memcpy(&env
->vfp
.zregs
[(rd
+ 1) & 31], &scratch
[1], oprsz
);
4380 memcpy(&env
->vfp
.zregs
[(rd
+ 2) & 31], &scratch
[2], oprsz
);
4381 memcpy(&env
->vfp
.zregs
[(rd
+ 3) & 31], &scratch
[3], oprsz
);
4384 #define DO_LDN_1(N) \
4385 void QEMU_FLATTEN HELPER(sve_ld##N##bb_r) \
4386 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4388 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4391 #define DO_LDN_2(N, SUFF, SIZE) \
4392 void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_le_r) \
4393 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4395 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4396 sve_ld1##SUFF##_le_tlb); \
4398 void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_be_r) \
4399 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4401 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4402 sve_ld1##SUFF##_be_tlb); \
4425 * Load contiguous data, first-fault and no-fault.
4427 * For user-only, one could argue that we should hold the mmap_lock during
4428 * the operation so that there is no race between page_check_range and the
4429 * load operation. However, unmapping pages out from under a running thread
4430 * is extraordinarily unlikely. This theoretical race condition also affects
4431 * linux-user/ in its get_user/put_user macros.
4433 * TODO: Construct some helpers, written in assembly, that interact with
4434 * handle_cpu_signal to produce memory ops which can properly report errors
4438 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4439 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4440 * option, which leaves subsequent data unchanged.
4442 static void record_fault(CPUARMState
*env
, uintptr_t i
, uintptr_t oprsz
)
4444 uint64_t *ffr
= env
->vfp
.pregs
[FFR_PRED_NUM
].p
;
4447 ffr
[i
/ 64] &= MAKE_64BIT_MASK(0, i
& 63);
4448 i
= ROUND_UP(i
, 64);
4450 for (; i
< oprsz
; i
+= 64) {
4456 * Common helper for all contiguous first-fault loads.
4458 static void sve_ldff1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
4459 uint32_t desc
, const uintptr_t retaddr
,
4460 const int esz
, const int msz
,
4461 sve_ld1_host_fn
*host_fn
,
4462 sve_ld1_tlb_fn
*tlb_fn
)
4464 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4465 const int mmu_idx
= get_mmuidx(oi
);
4466 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4467 void *vd
= &env
->vfp
.zregs
[rd
];
4468 const int diffsz
= esz
- msz
;
4469 const intptr_t reg_max
= simd_oprsz(desc
);
4470 const intptr_t mem_max
= reg_max
>> diffsz
;
4471 intptr_t split
, reg_off
, mem_off
;
4474 /* Skip to the first active element. */
4475 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
4476 if (unlikely(reg_off
== reg_max
)) {
4477 /* The entire predicate was false; no load occurs. */
4478 memset(vd
, 0, reg_max
);
4481 mem_off
= reg_off
>> diffsz
;
4482 set_helper_retaddr(retaddr
);
4485 * If the (remaining) load is entirely within a single page, then:
4486 * For softmmu, and the tlb hits, then no faults will occur;
4487 * For user-only, either the first load will fault or none will.
4488 * We can thus perform the load directly to the destination and
4489 * Vd will be unmodified on any exception path.
4491 split
= max_for_page(addr
, mem_off
, mem_max
);
4492 if (likely(split
== mem_max
)) {
4493 host
= tlb_vaddr_to_host(env
, addr
+ mem_off
, MMU_DATA_LOAD
, mmu_idx
);
4494 if (test_host_page(host
)) {
4495 mem_off
= host_fn(vd
, vg
, host
- mem_off
, mem_off
, mem_max
);
4496 tcg_debug_assert(mem_off
== mem_max
);
4497 set_helper_retaddr(0);
4498 /* After any fault, zero any leading inactive elements. */
4499 swap_memzero(vd
, reg_off
);
4504 #ifdef CONFIG_USER_ONLY
4506 * The page(s) containing this first element at ADDR+MEM_OFF must
4507 * be valid. Considering that this first element may be misaligned
4508 * and cross a page boundary itself, take the rest of the page from
4509 * the last byte of the element.
4511 split
= max_for_page(addr
, mem_off
+ (1 << msz
) - 1, mem_max
);
4512 mem_off
= host_fn(vd
, vg
, g2h(addr
), mem_off
, split
);
4514 /* After any fault, zero any leading inactive elements. */
4515 swap_memzero(vd
, reg_off
);
4516 reg_off
= mem_off
<< diffsz
;
4519 * Perform one normal read, which will fault or not.
4520 * But it is likely to bring the page into the tlb.
4522 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, oi
, retaddr
);
4524 /* After any fault, zero any leading predicated false elts. */
4525 swap_memzero(vd
, reg_off
);
4526 mem_off
+= 1 << msz
;
4527 reg_off
+= 1 << esz
;
4529 /* Try again to read the balance of the page. */
4530 split
= max_for_page(addr
, mem_off
- 1, mem_max
);
4531 if (split
>= (1 << msz
)) {
4532 host
= tlb_vaddr_to_host(env
, addr
+ mem_off
, MMU_DATA_LOAD
, mmu_idx
);
4534 mem_off
= host_fn(vd
, vg
, host
- mem_off
, mem_off
, split
);
4535 reg_off
= mem_off
<< diffsz
;
4540 set_helper_retaddr(0);
4541 record_fault(env
, reg_off
, reg_max
);
4545 * Common helper for all contiguous no-fault loads.
4547 static void sve_ldnf1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
4548 uint32_t desc
, const int esz
, const int msz
,
4549 sve_ld1_host_fn
*host_fn
)
4551 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4552 void *vd
= &env
->vfp
.zregs
[rd
];
4553 const int diffsz
= esz
- msz
;
4554 const intptr_t reg_max
= simd_oprsz(desc
);
4555 const intptr_t mem_max
= reg_max
>> diffsz
;
4556 const int mmu_idx
= cpu_mmu_index(env
, false);
4557 intptr_t split
, reg_off
, mem_off
;
4560 #ifdef CONFIG_USER_ONLY
4561 host
= tlb_vaddr_to_host(env
, addr
, MMU_DATA_LOAD
, mmu_idx
);
4562 if (likely(page_check_range(addr
, mem_max
, PAGE_READ
) == 0)) {
4563 /* The entire operation is valid and will not fault. */
4564 host_fn(vd
, vg
, host
, 0, mem_max
);
4569 /* There will be no fault, so we may modify in advance. */
4570 memset(vd
, 0, reg_max
);
4572 /* Skip to the first active element. */
4573 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
4574 if (unlikely(reg_off
== reg_max
)) {
4575 /* The entire predicate was false; no load occurs. */
4578 mem_off
= reg_off
>> diffsz
;
4580 #ifdef CONFIG_USER_ONLY
4581 if (page_check_range(addr
+ mem_off
, 1 << msz
, PAGE_READ
) == 0) {
4582 /* At least one load is valid; take the rest of the page. */
4583 split
= max_for_page(addr
, mem_off
+ (1 << msz
) - 1, mem_max
);
4584 mem_off
= host_fn(vd
, vg
, host
, mem_off
, split
);
4585 reg_off
= mem_off
<< diffsz
;
4589 * If the address is not in the TLB, we have no way to bring the
4590 * entry into the TLB without also risking a fault. Note that
4591 * the corollary is that we never load from an address not in RAM.
4593 * This last is out of spec, in a weird corner case.
4594 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4595 * must not actually hit the bus -- it returns UNKNOWN data instead.
4596 * But if you map non-RAM with Normal memory attributes and do a NF
4597 * load then it should access the bus. (Nobody ought actually do this
4598 * in the real world, obviously.)
4600 * Then there are the annoying special cases with watchpoints...
4601 * TODO: Add a form of non-faulting loads using cc->tlb_fill(probe=true).
4603 host
= tlb_vaddr_to_host(env
, addr
+ mem_off
, MMU_DATA_LOAD
, mmu_idx
);
4604 split
= max_for_page(addr
, mem_off
, mem_max
);
4605 if (host
&& split
>= (1 << msz
)) {
4606 mem_off
= host_fn(vd
, vg
, host
- mem_off
, mem_off
, split
);
4607 reg_off
= mem_off
<< diffsz
;
4611 record_fault(env
, reg_off
, reg_max
);
4614 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4615 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4616 target_ulong addr, uint32_t desc) \
4618 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4619 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4621 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4622 target_ulong addr, uint32_t desc) \
4624 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
4627 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4628 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4629 target_ulong addr, uint32_t desc) \
4631 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4632 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4634 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4635 target_ulong addr, uint32_t desc) \
4637 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4639 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4640 target_ulong addr, uint32_t desc) \
4642 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4643 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4645 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4646 target_ulong addr, uint32_t desc) \
4648 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
4651 DO_LDFF1_LDNF1_1(bb
, 0)
4652 DO_LDFF1_LDNF1_1(bhu
, 1)
4653 DO_LDFF1_LDNF1_1(bhs
, 1)
4654 DO_LDFF1_LDNF1_1(bsu
, 2)
4655 DO_LDFF1_LDNF1_1(bss
, 2)
4656 DO_LDFF1_LDNF1_1(bdu
, 3)
4657 DO_LDFF1_LDNF1_1(bds
, 3)
4659 DO_LDFF1_LDNF1_2(hh
, 1, 1)
4660 DO_LDFF1_LDNF1_2(hsu
, 2, 1)
4661 DO_LDFF1_LDNF1_2(hss
, 2, 1)
4662 DO_LDFF1_LDNF1_2(hdu
, 3, 1)
4663 DO_LDFF1_LDNF1_2(hds
, 3, 1)
4665 DO_LDFF1_LDNF1_2(ss
, 2, 2)
4666 DO_LDFF1_LDNF1_2(sdu
, 3, 2)
4667 DO_LDFF1_LDNF1_2(sds
, 3, 2)
4669 DO_LDFF1_LDNF1_2(dd
, 3, 3)
4671 #undef DO_LDFF1_LDNF1_1
4672 #undef DO_LDFF1_LDNF1_2
4675 * Store contiguous data, protected by a governing predicate.
4678 #ifdef CONFIG_SOFTMMU
4679 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4680 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4681 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4683 TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
4686 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4687 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4688 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4690 HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
4694 DO_ST_TLB(st1bb
, H1
, uint8_t, stb_p
, 0, helper_ret_stb_mmu
)
4695 DO_ST_TLB(st1bh
, H1_2
, uint16_t, stb_p
, 0, helper_ret_stb_mmu
)
4696 DO_ST_TLB(st1bs
, H1_4
, uint32_t, stb_p
, 0, helper_ret_stb_mmu
)
4697 DO_ST_TLB(st1bd
, , uint64_t, stb_p
, 0, helper_ret_stb_mmu
)
4699 DO_ST_TLB(st1hh_le
, H1_2
, uint16_t, stw_le_p
, MO_LE
, helper_le_stw_mmu
)
4700 DO_ST_TLB(st1hs_le
, H1_4
, uint32_t, stw_le_p
, MO_LE
, helper_le_stw_mmu
)
4701 DO_ST_TLB(st1hd_le
, , uint64_t, stw_le_p
, MO_LE
, helper_le_stw_mmu
)
4703 DO_ST_TLB(st1ss_le
, H1_4
, uint32_t, stl_le_p
, MO_LE
, helper_le_stl_mmu
)
4704 DO_ST_TLB(st1sd_le
, , uint64_t, stl_le_p
, MO_LE
, helper_le_stl_mmu
)
4706 DO_ST_TLB(st1dd_le
, , uint64_t, stq_le_p
, MO_LE
, helper_le_stq_mmu
)
4708 DO_ST_TLB(st1hh_be
, H1_2
, uint16_t, stw_be_p
, MO_BE
, helper_be_stw_mmu
)
4709 DO_ST_TLB(st1hs_be
, H1_4
, uint32_t, stw_be_p
, MO_BE
, helper_be_stw_mmu
)
4710 DO_ST_TLB(st1hd_be
, , uint64_t, stw_be_p
, MO_BE
, helper_be_stw_mmu
)
4712 DO_ST_TLB(st1ss_be
, H1_4
, uint32_t, stl_be_p
, MO_BE
, helper_be_stl_mmu
)
4713 DO_ST_TLB(st1sd_be
, , uint64_t, stl_be_p
, MO_BE
, helper_be_stl_mmu
)
4715 DO_ST_TLB(st1dd_be
, , uint64_t, stq_be_p
, MO_BE
, helper_be_stq_mmu
)
4720 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4722 static void sve_st1_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4723 uint32_t desc
, const uintptr_t ra
,
4724 const int esize
, const int msize
,
4725 sve_st1_tlb_fn
*tlb_fn
)
4727 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4728 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4729 intptr_t i
, oprsz
= simd_oprsz(desc
);
4730 void *vd
= &env
->vfp
.zregs
[rd
];
4732 set_helper_retaddr(ra
);
4733 for (i
= 0; i
< oprsz
; ) {
4734 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4737 tlb_fn(env
, vd
, i
, addr
, oi
, ra
);
4739 i
+= esize
, pg
>>= esize
;
4743 set_helper_retaddr(0);
4746 static void sve_st2_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4747 uint32_t desc
, const uintptr_t ra
,
4748 const int esize
, const int msize
,
4749 sve_st1_tlb_fn
*tlb_fn
)
4751 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4752 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4753 intptr_t i
, oprsz
= simd_oprsz(desc
);
4754 void *d1
= &env
->vfp
.zregs
[rd
];
4755 void *d2
= &env
->vfp
.zregs
[(rd
+ 1) & 31];
4757 set_helper_retaddr(ra
);
4758 for (i
= 0; i
< oprsz
; ) {
4759 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4762 tlb_fn(env
, d1
, i
, addr
, oi
, ra
);
4763 tlb_fn(env
, d2
, i
, addr
+ msize
, oi
, ra
);
4765 i
+= esize
, pg
>>= esize
;
4769 set_helper_retaddr(0);
4772 static void sve_st3_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4773 uint32_t desc
, const uintptr_t ra
,
4774 const int esize
, const int msize
,
4775 sve_st1_tlb_fn
*tlb_fn
)
4777 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4778 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4779 intptr_t i
, oprsz
= simd_oprsz(desc
);
4780 void *d1
= &env
->vfp
.zregs
[rd
];
4781 void *d2
= &env
->vfp
.zregs
[(rd
+ 1) & 31];
4782 void *d3
= &env
->vfp
.zregs
[(rd
+ 2) & 31];
4784 set_helper_retaddr(ra
);
4785 for (i
= 0; i
< oprsz
; ) {
4786 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4789 tlb_fn(env
, d1
, i
, addr
, oi
, ra
);
4790 tlb_fn(env
, d2
, i
, addr
+ msize
, oi
, ra
);
4791 tlb_fn(env
, d3
, i
, addr
+ 2 * msize
, oi
, ra
);
4793 i
+= esize
, pg
>>= esize
;
4797 set_helper_retaddr(0);
4800 static void sve_st4_r(CPUARMState
*env
, void *vg
, target_ulong addr
,
4801 uint32_t desc
, const uintptr_t ra
,
4802 const int esize
, const int msize
,
4803 sve_st1_tlb_fn
*tlb_fn
)
4805 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4806 const unsigned rd
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 5);
4807 intptr_t i
, oprsz
= simd_oprsz(desc
);
4808 void *d1
= &env
->vfp
.zregs
[rd
];
4809 void *d2
= &env
->vfp
.zregs
[(rd
+ 1) & 31];
4810 void *d3
= &env
->vfp
.zregs
[(rd
+ 2) & 31];
4811 void *d4
= &env
->vfp
.zregs
[(rd
+ 3) & 31];
4813 set_helper_retaddr(ra
);
4814 for (i
= 0; i
< oprsz
; ) {
4815 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4818 tlb_fn(env
, d1
, i
, addr
, oi
, ra
);
4819 tlb_fn(env
, d2
, i
, addr
+ msize
, oi
, ra
);
4820 tlb_fn(env
, d3
, i
, addr
+ 2 * msize
, oi
, ra
);
4821 tlb_fn(env
, d4
, i
, addr
+ 3 * msize
, oi
, ra
);
4823 i
+= esize
, pg
>>= esize
;
4827 set_helper_retaddr(0);
4830 #define DO_STN_1(N, NAME, ESIZE) \
4831 void QEMU_FLATTEN HELPER(sve_st##N##NAME##_r) \
4832 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4834 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
4835 sve_st1##NAME##_tlb); \
4838 #define DO_STN_2(N, NAME, ESIZE, MSIZE) \
4839 void QEMU_FLATTEN HELPER(sve_st##N##NAME##_le_r) \
4840 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4842 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4843 sve_st1##NAME##_le_tlb); \
4845 void QEMU_FLATTEN HELPER(sve_st##N##NAME##_be_r) \
4846 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4848 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4849 sve_st1##NAME##_be_tlb); \
4860 DO_STN_2(1, hh
, 2, 2)
4861 DO_STN_2(1, hs
, 4, 2)
4862 DO_STN_2(1, hd
, 8, 2)
4863 DO_STN_2(2, hh
, 2, 2)
4864 DO_STN_2(3, hh
, 2, 2)
4865 DO_STN_2(4, hh
, 2, 2)
4867 DO_STN_2(1, ss
, 4, 4)
4868 DO_STN_2(1, sd
, 8, 4)
4869 DO_STN_2(2, ss
, 4, 4)
4870 DO_STN_2(3, ss
, 4, 4)
4871 DO_STN_2(4, ss
, 4, 4)
4873 DO_STN_2(1, dd
, 8, 8)
4874 DO_STN_2(2, dd
, 8, 8)
4875 DO_STN_2(3, dd
, 8, 8)
4876 DO_STN_2(4, dd
, 8, 8)
4882 * Loads with a vector index.
4886 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
4888 typedef target_ulong
zreg_off_fn(void *reg
, intptr_t reg_ofs
);
4890 static target_ulong
off_zsu_s(void *reg
, intptr_t reg_ofs
)
4892 return *(uint32_t *)(reg
+ H1_4(reg_ofs
));
4895 static target_ulong
off_zss_s(void *reg
, intptr_t reg_ofs
)
4897 return *(int32_t *)(reg
+ H1_4(reg_ofs
));
4900 static target_ulong
off_zsu_d(void *reg
, intptr_t reg_ofs
)
4902 return (uint32_t)*(uint64_t *)(reg
+ reg_ofs
);
4905 static target_ulong
off_zss_d(void *reg
, intptr_t reg_ofs
)
4907 return (int32_t)*(uint64_t *)(reg
+ reg_ofs
);
4910 static target_ulong
off_zd_d(void *reg
, intptr_t reg_ofs
)
4912 return *(uint64_t *)(reg
+ reg_ofs
);
4915 static void sve_ld1_zs(CPUARMState
*env
, void *vd
, void *vg
, void *vm
,
4916 target_ulong base
, uint32_t desc
, uintptr_t ra
,
4917 zreg_off_fn
*off_fn
, sve_ld1_tlb_fn
*tlb_fn
)
4919 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4920 const int scale
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 2);
4921 intptr_t i
, oprsz
= simd_oprsz(desc
);
4922 ARMVectorReg scratch
= { };
4924 set_helper_retaddr(ra
);
4925 for (i
= 0; i
< oprsz
; ) {
4926 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4928 if (likely(pg
& 1)) {
4929 target_ulong off
= off_fn(vm
, i
);
4930 tlb_fn(env
, &scratch
, i
, base
+ (off
<< scale
), oi
, ra
);
4935 set_helper_retaddr(0);
4937 /* Wait until all exceptions have been raised to write back. */
4938 memcpy(vd
, &scratch
, oprsz
);
4941 static void sve_ld1_zd(CPUARMState
*env
, void *vd
, void *vg
, void *vm
,
4942 target_ulong base
, uint32_t desc
, uintptr_t ra
,
4943 zreg_off_fn
*off_fn
, sve_ld1_tlb_fn
*tlb_fn
)
4945 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
4946 const int scale
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 2);
4947 intptr_t i
, oprsz
= simd_oprsz(desc
) / 8;
4948 ARMVectorReg scratch
= { };
4950 set_helper_retaddr(ra
);
4951 for (i
= 0; i
< oprsz
; i
++) {
4952 uint8_t pg
= *(uint8_t *)(vg
+ H1(i
));
4953 if (likely(pg
& 1)) {
4954 target_ulong off
= off_fn(vm
, i
* 8);
4955 tlb_fn(env
, &scratch
, i
* 8, base
+ (off
<< scale
), oi
, ra
);
4958 set_helper_retaddr(0);
4960 /* Wait until all exceptions have been raised to write back. */
4961 memcpy(vd
, &scratch
, oprsz
* 8);
4964 #define DO_LD1_ZPZ_S(MEM, OFS) \
4965 void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
4966 (CPUARMState *env, void *vd, void *vg, void *vm, \
4967 target_ulong base, uint32_t desc) \
4969 sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
4970 off_##OFS##_s, sve_ld1##MEM##_tlb); \
4973 #define DO_LD1_ZPZ_D(MEM, OFS) \
4974 void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
4975 (CPUARMState *env, void *vd, void *vg, void *vm, \
4976 target_ulong base, uint32_t desc) \
4978 sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
4979 off_##OFS##_d, sve_ld1##MEM##_tlb); \
4982 DO_LD1_ZPZ_S(bsu
, zsu
)
4983 DO_LD1_ZPZ_S(bsu
, zss
)
4984 DO_LD1_ZPZ_D(bdu
, zsu
)
4985 DO_LD1_ZPZ_D(bdu
, zss
)
4986 DO_LD1_ZPZ_D(bdu
, zd
)
4988 DO_LD1_ZPZ_S(bss
, zsu
)
4989 DO_LD1_ZPZ_S(bss
, zss
)
4990 DO_LD1_ZPZ_D(bds
, zsu
)
4991 DO_LD1_ZPZ_D(bds
, zss
)
4992 DO_LD1_ZPZ_D(bds
, zd
)
4994 DO_LD1_ZPZ_S(hsu_le
, zsu
)
4995 DO_LD1_ZPZ_S(hsu_le
, zss
)
4996 DO_LD1_ZPZ_D(hdu_le
, zsu
)
4997 DO_LD1_ZPZ_D(hdu_le
, zss
)
4998 DO_LD1_ZPZ_D(hdu_le
, zd
)
5000 DO_LD1_ZPZ_S(hsu_be
, zsu
)
5001 DO_LD1_ZPZ_S(hsu_be
, zss
)
5002 DO_LD1_ZPZ_D(hdu_be
, zsu
)
5003 DO_LD1_ZPZ_D(hdu_be
, zss
)
5004 DO_LD1_ZPZ_D(hdu_be
, zd
)
5006 DO_LD1_ZPZ_S(hss_le
, zsu
)
5007 DO_LD1_ZPZ_S(hss_le
, zss
)
5008 DO_LD1_ZPZ_D(hds_le
, zsu
)
5009 DO_LD1_ZPZ_D(hds_le
, zss
)
5010 DO_LD1_ZPZ_D(hds_le
, zd
)
5012 DO_LD1_ZPZ_S(hss_be
, zsu
)
5013 DO_LD1_ZPZ_S(hss_be
, zss
)
5014 DO_LD1_ZPZ_D(hds_be
, zsu
)
5015 DO_LD1_ZPZ_D(hds_be
, zss
)
5016 DO_LD1_ZPZ_D(hds_be
, zd
)
5018 DO_LD1_ZPZ_S(ss_le
, zsu
)
5019 DO_LD1_ZPZ_S(ss_le
, zss
)
5020 DO_LD1_ZPZ_D(sdu_le
, zsu
)
5021 DO_LD1_ZPZ_D(sdu_le
, zss
)
5022 DO_LD1_ZPZ_D(sdu_le
, zd
)
5024 DO_LD1_ZPZ_S(ss_be
, zsu
)
5025 DO_LD1_ZPZ_S(ss_be
, zss
)
5026 DO_LD1_ZPZ_D(sdu_be
, zsu
)
5027 DO_LD1_ZPZ_D(sdu_be
, zss
)
5028 DO_LD1_ZPZ_D(sdu_be
, zd
)
5030 DO_LD1_ZPZ_D(sds_le
, zsu
)
5031 DO_LD1_ZPZ_D(sds_le
, zss
)
5032 DO_LD1_ZPZ_D(sds_le
, zd
)
5034 DO_LD1_ZPZ_D(sds_be
, zsu
)
5035 DO_LD1_ZPZ_D(sds_be
, zss
)
5036 DO_LD1_ZPZ_D(sds_be
, zd
)
5038 DO_LD1_ZPZ_D(dd_le
, zsu
)
5039 DO_LD1_ZPZ_D(dd_le
, zss
)
5040 DO_LD1_ZPZ_D(dd_le
, zd
)
5042 DO_LD1_ZPZ_D(dd_be
, zsu
)
5043 DO_LD1_ZPZ_D(dd_be
, zss
)
5044 DO_LD1_ZPZ_D(dd_be
, zd
)
5049 /* First fault loads with a vector index. */
5051 /* Load one element into VD+REG_OFF from (ENV,VADDR) without faulting.
5052 * The controlling predicate is known to be true. Return true if the
5053 * load was successful.
5055 typedef bool sve_ld1_nf_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
5056 target_ulong vaddr
, int mmu_idx
);
5058 #ifdef CONFIG_SOFTMMU
5059 #define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5060 static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
5061 target_ulong addr, int mmu_idx) \
5063 target_ulong next_page = -(addr | TARGET_PAGE_MASK); \
5064 if (likely(next_page - addr >= sizeof(TYPEM))) { \
5065 void *host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx); \
5066 if (likely(host)) { \
5067 TYPEM val = HOST(host); \
5068 *(TYPEE *)(vd + H(reg_off)) = val; \
5075 #define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5076 static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
5077 target_ulong addr, int mmu_idx) \
5079 if (likely(page_check_range(addr, sizeof(TYPEM), PAGE_READ))) { \
5080 TYPEM val = HOST(g2h(addr)); \
5081 *(TYPEE *)(vd + H(reg_off)) = val; \
5088 DO_LD_NF(bsu
, H1_4
, uint32_t, uint8_t, ldub_p
)
5089 DO_LD_NF(bss
, H1_4
, uint32_t, int8_t, ldsb_p
)
5090 DO_LD_NF(bdu
, , uint64_t, uint8_t, ldub_p
)
5091 DO_LD_NF(bds
, , uint64_t, int8_t, ldsb_p
)
5093 DO_LD_NF(hsu_le
, H1_4
, uint32_t, uint16_t, lduw_le_p
)
5094 DO_LD_NF(hss_le
, H1_4
, uint32_t, int16_t, ldsw_le_p
)
5095 DO_LD_NF(hsu_be
, H1_4
, uint32_t, uint16_t, lduw_be_p
)
5096 DO_LD_NF(hss_be
, H1_4
, uint32_t, int16_t, ldsw_be_p
)
5097 DO_LD_NF(hdu_le
, , uint64_t, uint16_t, lduw_le_p
)
5098 DO_LD_NF(hds_le
, , uint64_t, int16_t, ldsw_le_p
)
5099 DO_LD_NF(hdu_be
, , uint64_t, uint16_t, lduw_be_p
)
5100 DO_LD_NF(hds_be
, , uint64_t, int16_t, ldsw_be_p
)
5102 DO_LD_NF(ss_le
, H1_4
, uint32_t, uint32_t, ldl_le_p
)
5103 DO_LD_NF(ss_be
, H1_4
, uint32_t, uint32_t, ldl_be_p
)
5104 DO_LD_NF(sdu_le
, , uint64_t, uint32_t, ldl_le_p
)
5105 DO_LD_NF(sds_le
, , uint64_t, int32_t, ldl_le_p
)
5106 DO_LD_NF(sdu_be
, , uint64_t, uint32_t, ldl_be_p
)
5107 DO_LD_NF(sds_be
, , uint64_t, int32_t, ldl_be_p
)
5109 DO_LD_NF(dd_le
, , uint64_t, uint64_t, ldq_le_p
)
5110 DO_LD_NF(dd_be
, , uint64_t, uint64_t, ldq_be_p
)
5113 * Common helper for all gather first-faulting loads.
5115 static inline void sve_ldff1_zs(CPUARMState
*env
, void *vd
, void *vg
, void *vm
,
5116 target_ulong base
, uint32_t desc
, uintptr_t ra
,
5117 zreg_off_fn
*off_fn
, sve_ld1_tlb_fn
*tlb_fn
,
5118 sve_ld1_nf_fn
*nonfault_fn
)
5120 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
5121 const int mmu_idx
= get_mmuidx(oi
);
5122 const int scale
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 2);
5123 intptr_t reg_off
, reg_max
= simd_oprsz(desc
);
5126 /* Skip to the first true predicate. */
5127 reg_off
= find_next_active(vg
, 0, reg_max
, MO_32
);
5128 if (likely(reg_off
< reg_max
)) {
5129 /* Perform one normal read, which will fault or not. */
5130 set_helper_retaddr(ra
);
5131 addr
= off_fn(vm
, reg_off
);
5132 addr
= base
+ (addr
<< scale
);
5133 tlb_fn(env
, vd
, reg_off
, addr
, oi
, ra
);
5135 /* The rest of the reads will be non-faulting. */
5136 set_helper_retaddr(0);
5139 /* After any fault, zero the leading predicated false elements. */
5140 swap_memzero(vd
, reg_off
);
5142 while (likely((reg_off
+= 4) < reg_max
)) {
5143 uint64_t pg
= *(uint64_t *)(vg
+ (reg_off
>> 6) * 8);
5144 if (likely((pg
>> (reg_off
& 63)) & 1)) {
5145 addr
= off_fn(vm
, reg_off
);
5146 addr
= base
+ (addr
<< scale
);
5147 if (!nonfault_fn(env
, vd
, reg_off
, addr
, mmu_idx
)) {
5148 record_fault(env
, reg_off
, reg_max
);
5152 *(uint32_t *)(vd
+ H1_4(reg_off
)) = 0;
5157 static inline void sve_ldff1_zd(CPUARMState
*env
, void *vd
, void *vg
, void *vm
,
5158 target_ulong base
, uint32_t desc
, uintptr_t ra
,
5159 zreg_off_fn
*off_fn
, sve_ld1_tlb_fn
*tlb_fn
,
5160 sve_ld1_nf_fn
*nonfault_fn
)
5162 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
5163 const int mmu_idx
= get_mmuidx(oi
);
5164 const int scale
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 2);
5165 intptr_t reg_off
, reg_max
= simd_oprsz(desc
);
5168 /* Skip to the first true predicate. */
5169 reg_off
= find_next_active(vg
, 0, reg_max
, MO_64
);
5170 if (likely(reg_off
< reg_max
)) {
5171 /* Perform one normal read, which will fault or not. */
5172 set_helper_retaddr(ra
);
5173 addr
= off_fn(vm
, reg_off
);
5174 addr
= base
+ (addr
<< scale
);
5175 tlb_fn(env
, vd
, reg_off
, addr
, oi
, ra
);
5177 /* The rest of the reads will be non-faulting. */
5178 set_helper_retaddr(0);
5181 /* After any fault, zero the leading predicated false elements. */
5182 swap_memzero(vd
, reg_off
);
5184 while (likely((reg_off
+= 8) < reg_max
)) {
5185 uint8_t pg
= *(uint8_t *)(vg
+ H1(reg_off
>> 3));
5186 if (likely(pg
& 1)) {
5187 addr
= off_fn(vm
, reg_off
);
5188 addr
= base
+ (addr
<< scale
);
5189 if (!nonfault_fn(env
, vd
, reg_off
, addr
, mmu_idx
)) {
5190 record_fault(env
, reg_off
, reg_max
);
5194 *(uint64_t *)(vd
+ reg_off
) = 0;
5199 #define DO_LDFF1_ZPZ_S(MEM, OFS) \
5200 void HELPER(sve_ldff##MEM##_##OFS) \
5201 (CPUARMState *env, void *vd, void *vg, void *vm, \
5202 target_ulong base, uint32_t desc) \
5204 sve_ldff1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5205 off_##OFS##_s, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5208 #define DO_LDFF1_ZPZ_D(MEM, OFS) \
5209 void HELPER(sve_ldff##MEM##_##OFS) \
5210 (CPUARMState *env, void *vd, void *vg, void *vm, \
5211 target_ulong base, uint32_t desc) \
5213 sve_ldff1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5214 off_##OFS##_d, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5217 DO_LDFF1_ZPZ_S(bsu
, zsu
)
5218 DO_LDFF1_ZPZ_S(bsu
, zss
)
5219 DO_LDFF1_ZPZ_D(bdu
, zsu
)
5220 DO_LDFF1_ZPZ_D(bdu
, zss
)
5221 DO_LDFF1_ZPZ_D(bdu
, zd
)
5223 DO_LDFF1_ZPZ_S(bss
, zsu
)
5224 DO_LDFF1_ZPZ_S(bss
, zss
)
5225 DO_LDFF1_ZPZ_D(bds
, zsu
)
5226 DO_LDFF1_ZPZ_D(bds
, zss
)
5227 DO_LDFF1_ZPZ_D(bds
, zd
)
5229 DO_LDFF1_ZPZ_S(hsu_le
, zsu
)
5230 DO_LDFF1_ZPZ_S(hsu_le
, zss
)
5231 DO_LDFF1_ZPZ_D(hdu_le
, zsu
)
5232 DO_LDFF1_ZPZ_D(hdu_le
, zss
)
5233 DO_LDFF1_ZPZ_D(hdu_le
, zd
)
5235 DO_LDFF1_ZPZ_S(hsu_be
, zsu
)
5236 DO_LDFF1_ZPZ_S(hsu_be
, zss
)
5237 DO_LDFF1_ZPZ_D(hdu_be
, zsu
)
5238 DO_LDFF1_ZPZ_D(hdu_be
, zss
)
5239 DO_LDFF1_ZPZ_D(hdu_be
, zd
)
5241 DO_LDFF1_ZPZ_S(hss_le
, zsu
)
5242 DO_LDFF1_ZPZ_S(hss_le
, zss
)
5243 DO_LDFF1_ZPZ_D(hds_le
, zsu
)
5244 DO_LDFF1_ZPZ_D(hds_le
, zss
)
5245 DO_LDFF1_ZPZ_D(hds_le
, zd
)
5247 DO_LDFF1_ZPZ_S(hss_be
, zsu
)
5248 DO_LDFF1_ZPZ_S(hss_be
, zss
)
5249 DO_LDFF1_ZPZ_D(hds_be
, zsu
)
5250 DO_LDFF1_ZPZ_D(hds_be
, zss
)
5251 DO_LDFF1_ZPZ_D(hds_be
, zd
)
5253 DO_LDFF1_ZPZ_S(ss_le
, zsu
)
5254 DO_LDFF1_ZPZ_S(ss_le
, zss
)
5255 DO_LDFF1_ZPZ_D(sdu_le
, zsu
)
5256 DO_LDFF1_ZPZ_D(sdu_le
, zss
)
5257 DO_LDFF1_ZPZ_D(sdu_le
, zd
)
5259 DO_LDFF1_ZPZ_S(ss_be
, zsu
)
5260 DO_LDFF1_ZPZ_S(ss_be
, zss
)
5261 DO_LDFF1_ZPZ_D(sdu_be
, zsu
)
5262 DO_LDFF1_ZPZ_D(sdu_be
, zss
)
5263 DO_LDFF1_ZPZ_D(sdu_be
, zd
)
5265 DO_LDFF1_ZPZ_D(sds_le
, zsu
)
5266 DO_LDFF1_ZPZ_D(sds_le
, zss
)
5267 DO_LDFF1_ZPZ_D(sds_le
, zd
)
5269 DO_LDFF1_ZPZ_D(sds_be
, zsu
)
5270 DO_LDFF1_ZPZ_D(sds_be
, zss
)
5271 DO_LDFF1_ZPZ_D(sds_be
, zd
)
5273 DO_LDFF1_ZPZ_D(dd_le
, zsu
)
5274 DO_LDFF1_ZPZ_D(dd_le
, zss
)
5275 DO_LDFF1_ZPZ_D(dd_le
, zd
)
5277 DO_LDFF1_ZPZ_D(dd_be
, zsu
)
5278 DO_LDFF1_ZPZ_D(dd_be
, zss
)
5279 DO_LDFF1_ZPZ_D(dd_be
, zd
)
5281 /* Stores with a vector index. */
5283 static void sve_st1_zs(CPUARMState
*env
, void *vd
, void *vg
, void *vm
,
5284 target_ulong base
, uint32_t desc
, uintptr_t ra
,
5285 zreg_off_fn
*off_fn
, sve_ld1_tlb_fn
*tlb_fn
)
5287 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
5288 const int scale
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 2);
5289 intptr_t i
, oprsz
= simd_oprsz(desc
);
5291 set_helper_retaddr(ra
);
5292 for (i
= 0; i
< oprsz
; ) {
5293 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
5295 if (likely(pg
& 1)) {
5296 target_ulong off
= off_fn(vm
, i
);
5297 tlb_fn(env
, vd
, i
, base
+ (off
<< scale
), oi
, ra
);
5302 set_helper_retaddr(0);
5305 static void sve_st1_zd(CPUARMState
*env
, void *vd
, void *vg
, void *vm
,
5306 target_ulong base
, uint32_t desc
, uintptr_t ra
,
5307 zreg_off_fn
*off_fn
, sve_ld1_tlb_fn
*tlb_fn
)
5309 const TCGMemOpIdx oi
= extract32(desc
, SIMD_DATA_SHIFT
, MEMOPIDX_SHIFT
);
5310 const int scale
= extract32(desc
, SIMD_DATA_SHIFT
+ MEMOPIDX_SHIFT
, 2);
5311 intptr_t i
, oprsz
= simd_oprsz(desc
) / 8;
5313 set_helper_retaddr(ra
);
5314 for (i
= 0; i
< oprsz
; i
++) {
5315 uint8_t pg
= *(uint8_t *)(vg
+ H1(i
));
5316 if (likely(pg
& 1)) {
5317 target_ulong off
= off_fn(vm
, i
* 8);
5318 tlb_fn(env
, vd
, i
* 8, base
+ (off
<< scale
), oi
, ra
);
5321 set_helper_retaddr(0);
5324 #define DO_ST1_ZPZ_S(MEM, OFS) \
5325 void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
5326 (CPUARMState *env, void *vd, void *vg, void *vm, \
5327 target_ulong base, uint32_t desc) \
5329 sve_st1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5330 off_##OFS##_s, sve_st1##MEM##_tlb); \
5333 #define DO_ST1_ZPZ_D(MEM, OFS) \
5334 void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
5335 (CPUARMState *env, void *vd, void *vg, void *vm, \
5336 target_ulong base, uint32_t desc) \
5338 sve_st1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5339 off_##OFS##_d, sve_st1##MEM##_tlb); \
5342 DO_ST1_ZPZ_S(bs
, zsu
)
5343 DO_ST1_ZPZ_S(hs_le
, zsu
)
5344 DO_ST1_ZPZ_S(hs_be
, zsu
)
5345 DO_ST1_ZPZ_S(ss_le
, zsu
)
5346 DO_ST1_ZPZ_S(ss_be
, zsu
)
5348 DO_ST1_ZPZ_S(bs
, zss
)
5349 DO_ST1_ZPZ_S(hs_le
, zss
)
5350 DO_ST1_ZPZ_S(hs_be
, zss
)
5351 DO_ST1_ZPZ_S(ss_le
, zss
)
5352 DO_ST1_ZPZ_S(ss_be
, zss
)
5354 DO_ST1_ZPZ_D(bd
, zsu
)
5355 DO_ST1_ZPZ_D(hd_le
, zsu
)
5356 DO_ST1_ZPZ_D(hd_be
, zsu
)
5357 DO_ST1_ZPZ_D(sd_le
, zsu
)
5358 DO_ST1_ZPZ_D(sd_be
, zsu
)
5359 DO_ST1_ZPZ_D(dd_le
, zsu
)
5360 DO_ST1_ZPZ_D(dd_be
, zsu
)
5362 DO_ST1_ZPZ_D(bd
, zss
)
5363 DO_ST1_ZPZ_D(hd_le
, zss
)
5364 DO_ST1_ZPZ_D(hd_be
, zss
)
5365 DO_ST1_ZPZ_D(sd_le
, zss
)
5366 DO_ST1_ZPZ_D(sd_be
, zss
)
5367 DO_ST1_ZPZ_D(dd_le
, zss
)
5368 DO_ST1_ZPZ_D(dd_be
, zss
)
5370 DO_ST1_ZPZ_D(bd
, zd
)
5371 DO_ST1_ZPZ_D(hd_le
, zd
)
5372 DO_ST1_ZPZ_D(hd_be
, zd
)
5373 DO_ST1_ZPZ_D(sd_le
, zd
)
5374 DO_ST1_ZPZ_D(sd_be
, zd
)
5375 DO_ST1_ZPZ_D(dd_le
, zd
)
5376 DO_ST1_ZPZ_D(dd_be
, zd
)