4 * Copyright (c) 2022 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "exec/helper-proto.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/exec-all.h"
27 #include "qemu/int128.h"
28 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
33 void arm_reset_sve_state(CPUARMState
*env
)
35 memset(env
->vfp
.zregs
, 0, sizeof(env
->vfp
.zregs
));
36 /* Recall that FFR is stored as pregs[16]. */
37 memset(env
->vfp
.pregs
, 0, sizeof(env
->vfp
.pregs
));
38 vfp_set_fpcr(env
, 0x0800009f);
41 void helper_set_pstate_sm(CPUARMState
*env
, uint32_t i
)
43 if (i
== FIELD_EX64(env
->svcr
, SVCR
, SM
)) {
46 env
->svcr
^= R_SVCR_SM_MASK
;
47 arm_reset_sve_state(env
);
50 void helper_set_pstate_za(CPUARMState
*env
, uint32_t i
)
52 if (i
== FIELD_EX64(env
->svcr
, SVCR
, ZA
)) {
55 env
->svcr
^= R_SVCR_ZA_MASK
;
60 * SetPSTATE_ZA zeros on enable and disable. We can zero this only
61 * on enable: while disabled, the storage is inaccessible and the
62 * value does not matter. We're not saving the storage in vmstate
63 * when disabled either.
66 memset(env
->zarray
, 0, sizeof(env
->zarray
));
70 void helper_sme_zero(CPUARMState
*env
, uint32_t imm
, uint32_t svl
)
75 * Special case clearing the entire ZA space.
76 * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
77 * parts of the ZA storage outside of SVL.
80 memset(env
->zarray
, 0, sizeof(env
->zarray
));
85 * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
86 * so each row is discontiguous within ZA[].
88 for (i
= 0; i
< svl
; i
++) {
89 if (imm
& (1 << (i
% 8))) {
90 memset(&env
->zarray
[i
], 0, svl
);
97 * When considering the ZA storage as an array of elements of
98 * type T, the index within that array of the Nth element of
99 * a vertical slice of a tile can be calculated like this,
100 * regardless of the size of type T. This is because the tiles
101 * are interleaved, so if type T is size N bytes then row 1 of
102 * the tile is N rows away from row 0. The division by N to
103 * convert a byte offset into an array index and the multiplication
104 * by N to convert from vslice-index-within-the-tile to
105 * the index within the ZA storage cancel out.
107 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
110 * When doing byte arithmetic on the ZA storage, the element
111 * byteoff bytes away in a tile vertical slice is always this
112 * many bytes away in the ZA storage, regardless of the
113 * size of the tile element, assuming that byteoff is a multiple
114 * of the element size. Again this is because of the interleaving
115 * of the tiles. For instance if we have 1 byte per element then
116 * each row of the ZA storage has one byte of the vslice data,
117 * and (counting from 0) byte 8 goes in row 8 of the storage
118 * at offset (8 * row-size-in-bytes).
119 * If we have 8 bytes per element then each row of the ZA storage
120 * has 8 bytes of the data, but there are 8 interleaved tiles and
121 * so byte 8 of the data goes into row 1 of the tile,
122 * which is again row 8 of the storage, so the offset is still
123 * (8 * row-size-in-bytes). Similarly for other element sizes.
125 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
129 * Move Zreg vector to ZArray column.
131 #define DO_MOVA_C(NAME, TYPE, H) \
132 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
134 int i, oprsz = simd_oprsz(desc); \
135 for (i = 0; i < oprsz; ) { \
136 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
139 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
142 pg >>= sizeof(TYPE); \
147 DO_MOVA_C(sme_mova_cz_b
, uint8_t, H1
)
148 DO_MOVA_C(sme_mova_cz_h
, uint16_t, H1_2
)
149 DO_MOVA_C(sme_mova_cz_s
, uint32_t, H1_4
)
151 void HELPER(sme_mova_cz_d
)(void *za
, void *vn
, void *vg
, uint32_t desc
)
153 int i
, oprsz
= simd_oprsz(desc
) / 8;
158 for (i
= 0; i
< oprsz
; i
++) {
160 a
[tile_vslice_index(i
)] = n
[i
];
165 void HELPER(sme_mova_cz_q
)(void *za
, void *vn
, void *vg
, uint32_t desc
)
167 int i
, oprsz
= simd_oprsz(desc
) / 16;
173 * Int128 is used here simply to copy 16 bytes, and to simplify
174 * the address arithmetic.
176 for (i
= 0; i
< oprsz
; i
++) {
178 a
[tile_vslice_index(i
)] = n
[i
];
186 * Move ZArray column to Zreg vector.
188 #define DO_MOVA_Z(NAME, TYPE, H) \
189 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
191 int i, oprsz = simd_oprsz(desc); \
192 for (i = 0; i < oprsz; ) { \
193 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
196 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
199 pg >>= sizeof(TYPE); \
204 DO_MOVA_Z(sme_mova_zc_b
, uint8_t, H1
)
205 DO_MOVA_Z(sme_mova_zc_h
, uint16_t, H1_2
)
206 DO_MOVA_Z(sme_mova_zc_s
, uint32_t, H1_4
)
208 void HELPER(sme_mova_zc_d
)(void *vd
, void *za
, void *vg
, uint32_t desc
)
210 int i
, oprsz
= simd_oprsz(desc
) / 8;
215 for (i
= 0; i
< oprsz
; i
++) {
217 d
[i
] = a
[tile_vslice_index(i
)];
222 void HELPER(sme_mova_zc_q
)(void *vd
, void *za
, void *vg
, uint32_t desc
)
224 int i
, oprsz
= simd_oprsz(desc
) / 16;
230 * Int128 is used here simply to copy 16 bytes, and to simplify
231 * the address arithmetic.
233 for (i
= 0; i
< oprsz
; i
++, za
+= sizeof(ARMVectorReg
)) {
235 d
[i
] = a
[tile_vslice_index(i
)];
243 * Clear elements in a tile slice comprising len bytes.
246 typedef void ClearFn(void *ptr
, size_t off
, size_t len
);
248 static void clear_horizontal(void *ptr
, size_t off
, size_t len
)
250 memset(ptr
+ off
, 0, len
);
253 static void clear_vertical_b(void *vptr
, size_t off
, size_t len
)
255 for (size_t i
= 0; i
< len
; ++i
) {
256 *(uint8_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
260 static void clear_vertical_h(void *vptr
, size_t off
, size_t len
)
262 for (size_t i
= 0; i
< len
; i
+= 2) {
263 *(uint16_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
267 static void clear_vertical_s(void *vptr
, size_t off
, size_t len
)
269 for (size_t i
= 0; i
< len
; i
+= 4) {
270 *(uint32_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
274 static void clear_vertical_d(void *vptr
, size_t off
, size_t len
)
276 for (size_t i
= 0; i
< len
; i
+= 8) {
277 *(uint64_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
281 static void clear_vertical_q(void *vptr
, size_t off
, size_t len
)
283 for (size_t i
= 0; i
< len
; i
+= 16) {
284 memset(vptr
+ tile_vslice_offset(i
+ off
), 0, 16);
289 * Copy elements from an array into a tile slice comprising len bytes.
292 typedef void CopyFn(void *dst
, const void *src
, size_t len
);
294 static void copy_horizontal(void *dst
, const void *src
, size_t len
)
296 memcpy(dst
, src
, len
);
299 static void copy_vertical_b(void *vdst
, const void *vsrc
, size_t len
)
301 const uint8_t *src
= vsrc
;
305 for (i
= 0; i
< len
; ++i
) {
306 dst
[tile_vslice_index(i
)] = src
[i
];
310 static void copy_vertical_h(void *vdst
, const void *vsrc
, size_t len
)
312 const uint16_t *src
= vsrc
;
313 uint16_t *dst
= vdst
;
316 for (i
= 0; i
< len
/ 2; ++i
) {
317 dst
[tile_vslice_index(i
)] = src
[i
];
321 static void copy_vertical_s(void *vdst
, const void *vsrc
, size_t len
)
323 const uint32_t *src
= vsrc
;
324 uint32_t *dst
= vdst
;
327 for (i
= 0; i
< len
/ 4; ++i
) {
328 dst
[tile_vslice_index(i
)] = src
[i
];
332 static void copy_vertical_d(void *vdst
, const void *vsrc
, size_t len
)
334 const uint64_t *src
= vsrc
;
335 uint64_t *dst
= vdst
;
338 for (i
= 0; i
< len
/ 8; ++i
) {
339 dst
[tile_vslice_index(i
)] = src
[i
];
343 static void copy_vertical_q(void *vdst
, const void *vsrc
, size_t len
)
345 for (size_t i
= 0; i
< len
; i
+= 16) {
346 memcpy(vdst
+ tile_vslice_offset(i
), vsrc
+ i
, 16);
351 * Host and TLB primitives for vertical tile slice addressing.
354 #define DO_LD(NAME, TYPE, HOST, TLB) \
355 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
357 TYPE val = HOST(host); \
358 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
360 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
361 intptr_t off, target_ulong addr, uintptr_t ra) \
363 TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
364 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
367 #define DO_ST(NAME, TYPE, HOST, TLB) \
368 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
370 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
373 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
374 intptr_t off, target_ulong addr, uintptr_t ra) \
376 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
377 TLB(env, useronly_clean_ptr(addr), val, ra); \
381 * The ARMVectorReg elements are stored in host-endian 64-bit units.
382 * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
383 * corresponds to storing the two 64-bit pieces in little-endian order.
385 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
386 static inline void HNAME##_host(void *za, intptr_t off, void *host) \
388 uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
389 uint64_t *ptr = za + off; \
390 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
392 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
394 HNAME##_host(za, tile_vslice_offset(off), host); \
396 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
397 target_ulong addr, uintptr_t ra) \
399 uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
400 uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
401 uint64_t *ptr = za + off; \
402 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
404 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
405 target_ulong addr, uintptr_t ra) \
407 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
410 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
411 static inline void HNAME##_host(void *za, intptr_t off, void *host) \
413 uint64_t *ptr = za + off; \
414 HOST(host, ptr[BE]); \
415 HOST(host + 1, ptr[!BE]); \
417 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
419 HNAME##_host(za, tile_vslice_offset(off), host); \
421 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
422 target_ulong addr, uintptr_t ra) \
424 uint64_t *ptr = za + off; \
425 TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
426 TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
428 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
429 target_ulong addr, uintptr_t ra) \
431 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
434 DO_LD(ld1b
, uint8_t, ldub_p
, cpu_ldub_data_ra
)
435 DO_LD(ld1h_be
, uint16_t, lduw_be_p
, cpu_lduw_be_data_ra
)
436 DO_LD(ld1h_le
, uint16_t, lduw_le_p
, cpu_lduw_le_data_ra
)
437 DO_LD(ld1s_be
, uint32_t, ldl_be_p
, cpu_ldl_be_data_ra
)
438 DO_LD(ld1s_le
, uint32_t, ldl_le_p
, cpu_ldl_le_data_ra
)
439 DO_LD(ld1d_be
, uint64_t, ldq_be_p
, cpu_ldq_be_data_ra
)
440 DO_LD(ld1d_le
, uint64_t, ldq_le_p
, cpu_ldq_le_data_ra
)
442 DO_LDQ(sve_ld1qq_be
, sme_ld1q_be
, 1, ldq_be_p
, cpu_ldq_be_data_ra
)
443 DO_LDQ(sve_ld1qq_le
, sme_ld1q_le
, 0, ldq_le_p
, cpu_ldq_le_data_ra
)
445 DO_ST(st1b
, uint8_t, stb_p
, cpu_stb_data_ra
)
446 DO_ST(st1h_be
, uint16_t, stw_be_p
, cpu_stw_be_data_ra
)
447 DO_ST(st1h_le
, uint16_t, stw_le_p
, cpu_stw_le_data_ra
)
448 DO_ST(st1s_be
, uint32_t, stl_be_p
, cpu_stl_be_data_ra
)
449 DO_ST(st1s_le
, uint32_t, stl_le_p
, cpu_stl_le_data_ra
)
450 DO_ST(st1d_be
, uint64_t, stq_be_p
, cpu_stq_be_data_ra
)
451 DO_ST(st1d_le
, uint64_t, stq_le_p
, cpu_stq_le_data_ra
)
453 DO_STQ(sve_st1qq_be
, sme_st1q_be
, 1, stq_be_p
, cpu_stq_be_data_ra
)
454 DO_STQ(sve_st1qq_le
, sme_st1q_le
, 0, stq_le_p
, cpu_stq_le_data_ra
)
462 * Common helper for all contiguous predicated loads.
465 static inline QEMU_ALWAYS_INLINE
466 void sme_ld1(CPUARMState
*env
, void *za
, uint64_t *vg
,
467 const target_ulong addr
, uint32_t desc
, const uintptr_t ra
,
468 const int esz
, uint32_t mtedesc
, bool vertical
,
469 sve_ldst1_host_fn
*host_fn
,
470 sve_ldst1_tlb_fn
*tlb_fn
,
474 const intptr_t reg_max
= simd_oprsz(desc
);
475 const intptr_t esize
= 1 << esz
;
476 intptr_t reg_off
, reg_last
;
481 /* Find the active elements. */
482 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, esize
)) {
483 /* The entire predicate was false; no load occurs. */
484 clr_fn(za
, 0, reg_max
);
488 /* Probe the page(s). Exit with exception for any invalid page. */
489 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, ra
);
491 /* Handle watchpoints for all active elements. */
492 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, esize
, esize
,
496 * Handle mte checks for all active elements.
497 * Since TBI must be set for MTE, !mtedesc => !mte_active.
500 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, esize
, esize
,
504 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
505 if (unlikely(flags
!= 0)) {
506 #ifdef CONFIG_USER_ONLY
507 g_assert_not_reached();
510 * At least one page includes MMIO.
511 * Any bus operation can fail with cpu_transaction_failed,
512 * which for ARM will raise SyncExternal. Perform the load
513 * into scratch memory to preserve register state until the end.
515 ARMVectorReg scratch
= { };
517 reg_off
= info
.reg_off_first
[0];
518 reg_last
= info
.reg_off_last
[1];
520 reg_last
= info
.reg_off_split
;
522 reg_last
= info
.reg_off_last
[0];
527 uint64_t pg
= vg
[reg_off
>> 6];
529 if ((pg
>> (reg_off
& 63)) & 1) {
530 tlb_fn(env
, &scratch
, reg_off
, addr
+ reg_off
, ra
);
533 } while (reg_off
& 63);
534 } while (reg_off
<= reg_last
);
536 cpy_fn(za
, &scratch
, reg_max
);
541 /* The entire operation is in RAM, on valid pages. */
543 reg_off
= info
.reg_off_first
[0];
544 reg_last
= info
.reg_off_last
[0];
545 host
= info
.page
[0].host
;
548 memset(za
, 0, reg_max
);
549 } else if (reg_off
) {
550 clr_fn(za
, 0, reg_off
);
553 while (reg_off
<= reg_last
) {
554 uint64_t pg
= vg
[reg_off
>> 6];
556 if ((pg
>> (reg_off
& 63)) & 1) {
557 host_fn(za
, reg_off
, host
+ reg_off
);
558 } else if (vertical
) {
559 clr_fn(za
, reg_off
, esize
);
562 } while (reg_off
<= reg_last
&& (reg_off
& 63));
566 * Use the slow path to manage the cross-page misalignment.
567 * But we know this is RAM and cannot trap.
569 reg_off
= info
.reg_off_split
;
570 if (unlikely(reg_off
>= 0)) {
571 tlb_fn(env
, za
, reg_off
, addr
+ reg_off
, ra
);
574 reg_off
= info
.reg_off_first
[1];
575 if (unlikely(reg_off
>= 0)) {
576 reg_last
= info
.reg_off_last
[1];
577 host
= info
.page
[1].host
;
580 uint64_t pg
= vg
[reg_off
>> 6];
582 if ((pg
>> (reg_off
& 63)) & 1) {
583 host_fn(za
, reg_off
, host
+ reg_off
);
584 } else if (vertical
) {
585 clr_fn(za
, reg_off
, esize
);
588 } while (reg_off
& 63);
589 } while (reg_off
<= reg_last
);
593 static inline QEMU_ALWAYS_INLINE
594 void sme_ld1_mte(CPUARMState
*env
, void *za
, uint64_t *vg
,
595 target_ulong addr
, uint32_t desc
, uintptr_t ra
,
596 const int esz
, bool vertical
,
597 sve_ldst1_host_fn
*host_fn
,
598 sve_ldst1_tlb_fn
*tlb_fn
,
602 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
603 int bit55
= extract64(addr
, 55, 1);
605 /* Remove mtedesc from the normal sve descriptor. */
606 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
608 /* Perform gross MTE suppression early. */
609 if (!tbi_check(desc
, bit55
) ||
610 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
614 sme_ld1(env
, za
, vg
, addr
, desc
, ra
, esz
, mtedesc
, vertical
,
615 host_fn
, tlb_fn
, clr_fn
, cpy_fn
);
618 #define DO_LD(L, END, ESZ) \
619 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
620 target_ulong addr, uint32_t desc) \
622 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
623 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
624 clear_horizontal, copy_horizontal); \
626 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
627 target_ulong addr, uint32_t desc) \
629 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
630 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
631 clear_vertical_##L, copy_vertical_##L); \
633 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
634 target_ulong addr, uint32_t desc) \
636 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
637 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
638 clear_horizontal, copy_horizontal); \
640 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
641 target_ulong addr, uint32_t desc) \
643 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
644 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
645 clear_vertical_##L, copy_vertical_##L); \
655 DO_LD(q
, _be
, MO_128
)
656 DO_LD(q
, _le
, MO_128
)
661 * Common helper for all contiguous predicated stores.
664 static inline QEMU_ALWAYS_INLINE
665 void sme_st1(CPUARMState
*env
, void *za
, uint64_t *vg
,
666 const target_ulong addr
, uint32_t desc
, const uintptr_t ra
,
667 const int esz
, uint32_t mtedesc
, bool vertical
,
668 sve_ldst1_host_fn
*host_fn
,
669 sve_ldst1_tlb_fn
*tlb_fn
)
671 const intptr_t reg_max
= simd_oprsz(desc
);
672 const intptr_t esize
= 1 << esz
;
673 intptr_t reg_off
, reg_last
;
678 /* Find the active elements. */
679 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, esize
)) {
680 /* The entire predicate was false; no store occurs. */
684 /* Probe the page(s). Exit with exception for any invalid page. */
685 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, ra
);
687 /* Handle watchpoints for all active elements. */
688 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, esize
, esize
,
692 * Handle mte checks for all active elements.
693 * Since TBI must be set for MTE, !mtedesc => !mte_active.
696 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, esize
, esize
,
700 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
701 if (unlikely(flags
!= 0)) {
702 #ifdef CONFIG_USER_ONLY
703 g_assert_not_reached();
706 * At least one page includes MMIO.
707 * Any bus operation can fail with cpu_transaction_failed,
708 * which for ARM will raise SyncExternal. We cannot avoid
709 * this fault and will leave with the store incomplete.
711 reg_off
= info
.reg_off_first
[0];
712 reg_last
= info
.reg_off_last
[1];
714 reg_last
= info
.reg_off_split
;
716 reg_last
= info
.reg_off_last
[0];
721 uint64_t pg
= vg
[reg_off
>> 6];
723 if ((pg
>> (reg_off
& 63)) & 1) {
724 tlb_fn(env
, za
, reg_off
, addr
+ reg_off
, ra
);
727 } while (reg_off
& 63);
728 } while (reg_off
<= reg_last
);
733 reg_off
= info
.reg_off_first
[0];
734 reg_last
= info
.reg_off_last
[0];
735 host
= info
.page
[0].host
;
737 while (reg_off
<= reg_last
) {
738 uint64_t pg
= vg
[reg_off
>> 6];
740 if ((pg
>> (reg_off
& 63)) & 1) {
741 host_fn(za
, reg_off
, host
+ reg_off
);
744 } while (reg_off
<= reg_last
&& (reg_off
& 63));
748 * Use the slow path to manage the cross-page misalignment.
749 * But we know this is RAM and cannot trap.
751 reg_off
= info
.reg_off_split
;
752 if (unlikely(reg_off
>= 0)) {
753 tlb_fn(env
, za
, reg_off
, addr
+ reg_off
, ra
);
756 reg_off
= info
.reg_off_first
[1];
757 if (unlikely(reg_off
>= 0)) {
758 reg_last
= info
.reg_off_last
[1];
759 host
= info
.page
[1].host
;
762 uint64_t pg
= vg
[reg_off
>> 6];
764 if ((pg
>> (reg_off
& 63)) & 1) {
765 host_fn(za
, reg_off
, host
+ reg_off
);
768 } while (reg_off
& 63);
769 } while (reg_off
<= reg_last
);
773 static inline QEMU_ALWAYS_INLINE
774 void sme_st1_mte(CPUARMState
*env
, void *za
, uint64_t *vg
, target_ulong addr
,
775 uint32_t desc
, uintptr_t ra
, int esz
, bool vertical
,
776 sve_ldst1_host_fn
*host_fn
,
777 sve_ldst1_tlb_fn
*tlb_fn
)
779 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
780 int bit55
= extract64(addr
, 55, 1);
782 /* Remove mtedesc from the normal sve descriptor. */
783 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
785 /* Perform gross MTE suppression early. */
786 if (!tbi_check(desc
, bit55
) ||
787 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
791 sme_st1(env
, za
, vg
, addr
, desc
, ra
, esz
, mtedesc
,
792 vertical
, host_fn
, tlb_fn
);
795 #define DO_ST(L, END, ESZ) \
796 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
797 target_ulong addr, uint32_t desc) \
799 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
800 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
802 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
803 target_ulong addr, uint32_t desc) \
805 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
806 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
808 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
809 target_ulong addr, uint32_t desc) \
811 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
812 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
814 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
815 target_ulong addr, uint32_t desc) \
817 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
818 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
828 DO_ST(q
, _be
, MO_128
)
829 DO_ST(q
, _le
, MO_128
)
833 void HELPER(sme_addha_s
)(void *vzda
, void *vzn
, void *vpn
,
834 void *vpm
, uint32_t desc
)
836 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 4;
837 uint64_t *pn
= vpn
, *pm
= vpm
;
838 uint32_t *zda
= vzda
, *zn
= vzn
;
840 for (row
= 0; row
< oprsz
; ) {
841 uint64_t pa
= pn
[row
>> 4];
844 for (col
= 0; col
< oprsz
; ) {
845 uint64_t pb
= pm
[col
>> 4];
848 zda
[tile_vslice_index(row
) + H4(col
)] += zn
[H4(col
)];
851 } while (++col
& 15);
855 } while (++row
& 15);
859 void HELPER(sme_addha_d
)(void *vzda
, void *vzn
, void *vpn
,
860 void *vpm
, uint32_t desc
)
862 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
863 uint8_t *pn
= vpn
, *pm
= vpm
;
864 uint64_t *zda
= vzda
, *zn
= vzn
;
866 for (row
= 0; row
< oprsz
; ++row
) {
867 if (pn
[H1(row
)] & 1) {
868 for (col
= 0; col
< oprsz
; ++col
) {
869 if (pm
[H1(col
)] & 1) {
870 zda
[tile_vslice_index(row
) + col
] += zn
[col
];
877 void HELPER(sme_addva_s
)(void *vzda
, void *vzn
, void *vpn
,
878 void *vpm
, uint32_t desc
)
880 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 4;
881 uint64_t *pn
= vpn
, *pm
= vpm
;
882 uint32_t *zda
= vzda
, *zn
= vzn
;
884 for (row
= 0; row
< oprsz
; ) {
885 uint64_t pa
= pn
[row
>> 4];
888 uint32_t zn_row
= zn
[H4(row
)];
889 for (col
= 0; col
< oprsz
; ) {
890 uint64_t pb
= pm
[col
>> 4];
893 zda
[tile_vslice_index(row
) + H4(col
)] += zn_row
;
896 } while (++col
& 15);
900 } while (++row
& 15);
904 void HELPER(sme_addva_d
)(void *vzda
, void *vzn
, void *vpn
,
905 void *vpm
, uint32_t desc
)
907 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
908 uint8_t *pn
= vpn
, *pm
= vpm
;
909 uint64_t *zda
= vzda
, *zn
= vzn
;
911 for (row
= 0; row
< oprsz
; ++row
) {
912 if (pn
[H1(row
)] & 1) {
913 uint64_t zn_row
= zn
[row
];
914 for (col
= 0; col
< oprsz
; ++col
) {
915 if (pm
[H1(col
)] & 1) {
916 zda
[tile_vslice_index(row
) + col
] += zn_row
;
923 void HELPER(sme_fmopa_s
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
924 void *vpm
, void *vst
, uint32_t desc
)
926 intptr_t row
, col
, oprsz
= simd_maxsz(desc
);
927 uint32_t neg
= simd_data(desc
) << 31;
928 uint16_t *pn
= vpn
, *pm
= vpm
;
932 * Make a copy of float_status because this operation does not
933 * update the cumulative fp exception status. It also produces
936 fpst
= *(float_status
*)vst
;
937 set_default_nan_mode(true, &fpst
);
939 for (row
= 0; row
< oprsz
; ) {
940 uint16_t pa
= pn
[H2(row
>> 4)];
943 void *vza_row
= vza
+ tile_vslice_offset(row
);
944 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
)) ^ neg
;
946 for (col
= 0; col
< oprsz
; ) {
947 uint16_t pb
= pm
[H2(col
>> 4)];
950 uint32_t *a
= vza_row
+ H1_4(col
);
951 uint32_t *m
= vzm
+ H1_4(col
);
952 *a
= float32_muladd(n
, *m
, *a
, 0, vst
);
965 void HELPER(sme_fmopa_d
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
966 void *vpm
, void *vst
, uint32_t desc
)
968 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
969 uint64_t neg
= (uint64_t)simd_data(desc
) << 63;
970 uint64_t *za
= vza
, *zn
= vzn
, *zm
= vzm
;
971 uint8_t *pn
= vpn
, *pm
= vpm
;
972 float_status fpst
= *(float_status
*)vst
;
974 set_default_nan_mode(true, &fpst
);
976 for (row
= 0; row
< oprsz
; ++row
) {
977 if (pn
[H1(row
)] & 1) {
978 uint64_t *za_row
= &za
[tile_vslice_index(row
)];
979 uint64_t n
= zn
[row
] ^ neg
;
981 for (col
= 0; col
< oprsz
; ++col
) {
982 if (pm
[H1(col
)] & 1) {
983 uint64_t *a
= &za_row
[col
];
984 *a
= float64_muladd(n
, zm
[col
], *a
, 0, &fpst
);
992 * Alter PAIR as needed for controlling predicates being false,
993 * and for NEG on an enabled row element.
995 static inline uint32_t f16mop_adj_pair(uint32_t pair
, uint32_t pg
, uint32_t neg
)
998 * The pseudocode uses a conditional negate after the conditional zero.
999 * It is simpler here to unconditionally negate before conditional zero.
1003 pair
&= 0xffff0000u
;
1006 pair
&= 0x0000ffffu
;
1011 static float32
f16_dotadd(float32 sum
, uint32_t e1
, uint32_t e2
,
1012 float_status
*s_std
, float_status
*s_odd
)
1014 float64 e1r
= float16_to_float64(e1
& 0xffff, true, s_std
);
1015 float64 e1c
= float16_to_float64(e1
>> 16, true, s_std
);
1016 float64 e2r
= float16_to_float64(e2
& 0xffff, true, s_std
);
1017 float64 e2c
= float16_to_float64(e2
>> 16, true, s_std
);
1022 * The ARM pseudocode function FPDot performs both multiplies
1023 * and the add with a single rounding operation. Emulate this
1024 * by performing the first multiply in round-to-odd, then doing
1025 * the second multiply as fused multiply-add, and rounding to
1026 * float32 all in one step.
1028 t64
= float64_mul(e1r
, e2r
, s_odd
);
1029 t64
= float64r32_muladd(e1c
, e2c
, t64
, 0, s_std
);
1031 /* This conversion is exact, because we've already rounded. */
1032 t32
= float64_to_float32(t64
, s_std
);
1034 /* The final accumulation step is not fused. */
1035 return float32_add(sum
, t32
, s_std
);
1038 void HELPER(sme_fmopa_h
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
1039 void *vpm
, void *vst
, uint32_t desc
)
1041 intptr_t row
, col
, oprsz
= simd_maxsz(desc
);
1042 uint32_t neg
= simd_data(desc
) * 0x80008000u
;
1043 uint16_t *pn
= vpn
, *pm
= vpm
;
1044 float_status fpst_odd
, fpst_std
;
1047 * Make a copy of float_status because this operation does not
1048 * update the cumulative fp exception status. It also produces
1049 * default nans. Make a second copy with round-to-odd -- see above.
1051 fpst_std
= *(float_status
*)vst
;
1052 set_default_nan_mode(true, &fpst_std
);
1053 fpst_odd
= fpst_std
;
1054 set_float_rounding_mode(float_round_to_odd
, &fpst_odd
);
1056 for (row
= 0; row
< oprsz
; ) {
1057 uint16_t prow
= pn
[H2(row
>> 4)];
1059 void *vza_row
= vza
+ tile_vslice_offset(row
);
1060 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
));
1062 n
= f16mop_adj_pair(n
, prow
, neg
);
1064 for (col
= 0; col
< oprsz
; ) {
1065 uint16_t pcol
= pm
[H2(col
>> 4)];
1067 if (prow
& pcol
& 0b0101) {
1068 uint32_t *a
= vza_row
+ H1_4(col
);
1069 uint32_t m
= *(uint32_t *)(vzm
+ H1_4(col
));
1071 m
= f16mop_adj_pair(m
, pcol
, 0);
1072 *a
= f16_dotadd(*a
, n
, m
, &fpst_std
, &fpst_odd
);
1085 void HELPER(sme_bfmopa
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
1086 void *vpm
, uint32_t desc
)
1088 intptr_t row
, col
, oprsz
= simd_maxsz(desc
);
1089 uint32_t neg
= simd_data(desc
) * 0x80008000u
;
1090 uint16_t *pn
= vpn
, *pm
= vpm
;
1092 for (row
= 0; row
< oprsz
; ) {
1093 uint16_t prow
= pn
[H2(row
>> 4)];
1095 void *vza_row
= vza
+ tile_vslice_offset(row
);
1096 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
));
1098 n
= f16mop_adj_pair(n
, prow
, neg
);
1100 for (col
= 0; col
< oprsz
; ) {
1101 uint16_t pcol
= pm
[H2(col
>> 4)];
1103 if (prow
& pcol
& 0b0101) {
1104 uint32_t *a
= vza_row
+ H1_4(col
);
1105 uint32_t m
= *(uint32_t *)(vzm
+ H1_4(col
));
1107 m
= f16mop_adj_pair(m
, pcol
, 0);
1108 *a
= bfdotadd(*a
, n
, m
);
1121 typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1123 static inline void do_imopa(uint64_t *za
, uint64_t *zn
, uint64_t *zm
,
1124 uint8_t *pn
, uint8_t *pm
,
1125 uint32_t desc
, IMOPFn
*fn
)
1127 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
1128 bool neg
= simd_data(desc
);
1130 for (row
= 0; row
< oprsz
; ++row
) {
1131 uint8_t pa
= pn
[H1(row
)];
1132 uint64_t *za_row
= &za
[tile_vslice_index(row
)];
1133 uint64_t n
= zn
[row
];
1135 for (col
= 0; col
< oprsz
; ++col
) {
1136 uint8_t pb
= pm
[H1(col
)];
1137 uint64_t *a
= &za_row
[col
];
1139 *a
= fn(n
, zm
[col
], *a
, pa
& pb
, neg
);
1144 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1145 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1147 uint32_t sum0 = 0, sum1 = 0; \
1148 /* Apply P to N as a mask, making the inactive elements 0. */ \
1149 n &= expand_pred_b(p); \
1150 sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1151 sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
1152 sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1153 sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
1154 sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1155 sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \
1156 sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1157 sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \
1159 sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \
1161 sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \
1163 return ((uint64_t)sum1 << 32) | sum0; \
1166 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1167 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1170 /* Apply P to N as a mask, making the inactive elements 0. */ \
1171 n &= expand_pred_h(p); \
1172 sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1173 sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1174 sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1175 sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1176 return neg ? a - sum : a + sum; \
1179 DEF_IMOP_32(smopa_s
, int8_t, int8_t)
1180 DEF_IMOP_32(umopa_s
, uint8_t, uint8_t)
1181 DEF_IMOP_32(sumopa_s
, int8_t, uint8_t)
1182 DEF_IMOP_32(usmopa_s
, uint8_t, int8_t)
1184 DEF_IMOP_64(smopa_d
, int16_t, int16_t)
1185 DEF_IMOP_64(umopa_d
, uint16_t, uint16_t)
1186 DEF_IMOP_64(sumopa_d
, int16_t, uint16_t)
1187 DEF_IMOP_64(usmopa_d
, uint16_t, int16_t)
1189 #define DEF_IMOPH(NAME) \
1190 void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \
1191 void *vpm, uint32_t desc) \
1192 { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }