target/arm/sme_helper.c

   1 /*
   2  * ARM SME Operations
   3  *
   4  * Copyright (c) 2022 Linaro, Ltd.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "cpu.h"
  22 #include "internals.h"
  23 #include "tcg/tcg-gvec-desc.h"
  24 #include "exec/helper-proto.h"
  25 #include "exec/cpu_ldst.h"
  26 #include "exec/exec-all.h"
  27 #include "qemu/int128.h"
  28 #include "fpu/softfloat.h"
  29 #include "vec_internal.h"
  30 #include "sve_ldst_internal.h"
  31
  32 /* ResetSVEState */
  33 void arm_reset_sve_state(CPUARMState *env)
  34 {
  35     memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs));
  36     /* Recall that FFR is stored as pregs[16]. */
  37     memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs));
  38     vfp_set_fpcr(env, 0x0800009f);
  39 }
  40
  41 void helper_set_pstate_sm(CPUARMState *env, uint32_t i)
  42 {
  43     if (i == FIELD_EX64(env->svcr, SVCR, SM)) {
  44         return;
  45     }
  46     env->svcr ^= R_SVCR_SM_MASK;
  47     arm_reset_sve_state(env);
  48 }
  49
  50 void helper_set_pstate_za(CPUARMState *env, uint32_t i)
  51 {
  52     if (i == FIELD_EX64(env->svcr, SVCR, ZA)) {
  53         return;
  54     }
  55     env->svcr ^= R_SVCR_ZA_MASK;
  56
  57     /*
  58      * ResetSMEState.
  59      *
  60      * SetPSTATE_ZA zeros on enable and disable.  We can zero this only
  61      * on enable: while disabled, the storage is inaccessible and the
  62      * value does not matter.  We're not saving the storage in vmstate
  63      * when disabled either.
  64      */
  65     if (i) {
  66         memset(env->zarray, 0, sizeof(env->zarray));
  67     }
  68 }
  69
  70 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
  71 {
  72     uint32_t i;
  73
  74     /*
  75      * Special case clearing the entire ZA space.
  76      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
  77      * parts of the ZA storage outside of SVL.
  78      */
  79     if (imm == 0xff) {
  80         memset(env->zarray, 0, sizeof(env->zarray));
  81         return;
  82     }
  83
  84     /*
  85      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
  86      * so each row is discontiguous within ZA[].
  87      */
  88     for (i = 0; i < svl; i++) {
  89         if (imm & (1 << (i % 8))) {
  90             memset(&env->zarray[i], 0, svl);
  91         }
  92     }
  93 }
  94
  95
  96 /*
  97  * When considering the ZA storage as an array of elements of
  98  * type T, the index within that array of the Nth element of
  99  * a vertical slice of a tile can be calculated like this,
 100  * regardless of the size of type T. This is because the tiles
 101  * are interleaved, so if type T is size N bytes then row 1 of
 102  * the tile is N rows away from row 0. The division by N to
 103  * convert a byte offset into an array index and the multiplication
 104  * by N to convert from vslice-index-within-the-tile to
 105  * the index within the ZA storage cancel out.
 106  */
 107 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
 108
 109 /*
 110  * When doing byte arithmetic on the ZA storage, the element
 111  * byteoff bytes away in a tile vertical slice is always this
 112  * many bytes away in the ZA storage, regardless of the
 113  * size of the tile element, assuming that byteoff is a multiple
 114  * of the element size. Again this is because of the interleaving
 115  * of the tiles. For instance if we have 1 byte per element then
 116  * each row of the ZA storage has one byte of the vslice data,
 117  * and (counting from 0) byte 8 goes in row 8 of the storage
 118  * at offset (8 * row-size-in-bytes).
 119  * If we have 8 bytes per element then each row of the ZA storage
 120  * has 8 bytes of the data, but there are 8 interleaved tiles and
 121  * so byte 8 of the data goes into row 1 of the tile,
 122  * which is again row 8 of the storage, so the offset is still
 123  * (8 * row-size-in-bytes). Similarly for other element sizes.
 124  */
 125 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
 126
 127
 128 /*
 129  * Move Zreg vector to ZArray column.
 130  */
 131 #define DO_MOVA_C(NAME, TYPE, H)                                        \
 132 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
 133 {                                                                       \
 134     int i, oprsz = simd_oprsz(desc);                                    \
 135     for (i = 0; i < oprsz; ) {                                          \
 136         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 137         do {                                                            \
 138             if (pg & 1) {                                               \
 139                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
 140             }                                                           \
 141             i += sizeof(TYPE);                                          \
 142             pg >>= sizeof(TYPE);                                        \
 143         } while (i & 15);                                               \
 144     }                                                                   \
 145 }
 146
 147 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
 148 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
 149 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
 150
 151 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
 152 {
 153     int i, oprsz = simd_oprsz(desc) / 8;
 154     uint8_t *pg = vg;
 155     uint64_t *n = vn;
 156     uint64_t *a = za;
 157
 158     for (i = 0; i < oprsz; i++) {
 159         if (pg[H1(i)] & 1) {
 160             a[tile_vslice_index(i)] = n[i];
 161         }
 162     }
 163 }
 164
 165 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
 166 {
 167     int i, oprsz = simd_oprsz(desc) / 16;
 168     uint16_t *pg = vg;
 169     Int128 *n = vn;
 170     Int128 *a = za;
 171
 172     /*
 173      * Int128 is used here simply to copy 16 bytes, and to simplify
 174      * the address arithmetic.
 175      */
 176     for (i = 0; i < oprsz; i++) {
 177         if (pg[H2(i)] & 1) {
 178             a[tile_vslice_index(i)] = n[i];
 179         }
 180     }
 181 }
 182
 183 #undef DO_MOVA_C
 184
 185 /*
 186  * Move ZArray column to Zreg vector.
 187  */
 188 #define DO_MOVA_Z(NAME, TYPE, H)                                        \
 189 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
 190 {                                                                       \
 191     int i, oprsz = simd_oprsz(desc);                                    \
 192     for (i = 0; i < oprsz; ) {                                          \
 193         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 194         do {                                                            \
 195             if (pg & 1) {                                               \
 196                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
 197             }                                                           \
 198             i += sizeof(TYPE);                                          \
 199             pg >>= sizeof(TYPE);                                        \
 200         } while (i & 15);                                               \
 201     }                                                                   \
 202 }
 203
 204 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
 205 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
 206 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
 207
 208 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
 209 {
 210     int i, oprsz = simd_oprsz(desc) / 8;
 211     uint8_t *pg = vg;
 212     uint64_t *d = vd;
 213     uint64_t *a = za;
 214
 215     for (i = 0; i < oprsz; i++) {
 216         if (pg[H1(i)] & 1) {
 217             d[i] = a[tile_vslice_index(i)];
 218         }
 219     }
 220 }
 221
 222 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
 223 {
 224     int i, oprsz = simd_oprsz(desc) / 16;
 225     uint16_t *pg = vg;
 226     Int128 *d = vd;
 227     Int128 *a = za;
 228
 229     /*
 230      * Int128 is used here simply to copy 16 bytes, and to simplify
 231      * the address arithmetic.
 232      */
 233     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
 234         if (pg[H2(i)] & 1) {
 235             d[i] = a[tile_vslice_index(i)];
 236         }
 237     }
 238 }
 239
 240 #undef DO_MOVA_Z
 241
 242 /*
 243  * Clear elements in a tile slice comprising len bytes.
 244  */
 245
 246 typedef void ClearFn(void *ptr, size_t off, size_t len);
 247
 248 static void clear_horizontal(void *ptr, size_t off, size_t len)
 249 {
 250     memset(ptr + off, 0, len);
 251 }
 252
 253 static void clear_vertical_b(void *vptr, size_t off, size_t len)
 254 {
 255     for (size_t i = 0; i < len; ++i) {
 256         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 257     }
 258 }
 259
 260 static void clear_vertical_h(void *vptr, size_t off, size_t len)
 261 {
 262     for (size_t i = 0; i < len; i += 2) {
 263         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 264     }
 265 }
 266
 267 static void clear_vertical_s(void *vptr, size_t off, size_t len)
 268 {
 269     for (size_t i = 0; i < len; i += 4) {
 270         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 271     }
 272 }
 273
 274 static void clear_vertical_d(void *vptr, size_t off, size_t len)
 275 {
 276     for (size_t i = 0; i < len; i += 8) {
 277         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 278     }
 279 }
 280
 281 static void clear_vertical_q(void *vptr, size_t off, size_t len)
 282 {
 283     for (size_t i = 0; i < len; i += 16) {
 284         memset(vptr + tile_vslice_offset(i + off), 0, 16);
 285     }
 286 }
 287
 288 /*
 289  * Copy elements from an array into a tile slice comprising len bytes.
 290  */
 291
 292 typedef void CopyFn(void *dst, const void *src, size_t len);
 293
 294 static void copy_horizontal(void *dst, const void *src, size_t len)
 295 {
 296     memcpy(dst, src, len);
 297 }
 298
 299 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
 300 {
 301     const uint8_t *src = vsrc;
 302     uint8_t *dst = vdst;
 303     size_t i;
 304
 305     for (i = 0; i < len; ++i) {
 306         dst[tile_vslice_index(i)] = src[i];
 307     }
 308 }
 309
 310 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
 311 {
 312     const uint16_t *src = vsrc;
 313     uint16_t *dst = vdst;
 314     size_t i;
 315
 316     for (i = 0; i < len / 2; ++i) {
 317         dst[tile_vslice_index(i)] = src[i];
 318     }
 319 }
 320
 321 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
 322 {
 323     const uint32_t *src = vsrc;
 324     uint32_t *dst = vdst;
 325     size_t i;
 326
 327     for (i = 0; i < len / 4; ++i) {
 328         dst[tile_vslice_index(i)] = src[i];
 329     }
 330 }
 331
 332 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
 333 {
 334     const uint64_t *src = vsrc;
 335     uint64_t *dst = vdst;
 336     size_t i;
 337
 338     for (i = 0; i < len / 8; ++i) {
 339         dst[tile_vslice_index(i)] = src[i];
 340     }
 341 }
 342
 343 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
 344 {
 345     for (size_t i = 0; i < len; i += 16) {
 346         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
 347     }
 348 }
 349
 350 /*
 351  * Host and TLB primitives for vertical tile slice addressing.
 352  */
 353
 354 #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
 355 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 356 {                                                                           \
 357     TYPE val = HOST(host);                                                  \
 358     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 359 }                                                                           \
 360 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 361                         intptr_t off, target_ulong addr, uintptr_t ra)      \
 362 {                                                                           \
 363     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
 364     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 365 }
 366
 367 #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
 368 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 369 {                                                                           \
 370     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 371     HOST(host, val);                                                        \
 372 }                                                                           \
 373 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 374                         intptr_t off, target_ulong addr, uintptr_t ra)      \
 375 {                                                                           \
 376     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 377     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
 378 }
 379
 380 /*
 381  * The ARMVectorReg elements are stored in host-endian 64-bit units.
 382  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
 383  * corresponds to storing the two 64-bit pieces in little-endian order.
 384  */
 385 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 386 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 387 {                                                                           \
 388     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
 389     uint64_t *ptr = za + off;                                               \
 390     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 391 }                                                                           \
 392 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 393 {                                                                           \
 394     HNAME##_host(za, tile_vslice_offset(off), host);                        \
 395 }                                                                           \
 396 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 397                                target_ulong addr, uintptr_t ra)             \
 398 {                                                                           \
 399     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
 400     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
 401     uint64_t *ptr = za + off;                                               \
 402     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 403 }                                                                           \
 404 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 405                                target_ulong addr, uintptr_t ra)             \
 406 {                                                                           \
 407     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 408 }
 409
 410 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 411 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 412 {                                                                           \
 413     uint64_t *ptr = za + off;                                               \
 414     HOST(host, ptr[BE]);                                                    \
 415     HOST(host + 1, ptr[!BE]);                                               \
 416 }                                                                           \
 417 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 418 {                                                                           \
 419     HNAME##_host(za, tile_vslice_offset(off), host);                        \
 420 }                                                                           \
 421 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 422                                target_ulong addr, uintptr_t ra)             \
 423 {                                                                           \
 424     uint64_t *ptr = za + off;                                               \
 425     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
 426     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
 427 }                                                                           \
 428 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 429                                target_ulong addr, uintptr_t ra)             \
 430 {                                                                           \
 431     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 432 }
 433
 434 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
 435 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
 436 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
 437 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
 438 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
 439 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
 440 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
 441
 442 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
 443 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
 444
 445 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
 446 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
 447 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
 448 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
 449 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
 450 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
 451 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
 452
 453 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
 454 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
 455
 456 #undef DO_LD
 457 #undef DO_ST
 458 #undef DO_LDQ
 459 #undef DO_STQ
 460
 461 /*
 462  * Common helper for all contiguous predicated loads.
 463  */
 464
 465 static inline QEMU_ALWAYS_INLINE
 466 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
 467              const target_ulong addr, uint32_t desc, const uintptr_t ra,
 468              const int esz, uint32_t mtedesc, bool vertical,
 469              sve_ldst1_host_fn *host_fn,
 470              sve_ldst1_tlb_fn *tlb_fn,
 471              ClearFn *clr_fn,
 472              CopyFn *cpy_fn)
 473 {
 474     const intptr_t reg_max = simd_oprsz(desc);
 475     const intptr_t esize = 1 << esz;
 476     intptr_t reg_off, reg_last;
 477     SVEContLdSt info;
 478     void *host;
 479     int flags;
 480
 481     /* Find the active elements.  */
 482     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 483         /* The entire predicate was false; no load occurs.  */
 484         clr_fn(za, 0, reg_max);
 485         return;
 486     }
 487
 488     /* Probe the page(s).  Exit with exception for any invalid page. */
 489     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
 490
 491     /* Handle watchpoints for all active elements. */
 492     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 493                               BP_MEM_READ, ra);
 494
 495     /*
 496      * Handle mte checks for all active elements.
 497      * Since TBI must be set for MTE, !mtedesc => !mte_active.
 498      */
 499     if (mtedesc) {
 500         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 501                                 mtedesc, ra);
 502     }
 503
 504     flags = info.page[0].flags | info.page[1].flags;
 505     if (unlikely(flags != 0)) {
 506 #ifdef CONFIG_USER_ONLY
 507         g_assert_not_reached();
 508 #else
 509         /*
 510          * At least one page includes MMIO.
 511          * Any bus operation can fail with cpu_transaction_failed,
 512          * which for ARM will raise SyncExternal.  Perform the load
 513          * into scratch memory to preserve register state until the end.
 514          */
 515         ARMVectorReg scratch = { };
 516
 517         reg_off = info.reg_off_first[0];
 518         reg_last = info.reg_off_last[1];
 519         if (reg_last < 0) {
 520             reg_last = info.reg_off_split;
 521             if (reg_last < 0) {
 522                 reg_last = info.reg_off_last[0];
 523             }
 524         }
 525
 526         do {
 527             uint64_t pg = vg[reg_off >> 6];
 528             do {
 529                 if ((pg >> (reg_off & 63)) & 1) {
 530                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
 531                 }
 532                 reg_off += esize;
 533             } while (reg_off & 63);
 534         } while (reg_off <= reg_last);
 535
 536         cpy_fn(za, &scratch, reg_max);
 537         return;
 538 #endif
 539     }
 540
 541     /* The entire operation is in RAM, on valid pages. */
 542
 543     reg_off = info.reg_off_first[0];
 544     reg_last = info.reg_off_last[0];
 545     host = info.page[0].host;
 546
 547     if (!vertical) {
 548         memset(za, 0, reg_max);
 549     } else if (reg_off) {
 550         clr_fn(za, 0, reg_off);
 551     }
 552
 553     while (reg_off <= reg_last) {
 554         uint64_t pg = vg[reg_off >> 6];
 555         do {
 556             if ((pg >> (reg_off & 63)) & 1) {
 557                 host_fn(za, reg_off, host + reg_off);
 558             } else if (vertical) {
 559                 clr_fn(za, reg_off, esize);
 560             }
 561             reg_off += esize;
 562         } while (reg_off <= reg_last && (reg_off & 63));
 563     }
 564
 565     /*
 566      * Use the slow path to manage the cross-page misalignment.
 567      * But we know this is RAM and cannot trap.
 568      */
 569     reg_off = info.reg_off_split;
 570     if (unlikely(reg_off >= 0)) {
 571         tlb_fn(env, za, reg_off, addr + reg_off, ra);
 572     }
 573
 574     reg_off = info.reg_off_first[1];
 575     if (unlikely(reg_off >= 0)) {
 576         reg_last = info.reg_off_last[1];
 577         host = info.page[1].host;
 578
 579         do {
 580             uint64_t pg = vg[reg_off >> 6];
 581             do {
 582                 if ((pg >> (reg_off & 63)) & 1) {
 583                     host_fn(za, reg_off, host + reg_off);
 584                 } else if (vertical) {
 585                     clr_fn(za, reg_off, esize);
 586                 }
 587                 reg_off += esize;
 588             } while (reg_off & 63);
 589         } while (reg_off <= reg_last);
 590     }
 591 }
 592
 593 static inline QEMU_ALWAYS_INLINE
 594 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
 595                  target_ulong addr, uint32_t desc, uintptr_t ra,
 596                  const int esz, bool vertical,
 597                  sve_ldst1_host_fn *host_fn,
 598                  sve_ldst1_tlb_fn *tlb_fn,
 599                  ClearFn *clr_fn,
 600                  CopyFn *cpy_fn)
 601 {
 602     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 603     int bit55 = extract64(addr, 55, 1);
 604
 605     /* Remove mtedesc from the normal sve descriptor. */
 606     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 607
 608     /* Perform gross MTE suppression early. */
 609     if (!tbi_check(desc, bit55) ||
 610         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 611         mtedesc = 0;
 612     }
 613
 614     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
 615             host_fn, tlb_fn, clr_fn, cpy_fn);
 616 }
 617
 618 #define DO_LD(L, END, ESZ)                                                 \
 619 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 620                                  target_ulong addr, uint32_t desc)         \
 621 {                                                                          \
 622     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 623             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
 624             clear_horizontal, copy_horizontal);                            \
 625 }                                                                          \
 626 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 627                                  target_ulong addr, uint32_t desc)         \
 628 {                                                                          \
 629     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 630             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
 631             clear_vertical_##L, copy_vertical_##L);                        \
 632 }                                                                          \
 633 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 634                                      target_ulong addr, uint32_t desc)     \
 635 {                                                                          \
 636     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 637                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
 638                 clear_horizontal, copy_horizontal);                        \
 639 }                                                                          \
 640 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 641                                      target_ulong addr, uint32_t desc)     \
 642 {                                                                          \
 643     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 644                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
 645                 clear_vertical_##L, copy_vertical_##L);                    \
 646 }
 647
 648 DO_LD(b, , MO_8)
 649 DO_LD(h, _be, MO_16)
 650 DO_LD(h, _le, MO_16)
 651 DO_LD(s, _be, MO_32)
 652 DO_LD(s, _le, MO_32)
 653 DO_LD(d, _be, MO_64)
 654 DO_LD(d, _le, MO_64)
 655 DO_LD(q, _be, MO_128)
 656 DO_LD(q, _le, MO_128)
 657
 658 #undef DO_LD
 659
 660 /*
 661  * Common helper for all contiguous predicated stores.
 662  */
 663
 664 static inline QEMU_ALWAYS_INLINE
 665 void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
 666              const target_ulong addr, uint32_t desc, const uintptr_t ra,
 667              const int esz, uint32_t mtedesc, bool vertical,
 668              sve_ldst1_host_fn *host_fn,
 669              sve_ldst1_tlb_fn *tlb_fn)
 670 {
 671     const intptr_t reg_max = simd_oprsz(desc);
 672     const intptr_t esize = 1 << esz;
 673     intptr_t reg_off, reg_last;
 674     SVEContLdSt info;
 675     void *host;
 676     int flags;
 677
 678     /* Find the active elements.  */
 679     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 680         /* The entire predicate was false; no store occurs.  */
 681         return;
 682     }
 683
 684     /* Probe the page(s).  Exit with exception for any invalid page. */
 685     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
 686
 687     /* Handle watchpoints for all active elements. */
 688     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 689                               BP_MEM_WRITE, ra);
 690
 691     /*
 692      * Handle mte checks for all active elements.
 693      * Since TBI must be set for MTE, !mtedesc => !mte_active.
 694      */
 695     if (mtedesc) {
 696         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 697                                 mtedesc, ra);
 698     }
 699
 700     flags = info.page[0].flags | info.page[1].flags;
 701     if (unlikely(flags != 0)) {
 702 #ifdef CONFIG_USER_ONLY
 703         g_assert_not_reached();
 704 #else
 705         /*
 706          * At least one page includes MMIO.
 707          * Any bus operation can fail with cpu_transaction_failed,
 708          * which for ARM will raise SyncExternal.  We cannot avoid
 709          * this fault and will leave with the store incomplete.
 710          */
 711         reg_off = info.reg_off_first[0];
 712         reg_last = info.reg_off_last[1];
 713         if (reg_last < 0) {
 714             reg_last = info.reg_off_split;
 715             if (reg_last < 0) {
 716                 reg_last = info.reg_off_last[0];
 717             }
 718         }
 719
 720         do {
 721             uint64_t pg = vg[reg_off >> 6];
 722             do {
 723                 if ((pg >> (reg_off & 63)) & 1) {
 724                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
 725                 }
 726                 reg_off += esize;
 727             } while (reg_off & 63);
 728         } while (reg_off <= reg_last);
 729         return;
 730 #endif
 731     }
 732
 733     reg_off = info.reg_off_first[0];
 734     reg_last = info.reg_off_last[0];
 735     host = info.page[0].host;
 736
 737     while (reg_off <= reg_last) {
 738         uint64_t pg = vg[reg_off >> 6];
 739         do {
 740             if ((pg >> (reg_off & 63)) & 1) {
 741                 host_fn(za, reg_off, host + reg_off);
 742             }
 743             reg_off += 1 << esz;
 744         } while (reg_off <= reg_last && (reg_off & 63));
 745     }
 746
 747     /*
 748      * Use the slow path to manage the cross-page misalignment.
 749      * But we know this is RAM and cannot trap.
 750      */
 751     reg_off = info.reg_off_split;
 752     if (unlikely(reg_off >= 0)) {
 753         tlb_fn(env, za, reg_off, addr + reg_off, ra);
 754     }
 755
 756     reg_off = info.reg_off_first[1];
 757     if (unlikely(reg_off >= 0)) {
 758         reg_last = info.reg_off_last[1];
 759         host = info.page[1].host;
 760
 761         do {
 762             uint64_t pg = vg[reg_off >> 6];
 763             do {
 764                 if ((pg >> (reg_off & 63)) & 1) {
 765                     host_fn(za, reg_off, host + reg_off);
 766                 }
 767                 reg_off += 1 << esz;
 768             } while (reg_off & 63);
 769         } while (reg_off <= reg_last);
 770     }
 771 }
 772
 773 static inline QEMU_ALWAYS_INLINE
 774 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
 775                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
 776                  sve_ldst1_host_fn *host_fn,
 777                  sve_ldst1_tlb_fn *tlb_fn)
 778 {
 779     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 780     int bit55 = extract64(addr, 55, 1);
 781
 782     /* Remove mtedesc from the normal sve descriptor. */
 783     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 784
 785     /* Perform gross MTE suppression early. */
 786     if (!tbi_check(desc, bit55) ||
 787         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 788         mtedesc = 0;
 789     }
 790
 791     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
 792             vertical, host_fn, tlb_fn);
 793 }
 794
 795 #define DO_ST(L, END, ESZ)                                                 \
 796 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 797                                  target_ulong addr, uint32_t desc)         \
 798 {                                                                          \
 799     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 800             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
 801 }                                                                          \
 802 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 803                                  target_ulong addr, uint32_t desc)         \
 804 {                                                                          \
 805     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 806             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
 807 }                                                                          \
 808 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 809                                      target_ulong addr, uint32_t desc)     \
 810 {                                                                          \
 811     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 812                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
 813 }                                                                          \
 814 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 815                                      target_ulong addr, uint32_t desc)     \
 816 {                                                                          \
 817     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 818                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
 819 }
 820
 821 DO_ST(b, , MO_8)
 822 DO_ST(h, _be, MO_16)
 823 DO_ST(h, _le, MO_16)
 824 DO_ST(s, _be, MO_32)
 825 DO_ST(s, _le, MO_32)
 826 DO_ST(d, _be, MO_64)
 827 DO_ST(d, _le, MO_64)
 828 DO_ST(q, _be, MO_128)
 829 DO_ST(q, _le, MO_128)
 830
 831 #undef DO_ST
 832
 833 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
 834                          void *vpm, uint32_t desc)
 835 {
 836     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 837     uint64_t *pn = vpn, *pm = vpm;
 838     uint32_t *zda = vzda, *zn = vzn;
 839
 840     for (row = 0; row < oprsz; ) {
 841         uint64_t pa = pn[row >> 4];
 842         do {
 843             if (pa & 1) {
 844                 for (col = 0; col < oprsz; ) {
 845                     uint64_t pb = pm[col >> 4];
 846                     do {
 847                         if (pb & 1) {
 848                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
 849                         }
 850                         pb >>= 4;
 851                     } while (++col & 15);
 852                 }
 853             }
 854             pa >>= 4;
 855         } while (++row & 15);
 856     }
 857 }
 858
 859 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
 860                          void *vpm, uint32_t desc)
 861 {
 862     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 863     uint8_t *pn = vpn, *pm = vpm;
 864     uint64_t *zda = vzda, *zn = vzn;
 865
 866     for (row = 0; row < oprsz; ++row) {
 867         if (pn[H1(row)] & 1) {
 868             for (col = 0; col < oprsz; ++col) {
 869                 if (pm[H1(col)] & 1) {
 870                     zda[tile_vslice_index(row) + col] += zn[col];
 871                 }
 872             }
 873         }
 874     }
 875 }
 876
 877 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
 878                          void *vpm, uint32_t desc)
 879 {
 880     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 881     uint64_t *pn = vpn, *pm = vpm;
 882     uint32_t *zda = vzda, *zn = vzn;
 883
 884     for (row = 0; row < oprsz; ) {
 885         uint64_t pa = pn[row >> 4];
 886         do {
 887             if (pa & 1) {
 888                 uint32_t zn_row = zn[H4(row)];
 889                 for (col = 0; col < oprsz; ) {
 890                     uint64_t pb = pm[col >> 4];
 891                     do {
 892                         if (pb & 1) {
 893                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
 894                         }
 895                         pb >>= 4;
 896                     } while (++col & 15);
 897                 }
 898             }
 899             pa >>= 4;
 900         } while (++row & 15);
 901     }
 902 }
 903
 904 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
 905                          void *vpm, uint32_t desc)
 906 {
 907     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 908     uint8_t *pn = vpn, *pm = vpm;
 909     uint64_t *zda = vzda, *zn = vzn;
 910
 911     for (row = 0; row < oprsz; ++row) {
 912         if (pn[H1(row)] & 1) {
 913             uint64_t zn_row = zn[row];
 914             for (col = 0; col < oprsz; ++col) {
 915                 if (pm[H1(col)] & 1) {
 916                     zda[tile_vslice_index(row) + col] += zn_row;
 917                 }
 918             }
 919         }
 920     }
 921 }
 922
 923 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
 924                          void *vpm, void *vst, uint32_t desc)
 925 {
 926     intptr_t row, col, oprsz = simd_maxsz(desc);
 927     uint32_t neg = simd_data(desc) << 31;
 928     uint16_t *pn = vpn, *pm = vpm;
 929     float_status fpst;
 930
 931     /*
 932      * Make a copy of float_status because this operation does not
 933      * update the cumulative fp exception status.  It also produces
 934      * default nans.
 935      */
 936     fpst = *(float_status *)vst;
 937     set_default_nan_mode(true, &fpst);
 938
 939     for (row = 0; row < oprsz; ) {
 940         uint16_t pa = pn[H2(row >> 4)];
 941         do {
 942             if (pa & 1) {
 943                 void *vza_row = vza + tile_vslice_offset(row);
 944                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
 945
 946                 for (col = 0; col < oprsz; ) {
 947                     uint16_t pb = pm[H2(col >> 4)];
 948                     do {
 949                         if (pb & 1) {
 950                             uint32_t *a = vza_row + H1_4(col);
 951                             uint32_t *m = vzm + H1_4(col);
 952                             *a = float32_muladd(n, *m, *a, 0, vst);
 953                         }
 954                         col += 4;
 955                         pb >>= 4;
 956                     } while (col & 15);
 957                 }
 958             }
 959             row += 4;
 960             pa >>= 4;
 961         } while (row & 15);
 962     }
 963 }
 964
 965 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
 966                          void *vpm, void *vst, uint32_t desc)
 967 {
 968     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 969     uint64_t neg = (uint64_t)simd_data(desc) << 63;
 970     uint64_t *za = vza, *zn = vzn, *zm = vzm;
 971     uint8_t *pn = vpn, *pm = vpm;
 972     float_status fpst = *(float_status *)vst;
 973
 974     set_default_nan_mode(true, &fpst);
 975
 976     for (row = 0; row < oprsz; ++row) {
 977         if (pn[H1(row)] & 1) {
 978             uint64_t *za_row = &za[tile_vslice_index(row)];
 979             uint64_t n = zn[row] ^ neg;
 980
 981             for (col = 0; col < oprsz; ++col) {
 982                 if (pm[H1(col)] & 1) {
 983                     uint64_t *a = &za_row[col];
 984                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
 985                 }
 986             }
 987         }
 988     }
 989 }
 990
 991 /*
 992  * Alter PAIR as needed for controlling predicates being false,
 993  * and for NEG on an enabled row element.
 994  */
 995 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
 996 {
 997     /*
 998      * The pseudocode uses a conditional negate after the conditional zero.
 999      * It is simpler here to unconditionally negate before conditional zero.
1000      */
1001     pair ^= neg;
1002     if (!(pg & 1)) {
1003         pair &= 0xffff0000u;
1004     }
1005     if (!(pg & 4)) {
1006         pair &= 0x0000ffffu;
1007     }
1008     return pair;
1009 }
1010
1011 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
1012                           float_status *s_std, float_status *s_odd)
1013 {
1014     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
1015     float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
1016     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
1017     float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
1018     float64 t64;
1019     float32 t32;
1020
1021     /*
1022      * The ARM pseudocode function FPDot performs both multiplies
1023      * and the add with a single rounding operation.  Emulate this
1024      * by performing the first multiply in round-to-odd, then doing
1025      * the second multiply as fused multiply-add, and rounding to
1026      * float32 all in one step.
1027      */
1028     t64 = float64_mul(e1r, e2r, s_odd);
1029     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1030
1031     /* This conversion is exact, because we've already rounded. */
1032     t32 = float64_to_float32(t64, s_std);
1033
1034     /* The final accumulation step is not fused. */
1035     return float32_add(sum, t32, s_std);
1036 }
1037
1038 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1039                          void *vpm, void *vst, uint32_t desc)
1040 {
1041     intptr_t row, col, oprsz = simd_maxsz(desc);
1042     uint32_t neg = simd_data(desc) * 0x80008000u;
1043     uint16_t *pn = vpn, *pm = vpm;
1044     float_status fpst_odd, fpst_std;
1045
1046     /*
1047      * Make a copy of float_status because this operation does not
1048      * update the cumulative fp exception status.  It also produces
1049      * default nans.  Make a second copy with round-to-odd -- see above.
1050      */
1051     fpst_std = *(float_status *)vst;
1052     set_default_nan_mode(true, &fpst_std);
1053     fpst_odd = fpst_std;
1054     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1055
1056     for (row = 0; row < oprsz; ) {
1057         uint16_t prow = pn[H2(row >> 4)];
1058         do {
1059             void *vza_row = vza + tile_vslice_offset(row);
1060             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1061
1062             n = f16mop_adj_pair(n, prow, neg);
1063
1064             for (col = 0; col < oprsz; ) {
1065                 uint16_t pcol = pm[H2(col >> 4)];
1066                 do {
1067                     if (prow & pcol & 0b0101) {
1068                         uint32_t *a = vza_row + H1_4(col);
1069                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1070
1071                         m = f16mop_adj_pair(m, pcol, 0);
1072                         *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1073
1074                         col += 4;
1075                         pcol >>= 4;
1076                     }
1077                 } while (col & 15);
1078             }
1079             row += 4;
1080             prow >>= 4;
1081         } while (row & 15);
1082     }
1083 }
1084
1085 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1086                         void *vpm, uint32_t desc)
1087 {
1088     intptr_t row, col, oprsz = simd_maxsz(desc);
1089     uint32_t neg = simd_data(desc) * 0x80008000u;
1090     uint16_t *pn = vpn, *pm = vpm;
1091
1092     for (row = 0; row < oprsz; ) {
1093         uint16_t prow = pn[H2(row >> 4)];
1094         do {
1095             void *vza_row = vza + tile_vslice_offset(row);
1096             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1097
1098             n = f16mop_adj_pair(n, prow, neg);
1099
1100             for (col = 0; col < oprsz; ) {
1101                 uint16_t pcol = pm[H2(col >> 4)];
1102                 do {
1103                     if (prow & pcol & 0b0101) {
1104                         uint32_t *a = vza_row + H1_4(col);
1105                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1106
1107                         m = f16mop_adj_pair(m, pcol, 0);
1108                         *a = bfdotadd(*a, n, m);
1109
1110                         col += 4;
1111                         pcol >>= 4;
1112                     }
1113                 } while (col & 15);
1114             }
1115             row += 4;
1116             prow >>= 4;
1117         } while (row & 15);
1118     }
1119 }
1120
1121 typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1122
1123 static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1124                             uint8_t *pn, uint8_t *pm,
1125                             uint32_t desc, IMOPFn *fn)
1126 {
1127     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1128     bool neg = simd_data(desc);
1129
1130     for (row = 0; row < oprsz; ++row) {
1131         uint8_t pa = pn[H1(row)];
1132         uint64_t *za_row = &za[tile_vslice_index(row)];
1133         uint64_t n = zn[row];
1134
1135         for (col = 0; col < oprsz; ++col) {
1136             uint8_t pb = pm[H1(col)];
1137             uint64_t *a = &za_row[col];
1138
1139             *a = fn(n, zm[col], *a, pa & pb, neg);
1140         }
1141     }
1142 }
1143
1144 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1145 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1146 {                                                                           \
1147     uint32_t sum0 = 0, sum1 = 0;                                            \
1148     /* Apply P to N as a mask, making the inactive elements 0. */           \
1149     n &= expand_pred_b(p);                                                  \
1150     sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
1151     sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
1152     sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
1153     sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
1154     sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
1155     sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
1156     sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
1157     sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
1158     if (neg) {                                                              \
1159         sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
1160     } else {                                                                \
1161         sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
1162     }                                                                       \
1163     return ((uint64_t)sum1 << 32) | sum0;                                   \
1164 }
1165
1166 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1167 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1168 {                                                                           \
1169     uint64_t sum = 0;                                                       \
1170     /* Apply P to N as a mask, making the inactive elements 0. */           \
1171     n &= expand_pred_h(p);                                                  \
1172     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1173     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1174     sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
1175     sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
1176     return neg ? a - sum : a + sum;                                         \
1177 }
1178
1179 DEF_IMOP_32(smopa_s, int8_t, int8_t)
1180 DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1181 DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1182 DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1183
1184 DEF_IMOP_64(smopa_d, int16_t, int16_t)
1185 DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1186 DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1187 DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1188
1189 #define DEF_IMOPH(NAME) \
1190     void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
1191                             void *vpm, uint32_t desc)                        \
1192     { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1193
1194 DEF_IMOPH(smopa_s)
1195 DEF_IMOPH(umopa_s)
1196 DEF_IMOPH(sumopa_s)
1197 DEF_IMOPH(usmopa_s)
1198 DEF_IMOPH(smopa_d)
1199 DEF_IMOPH(umopa_d)
1200 DEF_IMOPH(sumopa_d)
1201 DEF_IMOPH(usmopa_d)