target/arm/sme_helper.c

   1 /*
   2  * ARM SME Operations
   3  *
   4  * Copyright (c) 2022 Linaro, Ltd.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "cpu.h"
  22 #include "internals.h"
  23 #include "tcg/tcg-gvec-desc.h"
  24 #include "exec/helper-proto.h"
  25 #include "exec/cpu_ldst.h"
  26 #include "exec/exec-all.h"
  27 #include "qemu/int128.h"
  28 #include "fpu/softfloat.h"
  29 #include "vec_internal.h"
  30 #include "sve_ldst_internal.h"
  31
  32 void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
  33 {
  34     aarch64_set_svcr(env, val, mask);
  35 }
  36
  37 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
  38 {
  39     uint32_t i;
  40
  41     /*
  42      * Special case clearing the entire ZA space.
  43      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
  44      * parts of the ZA storage outside of SVL.
  45      */
  46     if (imm == 0xff) {
  47         memset(env->zarray, 0, sizeof(env->zarray));
  48         return;
  49     }
  50
  51     /*
  52      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
  53      * so each row is discontiguous within ZA[].
  54      */
  55     for (i = 0; i < svl; i++) {
  56         if (imm & (1 << (i % 8))) {
  57             memset(&env->zarray[i], 0, svl);
  58         }
  59     }
  60 }
  61
  62
  63 /*
  64  * When considering the ZA storage as an array of elements of
  65  * type T, the index within that array of the Nth element of
  66  * a vertical slice of a tile can be calculated like this,
  67  * regardless of the size of type T. This is because the tiles
  68  * are interleaved, so if type T is size N bytes then row 1 of
  69  * the tile is N rows away from row 0. The division by N to
  70  * convert a byte offset into an array index and the multiplication
  71  * by N to convert from vslice-index-within-the-tile to
  72  * the index within the ZA storage cancel out.
  73  */
  74 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
  75
  76 /*
  77  * When doing byte arithmetic on the ZA storage, the element
  78  * byteoff bytes away in a tile vertical slice is always this
  79  * many bytes away in the ZA storage, regardless of the
  80  * size of the tile element, assuming that byteoff is a multiple
  81  * of the element size. Again this is because of the interleaving
  82  * of the tiles. For instance if we have 1 byte per element then
  83  * each row of the ZA storage has one byte of the vslice data,
  84  * and (counting from 0) byte 8 goes in row 8 of the storage
  85  * at offset (8 * row-size-in-bytes).
  86  * If we have 8 bytes per element then each row of the ZA storage
  87  * has 8 bytes of the data, but there are 8 interleaved tiles and
  88  * so byte 8 of the data goes into row 1 of the tile,
  89  * which is again row 8 of the storage, so the offset is still
  90  * (8 * row-size-in-bytes). Similarly for other element sizes.
  91  */
  92 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
  93
  94
  95 /*
  96  * Move Zreg vector to ZArray column.
  97  */
  98 #define DO_MOVA_C(NAME, TYPE, H)                                        \
  99 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
 100 {                                                                       \
 101     int i, oprsz = simd_oprsz(desc);                                    \
 102     for (i = 0; i < oprsz; ) {                                          \
 103         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 104         do {                                                            \
 105             if (pg & 1) {                                               \
 106                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
 107             }                                                           \
 108             i += sizeof(TYPE);                                          \
 109             pg >>= sizeof(TYPE);                                        \
 110         } while (i & 15);                                               \
 111     }                                                                   \
 112 }
 113
 114 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
 115 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
 116 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
 117
 118 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
 119 {
 120     int i, oprsz = simd_oprsz(desc) / 8;
 121     uint8_t *pg = vg;
 122     uint64_t *n = vn;
 123     uint64_t *a = za;
 124
 125     for (i = 0; i < oprsz; i++) {
 126         if (pg[H1(i)] & 1) {
 127             a[tile_vslice_index(i)] = n[i];
 128         }
 129     }
 130 }
 131
 132 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
 133 {
 134     int i, oprsz = simd_oprsz(desc) / 16;
 135     uint16_t *pg = vg;
 136     Int128 *n = vn;
 137     Int128 *a = za;
 138
 139     /*
 140      * Int128 is used here simply to copy 16 bytes, and to simplify
 141      * the address arithmetic.
 142      */
 143     for (i = 0; i < oprsz; i++) {
 144         if (pg[H2(i)] & 1) {
 145             a[tile_vslice_index(i)] = n[i];
 146         }
 147     }
 148 }
 149
 150 #undef DO_MOVA_C
 151
 152 /*
 153  * Move ZArray column to Zreg vector.
 154  */
 155 #define DO_MOVA_Z(NAME, TYPE, H)                                        \
 156 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
 157 {                                                                       \
 158     int i, oprsz = simd_oprsz(desc);                                    \
 159     for (i = 0; i < oprsz; ) {                                          \
 160         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 161         do {                                                            \
 162             if (pg & 1) {                                               \
 163                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
 164             }                                                           \
 165             i += sizeof(TYPE);                                          \
 166             pg >>= sizeof(TYPE);                                        \
 167         } while (i & 15);                                               \
 168     }                                                                   \
 169 }
 170
 171 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
 172 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
 173 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
 174
 175 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
 176 {
 177     int i, oprsz = simd_oprsz(desc) / 8;
 178     uint8_t *pg = vg;
 179     uint64_t *d = vd;
 180     uint64_t *a = za;
 181
 182     for (i = 0; i < oprsz; i++) {
 183         if (pg[H1(i)] & 1) {
 184             d[i] = a[tile_vslice_index(i)];
 185         }
 186     }
 187 }
 188
 189 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
 190 {
 191     int i, oprsz = simd_oprsz(desc) / 16;
 192     uint16_t *pg = vg;
 193     Int128 *d = vd;
 194     Int128 *a = za;
 195
 196     /*
 197      * Int128 is used here simply to copy 16 bytes, and to simplify
 198      * the address arithmetic.
 199      */
 200     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
 201         if (pg[H2(i)] & 1) {
 202             d[i] = a[tile_vslice_index(i)];
 203         }
 204     }
 205 }
 206
 207 #undef DO_MOVA_Z
 208
 209 /*
 210  * Clear elements in a tile slice comprising len bytes.
 211  */
 212
 213 typedef void ClearFn(void *ptr, size_t off, size_t len);
 214
 215 static void clear_horizontal(void *ptr, size_t off, size_t len)
 216 {
 217     memset(ptr + off, 0, len);
 218 }
 219
 220 static void clear_vertical_b(void *vptr, size_t off, size_t len)
 221 {
 222     for (size_t i = 0; i < len; ++i) {
 223         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 224     }
 225 }
 226
 227 static void clear_vertical_h(void *vptr, size_t off, size_t len)
 228 {
 229     for (size_t i = 0; i < len; i += 2) {
 230         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 231     }
 232 }
 233
 234 static void clear_vertical_s(void *vptr, size_t off, size_t len)
 235 {
 236     for (size_t i = 0; i < len; i += 4) {
 237         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 238     }
 239 }
 240
 241 static void clear_vertical_d(void *vptr, size_t off, size_t len)
 242 {
 243     for (size_t i = 0; i < len; i += 8) {
 244         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 245     }
 246 }
 247
 248 static void clear_vertical_q(void *vptr, size_t off, size_t len)
 249 {
 250     for (size_t i = 0; i < len; i += 16) {
 251         memset(vptr + tile_vslice_offset(i + off), 0, 16);
 252     }
 253 }
 254
 255 /*
 256  * Copy elements from an array into a tile slice comprising len bytes.
 257  */
 258
 259 typedef void CopyFn(void *dst, const void *src, size_t len);
 260
 261 static void copy_horizontal(void *dst, const void *src, size_t len)
 262 {
 263     memcpy(dst, src, len);
 264 }
 265
 266 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
 267 {
 268     const uint8_t *src = vsrc;
 269     uint8_t *dst = vdst;
 270     size_t i;
 271
 272     for (i = 0; i < len; ++i) {
 273         dst[tile_vslice_index(i)] = src[i];
 274     }
 275 }
 276
 277 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
 278 {
 279     const uint16_t *src = vsrc;
 280     uint16_t *dst = vdst;
 281     size_t i;
 282
 283     for (i = 0; i < len / 2; ++i) {
 284         dst[tile_vslice_index(i)] = src[i];
 285     }
 286 }
 287
 288 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
 289 {
 290     const uint32_t *src = vsrc;
 291     uint32_t *dst = vdst;
 292     size_t i;
 293
 294     for (i = 0; i < len / 4; ++i) {
 295         dst[tile_vslice_index(i)] = src[i];
 296     }
 297 }
 298
 299 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
 300 {
 301     const uint64_t *src = vsrc;
 302     uint64_t *dst = vdst;
 303     size_t i;
 304
 305     for (i = 0; i < len / 8; ++i) {
 306         dst[tile_vslice_index(i)] = src[i];
 307     }
 308 }
 309
 310 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
 311 {
 312     for (size_t i = 0; i < len; i += 16) {
 313         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
 314     }
 315 }
 316
 317 /*
 318  * Host and TLB primitives for vertical tile slice addressing.
 319  */
 320
 321 #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
 322 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 323 {                                                                           \
 324     TYPE val = HOST(host);                                                  \
 325     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 326 }                                                                           \
 327 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 328                         intptr_t off, target_ulong addr, uintptr_t ra)      \
 329 {                                                                           \
 330     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
 331     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 332 }
 333
 334 #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
 335 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 336 {                                                                           \
 337     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 338     HOST(host, val);                                                        \
 339 }                                                                           \
 340 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 341                         intptr_t off, target_ulong addr, uintptr_t ra)      \
 342 {                                                                           \
 343     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 344     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
 345 }
 346
 347 /*
 348  * The ARMVectorReg elements are stored in host-endian 64-bit units.
 349  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
 350  * corresponds to storing the two 64-bit pieces in little-endian order.
 351  */
 352 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 353 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 354 {                                                                           \
 355     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
 356     uint64_t *ptr = za + off;                                               \
 357     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 358 }                                                                           \
 359 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 360 {                                                                           \
 361     HNAME##_host(za, tile_vslice_offset(off), host);                        \
 362 }                                                                           \
 363 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 364                                target_ulong addr, uintptr_t ra)             \
 365 {                                                                           \
 366     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
 367     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
 368     uint64_t *ptr = za + off;                                               \
 369     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 370 }                                                                           \
 371 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 372                                target_ulong addr, uintptr_t ra)             \
 373 {                                                                           \
 374     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 375 }
 376
 377 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 378 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 379 {                                                                           \
 380     uint64_t *ptr = za + off;                                               \
 381     HOST(host, ptr[BE]);                                                    \
 382     HOST(host + 1, ptr[!BE]);                                               \
 383 }                                                                           \
 384 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 385 {                                                                           \
 386     HNAME##_host(za, tile_vslice_offset(off), host);                        \
 387 }                                                                           \
 388 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 389                                target_ulong addr, uintptr_t ra)             \
 390 {                                                                           \
 391     uint64_t *ptr = za + off;                                               \
 392     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
 393     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
 394 }                                                                           \
 395 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 396                                target_ulong addr, uintptr_t ra)             \
 397 {                                                                           \
 398     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 399 }
 400
 401 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
 402 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
 403 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
 404 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
 405 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
 406 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
 407 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
 408
 409 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
 410 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
 411
 412 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
 413 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
 414 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
 415 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
 416 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
 417 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
 418 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
 419
 420 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
 421 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
 422
 423 #undef DO_LD
 424 #undef DO_ST
 425 #undef DO_LDQ
 426 #undef DO_STQ
 427
 428 /*
 429  * Common helper for all contiguous predicated loads.
 430  */
 431
 432 static inline QEMU_ALWAYS_INLINE
 433 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
 434              const target_ulong addr, uint32_t desc, const uintptr_t ra,
 435              const int esz, uint32_t mtedesc, bool vertical,
 436              sve_ldst1_host_fn *host_fn,
 437              sve_ldst1_tlb_fn *tlb_fn,
 438              ClearFn *clr_fn,
 439              CopyFn *cpy_fn)
 440 {
 441     const intptr_t reg_max = simd_oprsz(desc);
 442     const intptr_t esize = 1 << esz;
 443     intptr_t reg_off, reg_last;
 444     SVEContLdSt info;
 445     void *host;
 446     int flags;
 447
 448     /* Find the active elements.  */
 449     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 450         /* The entire predicate was false; no load occurs.  */
 451         clr_fn(za, 0, reg_max);
 452         return;
 453     }
 454
 455     /* Probe the page(s).  Exit with exception for any invalid page. */
 456     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
 457
 458     /* Handle watchpoints for all active elements. */
 459     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 460                               BP_MEM_READ, ra);
 461
 462     /*
 463      * Handle mte checks for all active elements.
 464      * Since TBI must be set for MTE, !mtedesc => !mte_active.
 465      */
 466     if (mtedesc) {
 467         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 468                                 mtedesc, ra);
 469     }
 470
 471     flags = info.page[0].flags | info.page[1].flags;
 472     if (unlikely(flags != 0)) {
 473 #ifdef CONFIG_USER_ONLY
 474         g_assert_not_reached();
 475 #else
 476         /*
 477          * At least one page includes MMIO.
 478          * Any bus operation can fail with cpu_transaction_failed,
 479          * which for ARM will raise SyncExternal.  Perform the load
 480          * into scratch memory to preserve register state until the end.
 481          */
 482         ARMVectorReg scratch = { };
 483
 484         reg_off = info.reg_off_first[0];
 485         reg_last = info.reg_off_last[1];
 486         if (reg_last < 0) {
 487             reg_last = info.reg_off_split;
 488             if (reg_last < 0) {
 489                 reg_last = info.reg_off_last[0];
 490             }
 491         }
 492
 493         do {
 494             uint64_t pg = vg[reg_off >> 6];
 495             do {
 496                 if ((pg >> (reg_off & 63)) & 1) {
 497                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
 498                 }
 499                 reg_off += esize;
 500             } while (reg_off & 63);
 501         } while (reg_off <= reg_last);
 502
 503         cpy_fn(za, &scratch, reg_max);
 504         return;
 505 #endif
 506     }
 507
 508     /* The entire operation is in RAM, on valid pages. */
 509
 510     reg_off = info.reg_off_first[0];
 511     reg_last = info.reg_off_last[0];
 512     host = info.page[0].host;
 513
 514     if (!vertical) {
 515         memset(za, 0, reg_max);
 516     } else if (reg_off) {
 517         clr_fn(za, 0, reg_off);
 518     }
 519
 520     while (reg_off <= reg_last) {
 521         uint64_t pg = vg[reg_off >> 6];
 522         do {
 523             if ((pg >> (reg_off & 63)) & 1) {
 524                 host_fn(za, reg_off, host + reg_off);
 525             } else if (vertical) {
 526                 clr_fn(za, reg_off, esize);
 527             }
 528             reg_off += esize;
 529         } while (reg_off <= reg_last && (reg_off & 63));
 530     }
 531
 532     /*
 533      * Use the slow path to manage the cross-page misalignment.
 534      * But we know this is RAM and cannot trap.
 535      */
 536     reg_off = info.reg_off_split;
 537     if (unlikely(reg_off >= 0)) {
 538         tlb_fn(env, za, reg_off, addr + reg_off, ra);
 539     }
 540
 541     reg_off = info.reg_off_first[1];
 542     if (unlikely(reg_off >= 0)) {
 543         reg_last = info.reg_off_last[1];
 544         host = info.page[1].host;
 545
 546         do {
 547             uint64_t pg = vg[reg_off >> 6];
 548             do {
 549                 if ((pg >> (reg_off & 63)) & 1) {
 550                     host_fn(za, reg_off, host + reg_off);
 551                 } else if (vertical) {
 552                     clr_fn(za, reg_off, esize);
 553                 }
 554                 reg_off += esize;
 555             } while (reg_off & 63);
 556         } while (reg_off <= reg_last);
 557     }
 558 }
 559
 560 static inline QEMU_ALWAYS_INLINE
 561 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
 562                  target_ulong addr, uint32_t desc, uintptr_t ra,
 563                  const int esz, bool vertical,
 564                  sve_ldst1_host_fn *host_fn,
 565                  sve_ldst1_tlb_fn *tlb_fn,
 566                  ClearFn *clr_fn,
 567                  CopyFn *cpy_fn)
 568 {
 569     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 570     int bit55 = extract64(addr, 55, 1);
 571
 572     /* Remove mtedesc from the normal sve descriptor. */
 573     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 574
 575     /* Perform gross MTE suppression early. */
 576     if (!tbi_check(desc, bit55) ||
 577         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 578         mtedesc = 0;
 579     }
 580
 581     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
 582             host_fn, tlb_fn, clr_fn, cpy_fn);
 583 }
 584
 585 #define DO_LD(L, END, ESZ)                                                 \
 586 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 587                                  target_ulong addr, uint32_t desc)         \
 588 {                                                                          \
 589     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 590             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
 591             clear_horizontal, copy_horizontal);                            \
 592 }                                                                          \
 593 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 594                                  target_ulong addr, uint32_t desc)         \
 595 {                                                                          \
 596     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 597             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
 598             clear_vertical_##L, copy_vertical_##L);                        \
 599 }                                                                          \
 600 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 601                                      target_ulong addr, uint32_t desc)     \
 602 {                                                                          \
 603     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 604                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
 605                 clear_horizontal, copy_horizontal);                        \
 606 }                                                                          \
 607 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 608                                      target_ulong addr, uint32_t desc)     \
 609 {                                                                          \
 610     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 611                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
 612                 clear_vertical_##L, copy_vertical_##L);                    \
 613 }
 614
 615 DO_LD(b, , MO_8)
 616 DO_LD(h, _be, MO_16)
 617 DO_LD(h, _le, MO_16)
 618 DO_LD(s, _be, MO_32)
 619 DO_LD(s, _le, MO_32)
 620 DO_LD(d, _be, MO_64)
 621 DO_LD(d, _le, MO_64)
 622 DO_LD(q, _be, MO_128)
 623 DO_LD(q, _le, MO_128)
 624
 625 #undef DO_LD
 626
 627 /*
 628  * Common helper for all contiguous predicated stores.
 629  */
 630
 631 static inline QEMU_ALWAYS_INLINE
 632 void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
 633              const target_ulong addr, uint32_t desc, const uintptr_t ra,
 634              const int esz, uint32_t mtedesc, bool vertical,
 635              sve_ldst1_host_fn *host_fn,
 636              sve_ldst1_tlb_fn *tlb_fn)
 637 {
 638     const intptr_t reg_max = simd_oprsz(desc);
 639     const intptr_t esize = 1 << esz;
 640     intptr_t reg_off, reg_last;
 641     SVEContLdSt info;
 642     void *host;
 643     int flags;
 644
 645     /* Find the active elements.  */
 646     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 647         /* The entire predicate was false; no store occurs.  */
 648         return;
 649     }
 650
 651     /* Probe the page(s).  Exit with exception for any invalid page. */
 652     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
 653
 654     /* Handle watchpoints for all active elements. */
 655     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 656                               BP_MEM_WRITE, ra);
 657
 658     /*
 659      * Handle mte checks for all active elements.
 660      * Since TBI must be set for MTE, !mtedesc => !mte_active.
 661      */
 662     if (mtedesc) {
 663         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 664                                 mtedesc, ra);
 665     }
 666
 667     flags = info.page[0].flags | info.page[1].flags;
 668     if (unlikely(flags != 0)) {
 669 #ifdef CONFIG_USER_ONLY
 670         g_assert_not_reached();
 671 #else
 672         /*
 673          * At least one page includes MMIO.
 674          * Any bus operation can fail with cpu_transaction_failed,
 675          * which for ARM will raise SyncExternal.  We cannot avoid
 676          * this fault and will leave with the store incomplete.
 677          */
 678         reg_off = info.reg_off_first[0];
 679         reg_last = info.reg_off_last[1];
 680         if (reg_last < 0) {
 681             reg_last = info.reg_off_split;
 682             if (reg_last < 0) {
 683                 reg_last = info.reg_off_last[0];
 684             }
 685         }
 686
 687         do {
 688             uint64_t pg = vg[reg_off >> 6];
 689             do {
 690                 if ((pg >> (reg_off & 63)) & 1) {
 691                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
 692                 }
 693                 reg_off += esize;
 694             } while (reg_off & 63);
 695         } while (reg_off <= reg_last);
 696         return;
 697 #endif
 698     }
 699
 700     reg_off = info.reg_off_first[0];
 701     reg_last = info.reg_off_last[0];
 702     host = info.page[0].host;
 703
 704     while (reg_off <= reg_last) {
 705         uint64_t pg = vg[reg_off >> 6];
 706         do {
 707             if ((pg >> (reg_off & 63)) & 1) {
 708                 host_fn(za, reg_off, host + reg_off);
 709             }
 710             reg_off += 1 << esz;
 711         } while (reg_off <= reg_last && (reg_off & 63));
 712     }
 713
 714     /*
 715      * Use the slow path to manage the cross-page misalignment.
 716      * But we know this is RAM and cannot trap.
 717      */
 718     reg_off = info.reg_off_split;
 719     if (unlikely(reg_off >= 0)) {
 720         tlb_fn(env, za, reg_off, addr + reg_off, ra);
 721     }
 722
 723     reg_off = info.reg_off_first[1];
 724     if (unlikely(reg_off >= 0)) {
 725         reg_last = info.reg_off_last[1];
 726         host = info.page[1].host;
 727
 728         do {
 729             uint64_t pg = vg[reg_off >> 6];
 730             do {
 731                 if ((pg >> (reg_off & 63)) & 1) {
 732                     host_fn(za, reg_off, host + reg_off);
 733                 }
 734                 reg_off += 1 << esz;
 735             } while (reg_off & 63);
 736         } while (reg_off <= reg_last);
 737     }
 738 }
 739
 740 static inline QEMU_ALWAYS_INLINE
 741 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
 742                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
 743                  sve_ldst1_host_fn *host_fn,
 744                  sve_ldst1_tlb_fn *tlb_fn)
 745 {
 746     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 747     int bit55 = extract64(addr, 55, 1);
 748
 749     /* Remove mtedesc from the normal sve descriptor. */
 750     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 751
 752     /* Perform gross MTE suppression early. */
 753     if (!tbi_check(desc, bit55) ||
 754         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 755         mtedesc = 0;
 756     }
 757
 758     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
 759             vertical, host_fn, tlb_fn);
 760 }
 761
 762 #define DO_ST(L, END, ESZ)                                                 \
 763 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 764                                  target_ulong addr, uint32_t desc)         \
 765 {                                                                          \
 766     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 767             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
 768 }                                                                          \
 769 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 770                                  target_ulong addr, uint32_t desc)         \
 771 {                                                                          \
 772     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 773             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
 774 }                                                                          \
 775 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 776                                      target_ulong addr, uint32_t desc)     \
 777 {                                                                          \
 778     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 779                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
 780 }                                                                          \
 781 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 782                                      target_ulong addr, uint32_t desc)     \
 783 {                                                                          \
 784     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 785                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
 786 }
 787
 788 DO_ST(b, , MO_8)
 789 DO_ST(h, _be, MO_16)
 790 DO_ST(h, _le, MO_16)
 791 DO_ST(s, _be, MO_32)
 792 DO_ST(s, _le, MO_32)
 793 DO_ST(d, _be, MO_64)
 794 DO_ST(d, _le, MO_64)
 795 DO_ST(q, _be, MO_128)
 796 DO_ST(q, _le, MO_128)
 797
 798 #undef DO_ST
 799
 800 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
 801                          void *vpm, uint32_t desc)
 802 {
 803     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 804     uint64_t *pn = vpn, *pm = vpm;
 805     uint32_t *zda = vzda, *zn = vzn;
 806
 807     for (row = 0; row < oprsz; ) {
 808         uint64_t pa = pn[row >> 4];
 809         do {
 810             if (pa & 1) {
 811                 for (col = 0; col < oprsz; ) {
 812                     uint64_t pb = pm[col >> 4];
 813                     do {
 814                         if (pb & 1) {
 815                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
 816                         }
 817                         pb >>= 4;
 818                     } while (++col & 15);
 819                 }
 820             }
 821             pa >>= 4;
 822         } while (++row & 15);
 823     }
 824 }
 825
 826 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
 827                          void *vpm, uint32_t desc)
 828 {
 829     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 830     uint8_t *pn = vpn, *pm = vpm;
 831     uint64_t *zda = vzda, *zn = vzn;
 832
 833     for (row = 0; row < oprsz; ++row) {
 834         if (pn[H1(row)] & 1) {
 835             for (col = 0; col < oprsz; ++col) {
 836                 if (pm[H1(col)] & 1) {
 837                     zda[tile_vslice_index(row) + col] += zn[col];
 838                 }
 839             }
 840         }
 841     }
 842 }
 843
 844 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
 845                          void *vpm, uint32_t desc)
 846 {
 847     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 848     uint64_t *pn = vpn, *pm = vpm;
 849     uint32_t *zda = vzda, *zn = vzn;
 850
 851     for (row = 0; row < oprsz; ) {
 852         uint64_t pa = pn[row >> 4];
 853         do {
 854             if (pa & 1) {
 855                 uint32_t zn_row = zn[H4(row)];
 856                 for (col = 0; col < oprsz; ) {
 857                     uint64_t pb = pm[col >> 4];
 858                     do {
 859                         if (pb & 1) {
 860                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
 861                         }
 862                         pb >>= 4;
 863                     } while (++col & 15);
 864                 }
 865             }
 866             pa >>= 4;
 867         } while (++row & 15);
 868     }
 869 }
 870
 871 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
 872                          void *vpm, uint32_t desc)
 873 {
 874     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 875     uint8_t *pn = vpn, *pm = vpm;
 876     uint64_t *zda = vzda, *zn = vzn;
 877
 878     for (row = 0; row < oprsz; ++row) {
 879         if (pn[H1(row)] & 1) {
 880             uint64_t zn_row = zn[row];
 881             for (col = 0; col < oprsz; ++col) {
 882                 if (pm[H1(col)] & 1) {
 883                     zda[tile_vslice_index(row) + col] += zn_row;
 884                 }
 885             }
 886         }
 887     }
 888 }
 889
 890 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
 891                          void *vpm, void *vst, uint32_t desc)
 892 {
 893     intptr_t row, col, oprsz = simd_maxsz(desc);
 894     uint32_t neg = simd_data(desc) << 31;
 895     uint16_t *pn = vpn, *pm = vpm;
 896     float_status fpst;
 897
 898     /*
 899      * Make a copy of float_status because this operation does not
 900      * update the cumulative fp exception status.  It also produces
 901      * default nans.
 902      */
 903     fpst = *(float_status *)vst;
 904     set_default_nan_mode(true, &fpst);
 905
 906     for (row = 0; row < oprsz; ) {
 907         uint16_t pa = pn[H2(row >> 4)];
 908         do {
 909             if (pa & 1) {
 910                 void *vza_row = vza + tile_vslice_offset(row);
 911                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
 912
 913                 for (col = 0; col < oprsz; ) {
 914                     uint16_t pb = pm[H2(col >> 4)];
 915                     do {
 916                         if (pb & 1) {
 917                             uint32_t *a = vza_row + H1_4(col);
 918                             uint32_t *m = vzm + H1_4(col);
 919                             *a = float32_muladd(n, *m, *a, 0, vst);
 920                         }
 921                         col += 4;
 922                         pb >>= 4;
 923                     } while (col & 15);
 924                 }
 925             }
 926             row += 4;
 927             pa >>= 4;
 928         } while (row & 15);
 929     }
 930 }
 931
 932 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
 933                          void *vpm, void *vst, uint32_t desc)
 934 {
 935     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 936     uint64_t neg = (uint64_t)simd_data(desc) << 63;
 937     uint64_t *za = vza, *zn = vzn, *zm = vzm;
 938     uint8_t *pn = vpn, *pm = vpm;
 939     float_status fpst = *(float_status *)vst;
 940
 941     set_default_nan_mode(true, &fpst);
 942
 943     for (row = 0; row < oprsz; ++row) {
 944         if (pn[H1(row)] & 1) {
 945             uint64_t *za_row = &za[tile_vslice_index(row)];
 946             uint64_t n = zn[row] ^ neg;
 947
 948             for (col = 0; col < oprsz; ++col) {
 949                 if (pm[H1(col)] & 1) {
 950                     uint64_t *a = &za_row[col];
 951                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
 952                 }
 953             }
 954         }
 955     }
 956 }
 957
 958 /*
 959  * Alter PAIR as needed for controlling predicates being false,
 960  * and for NEG on an enabled row element.
 961  */
 962 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
 963 {
 964     /*
 965      * The pseudocode uses a conditional negate after the conditional zero.
 966      * It is simpler here to unconditionally negate before conditional zero.
 967      */
 968     pair ^= neg;
 969     if (!(pg & 1)) {
 970         pair &= 0xffff0000u;
 971     }
 972     if (!(pg & 4)) {
 973         pair &= 0x0000ffffu;
 974     }
 975     return pair;
 976 }
 977
 978 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
 979                           float_status *s_std, float_status *s_odd)
 980 {
 981     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
 982     float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
 983     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
 984     float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
 985     float64 t64;
 986     float32 t32;
 987
 988     /*
 989      * The ARM pseudocode function FPDot performs both multiplies
 990      * and the add with a single rounding operation.  Emulate this
 991      * by performing the first multiply in round-to-odd, then doing
 992      * the second multiply as fused multiply-add, and rounding to
 993      * float32 all in one step.
 994      */
 995     t64 = float64_mul(e1r, e2r, s_odd);
 996     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
 997
 998     /* This conversion is exact, because we've already rounded. */
 999     t32 = float64_to_float32(t64, s_std);
1000
1001     /* The final accumulation step is not fused. */
1002     return float32_add(sum, t32, s_std);
1003 }
1004
1005 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1006                          void *vpm, void *vst, uint32_t desc)
1007 {
1008     intptr_t row, col, oprsz = simd_maxsz(desc);
1009     uint32_t neg = simd_data(desc) * 0x80008000u;
1010     uint16_t *pn = vpn, *pm = vpm;
1011     float_status fpst_odd, fpst_std;
1012
1013     /*
1014      * Make a copy of float_status because this operation does not
1015      * update the cumulative fp exception status.  It also produces
1016      * default nans.  Make a second copy with round-to-odd -- see above.
1017      */
1018     fpst_std = *(float_status *)vst;
1019     set_default_nan_mode(true, &fpst_std);
1020     fpst_odd = fpst_std;
1021     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1022
1023     for (row = 0; row < oprsz; ) {
1024         uint16_t prow = pn[H2(row >> 4)];
1025         do {
1026             void *vza_row = vza + tile_vslice_offset(row);
1027             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1028
1029             n = f16mop_adj_pair(n, prow, neg);
1030
1031             for (col = 0; col < oprsz; ) {
1032                 uint16_t pcol = pm[H2(col >> 4)];
1033                 do {
1034                     if (prow & pcol & 0b0101) {
1035                         uint32_t *a = vza_row + H1_4(col);
1036                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1037
1038                         m = f16mop_adj_pair(m, pcol, 0);
1039                         *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1040
1041                         col += 4;
1042                         pcol >>= 4;
1043                     }
1044                 } while (col & 15);
1045             }
1046             row += 4;
1047             prow >>= 4;
1048         } while (row & 15);
1049     }
1050 }
1051
1052 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1053                         void *vpm, uint32_t desc)
1054 {
1055     intptr_t row, col, oprsz = simd_maxsz(desc);
1056     uint32_t neg = simd_data(desc) * 0x80008000u;
1057     uint16_t *pn = vpn, *pm = vpm;
1058
1059     for (row = 0; row < oprsz; ) {
1060         uint16_t prow = pn[H2(row >> 4)];
1061         do {
1062             void *vza_row = vza + tile_vslice_offset(row);
1063             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1064
1065             n = f16mop_adj_pair(n, prow, neg);
1066
1067             for (col = 0; col < oprsz; ) {
1068                 uint16_t pcol = pm[H2(col >> 4)];
1069                 do {
1070                     if (prow & pcol & 0b0101) {
1071                         uint32_t *a = vza_row + H1_4(col);
1072                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1073
1074                         m = f16mop_adj_pair(m, pcol, 0);
1075                         *a = bfdotadd(*a, n, m);
1076
1077                         col += 4;
1078                         pcol >>= 4;
1079                     }
1080                 } while (col & 15);
1081             }
1082             row += 4;
1083             prow >>= 4;
1084         } while (row & 15);
1085     }
1086 }
1087
1088 typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1089
1090 static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1091                             uint8_t *pn, uint8_t *pm,
1092                             uint32_t desc, IMOPFn *fn)
1093 {
1094     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1095     bool neg = simd_data(desc);
1096
1097     for (row = 0; row < oprsz; ++row) {
1098         uint8_t pa = pn[H1(row)];
1099         uint64_t *za_row = &za[tile_vslice_index(row)];
1100         uint64_t n = zn[row];
1101
1102         for (col = 0; col < oprsz; ++col) {
1103             uint8_t pb = pm[H1(col)];
1104             uint64_t *a = &za_row[col];
1105
1106             *a = fn(n, zm[col], *a, pa & pb, neg);
1107         }
1108     }
1109 }
1110
1111 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1112 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1113 {                                                                           \
1114     uint32_t sum0 = 0, sum1 = 0;                                            \
1115     /* Apply P to N as a mask, making the inactive elements 0. */           \
1116     n &= expand_pred_b(p);                                                  \
1117     sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
1118     sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
1119     sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
1120     sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
1121     sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
1122     sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
1123     sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
1124     sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
1125     if (neg) {                                                              \
1126         sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
1127     } else {                                                                \
1128         sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
1129     }                                                                       \
1130     return ((uint64_t)sum1 << 32) | sum0;                                   \
1131 }
1132
1133 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1134 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1135 {                                                                           \
1136     uint64_t sum = 0;                                                       \
1137     /* Apply P to N as a mask, making the inactive elements 0. */           \
1138     n &= expand_pred_h(p);                                                  \
1139     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1140     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1141     sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
1142     sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
1143     return neg ? a - sum : a + sum;                                         \
1144 }
1145
1146 DEF_IMOP_32(smopa_s, int8_t, int8_t)
1147 DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1148 DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1149 DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1150
1151 DEF_IMOP_64(smopa_d, int16_t, int16_t)
1152 DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1153 DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1154 DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1155
1156 #define DEF_IMOPH(NAME) \
1157     void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
1158                             void *vpm, uint32_t desc)                        \
1159     { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1160
1161 DEF_IMOPH(smopa_s)
1162 DEF_IMOPH(umopa_s)
1163 DEF_IMOPH(sumopa_s)
1164 DEF_IMOPH(usmopa_s)
1165 DEF_IMOPH(smopa_d)
1166 DEF_IMOPH(umopa_d)
1167 DEF_IMOPH(sumopa_d)
1168 DEF_IMOPH(usmopa_d)