target/arm/sve_helper.c

   1 /*
   2  * ARM SVE Operations
   3  *
   4  * Copyright (c) 2018 Linaro, Ltd.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "cpu.h"
  22 #include "internals.h"
  23 #include "exec/exec-all.h"
  24 #include "exec/cpu_ldst.h"
  25 #include "exec/helper-proto.h"
  26 #include "tcg/tcg-gvec-desc.h"
  27 #include "fpu/softfloat.h"
  28 #include "tcg/tcg.h"
  29
  30
  31 /* Note that vector data is stored in host-endian 64-bit chunks,
  32    so addressing units smaller than that needs a host-endian fixup.  */
  33 #ifdef HOST_WORDS_BIGENDIAN
  34 #define H1(x)   ((x) ^ 7)
  35 #define H1_2(x) ((x) ^ 6)
  36 #define H1_4(x) ((x) ^ 4)
  37 #define H2(x)   ((x) ^ 3)
  38 #define H4(x)   ((x) ^ 1)
  39 #else
  40 #define H1(x)   (x)
  41 #define H1_2(x) (x)
  42 #define H1_4(x) (x)
  43 #define H2(x)   (x)
  44 #define H4(x)   (x)
  45 #endif
  46
  47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
  48  *
  49  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  50  * and bit 0 set if C is set.  Compare the definitions of these variables
  51  * within CPUARMState.
  52  */
  53
  54 /* For no G bits set, NZCV = C.  */
  55 #define PREDTEST_INIT  1
  56
  57 /* This is an iterative function, called for each Pd and Pg word
  58  * moving forward.
  59  */
  60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  61 {
  62     if (likely(g)) {
  63         /* Compute N from first D & G.
  64            Use bit 2 to signal first G bit seen.  */
  65         if (!(flags & 4)) {
  66             flags |= ((d & (g & -g)) != 0) << 31;
  67             flags |= 4;
  68         }
  69
  70         /* Accumulate Z from each D & G.  */
  71         flags |= ((d & g) != 0) << 1;
  72
  73         /* Compute C from last !(D & G).  Replace previous.  */
  74         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  75     }
  76     return flags;
  77 }
  78
  79 /* This is an iterative function, called for each Pd and Pg word
  80  * moving backward.
  81  */
  82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
  83 {
  84     if (likely(g)) {
  85         /* Compute C from first (i.e last) !(D & G).
  86            Use bit 2 to signal first G bit seen.  */
  87         if (!(flags & 4)) {
  88             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
  89             flags |= (d & pow2floor(g)) == 0;
  90         }
  91
  92         /* Accumulate Z from each D & G.  */
  93         flags |= ((d & g) != 0) << 1;
  94
  95         /* Compute N from last (i.e first) D & G.  Replace previous.  */
  96         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
  97     }
  98     return flags;
  99 }
 100
 101 /* The same for a single word predicate.  */
 102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
 103 {
 104     return iter_predtest_fwd(d, g, PREDTEST_INIT);
 105 }
 106
 107 /* The same for a multi-word predicate.  */
 108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
 109 {
 110     uint32_t flags = PREDTEST_INIT;
 111     uint64_t *d = vd, *g = vg;
 112     uintptr_t i = 0;
 113
 114     do {
 115         flags = iter_predtest_fwd(d[i], g[i], flags);
 116     } while (++i < words);
 117
 118     return flags;
 119 }
 120
 121 /* Expand active predicate bits to bytes, for byte elements.
 122  *  for (i = 0; i < 256; ++i) {
 123  *      unsigned long m = 0;
 124  *      for (j = 0; j < 8; j++) {
 125  *          if ((i >> j) & 1) {
 126  *              m |= 0xfful << (j << 3);
 127  *          }
 128  *      }
 129  *      printf("0x%016lx,\n", m);
 130  *  }
 131  */
 132 static inline uint64_t expand_pred_b(uint8_t byte)
 133 {
 134     static const uint64_t word[256] = {
 135         0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 136         0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 137         0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 138         0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 139         0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 140         0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 141         0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 142         0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 143         0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 144         0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 145         0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 146         0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 147         0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 148         0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 149         0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 150         0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 151         0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 152         0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 153         0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 154         0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 155         0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 156         0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 157         0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 158         0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 159         0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 160         0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 161         0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 162         0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 163         0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 164         0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 165         0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 166         0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 167         0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 168         0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 169         0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 170         0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 171         0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 172         0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 173         0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 174         0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 175         0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 176         0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 177         0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 178         0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 179         0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 180         0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 181         0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 182         0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 183         0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 184         0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 185         0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 186         0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 187         0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 188         0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 189         0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 190         0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 191         0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 192         0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 193         0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 194         0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 195         0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 196         0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 197         0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 198         0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 199         0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 200         0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 201         0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 202         0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 203         0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 204         0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 205         0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 206         0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 207         0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 208         0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 209         0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 210         0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 211         0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 212         0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 213         0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 214         0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 215         0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 216         0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 217         0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 218         0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 219         0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 220         0xffffffffffffffff,
 221     };
 222     return word[byte];
 223 }
 224
 225 /* Similarly for half-word elements.
 226  *  for (i = 0; i < 256; ++i) {
 227  *      unsigned long m = 0;
 228  *      if (i & 0xaa) {
 229  *          continue;
 230  *      }
 231  *      for (j = 0; j < 8; j += 2) {
 232  *          if ((i >> j) & 1) {
 233  *              m |= 0xfffful << (j << 3);
 234  *          }
 235  *      }
 236  *      printf("[0x%x] = 0x%016lx,\n", i, m);
 237  *  }
 238  */
 239 static inline uint64_t expand_pred_h(uint8_t byte)
 240 {
 241     static const uint64_t word[] = {
 242         [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 243         [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 244         [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 245         [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 246         [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 247         [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 248         [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 249         [0x55] = 0xffffffffffffffff,
 250     };
 251     return word[byte & 0x55];
 252 }
 253
 254 /* Similarly for single word elements.  */
 255 static inline uint64_t expand_pred_s(uint8_t byte)
 256 {
 257     static const uint64_t word[] = {
 258         [0x01] = 0x00000000ffffffffull,
 259         [0x10] = 0xffffffff00000000ull,
 260         [0x11] = 0xffffffffffffffffull,
 261     };
 262     return word[byte & 0x11];
 263 }
 264
 265 /* Swap 16-bit words within a 32-bit word.  */
 266 static inline uint32_t hswap32(uint32_t h)
 267 {
 268     return rol32(h, 16);
 269 }
 270
 271 /* Swap 16-bit words within a 64-bit word.  */
 272 static inline uint64_t hswap64(uint64_t h)
 273 {
 274     uint64_t m = 0x0000ffff0000ffffull;
 275     h = rol64(h, 32);
 276     return ((h & m) << 16) | ((h >> 16) & m);
 277 }
 278
 279 /* Swap 32-bit words within a 64-bit word.  */
 280 static inline uint64_t wswap64(uint64_t h)
 281 {
 282     return rol64(h, 32);
 283 }
 284
 285 #define LOGICAL_PPPP(NAME, FUNC) \
 286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 287 {                                                                         \
 288     uintptr_t opr_sz = simd_oprsz(desc);                                  \
 289     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 290     uintptr_t i;                                                          \
 291     for (i = 0; i < opr_sz / 8; ++i) {                                    \
 292         d[i] = FUNC(n[i], m[i], g[i]);                                    \
 293     }                                                                     \
 294 }
 295
 296 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
 297 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 298 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 299 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 300 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 301 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 303 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 304
 305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
 306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 313
 314 #undef DO_AND
 315 #undef DO_BIC
 316 #undef DO_EOR
 317 #undef DO_ORR
 318 #undef DO_ORN
 319 #undef DO_NOR
 320 #undef DO_NAND
 321 #undef DO_SEL
 322 #undef LOGICAL_PPPP
 323
 324 /* Fully general three-operand expander, controlled by a predicate.
 325  * This is complicated by the host-endian storage of the register file.
 326  */
 327 /* ??? I don't expect the compiler could ever vectorize this itself.
 328  * With some tables we can convert bit masks to byte masks, and with
 329  * extra care wrt byte/word ordering we could use gcc generic vectors
 330  * and do 16 bytes at a time.
 331  */
 332 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 334 {                                                                       \
 335     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 336     for (i = 0; i < opr_sz; ) {                                         \
 337         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 338         do {                                                            \
 339             if (pg & 1) {                                               \
 340                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 341                 TYPE mm = *(TYPE *)(vm + H(i));                         \
 342                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 343             }                                                           \
 344             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 345         } while (i & 15);                                               \
 346     }                                                                   \
 347 }
 348
 349 /* Similarly, specialized for 64-bit operands.  */
 350 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 352 {                                                               \
 353     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 354     TYPE *d = vd, *n = vn, *m = vm;                             \
 355     uint8_t *pg = vg;                                           \
 356     for (i = 0; i < opr_sz; i += 1) {                           \
 357         if (pg[H1(i)] & 1) {                                    \
 358             TYPE nn = n[i], mm = m[i];                          \
 359             d[i] = OP(nn, mm);                                  \
 360         }                                                       \
 361     }                                                           \
 362 }
 363
 364 #define DO_AND(N, M)  (N & M)
 365 #define DO_EOR(N, M)  (N ^ M)
 366 #define DO_ORR(N, M)  (N | M)
 367 #define DO_BIC(N, M)  (N & ~M)
 368 #define DO_ADD(N, M)  (N + M)
 369 #define DO_SUB(N, M)  (N - M)
 370 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 371 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 372 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 373 #define DO_MUL(N, M)  (N * M)
 374
 375
 376 /*
 377  * We must avoid the C undefined behaviour cases: division by
 378  * zero and signed division of INT_MIN by -1. Both of these
 379  * have architecturally defined required results for Arm.
 380  * We special case all signed divisions by -1 to avoid having
 381  * to deduce the minimum integer for the type involved.
 382  */
 383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
 384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
 385
 386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 390
 391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 395
 396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 400
 401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 405
 406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 410
 411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 415
 416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 420
 421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 425
 426 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 427 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 428 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 430
 431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 435
 436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 440
 441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 445
 446 /* Because the computation type is at least twice as large as required,
 447    these work for both signed and unsigned source types.  */
 448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 449 {
 450     return (n * m) >> 8;
 451 }
 452
 453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 454 {
 455     return (n * m) >> 16;
 456 }
 457
 458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 459 {
 460     return (n * m) >> 32;
 461 }
 462
 463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 464 {
 465     uint64_t lo, hi;
 466     muls64(&lo, &hi, n, m);
 467     return hi;
 468 }
 469
 470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 471 {
 472     uint64_t lo, hi;
 473     mulu64(&lo, &hi, n, m);
 474     return hi;
 475 }
 476
 477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 481
 482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 486
 487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 491
 492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
 493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
 494
 495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
 496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
 497
 498 /* Note that all bits of the shift are significant
 499    and not modulo the element size.  */
 500 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 501 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 502 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 503
 504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 507
 508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 511
 512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 515
 516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 519
 520 #undef DO_ZPZZ
 521 #undef DO_ZPZZ_D
 522
 523 /* Three-operand expander, controlled by a predicate, in which the
 524  * third operand is "wide".  That is, for D = N op M, the same 64-bit
 525  * value of M is used with all of the narrower values of N.
 526  */
 527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 529 {                                                                       \
 530     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 531     for (i = 0; i < opr_sz; ) {                                         \
 532         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 533         TYPEW mm = *(TYPEW *)(vm + i);                                  \
 534         do {                                                            \
 535             if (pg & 1) {                                               \
 536                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 537                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 538             }                                                           \
 539             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 540         } while (i & 7);                                                \
 541     }                                                                   \
 542 }
 543
 544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 547
 548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 551
 552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 555
 556 #undef DO_ZPZW
 557
 558 /* Fully general two-operand expander, controlled by a predicate.
 559  */
 560 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
 561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 562 {                                                               \
 563     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 564     for (i = 0; i < opr_sz; ) {                                 \
 565         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 566         do {                                                    \
 567             if (pg & 1) {                                       \
 568                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 569                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
 570             }                                                   \
 571             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 572         } while (i & 15);                                       \
 573     }                                                           \
 574 }
 575
 576 /* Similarly, specialized for 64-bit operands.  */
 577 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
 578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 579 {                                                               \
 580     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 581     TYPE *d = vd, *n = vn;                                      \
 582     uint8_t *pg = vg;                                           \
 583     for (i = 0; i < opr_sz; i += 1) {                           \
 584         if (pg[H1(i)] & 1) {                                    \
 585             TYPE nn = n[i];                                     \
 586             d[i] = OP(nn);                                      \
 587         }                                                       \
 588     }                                                           \
 589 }
 590
 591 #define DO_CLS_B(N)   (clrsb32(N) - 24)
 592 #define DO_CLS_H(N)   (clrsb32(N) - 16)
 593
 594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 598
 599 #define DO_CLZ_B(N)   (clz32(N) - 24)
 600 #define DO_CLZ_H(N)   (clz32(N) - 16)
 601
 602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 606
 607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 611
 612 #define DO_CNOT(N)    (N == 0)
 613
 614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 618
 619 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 620
 621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 624
 625 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 626
 627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 630
 631 #define DO_NOT(N)    (~N)
 632
 633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 637
 638 #define DO_SXTB(N)    ((int8_t)N)
 639 #define DO_SXTH(N)    ((int16_t)N)
 640 #define DO_SXTS(N)    ((int32_t)N)
 641 #define DO_UXTB(N)    ((uint8_t)N)
 642 #define DO_UXTH(N)    ((uint16_t)N)
 643 #define DO_UXTS(N)    ((uint32_t)N)
 644
 645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 651
 652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 658
 659 #define DO_ABS(N)    (N < 0 ? -N : N)
 660
 661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 665
 666 #define DO_NEG(N)    (-N)
 667
 668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 672
 673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 676
 677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 679
 680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 681
 682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 686
 687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
 688  */
 689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 691 {                                                              \
 692     intptr_t i, opr_sz = simd_oprsz(desc);                     \
 693     for (i = 0; i < opr_sz; ) {                                \
 694         TYPEW mm = *(TYPEW *)(vm + i);                         \
 695         do {                                                   \
 696             TYPE nn = *(TYPE *)(vn + H(i));                    \
 697             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 698             i += sizeof(TYPE);                                 \
 699         } while (i & 7);                                       \
 700     }                                                          \
 701 }
 702
 703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 706
 707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 710
 711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 714
 715 #undef DO_ZZW
 716
 717 #undef DO_CLS_B
 718 #undef DO_CLS_H
 719 #undef DO_CLZ_B
 720 #undef DO_CLZ_H
 721 #undef DO_CNOT
 722 #undef DO_FABS
 723 #undef DO_FNEG
 724 #undef DO_ABS
 725 #undef DO_NEG
 726 #undef DO_ZPZ
 727 #undef DO_ZPZ_D
 728
 729 /* Two-operand reduction expander, controlled by a predicate.
 730  * The difference between TYPERED and TYPERET has to do with
 731  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 732  * but TYPERET must be unsigned so that e.g. a 32-bit value
 733  * is not sign-extended to the ABI uint64_t return type.
 734  */
 735 /* ??? If we were to vectorize this by hand the reduction ordering
 736  * would change.  For integer operands, this is perfectly fine.
 737  */
 738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 740 {                                                          \
 741     intptr_t i, opr_sz = simd_oprsz(desc);                 \
 742     TYPERED ret = INIT;                                    \
 743     for (i = 0; i < opr_sz; ) {                            \
 744         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 745         do {                                               \
 746             if (pg & 1) {                                  \
 747                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 748                 ret = OP(ret, nn);                         \
 749             }                                              \
 750             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 751         } while (i & 15);                                  \
 752     }                                                      \
 753     return (TYPERET)ret;                                   \
 754 }
 755
 756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 758 {                                                          \
 759     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 760     TYPEE *n = vn;                                         \
 761     uint8_t *pg = vg;                                      \
 762     TYPER ret = INIT;                                      \
 763     for (i = 0; i < opr_sz; i += 1) {                      \
 764         if (pg[H1(i)] & 1) {                               \
 765             TYPEE nn = n[i];                               \
 766             ret = OP(ret, nn);                             \
 767         }                                                  \
 768     }                                                      \
 769     return ret;                                            \
 770 }
 771
 772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 776
 777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 781
 782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 786
 787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 790
 791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 795
 796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 800
 801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 805
 806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 810
 811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 815
 816 #undef DO_VPZ
 817 #undef DO_VPZ_D
 818
 819 /* Two vector operand, one scalar operand, unpredicated.  */
 820 #define DO_ZZI(NAME, TYPE, OP)                                       \
 821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
 822 {                                                                    \
 823     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
 824     TYPE s = s64, *d = vd, *n = vn;                                  \
 825     for (i = 0; i < opr_sz; ++i) {                                   \
 826         d[i] = OP(n[i], s);                                          \
 827     }                                                                \
 828 }
 829
 830 #define DO_SUBR(X, Y)   (Y - X)
 831
 832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
 833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
 834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
 835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
 836
 837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
 838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
 839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
 840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
 841
 842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
 843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
 844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
 845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
 846
 847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
 848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
 849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
 850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
 851
 852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
 853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
 854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
 855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
 856
 857 #undef DO_ZZI
 858
 859 #undef DO_AND
 860 #undef DO_ORR
 861 #undef DO_EOR
 862 #undef DO_BIC
 863 #undef DO_ADD
 864 #undef DO_SUB
 865 #undef DO_MAX
 866 #undef DO_MIN
 867 #undef DO_ABD
 868 #undef DO_MUL
 869 #undef DO_DIV
 870 #undef DO_ASR
 871 #undef DO_LSR
 872 #undef DO_LSL
 873 #undef DO_SUBR
 874
 875 /* Similar to the ARM LastActiveElement pseudocode function, except the
 876    result is multiplied by the element size.  This includes the not found
 877    indication; e.g. not found for esz=3 is -8.  */
 878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 879 {
 880     uint64_t mask = pred_esz_masks[esz];
 881     intptr_t i = words;
 882
 883     do {
 884         uint64_t this_g = g[--i] & mask;
 885         if (this_g) {
 886             return i * 64 + (63 - clz64(this_g));
 887         }
 888     } while (i > 0);
 889     return (intptr_t)-1 << esz;
 890 }
 891
 892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 893 {
 894     uint32_t flags = PREDTEST_INIT;
 895     uint64_t *d = vd, *g = vg;
 896     intptr_t i = 0;
 897
 898     do {
 899         uint64_t this_d = d[i];
 900         uint64_t this_g = g[i];
 901
 902         if (this_g) {
 903             if (!(flags & 4)) {
 904                 /* Set in D the first bit of G.  */
 905                 this_d |= this_g & -this_g;
 906                 d[i] = this_d;
 907             }
 908             flags = iter_predtest_fwd(this_d, this_g, flags);
 909         }
 910     } while (++i < words);
 911
 912     return flags;
 913 }
 914
 915 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 916 {
 917     intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 918     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 919     uint32_t flags = PREDTEST_INIT;
 920     uint64_t *d = vd, *g = vg, esz_mask;
 921     intptr_t i, next;
 922
 923     next = last_active_element(vd, words, esz) + (1 << esz);
 924     esz_mask = pred_esz_masks[esz];
 925
 926     /* Similar to the pseudocode for pnext, but scaled by ESZ
 927        so that we find the correct bit.  */
 928     if (next < words * 64) {
 929         uint64_t mask = -1;
 930
 931         if (next & 63) {
 932             mask = ~((1ull << (next & 63)) - 1);
 933             next &= -64;
 934         }
 935         do {
 936             uint64_t this_g = g[next / 64] & esz_mask & mask;
 937             if (this_g != 0) {
 938                 next = (next & -64) + ctz64(this_g);
 939                 break;
 940             }
 941             next += 64;
 942             mask = -1;
 943         } while (next < words * 64);
 944     }
 945
 946     i = 0;
 947     do {
 948         uint64_t this_d = 0;
 949         if (i == next / 64) {
 950             this_d = 1ull << (next & 63);
 951         }
 952         d[i] = this_d;
 953         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 954     } while (++i < words);
 955
 956     return flags;
 957 }
 958
 959 /* Store zero into every active element of Zd.  We will use this for two
 960  * and three-operand predicated instructions for which logic dictates a
 961  * zero result.  In particular, logical shift by element size, which is
 962  * otherwise undefined on the host.
 963  *
 964  * For element sizes smaller than uint64_t, we use tables to expand
 965  * the N bits of the controlling predicate to a byte mask, and clear
 966  * those bytes.
 967  */
 968 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
 969 {
 970     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 971     uint64_t *d = vd;
 972     uint8_t *pg = vg;
 973     for (i = 0; i < opr_sz; i += 1) {
 974         d[i] &= ~expand_pred_b(pg[H1(i)]);
 975     }
 976 }
 977
 978 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
 979 {
 980     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 981     uint64_t *d = vd;
 982     uint8_t *pg = vg;
 983     for (i = 0; i < opr_sz; i += 1) {
 984         d[i] &= ~expand_pred_h(pg[H1(i)]);
 985     }
 986 }
 987
 988 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
 989 {
 990     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 991     uint64_t *d = vd;
 992     uint8_t *pg = vg;
 993     for (i = 0; i < opr_sz; i += 1) {
 994         d[i] &= ~expand_pred_s(pg[H1(i)]);
 995     }
 996 }
 997
 998 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 999 {
1000     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1001     uint64_t *d = vd;
1002     uint8_t *pg = vg;
1003     for (i = 0; i < opr_sz; i += 1) {
1004         if (pg[H1(i)] & 1) {
1005             d[i] = 0;
1006         }
1007     }
1008 }
1009
1010 /* Copy Zn into Zd, and store zero into inactive elements.  */
1011 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1012 {
1013     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1014     uint64_t *d = vd, *n = vn;
1015     uint8_t *pg = vg;
1016     for (i = 0; i < opr_sz; i += 1) {
1017         d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1018     }
1019 }
1020
1021 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1022 {
1023     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1024     uint64_t *d = vd, *n = vn;
1025     uint8_t *pg = vg;
1026     for (i = 0; i < opr_sz; i += 1) {
1027         d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1028     }
1029 }
1030
1031 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1032 {
1033     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1034     uint64_t *d = vd, *n = vn;
1035     uint8_t *pg = vg;
1036     for (i = 0; i < opr_sz; i += 1) {
1037         d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1038     }
1039 }
1040
1041 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1042 {
1043     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1044     uint64_t *d = vd, *n = vn;
1045     uint8_t *pg = vg;
1046     for (i = 0; i < opr_sz; i += 1) {
1047         d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1048     }
1049 }
1050
1051 /* Three-operand expander, immediate operand, controlled by a predicate.
1052  */
1053 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
1054 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1055 {                                                               \
1056     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1057     TYPE imm = simd_data(desc);                                 \
1058     for (i = 0; i < opr_sz; ) {                                 \
1059         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
1060         do {                                                    \
1061             if (pg & 1) {                                       \
1062                 TYPE nn = *(TYPE *)(vn + H(i));                 \
1063                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
1064             }                                                   \
1065             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
1066         } while (i & 15);                                       \
1067     }                                                           \
1068 }
1069
1070 /* Similarly, specialized for 64-bit operands.  */
1071 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
1072 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1073 {                                                               \
1074     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
1075     TYPE *d = vd, *n = vn;                                      \
1076     TYPE imm = simd_data(desc);                                 \
1077     uint8_t *pg = vg;                                           \
1078     for (i = 0; i < opr_sz; i += 1) {                           \
1079         if (pg[H1(i)] & 1) {                                    \
1080             TYPE nn = n[i];                                     \
1081             d[i] = OP(nn, imm);                                 \
1082         }                                                       \
1083     }                                                           \
1084 }
1085
1086 #define DO_SHR(N, M)  (N >> M)
1087 #define DO_SHL(N, M)  (N << M)
1088
1089 /* Arithmetic shift right for division.  This rounds negative numbers
1090    toward zero as per signed division.  Therefore before shifting,
1091    when N is negative, add 2**M-1.  */
1092 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1093
1094 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1095 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1096 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1097 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1098
1099 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1100 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1101 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1102 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1103
1104 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1105 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1106 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1107 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1108
1109 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1110 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1111 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1112 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1113
1114 #undef DO_SHR
1115 #undef DO_SHL
1116 #undef DO_ASRD
1117 #undef DO_ZPZI
1118 #undef DO_ZPZI_D
1119
1120 /* Fully general four-operand expander, controlled by a predicate.
1121  */
1122 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
1123 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1124                   void *vg, uint32_t desc)                    \
1125 {                                                             \
1126     intptr_t i, opr_sz = simd_oprsz(desc);                    \
1127     for (i = 0; i < opr_sz; ) {                               \
1128         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
1129         do {                                                  \
1130             if (pg & 1) {                                     \
1131                 TYPE nn = *(TYPE *)(vn + H(i));               \
1132                 TYPE mm = *(TYPE *)(vm + H(i));               \
1133                 TYPE aa = *(TYPE *)(va + H(i));               \
1134                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
1135             }                                                 \
1136             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
1137         } while (i & 15);                                     \
1138     }                                                         \
1139 }
1140
1141 /* Similarly, specialized for 64-bit operands.  */
1142 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
1143 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1144                   void *vg, uint32_t desc)                    \
1145 {                                                             \
1146     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
1147     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
1148     uint8_t *pg = vg;                                         \
1149     for (i = 0; i < opr_sz; i += 1) {                         \
1150         if (pg[H1(i)] & 1) {                                  \
1151             TYPE aa = a[i], nn = n[i], mm = m[i];             \
1152             d[i] = OP(aa, nn, mm);                            \
1153         }                                                     \
1154     }                                                         \
1155 }
1156
1157 #define DO_MLA(A, N, M)  (A + N * M)
1158 #define DO_MLS(A, N, M)  (A - N * M)
1159
1160 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1161 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1162
1163 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1164 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1165
1166 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1167 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1168
1169 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1170 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1171
1172 #undef DO_MLA
1173 #undef DO_MLS
1174 #undef DO_ZPZZZ
1175 #undef DO_ZPZZZ_D
1176
1177 void HELPER(sve_index_b)(void *vd, uint32_t start,
1178                          uint32_t incr, uint32_t desc)
1179 {
1180     intptr_t i, opr_sz = simd_oprsz(desc);
1181     uint8_t *d = vd;
1182     for (i = 0; i < opr_sz; i += 1) {
1183         d[H1(i)] = start + i * incr;
1184     }
1185 }
1186
1187 void HELPER(sve_index_h)(void *vd, uint32_t start,
1188                          uint32_t incr, uint32_t desc)
1189 {
1190     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1191     uint16_t *d = vd;
1192     for (i = 0; i < opr_sz; i += 1) {
1193         d[H2(i)] = start + i * incr;
1194     }
1195 }
1196
1197 void HELPER(sve_index_s)(void *vd, uint32_t start,
1198                          uint32_t incr, uint32_t desc)
1199 {
1200     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1201     uint32_t *d = vd;
1202     for (i = 0; i < opr_sz; i += 1) {
1203         d[H4(i)] = start + i * incr;
1204     }
1205 }
1206
1207 void HELPER(sve_index_d)(void *vd, uint64_t start,
1208                          uint64_t incr, uint32_t desc)
1209 {
1210     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211     uint64_t *d = vd;
1212     for (i = 0; i < opr_sz; i += 1) {
1213         d[i] = start + i * incr;
1214     }
1215 }
1216
1217 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1218 {
1219     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1220     uint32_t sh = simd_data(desc);
1221     uint32_t *d = vd, *n = vn, *m = vm;
1222     for (i = 0; i < opr_sz; i += 1) {
1223         d[i] = n[i] + (m[i] << sh);
1224     }
1225 }
1226
1227 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1228 {
1229     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1230     uint64_t sh = simd_data(desc);
1231     uint64_t *d = vd, *n = vn, *m = vm;
1232     for (i = 0; i < opr_sz; i += 1) {
1233         d[i] = n[i] + (m[i] << sh);
1234     }
1235 }
1236
1237 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1238 {
1239     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1240     uint64_t sh = simd_data(desc);
1241     uint64_t *d = vd, *n = vn, *m = vm;
1242     for (i = 0; i < opr_sz; i += 1) {
1243         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1244     }
1245 }
1246
1247 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1248 {
1249     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1250     uint64_t sh = simd_data(desc);
1251     uint64_t *d = vd, *n = vn, *m = vm;
1252     for (i = 0; i < opr_sz; i += 1) {
1253         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1254     }
1255 }
1256
1257 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1258 {
1259     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1260     static const uint16_t coeff[] = {
1261         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1262         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1263         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1264         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1265     };
1266     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1267     uint16_t *d = vd, *n = vn;
1268
1269     for (i = 0; i < opr_sz; i++) {
1270         uint16_t nn = n[i];
1271         intptr_t idx = extract32(nn, 0, 5);
1272         uint16_t exp = extract32(nn, 5, 5);
1273         d[i] = coeff[idx] | (exp << 10);
1274     }
1275 }
1276
1277 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1278 {
1279     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1280     static const uint32_t coeff[] = {
1281         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1282         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1283         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1284         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1285         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1286         0x1ef532, 0x20b051, 0x227043, 0x243516,
1287         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1288         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1289         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1290         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1291         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1292         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1293         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1294         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1295         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1296         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1297     };
1298     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1299     uint32_t *d = vd, *n = vn;
1300
1301     for (i = 0; i < opr_sz; i++) {
1302         uint32_t nn = n[i];
1303         intptr_t idx = extract32(nn, 0, 6);
1304         uint32_t exp = extract32(nn, 6, 8);
1305         d[i] = coeff[idx] | (exp << 23);
1306     }
1307 }
1308
1309 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1310 {
1311     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1312     static const uint64_t coeff[] = {
1313         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1314         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1315         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1316         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1317         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1318         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1319         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1320         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1321         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1322         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1323         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1324         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1325         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1326         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1327         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1328         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1329         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1330         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1331         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1332         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1333         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1334         0xFA7C1819E90D8ull,
1335     };
1336     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1337     uint64_t *d = vd, *n = vn;
1338
1339     for (i = 0; i < opr_sz; i++) {
1340         uint64_t nn = n[i];
1341         intptr_t idx = extract32(nn, 0, 6);
1342         uint64_t exp = extract32(nn, 6, 11);
1343         d[i] = coeff[idx] | (exp << 52);
1344     }
1345 }
1346
1347 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1348 {
1349     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1350     uint16_t *d = vd, *n = vn, *m = vm;
1351     for (i = 0; i < opr_sz; i += 1) {
1352         uint16_t nn = n[i];
1353         uint16_t mm = m[i];
1354         if (mm & 1) {
1355             nn = float16_one;
1356         }
1357         d[i] = nn ^ (mm & 2) << 14;
1358     }
1359 }
1360
1361 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1362 {
1363     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1364     uint32_t *d = vd, *n = vn, *m = vm;
1365     for (i = 0; i < opr_sz; i += 1) {
1366         uint32_t nn = n[i];
1367         uint32_t mm = m[i];
1368         if (mm & 1) {
1369             nn = float32_one;
1370         }
1371         d[i] = nn ^ (mm & 2) << 30;
1372     }
1373 }
1374
1375 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1376 {
1377     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1378     uint64_t *d = vd, *n = vn, *m = vm;
1379     for (i = 0; i < opr_sz; i += 1) {
1380         uint64_t nn = n[i];
1381         uint64_t mm = m[i];
1382         if (mm & 1) {
1383             nn = float64_one;
1384         }
1385         d[i] = nn ^ (mm & 2) << 62;
1386     }
1387 }
1388
1389 /*
1390  * Signed saturating addition with scalar operand.
1391  */
1392
1393 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1394 {
1395     intptr_t i, oprsz = simd_oprsz(desc);
1396
1397     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1398         int r = *(int8_t *)(a + i) + b;
1399         if (r > INT8_MAX) {
1400             r = INT8_MAX;
1401         } else if (r < INT8_MIN) {
1402             r = INT8_MIN;
1403         }
1404         *(int8_t *)(d + i) = r;
1405     }
1406 }
1407
1408 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1409 {
1410     intptr_t i, oprsz = simd_oprsz(desc);
1411
1412     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1413         int r = *(int16_t *)(a + i) + b;
1414         if (r > INT16_MAX) {
1415             r = INT16_MAX;
1416         } else if (r < INT16_MIN) {
1417             r = INT16_MIN;
1418         }
1419         *(int16_t *)(d + i) = r;
1420     }
1421 }
1422
1423 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1424 {
1425     intptr_t i, oprsz = simd_oprsz(desc);
1426
1427     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1428         int64_t r = *(int32_t *)(a + i) + b;
1429         if (r > INT32_MAX) {
1430             r = INT32_MAX;
1431         } else if (r < INT32_MIN) {
1432             r = INT32_MIN;
1433         }
1434         *(int32_t *)(d + i) = r;
1435     }
1436 }
1437
1438 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1439 {
1440     intptr_t i, oprsz = simd_oprsz(desc);
1441
1442     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1443         int64_t ai = *(int64_t *)(a + i);
1444         int64_t r = ai + b;
1445         if (((r ^ ai) & ~(ai ^ b)) < 0) {
1446             /* Signed overflow.  */
1447             r = (r < 0 ? INT64_MAX : INT64_MIN);
1448         }
1449         *(int64_t *)(d + i) = r;
1450     }
1451 }
1452
1453 /*
1454  * Unsigned saturating addition with scalar operand.
1455  */
1456
1457 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1458 {
1459     intptr_t i, oprsz = simd_oprsz(desc);
1460
1461     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1462         int r = *(uint8_t *)(a + i) + b;
1463         if (r > UINT8_MAX) {
1464             r = UINT8_MAX;
1465         } else if (r < 0) {
1466             r = 0;
1467         }
1468         *(uint8_t *)(d + i) = r;
1469     }
1470 }
1471
1472 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1473 {
1474     intptr_t i, oprsz = simd_oprsz(desc);
1475
1476     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1477         int r = *(uint16_t *)(a + i) + b;
1478         if (r > UINT16_MAX) {
1479             r = UINT16_MAX;
1480         } else if (r < 0) {
1481             r = 0;
1482         }
1483         *(uint16_t *)(d + i) = r;
1484     }
1485 }
1486
1487 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1488 {
1489     intptr_t i, oprsz = simd_oprsz(desc);
1490
1491     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1492         int64_t r = *(uint32_t *)(a + i) + b;
1493         if (r > UINT32_MAX) {
1494             r = UINT32_MAX;
1495         } else if (r < 0) {
1496             r = 0;
1497         }
1498         *(uint32_t *)(d + i) = r;
1499     }
1500 }
1501
1502 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1503 {
1504     intptr_t i, oprsz = simd_oprsz(desc);
1505
1506     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1507         uint64_t r = *(uint64_t *)(a + i) + b;
1508         if (r < b) {
1509             r = UINT64_MAX;
1510         }
1511         *(uint64_t *)(d + i) = r;
1512     }
1513 }
1514
1515 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1516 {
1517     intptr_t i, oprsz = simd_oprsz(desc);
1518
1519     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1520         uint64_t ai = *(uint64_t *)(a + i);
1521         *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1522     }
1523 }
1524
1525 /* Two operand predicated copy immediate with merge.  All valid immediates
1526  * can fit within 17 signed bits in the simd_data field.
1527  */
1528 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1529                          uint64_t mm, uint32_t desc)
1530 {
1531     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1532     uint64_t *d = vd, *n = vn;
1533     uint8_t *pg = vg;
1534
1535     mm = dup_const(MO_8, mm);
1536     for (i = 0; i < opr_sz; i += 1) {
1537         uint64_t nn = n[i];
1538         uint64_t pp = expand_pred_b(pg[H1(i)]);
1539         d[i] = (mm & pp) | (nn & ~pp);
1540     }
1541 }
1542
1543 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1544                          uint64_t mm, uint32_t desc)
1545 {
1546     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1547     uint64_t *d = vd, *n = vn;
1548     uint8_t *pg = vg;
1549
1550     mm = dup_const(MO_16, mm);
1551     for (i = 0; i < opr_sz; i += 1) {
1552         uint64_t nn = n[i];
1553         uint64_t pp = expand_pred_h(pg[H1(i)]);
1554         d[i] = (mm & pp) | (nn & ~pp);
1555     }
1556 }
1557
1558 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1559                          uint64_t mm, uint32_t desc)
1560 {
1561     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562     uint64_t *d = vd, *n = vn;
1563     uint8_t *pg = vg;
1564
1565     mm = dup_const(MO_32, mm);
1566     for (i = 0; i < opr_sz; i += 1) {
1567         uint64_t nn = n[i];
1568         uint64_t pp = expand_pred_s(pg[H1(i)]);
1569         d[i] = (mm & pp) | (nn & ~pp);
1570     }
1571 }
1572
1573 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1574                          uint64_t mm, uint32_t desc)
1575 {
1576     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577     uint64_t *d = vd, *n = vn;
1578     uint8_t *pg = vg;
1579
1580     for (i = 0; i < opr_sz; i += 1) {
1581         uint64_t nn = n[i];
1582         d[i] = (pg[H1(i)] & 1 ? mm : nn);
1583     }
1584 }
1585
1586 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1587 {
1588     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1589     uint64_t *d = vd;
1590     uint8_t *pg = vg;
1591
1592     val = dup_const(MO_8, val);
1593     for (i = 0; i < opr_sz; i += 1) {
1594         d[i] = val & expand_pred_b(pg[H1(i)]);
1595     }
1596 }
1597
1598 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1599 {
1600     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1601     uint64_t *d = vd;
1602     uint8_t *pg = vg;
1603
1604     val = dup_const(MO_16, val);
1605     for (i = 0; i < opr_sz; i += 1) {
1606         d[i] = val & expand_pred_h(pg[H1(i)]);
1607     }
1608 }
1609
1610 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1611 {
1612     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1613     uint64_t *d = vd;
1614     uint8_t *pg = vg;
1615
1616     val = dup_const(MO_32, val);
1617     for (i = 0; i < opr_sz; i += 1) {
1618         d[i] = val & expand_pred_s(pg[H1(i)]);
1619     }
1620 }
1621
1622 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1623 {
1624     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1625     uint64_t *d = vd;
1626     uint8_t *pg = vg;
1627
1628     for (i = 0; i < opr_sz; i += 1) {
1629         d[i] = (pg[H1(i)] & 1 ? val : 0);
1630     }
1631 }
1632
1633 /* Big-endian hosts need to frob the byte indices.  If the copy
1634  * happens to be 8-byte aligned, then no frobbing necessary.
1635  */
1636 static void swap_memmove(void *vd, void *vs, size_t n)
1637 {
1638     uintptr_t d = (uintptr_t)vd;
1639     uintptr_t s = (uintptr_t)vs;
1640     uintptr_t o = (d | s | n) & 7;
1641     size_t i;
1642
1643 #ifndef HOST_WORDS_BIGENDIAN
1644     o = 0;
1645 #endif
1646     switch (o) {
1647     case 0:
1648         memmove(vd, vs, n);
1649         break;
1650
1651     case 4:
1652         if (d < s || d >= s + n) {
1653             for (i = 0; i < n; i += 4) {
1654                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1655             }
1656         } else {
1657             for (i = n; i > 0; ) {
1658                 i -= 4;
1659                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1660             }
1661         }
1662         break;
1663
1664     case 2:
1665     case 6:
1666         if (d < s || d >= s + n) {
1667             for (i = 0; i < n; i += 2) {
1668                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1669             }
1670         } else {
1671             for (i = n; i > 0; ) {
1672                 i -= 2;
1673                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1674             }
1675         }
1676         break;
1677
1678     default:
1679         if (d < s || d >= s + n) {
1680             for (i = 0; i < n; i++) {
1681                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1682             }
1683         } else {
1684             for (i = n; i > 0; ) {
1685                 i -= 1;
1686                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1687             }
1688         }
1689         break;
1690     }
1691 }
1692
1693 /* Similarly for memset of 0.  */
1694 static void swap_memzero(void *vd, size_t n)
1695 {
1696     uintptr_t d = (uintptr_t)vd;
1697     uintptr_t o = (d | n) & 7;
1698     size_t i;
1699
1700     /* Usually, the first bit of a predicate is set, so N is 0.  */
1701     if (likely(n == 0)) {
1702         return;
1703     }
1704
1705 #ifndef HOST_WORDS_BIGENDIAN
1706     o = 0;
1707 #endif
1708     switch (o) {
1709     case 0:
1710         memset(vd, 0, n);
1711         break;
1712
1713     case 4:
1714         for (i = 0; i < n; i += 4) {
1715             *(uint32_t *)H1_4(d + i) = 0;
1716         }
1717         break;
1718
1719     case 2:
1720     case 6:
1721         for (i = 0; i < n; i += 2) {
1722             *(uint16_t *)H1_2(d + i) = 0;
1723         }
1724         break;
1725
1726     default:
1727         for (i = 0; i < n; i++) {
1728             *(uint8_t *)H1(d + i) = 0;
1729         }
1730         break;
1731     }
1732 }
1733
1734 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1735 {
1736     intptr_t opr_sz = simd_oprsz(desc);
1737     size_t n_ofs = simd_data(desc);
1738     size_t n_siz = opr_sz - n_ofs;
1739
1740     if (vd != vm) {
1741         swap_memmove(vd, vn + n_ofs, n_siz);
1742         swap_memmove(vd + n_siz, vm, n_ofs);
1743     } else if (vd != vn) {
1744         swap_memmove(vd + n_siz, vd, n_ofs);
1745         swap_memmove(vd, vn + n_ofs, n_siz);
1746     } else {
1747         /* vd == vn == vm.  Need temp space.  */
1748         ARMVectorReg tmp;
1749         swap_memmove(&tmp, vm, n_ofs);
1750         swap_memmove(vd, vd + n_ofs, n_siz);
1751         memcpy(vd + n_siz, &tmp, n_ofs);
1752     }
1753 }
1754
1755 #define DO_INSR(NAME, TYPE, H) \
1756 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1757 {                                                                  \
1758     intptr_t opr_sz = simd_oprsz(desc);                            \
1759     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
1760     *(TYPE *)(vd + H(0)) = val;                                    \
1761 }
1762
1763 DO_INSR(sve_insr_b, uint8_t, H1)
1764 DO_INSR(sve_insr_h, uint16_t, H1_2)
1765 DO_INSR(sve_insr_s, uint32_t, H1_4)
1766 DO_INSR(sve_insr_d, uint64_t, )
1767
1768 #undef DO_INSR
1769
1770 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1771 {
1772     intptr_t i, j, opr_sz = simd_oprsz(desc);
1773     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1774         uint64_t f = *(uint64_t *)(vn + i);
1775         uint64_t b = *(uint64_t *)(vn + j);
1776         *(uint64_t *)(vd + i) = bswap64(b);
1777         *(uint64_t *)(vd + j) = bswap64(f);
1778     }
1779 }
1780
1781 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1782 {
1783     intptr_t i, j, opr_sz = simd_oprsz(desc);
1784     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1785         uint64_t f = *(uint64_t *)(vn + i);
1786         uint64_t b = *(uint64_t *)(vn + j);
1787         *(uint64_t *)(vd + i) = hswap64(b);
1788         *(uint64_t *)(vd + j) = hswap64(f);
1789     }
1790 }
1791
1792 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1793 {
1794     intptr_t i, j, opr_sz = simd_oprsz(desc);
1795     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1796         uint64_t f = *(uint64_t *)(vn + i);
1797         uint64_t b = *(uint64_t *)(vn + j);
1798         *(uint64_t *)(vd + i) = rol64(b, 32);
1799         *(uint64_t *)(vd + j) = rol64(f, 32);
1800     }
1801 }
1802
1803 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1804 {
1805     intptr_t i, j, opr_sz = simd_oprsz(desc);
1806     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1807         uint64_t f = *(uint64_t *)(vn + i);
1808         uint64_t b = *(uint64_t *)(vn + j);
1809         *(uint64_t *)(vd + i) = b;
1810         *(uint64_t *)(vd + j) = f;
1811     }
1812 }
1813
1814 #define DO_TBL(NAME, TYPE, H) \
1815 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1816 {                                                              \
1817     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1818     uintptr_t elem = opr_sz / sizeof(TYPE);                    \
1819     TYPE *d = vd, *n = vn, *m = vm;                            \
1820     ARMVectorReg tmp;                                          \
1821     if (unlikely(vd == vn)) {                                  \
1822         n = memcpy(&tmp, vn, opr_sz);                          \
1823     }                                                          \
1824     for (i = 0; i < elem; i++) {                               \
1825         TYPE j = m[H(i)];                                      \
1826         d[H(i)] = j < elem ? n[H(j)] : 0;                      \
1827     }                                                          \
1828 }
1829
1830 DO_TBL(sve_tbl_b, uint8_t, H1)
1831 DO_TBL(sve_tbl_h, uint16_t, H2)
1832 DO_TBL(sve_tbl_s, uint32_t, H4)
1833 DO_TBL(sve_tbl_d, uint64_t, )
1834
1835 #undef TBL
1836
1837 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1838 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1839 {                                                              \
1840     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1841     TYPED *d = vd;                                             \
1842     TYPES *n = vn;                                             \
1843     ARMVectorReg tmp;                                          \
1844     if (unlikely(vn - vd < opr_sz)) {                          \
1845         n = memcpy(&tmp, n, opr_sz / 2);                       \
1846     }                                                          \
1847     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
1848         d[HD(i)] = n[HS(i)];                                   \
1849     }                                                          \
1850 }
1851
1852 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1853 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1854 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1855
1856 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1857 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1858 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1859
1860 #undef DO_UNPK
1861
1862 /* Mask of bits included in the even numbered predicates of width esz.
1863  * We also use this for expand_bits/compress_bits, and so extend the
1864  * same pattern out to 16-bit units.
1865  */
1866 static const uint64_t even_bit_esz_masks[5] = {
1867     0x5555555555555555ull,
1868     0x3333333333333333ull,
1869     0x0f0f0f0f0f0f0f0full,
1870     0x00ff00ff00ff00ffull,
1871     0x0000ffff0000ffffull,
1872 };
1873
1874 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1875  * For N==0, this corresponds to the operation that in qemu/bitops.h
1876  * we call half_shuffle64; this algorithm is from Hacker's Delight,
1877  * section 7-2 Shuffling Bits.
1878  */
1879 static uint64_t expand_bits(uint64_t x, int n)
1880 {
1881     int i;
1882
1883     x &= 0xffffffffu;
1884     for (i = 4; i >= n; i--) {
1885         int sh = 1 << i;
1886         x = ((x << sh) | x) & even_bit_esz_masks[i];
1887     }
1888     return x;
1889 }
1890
1891 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1892  * For N==0, this corresponds to the operation that in qemu/bitops.h
1893  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1894  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1895  */
1896 static uint64_t compress_bits(uint64_t x, int n)
1897 {
1898     int i;
1899
1900     for (i = n; i <= 4; i++) {
1901         int sh = 1 << i;
1902         x &= even_bit_esz_masks[i];
1903         x = (x >> sh) | x;
1904     }
1905     return x & 0xffffffffu;
1906 }
1907
1908 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1909 {
1910     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1911     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1912     intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1913     uint64_t *d = vd;
1914     intptr_t i;
1915
1916     if (oprsz <= 8) {
1917         uint64_t nn = *(uint64_t *)vn;
1918         uint64_t mm = *(uint64_t *)vm;
1919         int half = 4 * oprsz;
1920
1921         nn = extract64(nn, high * half, half);
1922         mm = extract64(mm, high * half, half);
1923         nn = expand_bits(nn, esz);
1924         mm = expand_bits(mm, esz);
1925         d[0] = nn + (mm << (1 << esz));
1926     } else {
1927         ARMPredicateReg tmp_n, tmp_m;
1928
1929         /* We produce output faster than we consume input.
1930            Therefore we must be mindful of possible overlap.  */
1931         if ((vn - vd) < (uintptr_t)oprsz) {
1932             vn = memcpy(&tmp_n, vn, oprsz);
1933         }
1934         if ((vm - vd) < (uintptr_t)oprsz) {
1935             vm = memcpy(&tmp_m, vm, oprsz);
1936         }
1937         if (high) {
1938             high = oprsz >> 1;
1939         }
1940
1941         if ((high & 3) == 0) {
1942             uint32_t *n = vn, *m = vm;
1943             high >>= 2;
1944
1945             for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1946                 uint64_t nn = n[H4(high + i)];
1947                 uint64_t mm = m[H4(high + i)];
1948
1949                 nn = expand_bits(nn, esz);
1950                 mm = expand_bits(mm, esz);
1951                 d[i] = nn + (mm << (1 << esz));
1952             }
1953         } else {
1954             uint8_t *n = vn, *m = vm;
1955             uint16_t *d16 = vd;
1956
1957             for (i = 0; i < oprsz / 2; i++) {
1958                 uint16_t nn = n[H1(high + i)];
1959                 uint16_t mm = m[H1(high + i)];
1960
1961                 nn = expand_bits(nn, esz);
1962                 mm = expand_bits(mm, esz);
1963                 d16[H2(i)] = nn + (mm << (1 << esz));
1964             }
1965         }
1966     }
1967 }
1968
1969 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1970 {
1971     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1972     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1973     int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1974     uint64_t *d = vd, *n = vn, *m = vm;
1975     uint64_t l, h;
1976     intptr_t i;
1977
1978     if (oprsz <= 8) {
1979         l = compress_bits(n[0] >> odd, esz);
1980         h = compress_bits(m[0] >> odd, esz);
1981         d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1982     } else {
1983         ARMPredicateReg tmp_m;
1984         intptr_t oprsz_16 = oprsz / 16;
1985
1986         if ((vm - vd) < (uintptr_t)oprsz) {
1987             m = memcpy(&tmp_m, vm, oprsz);
1988         }
1989
1990         for (i = 0; i < oprsz_16; i++) {
1991             l = n[2 * i + 0];
1992             h = n[2 * i + 1];
1993             l = compress_bits(l >> odd, esz);
1994             h = compress_bits(h >> odd, esz);
1995             d[i] = l + (h << 32);
1996         }
1997
1998         /* For VL which is not a power of 2, the results from M do not
1999            align nicely with the uint64_t for D.  Put the aligned results
2000            from M into TMP_M and then copy it into place afterward.  */
2001         if (oprsz & 15) {
2002             d[i] = compress_bits(n[2 * i] >> odd, esz);
2003
2004             for (i = 0; i < oprsz_16; i++) {
2005                 l = m[2 * i + 0];
2006                 h = m[2 * i + 1];
2007                 l = compress_bits(l >> odd, esz);
2008                 h = compress_bits(h >> odd, esz);
2009                 tmp_m.p[i] = l + (h << 32);
2010             }
2011             tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2012
2013             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2014         } else {
2015             for (i = 0; i < oprsz_16; i++) {
2016                 l = m[2 * i + 0];
2017                 h = m[2 * i + 1];
2018                 l = compress_bits(l >> odd, esz);
2019                 h = compress_bits(h >> odd, esz);
2020                 d[oprsz_16 + i] = l + (h << 32);
2021             }
2022         }
2023     }
2024 }
2025
2026 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2027 {
2028     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2029     uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2030     bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2031     uint64_t *d = vd, *n = vn, *m = vm;
2032     uint64_t mask;
2033     int shr, shl;
2034     intptr_t i;
2035
2036     shl = 1 << esz;
2037     shr = 0;
2038     mask = even_bit_esz_masks[esz];
2039     if (odd) {
2040         mask <<= shl;
2041         shr = shl;
2042         shl = 0;
2043     }
2044
2045     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2046         uint64_t nn = (n[i] & mask) >> shr;
2047         uint64_t mm = (m[i] & mask) << shl;
2048         d[i] = nn + mm;
2049     }
2050 }
2051
2052 /* Reverse units of 2**N bits.  */
2053 static uint64_t reverse_bits_64(uint64_t x, int n)
2054 {
2055     int i, sh;
2056
2057     x = bswap64(x);
2058     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2059         uint64_t mask = even_bit_esz_masks[i];
2060         x = ((x & mask) << sh) | ((x >> sh) & mask);
2061     }
2062     return x;
2063 }
2064
2065 static uint8_t reverse_bits_8(uint8_t x, int n)
2066 {
2067     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2068     int i, sh;
2069
2070     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2071         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2072     }
2073     return x;
2074 }
2075
2076 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2077 {
2078     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2079     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2080     intptr_t i, oprsz_2 = oprsz / 2;
2081
2082     if (oprsz <= 8) {
2083         uint64_t l = *(uint64_t *)vn;
2084         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2085         *(uint64_t *)vd = l;
2086     } else if ((oprsz & 15) == 0) {
2087         for (i = 0; i < oprsz_2; i += 8) {
2088             intptr_t ih = oprsz - 8 - i;
2089             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2090             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2091             *(uint64_t *)(vd + i) = h;
2092             *(uint64_t *)(vd + ih) = l;
2093         }
2094     } else {
2095         for (i = 0; i < oprsz_2; i += 1) {
2096             intptr_t il = H1(i);
2097             intptr_t ih = H1(oprsz - 1 - i);
2098             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2099             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2100             *(uint8_t *)(vd + il) = h;
2101             *(uint8_t *)(vd + ih) = l;
2102         }
2103     }
2104 }
2105
2106 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2107 {
2108     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2109     intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2110     uint64_t *d = vd;
2111     intptr_t i;
2112
2113     if (oprsz <= 8) {
2114         uint64_t nn = *(uint64_t *)vn;
2115         int half = 4 * oprsz;
2116
2117         nn = extract64(nn, high * half, half);
2118         nn = expand_bits(nn, 0);
2119         d[0] = nn;
2120     } else {
2121         ARMPredicateReg tmp_n;
2122
2123         /* We produce output faster than we consume input.
2124            Therefore we must be mindful of possible overlap.  */
2125         if ((vn - vd) < (uintptr_t)oprsz) {
2126             vn = memcpy(&tmp_n, vn, oprsz);
2127         }
2128         if (high) {
2129             high = oprsz >> 1;
2130         }
2131
2132         if ((high & 3) == 0) {
2133             uint32_t *n = vn;
2134             high >>= 2;
2135
2136             for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2137                 uint64_t nn = n[H4(high + i)];
2138                 d[i] = expand_bits(nn, 0);
2139             }
2140         } else {
2141             uint16_t *d16 = vd;
2142             uint8_t *n = vn;
2143
2144             for (i = 0; i < oprsz / 2; i++) {
2145                 uint16_t nn = n[H1(high + i)];
2146                 d16[H2(i)] = expand_bits(nn, 0);
2147             }
2148         }
2149     }
2150 }
2151
2152 #define DO_ZIP(NAME, TYPE, H) \
2153 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
2154 {                                                                    \
2155     intptr_t oprsz = simd_oprsz(desc);                               \
2156     intptr_t i, oprsz_2 = oprsz / 2;                                 \
2157     ARMVectorReg tmp_n, tmp_m;                                       \
2158     /* We produce output faster than we consume input.               \
2159        Therefore we must be mindful of possible overlap.  */         \
2160     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
2161         vn = memcpy(&tmp_n, vn, oprsz_2);                            \
2162     }                                                                \
2163     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
2164         vm = memcpy(&tmp_m, vm, oprsz_2);                            \
2165     }                                                                \
2166     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
2167         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
2168         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2169     }                                                                \
2170 }
2171
2172 DO_ZIP(sve_zip_b, uint8_t, H1)
2173 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2174 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2175 DO_ZIP(sve_zip_d, uint64_t, )
2176
2177 #define DO_UZP(NAME, TYPE, H) \
2178 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2179 {                                                                      \
2180     intptr_t oprsz = simd_oprsz(desc);                                 \
2181     intptr_t oprsz_2 = oprsz / 2;                                      \
2182     intptr_t odd_ofs = simd_data(desc);                                \
2183     intptr_t i;                                                        \
2184     ARMVectorReg tmp_m;                                                \
2185     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
2186         vm = memcpy(&tmp_m, vm, oprsz);                                \
2187     }                                                                  \
2188     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2189         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs));     \
2190     }                                                                  \
2191     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2192         *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2193     }                                                                  \
2194 }
2195
2196 DO_UZP(sve_uzp_b, uint8_t, H1)
2197 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2198 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2199 DO_UZP(sve_uzp_d, uint64_t, )
2200
2201 #define DO_TRN(NAME, TYPE, H) \
2202 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2203 {                                                                      \
2204     intptr_t oprsz = simd_oprsz(desc);                                 \
2205     intptr_t odd_ofs = simd_data(desc);                                \
2206     intptr_t i;                                                        \
2207     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
2208         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
2209         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
2210         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
2211         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
2212     }                                                                  \
2213 }
2214
2215 DO_TRN(sve_trn_b, uint8_t, H1)
2216 DO_TRN(sve_trn_h, uint16_t, H1_2)
2217 DO_TRN(sve_trn_s, uint32_t, H1_4)
2218 DO_TRN(sve_trn_d, uint64_t, )
2219
2220 #undef DO_ZIP
2221 #undef DO_UZP
2222 #undef DO_TRN
2223
2224 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2225 {
2226     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2227     uint32_t *d = vd, *n = vn;
2228     uint8_t *pg = vg;
2229
2230     for (i = j = 0; i < opr_sz; i++) {
2231         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2232             d[H4(j)] = n[H4(i)];
2233             j++;
2234         }
2235     }
2236     for (; j < opr_sz; j++) {
2237         d[H4(j)] = 0;
2238     }
2239 }
2240
2241 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2242 {
2243     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2244     uint64_t *d = vd, *n = vn;
2245     uint8_t *pg = vg;
2246
2247     for (i = j = 0; i < opr_sz; i++) {
2248         if (pg[H1(i)] & 1) {
2249             d[j] = n[i];
2250             j++;
2251         }
2252     }
2253     for (; j < opr_sz; j++) {
2254         d[j] = 0;
2255     }
2256 }
2257
2258 /* Similar to the ARM LastActiveElement pseudocode function, except the
2259  * result is multiplied by the element size.  This includes the not found
2260  * indication; e.g. not found for esz=3 is -8.
2261  */
2262 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2263 {
2264     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2265     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2266
2267     return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2268 }
2269
2270 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2271 {
2272     intptr_t opr_sz = simd_oprsz(desc) / 8;
2273     int esz = simd_data(desc);
2274     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2275     intptr_t i, first_i, last_i;
2276     ARMVectorReg tmp;
2277
2278     first_i = last_i = 0;
2279     first_g = last_g = 0;
2280
2281     /* Find the extent of the active elements within VG.  */
2282     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2283         pg = *(uint64_t *)(vg + i) & mask;
2284         if (pg) {
2285             if (last_g == 0) {
2286                 last_g = pg;
2287                 last_i = i;
2288             }
2289             first_g = pg;
2290             first_i = i;
2291         }
2292     }
2293
2294     len = 0;
2295     if (first_g != 0) {
2296         first_i = first_i * 8 + ctz64(first_g);
2297         last_i = last_i * 8 + 63 - clz64(last_g);
2298         len = last_i - first_i + (1 << esz);
2299         if (vd == vm) {
2300             vm = memcpy(&tmp, vm, opr_sz * 8);
2301         }
2302         swap_memmove(vd, vn + first_i, len);
2303     }
2304     swap_memmove(vd + len, vm, opr_sz * 8 - len);
2305 }
2306
2307 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2308                             void *vg, uint32_t desc)
2309 {
2310     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2311     uint64_t *d = vd, *n = vn, *m = vm;
2312     uint8_t *pg = vg;
2313
2314     for (i = 0; i < opr_sz; i += 1) {
2315         uint64_t nn = n[i], mm = m[i];
2316         uint64_t pp = expand_pred_b(pg[H1(i)]);
2317         d[i] = (nn & pp) | (mm & ~pp);
2318     }
2319 }
2320
2321 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2322                             void *vg, uint32_t desc)
2323 {
2324     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2325     uint64_t *d = vd, *n = vn, *m = vm;
2326     uint8_t *pg = vg;
2327
2328     for (i = 0; i < opr_sz; i += 1) {
2329         uint64_t nn = n[i], mm = m[i];
2330         uint64_t pp = expand_pred_h(pg[H1(i)]);
2331         d[i] = (nn & pp) | (mm & ~pp);
2332     }
2333 }
2334
2335 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2336                             void *vg, uint32_t desc)
2337 {
2338     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2339     uint64_t *d = vd, *n = vn, *m = vm;
2340     uint8_t *pg = vg;
2341
2342     for (i = 0; i < opr_sz; i += 1) {
2343         uint64_t nn = n[i], mm = m[i];
2344         uint64_t pp = expand_pred_s(pg[H1(i)]);
2345         d[i] = (nn & pp) | (mm & ~pp);
2346     }
2347 }
2348
2349 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2350                             void *vg, uint32_t desc)
2351 {
2352     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2353     uint64_t *d = vd, *n = vn, *m = vm;
2354     uint8_t *pg = vg;
2355
2356     for (i = 0; i < opr_sz; i += 1) {
2357         uint64_t nn = n[i], mm = m[i];
2358         d[i] = (pg[H1(i)] & 1 ? nn : mm);
2359     }
2360 }
2361
2362 /* Two operand comparison controlled by a predicate.
2363  * ??? It is very tempting to want to be able to expand this inline
2364  * with x86 instructions, e.g.
2365  *
2366  *    vcmpeqw    zm, zn, %ymm0
2367  *    vpmovmskb  %ymm0, %eax
2368  *    and        $0x5555, %eax
2369  *    and        pg, %eax
2370  *
2371  * or even aarch64, e.g.
2372  *
2373  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2374  *    cmeq       v0.8h, zn, zm
2375  *    and        v0.8h, v0.8h, mask
2376  *    addv       h0, v0.8h
2377  *    and        v0.8b, pg
2378  *
2379  * However, coming up with an abstraction that allows vector inputs and
2380  * a scalar output, and also handles the byte-ordering of sub-uint64_t
2381  * scalar outputs, is tricky.
2382  */
2383 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
2384 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2385 {                                                                            \
2386     intptr_t opr_sz = simd_oprsz(desc);                                      \
2387     uint32_t flags = PREDTEST_INIT;                                          \
2388     intptr_t i = opr_sz;                                                     \
2389     do {                                                                     \
2390         uint64_t out = 0, pg;                                                \
2391         do {                                                                 \
2392             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
2393             TYPE nn = *(TYPE *)(vn + H(i));                                  \
2394             TYPE mm = *(TYPE *)(vm + H(i));                                  \
2395             out |= nn OP mm;                                                 \
2396         } while (i & 63);                                                    \
2397         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2398         out &= pg;                                                           \
2399         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2400         flags = iter_predtest_bwd(out, pg, flags);                           \
2401     } while (i > 0);                                                         \
2402     return flags;                                                            \
2403 }
2404
2405 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2406     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2407 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2408     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2409 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2410     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2411 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2412     DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
2413
2414 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
2415 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2416 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2417 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2418
2419 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
2420 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2421 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2422 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2423
2424 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
2425 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2426 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2427 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2428
2429 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
2430 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2431 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2432 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2433
2434 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
2435 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2436 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2437 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2438
2439 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
2440 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2441 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2442 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2443
2444 #undef DO_CMP_PPZZ_B
2445 #undef DO_CMP_PPZZ_H
2446 #undef DO_CMP_PPZZ_S
2447 #undef DO_CMP_PPZZ_D
2448 #undef DO_CMP_PPZZ
2449
2450 /* Similar, but the second source is "wide".  */
2451 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
2452 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2453 {                                                                            \
2454     intptr_t opr_sz = simd_oprsz(desc);                                      \
2455     uint32_t flags = PREDTEST_INIT;                                          \
2456     intptr_t i = opr_sz;                                                     \
2457     do {                                                                     \
2458         uint64_t out = 0, pg;                                                \
2459         do {                                                                 \
2460             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
2461             do {                                                             \
2462                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
2463                 TYPE nn = *(TYPE *)(vn + H(i));                              \
2464                 out |= nn OP mm;                                             \
2465             } while (i & 7);                                                 \
2466         } while (i & 63);                                                    \
2467         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2468         out &= pg;                                                           \
2469         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2470         flags = iter_predtest_bwd(out, pg, flags);                           \
2471     } while (i > 0);                                                         \
2472     return flags;                                                            \
2473 }
2474
2475 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2476     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
2477 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2478     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2479 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2480     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2481
2482 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
2483 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2484 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2485
2486 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
2487 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2488 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2489
2490 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
2491 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
2492 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
2493
2494 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
2495 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
2496 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
2497
2498 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
2499 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2500 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2501
2502 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
2503 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2504 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2505
2506 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
2507 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
2508 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
2509
2510 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
2511 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
2512 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
2513
2514 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
2515 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2516 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2517
2518 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
2519 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2520 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2521
2522 #undef DO_CMP_PPZW_B
2523 #undef DO_CMP_PPZW_H
2524 #undef DO_CMP_PPZW_S
2525 #undef DO_CMP_PPZW
2526
2527 /* Similar, but the second source is immediate.  */
2528 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
2529 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
2530 {                                                                    \
2531     intptr_t opr_sz = simd_oprsz(desc);                              \
2532     uint32_t flags = PREDTEST_INIT;                                  \
2533     TYPE mm = simd_data(desc);                                       \
2534     intptr_t i = opr_sz;                                             \
2535     do {                                                             \
2536         uint64_t out = 0, pg;                                        \
2537         do {                                                         \
2538             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
2539             TYPE nn = *(TYPE *)(vn + H(i));                          \
2540             out |= nn OP mm;                                         \
2541         } while (i & 63);                                            \
2542         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
2543         out &= pg;                                                   \
2544         *(uint64_t *)(vd + (i >> 3)) = out;                          \
2545         flags = iter_predtest_bwd(out, pg, flags);                   \
2546     } while (i > 0);                                                 \
2547     return flags;                                                    \
2548 }
2549
2550 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2551     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2552 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2553     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2554 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2555     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2556 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2557     DO_CMP_PPZI(NAME, TYPE, OP,     , 0x0101010101010101ull)
2558
2559 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
2560 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2561 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2562 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2563
2564 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
2565 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2566 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2567 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2568
2569 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
2570 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2571 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2572 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2573
2574 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
2575 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2576 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2577 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2578
2579 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
2580 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2581 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2582 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2583
2584 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
2585 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2586 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2587 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2588
2589 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
2590 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2591 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2592 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2593
2594 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
2595 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2596 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2597 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2598
2599 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
2600 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2601 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2602 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2603
2604 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
2605 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2606 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2607 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2608
2609 #undef DO_CMP_PPZI_B
2610 #undef DO_CMP_PPZI_H
2611 #undef DO_CMP_PPZI_S
2612 #undef DO_CMP_PPZI_D
2613 #undef DO_CMP_PPZI
2614
2615 /* Similar to the ARM LastActive pseudocode function.  */
2616 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2617 {
2618     intptr_t i;
2619
2620     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2621         uint64_t pg = *(uint64_t *)(vg + i);
2622         if (pg) {
2623             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2624         }
2625     }
2626     return 0;
2627 }
2628
2629 /* Compute a mask into RETB that is true for all G, up to and including
2630  * (if after) or excluding (if !after) the first G & N.
2631  * Return true if BRK found.
2632  */
2633 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2634                         bool brk, bool after)
2635 {
2636     uint64_t b;
2637
2638     if (brk) {
2639         b = 0;
2640     } else if ((g & n) == 0) {
2641         /* For all G, no N are set; break not found.  */
2642         b = g;
2643     } else {
2644         /* Break somewhere in N.  Locate it.  */
2645         b = g & n;            /* guard true, pred true */
2646         b = b & -b;           /* first such */
2647         if (after) {
2648             b = b | (b - 1);  /* break after same */
2649         } else {
2650             b = b - 1;        /* break before same */
2651         }
2652         brk = true;
2653     }
2654
2655     *retb = b;
2656     return brk;
2657 }
2658
2659 /* Compute a zeroing BRK.  */
2660 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2661                           intptr_t oprsz, bool after)
2662 {
2663     bool brk = false;
2664     intptr_t i;
2665
2666     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2667         uint64_t this_b, this_g = g[i];
2668
2669         brk = compute_brk(&this_b, n[i], this_g, brk, after);
2670         d[i] = this_b & this_g;
2671     }
2672 }
2673
2674 /* Likewise, but also compute flags.  */
2675 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2676                                intptr_t oprsz, bool after)
2677 {
2678     uint32_t flags = PREDTEST_INIT;
2679     bool brk = false;
2680     intptr_t i;
2681
2682     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2683         uint64_t this_b, this_d, this_g = g[i];
2684
2685         brk = compute_brk(&this_b, n[i], this_g, brk, after);
2686         d[i] = this_d = this_b & this_g;
2687         flags = iter_predtest_fwd(this_d, this_g, flags);
2688     }
2689     return flags;
2690 }
2691
2692 /* Compute a merging BRK.  */
2693 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2694                           intptr_t oprsz, bool after)
2695 {
2696     bool brk = false;
2697     intptr_t i;
2698
2699     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2700         uint64_t this_b, this_g = g[i];
2701
2702         brk = compute_brk(&this_b, n[i], this_g, brk, after);
2703         d[i] = (this_b & this_g) | (d[i] & ~this_g);
2704     }
2705 }
2706
2707 /* Likewise, but also compute flags.  */
2708 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2709                                intptr_t oprsz, bool after)
2710 {
2711     uint32_t flags = PREDTEST_INIT;
2712     bool brk = false;
2713     intptr_t i;
2714
2715     for (i = 0; i < oprsz / 8; ++i) {
2716         uint64_t this_b, this_d = d[i], this_g = g[i];
2717
2718         brk = compute_brk(&this_b, n[i], this_g, brk, after);
2719         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2720         flags = iter_predtest_fwd(this_d, this_g, flags);
2721     }
2722     return flags;
2723 }
2724
2725 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2726 {
2727     /* It is quicker to zero the whole predicate than loop on OPRSZ.
2728      * The compiler should turn this into 4 64-bit integer stores.
2729      */
2730     memset(d, 0, sizeof(ARMPredicateReg));
2731     return PREDTEST_INIT;
2732 }
2733
2734 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2735                        uint32_t pred_desc)
2736 {
2737     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738     if (last_active_pred(vn, vg, oprsz)) {
2739         compute_brk_z(vd, vm, vg, oprsz, true);
2740     } else {
2741         do_zero(vd, oprsz);
2742     }
2743 }
2744
2745 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2746                             uint32_t pred_desc)
2747 {
2748     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2749     if (last_active_pred(vn, vg, oprsz)) {
2750         return compute_brks_z(vd, vm, vg, oprsz, true);
2751     } else {
2752         return do_zero(vd, oprsz);
2753     }
2754 }
2755
2756 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2757                        uint32_t pred_desc)
2758 {
2759     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2760     if (last_active_pred(vn, vg, oprsz)) {
2761         compute_brk_z(vd, vm, vg, oprsz, false);
2762     } else {
2763         do_zero(vd, oprsz);
2764     }
2765 }
2766
2767 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2768                             uint32_t pred_desc)
2769 {
2770     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771     if (last_active_pred(vn, vg, oprsz)) {
2772         return compute_brks_z(vd, vm, vg, oprsz, false);
2773     } else {
2774         return do_zero(vd, oprsz);
2775     }
2776 }
2777
2778 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779 {
2780     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2781     compute_brk_z(vd, vn, vg, oprsz, true);
2782 }
2783
2784 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785 {
2786     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787     return compute_brks_z(vd, vn, vg, oprsz, true);
2788 }
2789
2790 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2791 {
2792     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2793     compute_brk_z(vd, vn, vg, oprsz, false);
2794 }
2795
2796 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2797 {
2798     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2799     return compute_brks_z(vd, vn, vg, oprsz, false);
2800 }
2801
2802 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2803 {
2804     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2805     compute_brk_m(vd, vn, vg, oprsz, true);
2806 }
2807
2808 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2809 {
2810     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2811     return compute_brks_m(vd, vn, vg, oprsz, true);
2812 }
2813
2814 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2815 {
2816     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2817     compute_brk_m(vd, vn, vg, oprsz, false);
2818 }
2819
2820 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2821 {
2822     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823     return compute_brks_m(vd, vn, vg, oprsz, false);
2824 }
2825
2826 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2827 {
2828     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2829
2830     if (!last_active_pred(vn, vg, oprsz)) {
2831         do_zero(vd, oprsz);
2832     }
2833 }
2834
2835 /* As if PredTest(Ones(PL), D, esz).  */
2836 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2837                               uint64_t esz_mask)
2838 {
2839     uint32_t flags = PREDTEST_INIT;
2840     intptr_t i;
2841
2842     for (i = 0; i < oprsz / 8; i++) {
2843         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2844     }
2845     if (oprsz & 7) {
2846         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2847         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2848     }
2849     return flags;
2850 }
2851
2852 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2853 {
2854     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2855
2856     if (last_active_pred(vn, vg, oprsz)) {
2857         return predtest_ones(vd, oprsz, -1);
2858     } else {
2859         return do_zero(vd, oprsz);
2860     }
2861 }
2862
2863 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2864 {
2865     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2866     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2867     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2868     intptr_t i;
2869
2870     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2871         uint64_t t = n[i] & g[i] & mask;
2872         sum += ctpop64(t);
2873     }
2874     return sum;
2875 }
2876
2877 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2878 {
2879     uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2880     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2881     uint64_t esz_mask = pred_esz_masks[esz];
2882     ARMPredicateReg *d = vd;
2883     uint32_t flags;
2884     intptr_t i;
2885
2886     /* Begin with a zero predicate register.  */
2887     flags = do_zero(d, oprsz);
2888     if (count == 0) {
2889         return flags;
2890     }
2891
2892     /* Set all of the requested bits.  */
2893     for (i = 0; i < count / 64; ++i) {
2894         d->p[i] = esz_mask;
2895     }
2896     if (count & 63) {
2897         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2898     }
2899
2900     return predtest_ones(d, oprsz, esz_mask);
2901 }
2902
2903 /* Recursive reduction on a function;
2904  * C.f. the ARM ARM function ReducePredicated.
2905  *
2906  * While it would be possible to write this without the DATA temporary,
2907  * it is much simpler to process the predicate register this way.
2908  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2909  * little to gain with a more complex non-recursive form.
2910  */
2911 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
2912 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2913 {                                                                     \
2914     if (n == 1) {                                                     \
2915         return *data;                                                 \
2916     } else {                                                          \
2917         uintptr_t half = n / 2;                                       \
2918         TYPE lo = NAME##_reduce(data, status, half);                  \
2919         TYPE hi = NAME##_reduce(data + half, status, half);           \
2920         return TYPE##_##FUNC(lo, hi, status);                         \
2921     }                                                                 \
2922 }                                                                     \
2923 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
2924 {                                                                     \
2925     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc);  \
2926     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
2927     for (i = 0; i < oprsz; ) {                                        \
2928         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
2929         do {                                                          \
2930             TYPE nn = *(TYPE *)(vn + H(i));                           \
2931             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
2932             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
2933         } while (i & 15);                                             \
2934     }                                                                 \
2935     for (; i < maxsz; i += sizeof(TYPE)) {                            \
2936         *(TYPE *)((void *)data + i) = IDENT;                          \
2937     }                                                                 \
2938     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
2939 }
2940
2941 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2942 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2943 DO_REDUCE(sve_faddv_d, float64,     , add, float64_zero)
2944
2945 /* Identity is floatN_default_nan, without the function call.  */
2946 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2947 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2948 DO_REDUCE(sve_fminnmv_d, float64,     , minnum, 0x7FF8000000000000ULL)
2949
2950 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2951 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2952 DO_REDUCE(sve_fmaxnmv_d, float64,     , maxnum, 0x7FF8000000000000ULL)
2953
2954 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2955 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2956 DO_REDUCE(sve_fminv_d, float64,     , min, float64_infinity)
2957
2958 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2959 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2960 DO_REDUCE(sve_fmaxv_d, float64,     , max, float64_chs(float64_infinity))
2961
2962 #undef DO_REDUCE
2963
2964 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2965                              void *status, uint32_t desc)
2966 {
2967     intptr_t i = 0, opr_sz = simd_oprsz(desc);
2968     float16 result = nn;
2969
2970     do {
2971         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2972         do {
2973             if (pg & 1) {
2974                 float16 mm = *(float16 *)(vm + H1_2(i));
2975                 result = float16_add(result, mm, status);
2976             }
2977             i += sizeof(float16), pg >>= sizeof(float16);
2978         } while (i & 15);
2979     } while (i < opr_sz);
2980
2981     return result;
2982 }
2983
2984 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2985                              void *status, uint32_t desc)
2986 {
2987     intptr_t i = 0, opr_sz = simd_oprsz(desc);
2988     float32 result = nn;
2989
2990     do {
2991         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2992         do {
2993             if (pg & 1) {
2994                 float32 mm = *(float32 *)(vm + H1_2(i));
2995                 result = float32_add(result, mm, status);
2996             }
2997             i += sizeof(float32), pg >>= sizeof(float32);
2998         } while (i & 15);
2999     } while (i < opr_sz);
3000
3001     return result;
3002 }
3003
3004 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3005                              void *status, uint32_t desc)
3006 {
3007     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3008     uint64_t *m = vm;
3009     uint8_t *pg = vg;
3010
3011     for (i = 0; i < opr_sz; i++) {
3012         if (pg[H1(i)] & 1) {
3013             nn = float64_add(nn, m[i], status);
3014         }
3015     }
3016
3017     return nn;
3018 }
3019
3020 /* Fully general three-operand expander, controlled by a predicate,
3021  * With the extra float_status parameter.
3022  */
3023 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
3024 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
3025                   void *status, uint32_t desc)                  \
3026 {                                                               \
3027     intptr_t i = simd_oprsz(desc);                              \
3028     uint64_t *g = vg;                                           \
3029     do {                                                        \
3030         uint64_t pg = g[(i - 1) >> 6];                          \
3031         do {                                                    \
3032             i -= sizeof(TYPE);                                  \
3033             if (likely((pg >> (i & 63)) & 1)) {                 \
3034                 TYPE nn = *(TYPE *)(vn + H(i));                 \
3035                 TYPE mm = *(TYPE *)(vm + H(i));                 \
3036                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
3037             }                                                   \
3038         } while (i & 63);                                       \
3039     } while (i != 0);                                           \
3040 }
3041
3042 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3043 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3044 DO_ZPZZ_FP(sve_fadd_d, uint64_t,     , float64_add)
3045
3046 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3047 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3048 DO_ZPZZ_FP(sve_fsub_d, uint64_t,     , float64_sub)
3049
3050 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3051 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3052 DO_ZPZZ_FP(sve_fmul_d, uint64_t,     , float64_mul)
3053
3054 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3055 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3056 DO_ZPZZ_FP(sve_fdiv_d, uint64_t,     , float64_div)
3057
3058 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3059 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3060 DO_ZPZZ_FP(sve_fmin_d, uint64_t,     , float64_min)
3061
3062 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3063 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3064 DO_ZPZZ_FP(sve_fmax_d, uint64_t,     , float64_max)
3065
3066 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3067 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3068 DO_ZPZZ_FP(sve_fminnum_d, uint64_t,     , float64_minnum)
3069
3070 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3071 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3072 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t,     , float64_maxnum)
3073
3074 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3075 {
3076     return float16_abs(float16_sub(a, b, s));
3077 }
3078
3079 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3080 {
3081     return float32_abs(float32_sub(a, b, s));
3082 }
3083
3084 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3085 {
3086     return float64_abs(float64_sub(a, b, s));
3087 }
3088
3089 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3090 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3091 DO_ZPZZ_FP(sve_fabd_d, uint64_t,     , abd_d)
3092
3093 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3094 {
3095     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3096     return float64_scalbn(a, b_int, s);
3097 }
3098
3099 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3100 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3101 DO_ZPZZ_FP(sve_fscalbn_d, int64_t,     , scalbn_d)
3102
3103 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3104 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3105 DO_ZPZZ_FP(sve_fmulx_d, uint64_t,     , helper_vfp_mulxd)
3106
3107 #undef DO_ZPZZ_FP
3108
3109 /* Three-operand expander, with one scalar operand, controlled by
3110  * a predicate, with the extra float_status parameter.
3111  */
3112 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3113 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
3114                   void *status, uint32_t desc)                    \
3115 {                                                                 \
3116     intptr_t i = simd_oprsz(desc);                                \
3117     uint64_t *g = vg;                                             \
3118     TYPE mm = scalar;                                             \
3119     do {                                                          \
3120         uint64_t pg = g[(i - 1) >> 6];                            \
3121         do {                                                      \
3122             i -= sizeof(TYPE);                                    \
3123             if (likely((pg >> (i & 63)) & 1)) {                   \
3124                 TYPE nn = *(TYPE *)(vn + H(i));                   \
3125                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
3126             }                                                     \
3127         } while (i & 63);                                         \
3128     } while (i != 0);                                             \
3129 }
3130
3131 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3132 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3133 DO_ZPZS_FP(sve_fadds_d, float64,     , float64_add)
3134
3135 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3136 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3137 DO_ZPZS_FP(sve_fsubs_d, float64,     , float64_sub)
3138
3139 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3140 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3141 DO_ZPZS_FP(sve_fmuls_d, float64,     , float64_mul)
3142
3143 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3144 {
3145     return float16_sub(b, a, s);
3146 }
3147
3148 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3149 {
3150     return float32_sub(b, a, s);
3151 }
3152
3153 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3154 {
3155     return float64_sub(b, a, s);
3156 }
3157
3158 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3159 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3160 DO_ZPZS_FP(sve_fsubrs_d, float64,     , subr_d)
3161
3162 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3163 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3164 DO_ZPZS_FP(sve_fmaxnms_d, float64,     , float64_maxnum)
3165
3166 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3167 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3168 DO_ZPZS_FP(sve_fminnms_d, float64,     , float64_minnum)
3169
3170 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3171 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3172 DO_ZPZS_FP(sve_fmaxs_d, float64,     , float64_max)
3173
3174 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3175 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3176 DO_ZPZS_FP(sve_fmins_d, float64,     , float64_min)
3177
3178 /* Fully general two-operand expander, controlled by a predicate,
3179  * With the extra float_status parameter.
3180  */
3181 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
3182 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3183 {                                                                     \
3184     intptr_t i = simd_oprsz(desc);                                    \
3185     uint64_t *g = vg;                                                 \
3186     do {                                                              \
3187         uint64_t pg = g[(i - 1) >> 6];                                \
3188         do {                                                          \
3189             i -= sizeof(TYPE);                                        \
3190             if (likely((pg >> (i & 63)) & 1)) {                       \
3191                 TYPE nn = *(TYPE *)(vn + H(i));                       \
3192                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
3193             }                                                         \
3194         } while (i & 63);                                             \
3195     } while (i != 0);                                                 \
3196 }
3197
3198 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
3199  * FZ16.  When converting from fp16, this affects flushing input denormals;
3200  * when converting to fp16, this affects flushing output denormals.
3201  */
3202 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3203 {
3204     bool save = get_flush_inputs_to_zero(fpst);
3205     float32 ret;
3206
3207     set_flush_inputs_to_zero(false, fpst);
3208     ret = float16_to_float32(f, true, fpst);
3209     set_flush_inputs_to_zero(save, fpst);
3210     return ret;
3211 }
3212
3213 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3214 {
3215     bool save = get_flush_inputs_to_zero(fpst);
3216     float64 ret;
3217
3218     set_flush_inputs_to_zero(false, fpst);
3219     ret = float16_to_float64(f, true, fpst);
3220     set_flush_inputs_to_zero(save, fpst);
3221     return ret;
3222 }
3223
3224 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3225 {
3226     bool save = get_flush_to_zero(fpst);
3227     float16 ret;
3228
3229     set_flush_to_zero(false, fpst);
3230     ret = float32_to_float16(f, true, fpst);
3231     set_flush_to_zero(save, fpst);
3232     return ret;
3233 }
3234
3235 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3236 {
3237     bool save = get_flush_to_zero(fpst);
3238     float16 ret;
3239
3240     set_flush_to_zero(false, fpst);
3241     ret = float64_to_float16(f, true, fpst);
3242     set_flush_to_zero(save, fpst);
3243     return ret;
3244 }
3245
3246 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3247 {
3248     if (float16_is_any_nan(f)) {
3249         float_raise(float_flag_invalid, s);
3250         return 0;
3251     }
3252     return float16_to_int16_round_to_zero(f, s);
3253 }
3254
3255 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3256 {
3257     if (float16_is_any_nan(f)) {
3258         float_raise(float_flag_invalid, s);
3259         return 0;
3260     }
3261     return float16_to_int64_round_to_zero(f, s);
3262 }
3263
3264 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3265 {
3266     if (float32_is_any_nan(f)) {
3267         float_raise(float_flag_invalid, s);
3268         return 0;
3269     }
3270     return float32_to_int64_round_to_zero(f, s);
3271 }
3272
3273 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3274 {
3275     if (float64_is_any_nan(f)) {
3276         float_raise(float_flag_invalid, s);
3277         return 0;
3278     }
3279     return float64_to_int64_round_to_zero(f, s);
3280 }
3281
3282 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3283 {
3284     if (float16_is_any_nan(f)) {
3285         float_raise(float_flag_invalid, s);
3286         return 0;
3287     }
3288     return float16_to_uint16_round_to_zero(f, s);
3289 }
3290
3291 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3292 {
3293     if (float16_is_any_nan(f)) {
3294         float_raise(float_flag_invalid, s);
3295         return 0;
3296     }
3297     return float16_to_uint64_round_to_zero(f, s);
3298 }
3299
3300 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3301 {
3302     if (float32_is_any_nan(f)) {
3303         float_raise(float_flag_invalid, s);
3304         return 0;
3305     }
3306     return float32_to_uint64_round_to_zero(f, s);
3307 }
3308
3309 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3310 {
3311     if (float64_is_any_nan(f)) {
3312         float_raise(float_flag_invalid, s);
3313         return 0;
3314     }
3315     return float64_to_uint64_round_to_zero(f, s);
3316 }
3317
3318 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3319 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3320 DO_ZPZ_FP(sve_fcvt_dh, uint64_t,     , sve_f64_to_f16)
3321 DO_ZPZ_FP(sve_fcvt_hd, uint64_t,     , sve_f16_to_f64)
3322 DO_ZPZ_FP(sve_fcvt_ds, uint64_t,     , float64_to_float32)
3323 DO_ZPZ_FP(sve_fcvt_sd, uint64_t,     , float32_to_float64)
3324
3325 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3326 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3327 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3328 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t,     , vfp_float16_to_int64_rtz)
3329 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t,     , vfp_float32_to_int64_rtz)
3330 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t,     , helper_vfp_tosizd)
3331 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t,     , vfp_float64_to_int64_rtz)
3332
3333 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3334 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3335 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3336 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t,     , vfp_float16_to_uint64_rtz)
3337 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t,     , vfp_float32_to_uint64_rtz)
3338 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t,     , helper_vfp_touizd)
3339 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t,     , vfp_float64_to_uint64_rtz)
3340
3341 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3342 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3343 DO_ZPZ_FP(sve_frint_d, uint64_t,     , helper_rintd)
3344
3345 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3346 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3347 DO_ZPZ_FP(sve_frintx_d, uint64_t,     , float64_round_to_int)
3348
3349 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3350 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3351 DO_ZPZ_FP(sve_frecpx_d, uint64_t,     , helper_frecpx_f64)
3352
3353 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3354 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3355 DO_ZPZ_FP(sve_fsqrt_d, uint64_t,     , float64_sqrt)
3356
3357 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3358 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3359 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3360 DO_ZPZ_FP(sve_scvt_sd, uint64_t,     , int32_to_float64)
3361 DO_ZPZ_FP(sve_scvt_dh, uint64_t,     , int64_to_float16)
3362 DO_ZPZ_FP(sve_scvt_ds, uint64_t,     , int64_to_float32)
3363 DO_ZPZ_FP(sve_scvt_dd, uint64_t,     , int64_to_float64)
3364
3365 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3366 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3367 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3368 DO_ZPZ_FP(sve_ucvt_sd, uint64_t,     , uint32_to_float64)
3369 DO_ZPZ_FP(sve_ucvt_dh, uint64_t,     , uint64_to_float16)
3370 DO_ZPZ_FP(sve_ucvt_ds, uint64_t,     , uint64_to_float32)
3371 DO_ZPZ_FP(sve_ucvt_dd, uint64_t,     , uint64_to_float64)
3372
3373 #undef DO_ZPZ_FP
3374
3375 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3376                             float_status *status, uint32_t desc,
3377                             uint16_t neg1, uint16_t neg3)
3378 {
3379     intptr_t i = simd_oprsz(desc);
3380     uint64_t *g = vg;
3381
3382     do {
3383         uint64_t pg = g[(i - 1) >> 6];
3384         do {
3385             i -= 2;
3386             if (likely((pg >> (i & 63)) & 1)) {
3387                 float16 e1, e2, e3, r;
3388
3389                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3390                 e2 = *(uint16_t *)(vm + H1_2(i));
3391                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3392                 r = float16_muladd(e1, e2, e3, 0, status);
3393                 *(uint16_t *)(vd + H1_2(i)) = r;
3394             }
3395         } while (i & 63);
3396     } while (i != 0);
3397 }
3398
3399 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3400                               void *vg, void *status, uint32_t desc)
3401 {
3402     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3403 }
3404
3405 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3406                               void *vg, void *status, uint32_t desc)
3407 {
3408     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3409 }
3410
3411 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3412                                void *vg, void *status, uint32_t desc)
3413 {
3414     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3415 }
3416
3417 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3418                                void *vg, void *status, uint32_t desc)
3419 {
3420     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3421 }
3422
3423 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3424                             float_status *status, uint32_t desc,
3425                             uint32_t neg1, uint32_t neg3)
3426 {
3427     intptr_t i = simd_oprsz(desc);
3428     uint64_t *g = vg;
3429
3430     do {
3431         uint64_t pg = g[(i - 1) >> 6];
3432         do {
3433             i -= 4;
3434             if (likely((pg >> (i & 63)) & 1)) {
3435                 float32 e1, e2, e3, r;
3436
3437                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3438                 e2 = *(uint32_t *)(vm + H1_4(i));
3439                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3440                 r = float32_muladd(e1, e2, e3, 0, status);
3441                 *(uint32_t *)(vd + H1_4(i)) = r;
3442             }
3443         } while (i & 63);
3444     } while (i != 0);
3445 }
3446
3447 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3448                               void *vg, void *status, uint32_t desc)
3449 {
3450     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3451 }
3452
3453 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3454                               void *vg, void *status, uint32_t desc)
3455 {
3456     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3457 }
3458
3459 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3460                                void *vg, void *status, uint32_t desc)
3461 {
3462     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3463 }
3464
3465 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3466                                void *vg, void *status, uint32_t desc)
3467 {
3468     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3469 }
3470
3471 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3472                             float_status *status, uint32_t desc,
3473                             uint64_t neg1, uint64_t neg3)
3474 {
3475     intptr_t i = simd_oprsz(desc);
3476     uint64_t *g = vg;
3477
3478     do {
3479         uint64_t pg = g[(i - 1) >> 6];
3480         do {
3481             i -= 8;
3482             if (likely((pg >> (i & 63)) & 1)) {
3483                 float64 e1, e2, e3, r;
3484
3485                 e1 = *(uint64_t *)(vn + i) ^ neg1;
3486                 e2 = *(uint64_t *)(vm + i);
3487                 e3 = *(uint64_t *)(va + i) ^ neg3;
3488                 r = float64_muladd(e1, e2, e3, 0, status);
3489                 *(uint64_t *)(vd + i) = r;
3490             }
3491         } while (i & 63);
3492     } while (i != 0);
3493 }
3494
3495 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3496                               void *vg, void *status, uint32_t desc)
3497 {
3498     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3499 }
3500
3501 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3502                               void *vg, void *status, uint32_t desc)
3503 {
3504     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3505 }
3506
3507 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3508                                void *vg, void *status, uint32_t desc)
3509 {
3510     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3511 }
3512
3513 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3514                                void *vg, void *status, uint32_t desc)
3515 {
3516     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3517 }
3518
3519 /* Two operand floating-point comparison controlled by a predicate.
3520  * Unlike the integer version, we are not allowed to optimistically
3521  * compare operands, since the comparison may have side effects wrt
3522  * the FPSR.
3523  */
3524 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
3525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
3526                   void *status, uint32_t desc)                          \
3527 {                                                                       \
3528     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
3529     uint64_t *d = vd, *g = vg;                                          \
3530     do {                                                                \
3531         uint64_t out = 0, pg = g[j];                                    \
3532         do {                                                            \
3533             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
3534             if (likely((pg >> (i & 63)) & 1)) {                         \
3535                 TYPE nn = *(TYPE *)(vn + H(i));                         \
3536                 TYPE mm = *(TYPE *)(vm + H(i));                         \
3537                 out |= OP(TYPE, nn, mm, status);                        \
3538             }                                                           \
3539         } while (i & 63);                                               \
3540         d[j--] = out;                                                   \
3541     } while (i > 0);                                                    \
3542 }
3543
3544 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3545     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3546 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3547     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3548 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3549     DO_FPCMP_PPZZ(NAME##_d, float64,     , OP)
3550
3551 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3552     DO_FPCMP_PPZZ_H(NAME, OP)   \
3553     DO_FPCMP_PPZZ_S(NAME, OP)   \
3554     DO_FPCMP_PPZZ_D(NAME, OP)
3555
3556 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
3557 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
3558 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
3559 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
3560 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
3561 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
3562 #define DO_FCMUO(TYPE, X, Y, ST)  \
3563     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3564 #define DO_FACGE(TYPE, X, Y, ST)  \
3565     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3566 #define DO_FACGT(TYPE, X, Y, ST)  \
3567     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3568
3569 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3570 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3571 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3572 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3573 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3574 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3575 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3576
3577 #undef DO_FPCMP_PPZZ_ALL
3578 #undef DO_FPCMP_PPZZ_D
3579 #undef DO_FPCMP_PPZZ_S
3580 #undef DO_FPCMP_PPZZ_H
3581 #undef DO_FPCMP_PPZZ
3582
3583 /* One operand floating-point comparison against zero, controlled
3584  * by a predicate.
3585  */
3586 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
3587 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
3588                   void *status, uint32_t desc)             \
3589 {                                                          \
3590     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
3591     uint64_t *d = vd, *g = vg;                             \
3592     do {                                                   \
3593         uint64_t out = 0, pg = g[j];                       \
3594         do {                                               \
3595             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
3596             if ((pg >> (i & 63)) & 1) {                    \
3597                 TYPE nn = *(TYPE *)(vn + H(i));            \
3598                 out |= OP(TYPE, nn, 0, status);            \
3599             }                                              \
3600         } while (i & 63);                                  \
3601         d[j--] = out;                                      \
3602     } while (i > 0);                                       \
3603 }
3604
3605 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3606     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3607 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3608     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3609 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3610     DO_FPCMP_PPZ0(NAME##_d, float64,     , OP)
3611
3612 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3613     DO_FPCMP_PPZ0_H(NAME, OP)   \
3614     DO_FPCMP_PPZ0_S(NAME, OP)   \
3615     DO_FPCMP_PPZ0_D(NAME, OP)
3616
3617 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3618 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3619 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3620 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3621 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3622 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3623
3624 /* FP Trig Multiply-Add. */
3625
3626 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3627 {
3628     static const float16 coeff[16] = {
3629         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3630         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3631     };
3632     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3633     intptr_t x = simd_data(desc);
3634     float16 *d = vd, *n = vn, *m = vm;
3635     for (i = 0; i < opr_sz; i++) {
3636         float16 mm = m[i];
3637         intptr_t xx = x;
3638         if (float16_is_neg(mm)) {
3639             mm = float16_abs(mm);
3640             xx += 8;
3641         }
3642         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3643     }
3644 }
3645
3646 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3647 {
3648     static const float32 coeff[16] = {
3649         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3650         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3651         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3652         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3653     };
3654     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3655     intptr_t x = simd_data(desc);
3656     float32 *d = vd, *n = vn, *m = vm;
3657     for (i = 0; i < opr_sz; i++) {
3658         float32 mm = m[i];
3659         intptr_t xx = x;
3660         if (float32_is_neg(mm)) {
3661             mm = float32_abs(mm);
3662             xx += 8;
3663         }
3664         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3665     }
3666 }
3667
3668 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3669 {
3670     static const float64 coeff[16] = {
3671         0x3ff0000000000000ull, 0xbfc5555555555543ull,
3672         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3673         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3674         0x3de5d8408868552full, 0x0000000000000000ull,
3675         0x3ff0000000000000ull, 0xbfe0000000000000ull,
3676         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3677         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3678         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3679     };
3680     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3681     intptr_t x = simd_data(desc);
3682     float64 *d = vd, *n = vn, *m = vm;
3683     for (i = 0; i < opr_sz; i++) {
3684         float64 mm = m[i];
3685         intptr_t xx = x;
3686         if (float64_is_neg(mm)) {
3687             mm = float64_abs(mm);
3688             xx += 8;
3689         }
3690         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3691     }
3692 }
3693
3694 /*
3695  * FP Complex Add
3696  */
3697
3698 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3699                          void *vs, uint32_t desc)
3700 {
3701     intptr_t j, i = simd_oprsz(desc);
3702     uint64_t *g = vg;
3703     float16 neg_imag = float16_set_sign(0, simd_data(desc));
3704     float16 neg_real = float16_chs(neg_imag);
3705
3706     do {
3707         uint64_t pg = g[(i - 1) >> 6];
3708         do {
3709             float16 e0, e1, e2, e3;
3710
3711             /* I holds the real index; J holds the imag index.  */
3712             j = i - sizeof(float16);
3713             i -= 2 * sizeof(float16);
3714
3715             e0 = *(float16 *)(vn + H1_2(i));
3716             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3717             e2 = *(float16 *)(vn + H1_2(j));
3718             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3719
3720             if (likely((pg >> (i & 63)) & 1)) {
3721                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3722             }
3723             if (likely((pg >> (j & 63)) & 1)) {
3724                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3725             }
3726         } while (i & 63);
3727     } while (i != 0);
3728 }
3729
3730 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3731                          void *vs, uint32_t desc)
3732 {
3733     intptr_t j, i = simd_oprsz(desc);
3734     uint64_t *g = vg;
3735     float32 neg_imag = float32_set_sign(0, simd_data(desc));
3736     float32 neg_real = float32_chs(neg_imag);
3737
3738     do {
3739         uint64_t pg = g[(i - 1) >> 6];
3740         do {
3741             float32 e0, e1, e2, e3;
3742
3743             /* I holds the real index; J holds the imag index.  */
3744             j = i - sizeof(float32);
3745             i -= 2 * sizeof(float32);
3746
3747             e0 = *(float32 *)(vn + H1_2(i));
3748             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3749             e2 = *(float32 *)(vn + H1_2(j));
3750             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3751
3752             if (likely((pg >> (i & 63)) & 1)) {
3753                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3754             }
3755             if (likely((pg >> (j & 63)) & 1)) {
3756                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3757             }
3758         } while (i & 63);
3759     } while (i != 0);
3760 }
3761
3762 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3763                          void *vs, uint32_t desc)
3764 {
3765     intptr_t j, i = simd_oprsz(desc);
3766     uint64_t *g = vg;
3767     float64 neg_imag = float64_set_sign(0, simd_data(desc));
3768     float64 neg_real = float64_chs(neg_imag);
3769
3770     do {
3771         uint64_t pg = g[(i - 1) >> 6];
3772         do {
3773             float64 e0, e1, e2, e3;
3774
3775             /* I holds the real index; J holds the imag index.  */
3776             j = i - sizeof(float64);
3777             i -= 2 * sizeof(float64);
3778
3779             e0 = *(float64 *)(vn + H1_2(i));
3780             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3781             e2 = *(float64 *)(vn + H1_2(j));
3782             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3783
3784             if (likely((pg >> (i & 63)) & 1)) {
3785                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3786             }
3787             if (likely((pg >> (j & 63)) & 1)) {
3788                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3789             }
3790         } while (i & 63);
3791     } while (i != 0);
3792 }
3793
3794 /*
3795  * FP Complex Multiply
3796  */
3797
3798 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3799                                void *vg, void *status, uint32_t desc)
3800 {
3801     intptr_t j, i = simd_oprsz(desc);
3802     unsigned rot = simd_data(desc);
3803     bool flip = rot & 1;
3804     float16 neg_imag, neg_real;
3805     uint64_t *g = vg;
3806
3807     neg_imag = float16_set_sign(0, (rot & 2) != 0);
3808     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3809
3810     do {
3811         uint64_t pg = g[(i - 1) >> 6];
3812         do {
3813             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3814
3815             /* I holds the real index; J holds the imag index.  */
3816             j = i - sizeof(float16);
3817             i -= 2 * sizeof(float16);
3818
3819             nr = *(float16 *)(vn + H1_2(i));
3820             ni = *(float16 *)(vn + H1_2(j));
3821             mr = *(float16 *)(vm + H1_2(i));
3822             mi = *(float16 *)(vm + H1_2(j));
3823
3824             e2 = (flip ? ni : nr);
3825             e1 = (flip ? mi : mr) ^ neg_real;
3826             e4 = e2;
3827             e3 = (flip ? mr : mi) ^ neg_imag;
3828
3829             if (likely((pg >> (i & 63)) & 1)) {
3830                 d = *(float16 *)(va + H1_2(i));
3831                 d = float16_muladd(e2, e1, d, 0, status);
3832                 *(float16 *)(vd + H1_2(i)) = d;
3833             }
3834             if (likely((pg >> (j & 63)) & 1)) {
3835                 d = *(float16 *)(va + H1_2(j));
3836                 d = float16_muladd(e4, e3, d, 0, status);
3837                 *(float16 *)(vd + H1_2(j)) = d;
3838             }
3839         } while (i & 63);
3840     } while (i != 0);
3841 }
3842
3843 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3844                                void *vg, void *status, uint32_t desc)
3845 {
3846     intptr_t j, i = simd_oprsz(desc);
3847     unsigned rot = simd_data(desc);
3848     bool flip = rot & 1;
3849     float32 neg_imag, neg_real;
3850     uint64_t *g = vg;
3851
3852     neg_imag = float32_set_sign(0, (rot & 2) != 0);
3853     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3854
3855     do {
3856         uint64_t pg = g[(i - 1) >> 6];
3857         do {
3858             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3859
3860             /* I holds the real index; J holds the imag index.  */
3861             j = i - sizeof(float32);
3862             i -= 2 * sizeof(float32);
3863
3864             nr = *(float32 *)(vn + H1_2(i));
3865             ni = *(float32 *)(vn + H1_2(j));
3866             mr = *(float32 *)(vm + H1_2(i));
3867             mi = *(float32 *)(vm + H1_2(j));
3868
3869             e2 = (flip ? ni : nr);
3870             e1 = (flip ? mi : mr) ^ neg_real;
3871             e4 = e2;
3872             e3 = (flip ? mr : mi) ^ neg_imag;
3873
3874             if (likely((pg >> (i & 63)) & 1)) {
3875                 d = *(float32 *)(va + H1_2(i));
3876                 d = float32_muladd(e2, e1, d, 0, status);
3877                 *(float32 *)(vd + H1_2(i)) = d;
3878             }
3879             if (likely((pg >> (j & 63)) & 1)) {
3880                 d = *(float32 *)(va + H1_2(j));
3881                 d = float32_muladd(e4, e3, d, 0, status);
3882                 *(float32 *)(vd + H1_2(j)) = d;
3883             }
3884         } while (i & 63);
3885     } while (i != 0);
3886 }
3887
3888 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3889                                void *vg, void *status, uint32_t desc)
3890 {
3891     intptr_t j, i = simd_oprsz(desc);
3892     unsigned rot = simd_data(desc);
3893     bool flip = rot & 1;
3894     float64 neg_imag, neg_real;
3895     uint64_t *g = vg;
3896
3897     neg_imag = float64_set_sign(0, (rot & 2) != 0);
3898     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3899
3900     do {
3901         uint64_t pg = g[(i - 1) >> 6];
3902         do {
3903             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3904
3905             /* I holds the real index; J holds the imag index.  */
3906             j = i - sizeof(float64);
3907             i -= 2 * sizeof(float64);
3908
3909             nr = *(float64 *)(vn + H1_2(i));
3910             ni = *(float64 *)(vn + H1_2(j));
3911             mr = *(float64 *)(vm + H1_2(i));
3912             mi = *(float64 *)(vm + H1_2(j));
3913
3914             e2 = (flip ? ni : nr);
3915             e1 = (flip ? mi : mr) ^ neg_real;
3916             e4 = e2;
3917             e3 = (flip ? mr : mi) ^ neg_imag;
3918
3919             if (likely((pg >> (i & 63)) & 1)) {
3920                 d = *(float64 *)(va + H1_2(i));
3921                 d = float64_muladd(e2, e1, d, 0, status);
3922                 *(float64 *)(vd + H1_2(i)) = d;
3923             }
3924             if (likely((pg >> (j & 63)) & 1)) {
3925                 d = *(float64 *)(va + H1_2(j));
3926                 d = float64_muladd(e4, e3, d, 0, status);
3927                 *(float64 *)(vd + H1_2(j)) = d;
3928             }
3929         } while (i & 63);
3930     } while (i != 0);
3931 }
3932
3933 /*
3934  * Load contiguous data, protected by a governing predicate.
3935  */
3936
3937 /*
3938  * Load one element into @vd + @reg_off from @host.
3939  * The controlling predicate is known to be true.
3940  */
3941 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3942
3943 /*
3944  * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3945  * The controlling predicate is known to be true.
3946  */
3947 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3948                               target_ulong vaddr, uintptr_t retaddr);
3949
3950 /*
3951  * Generate the above primitives.
3952  */
3953
3954 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3955 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host)  \
3956 {                                                                      \
3957     TYPEM val = HOST(host);                                            \
3958     *(TYPEE *)(vd + H(reg_off)) = val;                                 \
3959 }
3960
3961 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3962 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host)  \
3963 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3964
3965 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3966 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
3967                              target_ulong addr, uintptr_t ra)               \
3968 {                                                                           \
3969     *(TYPEE *)(vd + H(reg_off)) = (TYPEM)TLB(env, addr, ra);                \
3970 }
3971
3972 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3973 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
3974                              target_ulong addr, uintptr_t ra)               \
3975 {                                                                           \
3976     TLB(env, addr, (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra);                 \
3977 }
3978
3979 #define DO_LD_PRIM_1(NAME, H, TE, TM)                   \
3980     DO_LD_HOST(NAME, H, TE, TM, ldub_p)                 \
3981     DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3982
3983 DO_LD_PRIM_1(ld1bb,  H1,   uint8_t,  uint8_t)
3984 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3985 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t,  int8_t)
3986 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3987 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t,  int8_t)
3988 DO_LD_PRIM_1(ld1bdu,     , uint64_t, uint8_t)
3989 DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
3990
3991 #define DO_ST_PRIM_1(NAME, H, TE, TM)                   \
3992     DO_ST_HOST(st1##NAME, H, TE, TM, stb_p)             \
3993     DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3994
3995 DO_ST_PRIM_1(bb,   H1,  uint8_t, uint8_t)
3996 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3997 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3998 DO_ST_PRIM_1(bd,     , uint64_t, uint8_t)
3999
4000 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4001     DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p)    \
4002     DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p)    \
4003     DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4004     DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
4005
4006 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
4007     DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p)    \
4008     DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p)    \
4009     DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4010     DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
4011
4012 DO_LD_PRIM_2(hh,  H1_2, uint16_t, uint16_t, lduw)
4013 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4014 DO_LD_PRIM_2(hss, H1_4, uint32_t,  int16_t, lduw)
4015 DO_LD_PRIM_2(hdu,     , uint64_t, uint16_t, lduw)
4016 DO_LD_PRIM_2(hds,     , uint64_t,  int16_t, lduw)
4017
4018 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4019 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4020 DO_ST_PRIM_2(hd,     , uint64_t, uint16_t, stw)
4021
4022 DO_LD_PRIM_2(ss,  H1_4, uint32_t, uint32_t, ldl)
4023 DO_LD_PRIM_2(sdu,     , uint64_t, uint32_t, ldl)
4024 DO_LD_PRIM_2(sds,     , uint64_t,  int32_t, ldl)
4025
4026 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4027 DO_ST_PRIM_2(sd,     , uint64_t, uint32_t, stl)
4028
4029 DO_LD_PRIM_2(dd,     , uint64_t, uint64_t, ldq)
4030 DO_ST_PRIM_2(dd,     , uint64_t, uint64_t, stq)
4031
4032 #undef DO_LD_TLB
4033 #undef DO_ST_TLB
4034 #undef DO_LD_HOST
4035 #undef DO_LD_PRIM_1
4036 #undef DO_ST_PRIM_1
4037 #undef DO_LD_PRIM_2
4038 #undef DO_ST_PRIM_2
4039
4040 /*
4041  * Skip through a sequence of inactive elements in the guarding predicate @vg,
4042  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
4043  * element >= @reg_off, or @reg_max if there were no active elements at all.
4044  */
4045 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4046                                  intptr_t reg_max, int esz)
4047 {
4048     uint64_t pg_mask = pred_esz_masks[esz];
4049     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4050
4051     /* In normal usage, the first element is active.  */
4052     if (likely(pg & 1)) {
4053         return reg_off;
4054     }
4055
4056     if (pg == 0) {
4057         reg_off &= -64;
4058         do {
4059             reg_off += 64;
4060             if (unlikely(reg_off >= reg_max)) {
4061                 /* The entire predicate was false.  */
4062                 return reg_max;
4063             }
4064             pg = vg[reg_off >> 6] & pg_mask;
4065         } while (pg == 0);
4066     }
4067     reg_off += ctz64(pg);
4068
4069     /* We should never see an out of range predicate bit set.  */
4070     tcg_debug_assert(reg_off < reg_max);
4071     return reg_off;
4072 }
4073
4074 /*
4075  * Resolve the guest virtual address to info->host and info->flags.
4076  * If @nofault, return false if the page is invalid, otherwise
4077  * exit via page fault exception.
4078  */
4079
4080 typedef struct {
4081     void *host;
4082     int flags;
4083     MemTxAttrs attrs;
4084 } SVEHostPage;
4085
4086 static bool sve_probe_page(SVEHostPage *info, bool nofault,
4087                            CPUARMState *env, target_ulong addr,
4088                            int mem_off, MMUAccessType access_type,
4089                            int mmu_idx, uintptr_t retaddr)
4090 {
4091     int flags;
4092
4093     addr += mem_off;
4094     flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4095                                &info->host, retaddr);
4096     info->flags = flags;
4097
4098     if (flags & TLB_INVALID_MASK) {
4099         g_assert(nofault);
4100         return false;
4101     }
4102
4103     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4104     info->host -= mem_off;
4105
4106 #ifdef CONFIG_USER_ONLY
4107     memset(&info->attrs, 0, sizeof(info->attrs));
4108 #else
4109     /*
4110      * Find the iotlbentry for addr and return the transaction attributes.
4111      * This *must* be present in the TLB because we just found the mapping.
4112      */
4113     {
4114         uintptr_t index = tlb_index(env, mmu_idx, addr);
4115
4116 # ifdef CONFIG_DEBUG_TCG
4117         CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4118         target_ulong comparator = (access_type == MMU_DATA_LOAD
4119                                    ? entry->addr_read
4120                                    : tlb_addr_write(entry));
4121         g_assert(tlb_hit(comparator, addr));
4122 # endif
4123
4124         CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4125         info->attrs = iotlbentry->attrs;
4126     }
4127 #endif
4128
4129     return true;
4130 }
4131
4132
4133 /*
4134  * Analyse contiguous data, protected by a governing predicate.
4135  */
4136
4137 typedef enum {
4138     FAULT_NO,
4139     FAULT_FIRST,
4140     FAULT_ALL,
4141 } SVEContFault;
4142
4143 typedef struct {
4144     /*
4145      * First and last element wholly contained within the two pages.
4146      * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4147      * reg_off_last[0] may be < 0 if the first element crosses pages.
4148      * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4149      * are set >= 0 only if there are complete elements on a second page.
4150      *
4151      * The reg_off_* offsets are relative to the internal vector register.
4152      * The mem_off_first offset is relative to the memory address; the
4153      * two offsets are different when a load operation extends, a store
4154      * operation truncates, or for multi-register operations.
4155      */
4156     int16_t mem_off_first[2];
4157     int16_t reg_off_first[2];
4158     int16_t reg_off_last[2];
4159
4160     /*
4161      * One element that is misaligned and spans both pages,
4162      * or -1 if there is no such active element.
4163      */
4164     int16_t mem_off_split;
4165     int16_t reg_off_split;
4166
4167     /*
4168      * The byte offset at which the entire operation crosses a page boundary.
4169      * Set >= 0 if and only if the entire operation spans two pages.
4170      */
4171     int16_t page_split;
4172
4173     /* TLB data for the two pages. */
4174     SVEHostPage page[2];
4175 } SVEContLdSt;
4176
4177 /*
4178  * Find first active element on each page, and a loose bound for the
4179  * final element on each page.  Identify any single element that spans
4180  * the page boundary.  Return true if there are any active elements.
4181  */
4182 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4183                                    uint64_t *vg, intptr_t reg_max,
4184                                    int esz, int msize)
4185 {
4186     const int esize = 1 << esz;
4187     const uint64_t pg_mask = pred_esz_masks[esz];
4188     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4189     intptr_t mem_off_last, mem_off_split;
4190     intptr_t page_split, elt_split;
4191     intptr_t i;
4192
4193     /* Set all of the element indices to -1, and the TLB data to 0. */
4194     memset(info, -1, offsetof(SVEContLdSt, page));
4195     memset(info->page, 0, sizeof(info->page));
4196
4197     /* Gross scan over the entire predicate to find bounds. */
4198     i = 0;
4199     do {
4200         uint64_t pg = vg[i] & pg_mask;
4201         if (pg) {
4202             reg_off_last = i * 64 + 63 - clz64(pg);
4203             if (reg_off_first < 0) {
4204                 reg_off_first = i * 64 + ctz64(pg);
4205             }
4206         }
4207     } while (++i * 64 < reg_max);
4208
4209     if (unlikely(reg_off_first < 0)) {
4210         /* No active elements, no pages touched. */
4211         return false;
4212     }
4213     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4214
4215     info->reg_off_first[0] = reg_off_first;
4216     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4217     mem_off_last = (reg_off_last >> esz) * msize;
4218
4219     page_split = -(addr | TARGET_PAGE_MASK);
4220     if (likely(mem_off_last + msize <= page_split)) {
4221         /* The entire operation fits within a single page. */
4222         info->reg_off_last[0] = reg_off_last;
4223         return true;
4224     }
4225
4226     info->page_split = page_split;
4227     elt_split = page_split / msize;
4228     reg_off_split = elt_split << esz;
4229     mem_off_split = elt_split * msize;
4230
4231     /*
4232      * This is the last full element on the first page, but it is not
4233      * necessarily active.  If there is no full element, i.e. the first
4234      * active element is the one that's split, this value remains -1.
4235      * It is useful as iteration bounds.
4236      */
4237     if (elt_split != 0) {
4238         info->reg_off_last[0] = reg_off_split - esize;
4239     }
4240
4241     /* Determine if an unaligned element spans the pages.  */
4242     if (page_split % msize != 0) {
4243         /* It is helpful to know if the split element is active. */
4244         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4245             info->reg_off_split = reg_off_split;
4246             info->mem_off_split = mem_off_split;
4247
4248             if (reg_off_split == reg_off_last) {
4249                 /* The page crossing element is last. */
4250                 return true;
4251             }
4252         }
4253         reg_off_split += esize;
4254         mem_off_split += msize;
4255     }
4256
4257     /*
4258      * We do want the first active element on the second page, because
4259      * this may affect the address reported in an exception.
4260      */
4261     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4262     tcg_debug_assert(reg_off_split <= reg_off_last);
4263     info->reg_off_first[1] = reg_off_split;
4264     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4265     info->reg_off_last[1] = reg_off_last;
4266     return true;
4267 }
4268
4269 /*
4270  * Resolve the guest virtual addresses to info->page[].
4271  * Control the generation of page faults with @fault.  Return false if
4272  * there is no work to do, which can only happen with @fault == FAULT_NO.
4273  */
4274 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4275                                 CPUARMState *env, target_ulong addr,
4276                                 MMUAccessType access_type, uintptr_t retaddr)
4277 {
4278     int mmu_idx = cpu_mmu_index(env, false);
4279     int mem_off = info->mem_off_first[0];
4280     bool nofault = fault == FAULT_NO;
4281     bool have_work = true;
4282
4283     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4284                         access_type, mmu_idx, retaddr)) {
4285         /* No work to be done. */
4286         return false;
4287     }
4288
4289     if (likely(info->page_split < 0)) {
4290         /* The entire operation was on the one page. */
4291         return true;
4292     }
4293
4294     /*
4295      * If the second page is invalid, then we want the fault address to be
4296      * the first byte on that page which is accessed.
4297      */
4298     if (info->mem_off_split >= 0) {
4299         /*
4300          * There is an element split across the pages.  The fault address
4301          * should be the first byte of the second page.
4302          */
4303         mem_off = info->page_split;
4304         /*
4305          * If the split element is also the first active element
4306          * of the vector, then:  For first-fault we should continue
4307          * to generate faults for the second page.  For no-fault,
4308          * we have work only if the second page is valid.
4309          */
4310         if (info->mem_off_first[0] < info->mem_off_split) {
4311             nofault = FAULT_FIRST;
4312             have_work = false;
4313         }
4314     } else {
4315         /*
4316          * There is no element split across the pages.  The fault address
4317          * should be the first active element on the second page.
4318          */
4319         mem_off = info->mem_off_first[1];
4320         /*
4321          * There must have been one active element on the first page,
4322          * so we're out of first-fault territory.
4323          */
4324         nofault = fault != FAULT_ALL;
4325     }
4326
4327     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4328                                 access_type, mmu_idx, retaddr);
4329     return have_work;
4330 }
4331
4332 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4333                                       uint64_t *vg, target_ulong addr,
4334                                       int esize, int msize, int wp_access,
4335                                       uintptr_t retaddr)
4336 {
4337 #ifndef CONFIG_USER_ONLY
4338     intptr_t mem_off, reg_off, reg_last;
4339     int flags0 = info->page[0].flags;
4340     int flags1 = info->page[1].flags;
4341
4342     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4343         return;
4344     }
4345
4346     /* Indicate that watchpoints are handled. */
4347     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4348     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4349
4350     if (flags0 & TLB_WATCHPOINT) {
4351         mem_off = info->mem_off_first[0];
4352         reg_off = info->reg_off_first[0];
4353         reg_last = info->reg_off_last[0];
4354
4355         while (reg_off <= reg_last) {
4356             uint64_t pg = vg[reg_off >> 6];
4357             do {
4358                 if ((pg >> (reg_off & 63)) & 1) {
4359                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4360                                          msize, info->page[0].attrs,
4361                                          wp_access, retaddr);
4362                 }
4363                 reg_off += esize;
4364                 mem_off += msize;
4365             } while (reg_off <= reg_last && (reg_off & 63));
4366         }
4367     }
4368
4369     mem_off = info->mem_off_split;
4370     if (mem_off >= 0) {
4371         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4372                              info->page[0].attrs, wp_access, retaddr);
4373     }
4374
4375     mem_off = info->mem_off_first[1];
4376     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4377         reg_off = info->reg_off_first[1];
4378         reg_last = info->reg_off_last[1];
4379
4380         do {
4381             uint64_t pg = vg[reg_off >> 6];
4382             do {
4383                 if ((pg >> (reg_off & 63)) & 1) {
4384                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4385                                          msize, info->page[1].attrs,
4386                                          wp_access, retaddr);
4387                 }
4388                 reg_off += esize;
4389                 mem_off += msize;
4390             } while (reg_off & 63);
4391         } while (reg_off <= reg_last);
4392     }
4393 #endif
4394 }
4395
4396 /*
4397  * Common helper for all contiguous 1,2,3,4-register predicated stores.
4398  */
4399 static inline QEMU_ALWAYS_INLINE
4400 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4401                uint32_t desc, const uintptr_t retaddr,
4402                const int esz, const int msz, const int N,
4403                sve_ldst1_host_fn *host_fn,
4404                sve_ldst1_tlb_fn *tlb_fn)
4405 {
4406     const unsigned rd = simd_data(desc);
4407     const intptr_t reg_max = simd_oprsz(desc);
4408     intptr_t reg_off, reg_last, mem_off;
4409     SVEContLdSt info;
4410     void *host;
4411     int flags, i;
4412
4413     /* Find the active elements.  */
4414     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4415         /* The entire predicate was false; no load occurs.  */
4416         for (i = 0; i < N; ++i) {
4417             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4418         }
4419         return;
4420     }
4421
4422     /* Probe the page(s).  Exit with exception for any invalid page. */
4423     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4424
4425     /* Handle watchpoints for all active elements. */
4426     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4427                               BP_MEM_READ, retaddr);
4428
4429     /* TODO: MTE check. */
4430
4431     flags = info.page[0].flags | info.page[1].flags;
4432     if (unlikely(flags != 0)) {
4433 #ifdef CONFIG_USER_ONLY
4434         g_assert_not_reached();
4435 #else
4436         /*
4437          * At least one page includes MMIO.
4438          * Any bus operation can fail with cpu_transaction_failed,
4439          * which for ARM will raise SyncExternal.  Perform the load
4440          * into scratch memory to preserve register state until the end.
4441          */
4442         ARMVectorReg scratch[4] = { };
4443
4444         mem_off = info.mem_off_first[0];
4445         reg_off = info.reg_off_first[0];
4446         reg_last = info.reg_off_last[1];
4447         if (reg_last < 0) {
4448             reg_last = info.reg_off_split;
4449             if (reg_last < 0) {
4450                 reg_last = info.reg_off_last[0];
4451             }
4452         }
4453
4454         do {
4455             uint64_t pg = vg[reg_off >> 6];
4456             do {
4457                 if ((pg >> (reg_off & 63)) & 1) {
4458                     for (i = 0; i < N; ++i) {
4459                         tlb_fn(env, &scratch[i], reg_off,
4460                                addr + mem_off + (i << msz), retaddr);
4461                     }
4462                 }
4463                 reg_off += 1 << esz;
4464                 mem_off += N << msz;
4465             } while (reg_off & 63);
4466         } while (reg_off <= reg_last);
4467
4468         for (i = 0; i < N; ++i) {
4469             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4470         }
4471         return;
4472 #endif
4473     }
4474
4475     /* The entire operation is in RAM, on valid pages. */
4476
4477     for (i = 0; i < N; ++i) {
4478         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4479     }
4480
4481     mem_off = info.mem_off_first[0];
4482     reg_off = info.reg_off_first[0];
4483     reg_last = info.reg_off_last[0];
4484     host = info.page[0].host;
4485
4486     while (reg_off <= reg_last) {
4487         uint64_t pg = vg[reg_off >> 6];
4488         do {
4489             if ((pg >> (reg_off & 63)) & 1) {
4490                 for (i = 0; i < N; ++i) {
4491                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4492                             host + mem_off + (i << msz));
4493                 }
4494             }
4495             reg_off += 1 << esz;
4496             mem_off += N << msz;
4497         } while (reg_off <= reg_last && (reg_off & 63));
4498     }
4499
4500     /*
4501      * Use the slow path to manage the cross-page misalignment.
4502      * But we know this is RAM and cannot trap.
4503      */
4504     mem_off = info.mem_off_split;
4505     if (unlikely(mem_off >= 0)) {
4506         reg_off = info.reg_off_split;
4507         for (i = 0; i < N; ++i) {
4508             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4509                    addr + mem_off + (i << msz), retaddr);
4510         }
4511     }
4512
4513     mem_off = info.mem_off_first[1];
4514     if (unlikely(mem_off >= 0)) {
4515         reg_off = info.reg_off_first[1];
4516         reg_last = info.reg_off_last[1];
4517         host = info.page[1].host;
4518
4519         do {
4520             uint64_t pg = vg[reg_off >> 6];
4521             do {
4522                 if ((pg >> (reg_off & 63)) & 1) {
4523                     for (i = 0; i < N; ++i) {
4524                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4525                                 host + mem_off + (i << msz));
4526                     }
4527                 }
4528                 reg_off += 1 << esz;
4529                 mem_off += N << msz;
4530             } while (reg_off & 63);
4531         } while (reg_off <= reg_last);
4532     }
4533 }
4534
4535 #define DO_LD1_1(NAME, ESZ) \
4536 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,        \
4537                             target_ulong addr, uint32_t desc)  \
4538 {                                                              \
4539     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,      \
4540               sve_##NAME##_host, sve_##NAME##_tlb);            \
4541 }
4542
4543 #define DO_LD1_2(NAME, ESZ, MSZ) \
4544 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,        \
4545                                target_ulong addr, uint32_t desc)  \
4546 {                                                                 \
4547     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,          \
4548               sve_##NAME##_le_host, sve_##NAME##_le_tlb);         \
4549 }                                                                 \
4550 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,        \
4551                                target_ulong addr, uint32_t desc)  \
4552 {                                                                 \
4553     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,          \
4554               sve_##NAME##_be_host, sve_##NAME##_be_tlb);         \
4555 }
4556
4557 DO_LD1_1(ld1bb,  MO_8)
4558 DO_LD1_1(ld1bhu, MO_16)
4559 DO_LD1_1(ld1bhs, MO_16)
4560 DO_LD1_1(ld1bsu, MO_32)
4561 DO_LD1_1(ld1bss, MO_32)
4562 DO_LD1_1(ld1bdu, MO_64)
4563 DO_LD1_1(ld1bds, MO_64)
4564
4565 DO_LD1_2(ld1hh,  MO_16, MO_16)
4566 DO_LD1_2(ld1hsu, MO_32, MO_16)
4567 DO_LD1_2(ld1hss, MO_32, MO_16)
4568 DO_LD1_2(ld1hdu, MO_64, MO_16)
4569 DO_LD1_2(ld1hds, MO_64, MO_16)
4570
4571 DO_LD1_2(ld1ss,  MO_32, MO_32)
4572 DO_LD1_2(ld1sdu, MO_64, MO_32)
4573 DO_LD1_2(ld1sds, MO_64, MO_32)
4574
4575 DO_LD1_2(ld1dd,  MO_64, MO_64)
4576
4577 #undef DO_LD1_1
4578 #undef DO_LD1_2
4579
4580 #define DO_LDN_1(N) \
4581 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,        \
4582                              target_ulong addr, uint32_t desc)  \
4583 {                                                               \
4584     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,      \
4585               sve_ld1bb_host, sve_ld1bb_tlb);                   \
4586 }
4587
4588 #define DO_LDN_2(N, SUFF, ESZ) \
4589 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,       \
4590                                     target_ulong addr, uint32_t desc) \
4591 {                                                                     \
4592     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,              \
4593               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);       \
4594 }                                                                     \
4595 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,       \
4596                                     target_ulong addr, uint32_t desc) \
4597 {                                                                     \
4598     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,              \
4599               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);       \
4600 }
4601
4602 DO_LDN_1(2)
4603 DO_LDN_1(3)
4604 DO_LDN_1(4)
4605
4606 DO_LDN_2(2, hh, MO_16)
4607 DO_LDN_2(3, hh, MO_16)
4608 DO_LDN_2(4, hh, MO_16)
4609
4610 DO_LDN_2(2, ss, MO_32)
4611 DO_LDN_2(3, ss, MO_32)
4612 DO_LDN_2(4, ss, MO_32)
4613
4614 DO_LDN_2(2, dd, MO_64)
4615 DO_LDN_2(3, dd, MO_64)
4616 DO_LDN_2(4, dd, MO_64)
4617
4618 #undef DO_LDN_1
4619 #undef DO_LDN_2
4620
4621 /*
4622  * Load contiguous data, first-fault and no-fault.
4623  *
4624  * For user-only, one could argue that we should hold the mmap_lock during
4625  * the operation so that there is no race between page_check_range and the
4626  * load operation.  However, unmapping pages out from under a running thread
4627  * is extraordinarily unlikely.  This theoretical race condition also affects
4628  * linux-user/ in its get_user/put_user macros.
4629  *
4630  * TODO: Construct some helpers, written in assembly, that interact with
4631  * handle_cpu_signal to produce memory ops which can properly report errors
4632  * without racing.
4633  */
4634
4635 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
4636  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4637  * option, which leaves subsequent data unchanged.
4638  */
4639 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4640 {
4641     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4642
4643     if (i & 63) {
4644         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4645         i = ROUND_UP(i, 64);
4646     }
4647     for (; i < oprsz; i += 64) {
4648         ffr[i / 64] = 0;
4649     }
4650 }
4651
4652 /*
4653  * Common helper for all contiguous no-fault and first-fault loads.
4654  */
4655 static inline QEMU_ALWAYS_INLINE
4656 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4657                    uint32_t desc, const uintptr_t retaddr,
4658                    const int esz, const int msz, const SVEContFault fault,
4659                    sve_ldst1_host_fn *host_fn,
4660                    sve_ldst1_tlb_fn *tlb_fn)
4661 {
4662     const unsigned rd = simd_data(desc);
4663     void *vd = &env->vfp.zregs[rd];
4664     const intptr_t reg_max = simd_oprsz(desc);
4665     intptr_t reg_off, mem_off, reg_last;
4666     SVEContLdSt info;
4667     int flags;
4668     void *host;
4669
4670     /* Find the active elements.  */
4671     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4672         /* The entire predicate was false; no load occurs.  */
4673         memset(vd, 0, reg_max);
4674         return;
4675     }
4676     reg_off = info.reg_off_first[0];
4677
4678     /* Probe the page(s). */
4679     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4680         /* Fault on first element. */
4681         tcg_debug_assert(fault == FAULT_NO);
4682         memset(vd, 0, reg_max);
4683         goto do_fault;
4684     }
4685
4686     mem_off = info.mem_off_first[0];
4687     flags = info.page[0].flags;
4688
4689     if (fault == FAULT_FIRST) {
4690         /*
4691          * Special handling of the first active element,
4692          * if it crosses a page boundary or is MMIO.
4693          */
4694         bool is_split = mem_off == info.mem_off_split;
4695         /* TODO: MTE check. */
4696         if (unlikely(flags != 0) || unlikely(is_split)) {
4697             /*
4698              * Use the slow path for cross-page handling.
4699              * Might trap for MMIO or watchpoints.
4700              */
4701             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4702
4703             /* After any fault, zero the other elements. */
4704             swap_memzero(vd, reg_off);
4705             reg_off += 1 << esz;
4706             mem_off += 1 << msz;
4707             swap_memzero(vd + reg_off, reg_max - reg_off);
4708
4709             if (is_split) {
4710                 goto second_page;
4711             }
4712         } else {
4713             memset(vd, 0, reg_max);
4714         }
4715     } else {
4716         memset(vd, 0, reg_max);
4717         if (unlikely(mem_off == info.mem_off_split)) {
4718             /* The first active element crosses a page boundary. */
4719             flags |= info.page[1].flags;
4720             if (unlikely(flags & TLB_MMIO)) {
4721                 /* Some page is MMIO, see below. */
4722                 goto do_fault;
4723             }
4724             if (unlikely(flags & TLB_WATCHPOINT) &&
4725                 (cpu_watchpoint_address_matches
4726                  (env_cpu(env), addr + mem_off, 1 << msz)
4727                  & BP_MEM_READ)) {
4728                 /* Watchpoint hit, see below. */
4729                 goto do_fault;
4730             }
4731             /* TODO: MTE check. */
4732             /*
4733              * Use the slow path for cross-page handling.
4734              * This is RAM, without a watchpoint, and will not trap.
4735              */
4736             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4737             goto second_page;
4738         }
4739     }
4740
4741     /*
4742      * From this point on, all memory operations are MemSingleNF.
4743      *
4744      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4745      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4746      *
4747      * Unfortuately we do not have access to the memory attributes from the
4748      * PTE to tell Device memory from Normal memory.  So we make a mostly
4749      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4750      * This gives the right answer for the common cases of "Normal memory,
4751      * backed by host RAM" and "Device memory, backed by MMIO".
4752      * The architecture allows us to suppress an NF load and return
4753      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4754      * case of "Normal memory, backed by MMIO" is permitted.  The case we
4755      * get wrong is "Device memory, backed by host RAM", for which we
4756      * should return (UNKNOWN, FAULT) for but do not.
4757      *
4758      * Similarly, CPU_BP breakpoints would raise exceptions, and so
4759      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
4760      * architectural breakpoints the same.
4761      */
4762     if (unlikely(flags & TLB_MMIO)) {
4763         goto do_fault;
4764     }
4765
4766     reg_last = info.reg_off_last[0];
4767     host = info.page[0].host;
4768
4769     do {
4770         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4771         do {
4772             if ((pg >> (reg_off & 63)) & 1) {
4773                 if (unlikely(flags & TLB_WATCHPOINT) &&
4774                     (cpu_watchpoint_address_matches
4775                      (env_cpu(env), addr + mem_off, 1 << msz)
4776                      & BP_MEM_READ)) {
4777                     goto do_fault;
4778                 }
4779                 /* TODO: MTE check. */
4780                 host_fn(vd, reg_off, host + mem_off);
4781             }
4782             reg_off += 1 << esz;
4783             mem_off += 1 << msz;
4784         } while (reg_off <= reg_last && (reg_off & 63));
4785     } while (reg_off <= reg_last);
4786
4787     /*
4788      * MemSingleNF is allowed to fail for any reason.  We have special
4789      * code above to handle the first element crossing a page boundary.
4790      * As an implementation choice, decline to handle a cross-page element
4791      * in any other position.
4792      */
4793     reg_off = info.reg_off_split;
4794     if (reg_off >= 0) {
4795         goto do_fault;
4796     }
4797
4798  second_page:
4799     reg_off = info.reg_off_first[1];
4800     if (likely(reg_off < 0)) {
4801         /* No active elements on the second page.  All done. */
4802         return;
4803     }
4804
4805     /*
4806      * MemSingleNF is allowed to fail for any reason.  As an implementation
4807      * choice, decline to handle elements on the second page.  This should
4808      * be low frequency as the guest walks through memory -- the next
4809      * iteration of the guest's loop should be aligned on the page boundary,
4810      * and then all following iterations will stay aligned.
4811      */
4812
4813  do_fault:
4814     record_fault(env, reg_off, reg_max);
4815 }
4816
4817 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4818 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
4819                                  target_ulong addr, uint32_t desc)      \
4820 {                                                                       \
4821     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
4822                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
4823 }                                                                       \
4824 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
4825                                  target_ulong addr, uint32_t desc)      \
4826 {                                                                       \
4827     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO,    \
4828                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
4829 }
4830
4831 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4832 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
4833                                     target_ulong addr, uint32_t desc)   \
4834 {                                                                       \
4835     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST,  \
4836                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
4837 }                                                                       \
4838 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
4839                                     target_ulong addr, uint32_t desc)   \
4840 {                                                                       \
4841     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO,     \
4842                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
4843 }                                                                       \
4844 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
4845                                     target_ulong addr, uint32_t desc)   \
4846 {                                                                       \
4847     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST,  \
4848                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
4849 }                                                                       \
4850 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
4851                                     target_ulong addr, uint32_t desc)   \
4852 {                                                                       \
4853     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO,     \
4854                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
4855 }
4856
4857 DO_LDFF1_LDNF1_1(bb,  MO_8)
4858 DO_LDFF1_LDNF1_1(bhu, MO_16)
4859 DO_LDFF1_LDNF1_1(bhs, MO_16)
4860 DO_LDFF1_LDNF1_1(bsu, MO_32)
4861 DO_LDFF1_LDNF1_1(bss, MO_32)
4862 DO_LDFF1_LDNF1_1(bdu, MO_64)
4863 DO_LDFF1_LDNF1_1(bds, MO_64)
4864
4865 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
4866 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
4867 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
4868 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
4869 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
4870
4871 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
4872 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
4873 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
4874
4875 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
4876
4877 #undef DO_LDFF1_LDNF1_1
4878 #undef DO_LDFF1_LDNF1_2
4879
4880 /*
4881  * Common helper for all contiguous 1,2,3,4-register predicated stores.
4882  */
4883
4884 static inline QEMU_ALWAYS_INLINE
4885 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, uint32_t desc,
4886                const uintptr_t retaddr, const int esz,
4887                const int msz, const int N,
4888                sve_ldst1_host_fn *host_fn,
4889                sve_ldst1_tlb_fn *tlb_fn)
4890 {
4891     const unsigned rd = simd_data(desc);
4892     const intptr_t reg_max = simd_oprsz(desc);
4893     intptr_t reg_off, reg_last, mem_off;
4894     SVEContLdSt info;
4895     void *host;
4896     int i, flags;
4897
4898     /* Find the active elements.  */
4899     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4900         /* The entire predicate was false; no store occurs.  */
4901         return;
4902     }
4903
4904     /* Probe the page(s).  Exit with exception for any invalid page. */
4905     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
4906
4907     /* Handle watchpoints for all active elements. */
4908     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4909                               BP_MEM_WRITE, retaddr);
4910
4911     /* TODO: MTE check. */
4912
4913     flags = info.page[0].flags | info.page[1].flags;
4914     if (unlikely(flags != 0)) {
4915 #ifdef CONFIG_USER_ONLY
4916         g_assert_not_reached();
4917 #else
4918         /*
4919          * At least one page includes MMIO.
4920          * Any bus operation can fail with cpu_transaction_failed,
4921          * which for ARM will raise SyncExternal.  We cannot avoid
4922          * this fault and will leave with the store incomplete.
4923          */
4924         mem_off = info.mem_off_first[0];
4925         reg_off = info.reg_off_first[0];
4926         reg_last = info.reg_off_last[1];
4927         if (reg_last < 0) {
4928             reg_last = info.reg_off_split;
4929             if (reg_last < 0) {
4930                 reg_last = info.reg_off_last[0];
4931             }
4932         }
4933
4934         do {
4935             uint64_t pg = vg[reg_off >> 6];
4936             do {
4937                 if ((pg >> (reg_off & 63)) & 1) {
4938                     for (i = 0; i < N; ++i) {
4939                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4940                                addr + mem_off + (i << msz), retaddr);
4941                     }
4942                 }
4943                 reg_off += 1 << esz;
4944                 mem_off += N << msz;
4945             } while (reg_off & 63);
4946         } while (reg_off <= reg_last);
4947         return;
4948 #endif
4949     }
4950
4951     mem_off = info.mem_off_first[0];
4952     reg_off = info.reg_off_first[0];
4953     reg_last = info.reg_off_last[0];
4954     host = info.page[0].host;
4955
4956     while (reg_off <= reg_last) {
4957         uint64_t pg = vg[reg_off >> 6];
4958         do {
4959             if ((pg >> (reg_off & 63)) & 1) {
4960                 for (i = 0; i < N; ++i) {
4961                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4962                             host + mem_off + (i << msz));
4963                 }
4964             }
4965             reg_off += 1 << esz;
4966             mem_off += N << msz;
4967         } while (reg_off <= reg_last && (reg_off & 63));
4968     }
4969
4970     /*
4971      * Use the slow path to manage the cross-page misalignment.
4972      * But we know this is RAM and cannot trap.
4973      */
4974     mem_off = info.mem_off_split;
4975     if (unlikely(mem_off >= 0)) {
4976         reg_off = info.reg_off_split;
4977         for (i = 0; i < N; ++i) {
4978             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4979                    addr + mem_off + (i << msz), retaddr);
4980         }
4981     }
4982
4983     mem_off = info.mem_off_first[1];
4984     if (unlikely(mem_off >= 0)) {
4985         reg_off = info.reg_off_first[1];
4986         reg_last = info.reg_off_last[1];
4987         host = info.page[1].host;
4988
4989         do {
4990             uint64_t pg = vg[reg_off >> 6];
4991             do {
4992                 if ((pg >> (reg_off & 63)) & 1) {
4993                     for (i = 0; i < N; ++i) {
4994                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4995                                 host + mem_off + (i << msz));
4996                     }
4997                 }
4998                 reg_off += 1 << esz;
4999                 mem_off += N << msz;
5000             } while (reg_off & 63);
5001         } while (reg_off <= reg_last);
5002     }
5003 }
5004
5005 #define DO_STN_1(N, NAME, ESZ) \
5006 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,        \
5007                                  target_ulong addr, uint32_t desc)  \
5008 {                                                                   \
5009     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
5010               sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
5011 }
5012
5013 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5014 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,       \
5015                                     target_ulong addr, uint32_t desc) \
5016 {                                                                     \
5017     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,              \
5018               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);       \
5019 }                                                                     \
5020 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,       \
5021                                     target_ulong addr, uint32_t desc) \
5022 {                                                                     \
5023     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,              \
5024               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);       \
5025 }
5026
5027 DO_STN_1(1, bb, MO_8)
5028 DO_STN_1(1, bh, MO_16)
5029 DO_STN_1(1, bs, MO_32)
5030 DO_STN_1(1, bd, MO_64)
5031 DO_STN_1(2, bb, MO_8)
5032 DO_STN_1(3, bb, MO_8)
5033 DO_STN_1(4, bb, MO_8)
5034
5035 DO_STN_2(1, hh, MO_16, MO_16)
5036 DO_STN_2(1, hs, MO_32, MO_16)
5037 DO_STN_2(1, hd, MO_64, MO_16)
5038 DO_STN_2(2, hh, MO_16, MO_16)
5039 DO_STN_2(3, hh, MO_16, MO_16)
5040 DO_STN_2(4, hh, MO_16, MO_16)
5041
5042 DO_STN_2(1, ss, MO_32, MO_32)
5043 DO_STN_2(1, sd, MO_64, MO_32)
5044 DO_STN_2(2, ss, MO_32, MO_32)
5045 DO_STN_2(3, ss, MO_32, MO_32)
5046 DO_STN_2(4, ss, MO_32, MO_32)
5047
5048 DO_STN_2(1, dd, MO_64, MO_64)
5049 DO_STN_2(2, dd, MO_64, MO_64)
5050 DO_STN_2(3, dd, MO_64, MO_64)
5051 DO_STN_2(4, dd, MO_64, MO_64)
5052
5053 #undef DO_STN_1
5054 #undef DO_STN_2
5055
5056 /*
5057  * Loads with a vector index.
5058  */
5059
5060 /*
5061  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5062  */
5063 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5064
5065 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5066 {
5067     return *(uint32_t *)(reg + H1_4(reg_ofs));
5068 }
5069
5070 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5071 {
5072     return *(int32_t *)(reg + H1_4(reg_ofs));
5073 }
5074
5075 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5076 {
5077     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5078 }
5079
5080 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5081 {
5082     return (int32_t)*(uint64_t *)(reg + reg_ofs);
5083 }
5084
5085 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5086 {
5087     return *(uint64_t *)(reg + reg_ofs);
5088 }
5089
5090 static inline QEMU_ALWAYS_INLINE
5091 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5092                target_ulong base, uint32_t desc, uintptr_t retaddr,
5093                int esize, int msize, zreg_off_fn *off_fn,
5094                sve_ldst1_host_fn *host_fn,
5095                sve_ldst1_tlb_fn *tlb_fn)
5096 {
5097     const int mmu_idx = cpu_mmu_index(env, false);
5098     const intptr_t reg_max = simd_oprsz(desc);
5099     const int scale = simd_data(desc);
5100     ARMVectorReg scratch;
5101     intptr_t reg_off;
5102     SVEHostPage info, info2;
5103
5104     memset(&scratch, 0, reg_max);
5105     reg_off = 0;
5106     do {
5107         uint64_t pg = vg[reg_off >> 6];
5108         do {
5109             if (likely(pg & 1)) {
5110                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5111                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5112
5113                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5114                                mmu_idx, retaddr);
5115
5116                 if (likely(in_page >= msize)) {
5117                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
5118                         cpu_check_watchpoint(env_cpu(env), addr, msize,
5119                                              info.attrs, BP_MEM_READ, retaddr);
5120                     }
5121                     /* TODO: MTE check */
5122                     host_fn(&scratch, reg_off, info.host);
5123                 } else {
5124                     /* Element crosses the page boundary. */
5125                     sve_probe_page(&info2, false, env, addr + in_page, 0,
5126                                    MMU_DATA_LOAD, mmu_idx, retaddr);
5127                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5128                         cpu_check_watchpoint(env_cpu(env), addr,
5129                                              msize, info.attrs,
5130                                              BP_MEM_READ, retaddr);
5131                     }
5132                     /* TODO: MTE check */
5133                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
5134                 }
5135             }
5136             reg_off += esize;
5137             pg >>= esize;
5138         } while (reg_off & 63);
5139     } while (reg_off < reg_max);
5140
5141     /* Wait until all exceptions have been raised to write back.  */
5142     memcpy(vd, &scratch, reg_max);
5143 }
5144
5145 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5146 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5147                                  void *vm, target_ulong base, uint32_t desc) \
5148 {                                                                            \
5149     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,             \
5150               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
5151 }
5152
5153 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5154 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5155                                  void *vm, target_ulong base, uint32_t desc) \
5156 {                                                                            \
5157     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,             \
5158               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
5159 }
5160
5161 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5162 DO_LD1_ZPZ_S(bsu, zss, MO_8)
5163 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5164 DO_LD1_ZPZ_D(bdu, zss, MO_8)
5165 DO_LD1_ZPZ_D(bdu, zd, MO_8)
5166
5167 DO_LD1_ZPZ_S(bss, zsu, MO_8)
5168 DO_LD1_ZPZ_S(bss, zss, MO_8)
5169 DO_LD1_ZPZ_D(bds, zsu, MO_8)
5170 DO_LD1_ZPZ_D(bds, zss, MO_8)
5171 DO_LD1_ZPZ_D(bds, zd, MO_8)
5172
5173 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5174 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5175 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5176 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5177 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5178
5179 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5180 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5181 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5182 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5183 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5184
5185 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5186 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5187 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5188 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5189 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5190
5191 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5192 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5193 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5194 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5195 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5196
5197 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5198 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5199 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5200 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5201 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5202
5203 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5204 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5205 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5206 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5207 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5208
5209 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5210 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5211 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5212
5213 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5214 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5215 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5216
5217 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5218 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5219 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5220
5221 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5222 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5223 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5224
5225 #undef DO_LD1_ZPZ_S
5226 #undef DO_LD1_ZPZ_D
5227
5228 /* First fault loads with a vector index.  */
5229
5230 /*
5231  * Common helpers for all gather first-faulting loads.
5232  */
5233
5234 static inline QEMU_ALWAYS_INLINE
5235 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5236                  target_ulong base, uint32_t desc, uintptr_t retaddr,
5237                  const int esz, const int msz, zreg_off_fn *off_fn,
5238                  sve_ldst1_host_fn *host_fn,
5239                  sve_ldst1_tlb_fn *tlb_fn)
5240 {
5241     const int mmu_idx = cpu_mmu_index(env, false);
5242     const intptr_t reg_max = simd_oprsz(desc);
5243     const int scale = simd_data(desc);
5244     const int esize = 1 << esz;
5245     const int msize = 1 << msz;
5246     intptr_t reg_off;
5247     SVEHostPage info;
5248     target_ulong addr, in_page;
5249
5250     /* Skip to the first true predicate.  */
5251     reg_off = find_next_active(vg, 0, reg_max, esz);
5252     if (unlikely(reg_off >= reg_max)) {
5253         /* The entire predicate was false; no load occurs.  */
5254         memset(vd, 0, reg_max);
5255         return;
5256     }
5257
5258     /*
5259      * Probe the first element, allowing faults.
5260      */
5261     addr = base + (off_fn(vm, reg_off) << scale);
5262     tlb_fn(env, vd, reg_off, addr, retaddr);
5263
5264     /* After any fault, zero the other elements. */
5265     swap_memzero(vd, reg_off);
5266     reg_off += esize;
5267     swap_memzero(vd + reg_off, reg_max - reg_off);
5268
5269     /*
5270      * Probe the remaining elements, not allowing faults.
5271      */
5272     while (reg_off < reg_max) {
5273         uint64_t pg = vg[reg_off >> 6];
5274         do {
5275             if (likely((pg >> (reg_off & 63)) & 1)) {
5276                 addr = base + (off_fn(vm, reg_off) << scale);
5277                 in_page = -(addr | TARGET_PAGE_MASK);
5278
5279                 if (unlikely(in_page < msize)) {
5280                     /* Stop if the element crosses a page boundary. */
5281                     goto fault;
5282                 }
5283
5284                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5285                                mmu_idx, retaddr);
5286                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5287                     goto fault;
5288                 }
5289                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5290                     (cpu_watchpoint_address_matches
5291                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5292                     goto fault;
5293                 }
5294                 /* TODO: MTE check. */
5295
5296                 host_fn(vd, reg_off, info.host);
5297             }
5298             reg_off += esize;
5299         } while (reg_off & 63);
5300     }
5301     return;
5302
5303  fault:
5304     record_fault(env, reg_off, reg_max);
5305 }
5306
5307 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5308 void HELPER(sve_ldff##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5309                                    void *vm, target_ulong base, uint32_t desc) \
5310 {                                                                              \
5311     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,              \
5312                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
5313 }
5314
5315 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5316 void HELPER(sve_ldff##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5317                                    void *vm, target_ulong base, uint32_t desc) \
5318 {                                                                              \
5319     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,              \
5320                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
5321 }
5322
5323 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5324 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5325 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5326 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5327 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5328
5329 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5330 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5331 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5332 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5333 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5334
5335 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5336 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5337 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5338 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5339 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5340
5341 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5342 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5343 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5344 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5345 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5346
5347 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5348 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5349 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5350 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5351 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5352
5353 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5354 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5355 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5356 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5357 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5358
5359 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
5360 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
5361 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5362 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5363 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5364
5365 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
5366 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
5367 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5368 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5369 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5370
5371 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5372 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5373 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5374
5375 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5376 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5377 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5378
5379 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5380 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5381 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5382
5383 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5384 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5385 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5386
5387 /* Stores with a vector index.  */
5388
5389 static inline QEMU_ALWAYS_INLINE
5390 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5391                target_ulong base, uint32_t desc, uintptr_t retaddr,
5392                int esize, int msize, zreg_off_fn *off_fn,
5393                sve_ldst1_host_fn *host_fn,
5394                sve_ldst1_tlb_fn *tlb_fn)
5395 {
5396     const int mmu_idx = cpu_mmu_index(env, false);
5397     const intptr_t reg_max = simd_oprsz(desc);
5398     const int scale = simd_data(desc);
5399     void *host[ARM_MAX_VQ * 4];
5400     intptr_t reg_off, i;
5401     SVEHostPage info, info2;
5402
5403     /*
5404      * Probe all of the elements for host addresses and flags.
5405      */
5406     i = reg_off = 0;
5407     do {
5408         uint64_t pg = vg[reg_off >> 6];
5409         do {
5410             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5411             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5412
5413             host[i] = NULL;
5414             if (likely((pg >> (reg_off & 63)) & 1)) {
5415                 if (likely(in_page >= msize)) {
5416                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5417                                    mmu_idx, retaddr);
5418                     host[i] = info.host;
5419                 } else {
5420                     /*
5421                      * Element crosses the page boundary.
5422                      * Probe both pages, but do not record the host address,
5423                      * so that we use the slow path.
5424                      */
5425                     sve_probe_page(&info, false, env, addr, 0,
5426                                    MMU_DATA_STORE, mmu_idx, retaddr);
5427                     sve_probe_page(&info2, false, env, addr + in_page, 0,
5428                                    MMU_DATA_STORE, mmu_idx, retaddr);
5429                     info.flags |= info2.flags;
5430                 }
5431
5432                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5433                     cpu_check_watchpoint(env_cpu(env), addr, msize,
5434                                          info.attrs, BP_MEM_WRITE, retaddr);
5435                 }
5436                 /* TODO: MTE check. */
5437             }
5438             i += 1;
5439             reg_off += esize;
5440         } while (reg_off & 63);
5441     } while (reg_off < reg_max);
5442
5443     /*
5444      * Now that we have recognized all exceptions except SyncExternal
5445      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5446      *
5447      * Note for the common case of an element in RAM, not crossing a page
5448      * boundary, we have stored the host address in host[].  This doubles
5449      * as a first-level check against the predicate, since only enabled
5450      * elements have non-null host addresses.
5451      */
5452     i = reg_off = 0;
5453     do {
5454         void *h = host[i];
5455         if (likely(h != NULL)) {
5456             host_fn(vd, reg_off, h);
5457         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5458             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5459             tlb_fn(env, vd, reg_off, addr, retaddr);
5460         }
5461         i += 1;
5462         reg_off += esize;
5463     } while (reg_off < reg_max);
5464 }
5465
5466 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5467 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5468                                  void *vm, target_ulong base, uint32_t desc) \
5469 {                                                                            \
5470     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,             \
5471               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);       \
5472 }
5473
5474 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5475 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5476                                  void *vm, target_ulong base, uint32_t desc) \
5477 {                                                                            \
5478     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,             \
5479               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);       \
5480 }
5481
5482 DO_ST1_ZPZ_S(bs, zsu, MO_8)
5483 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5484 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5485 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5486 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5487
5488 DO_ST1_ZPZ_S(bs, zss, MO_8)
5489 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5490 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5491 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5492 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5493
5494 DO_ST1_ZPZ_D(bd, zsu, MO_8)
5495 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5496 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5497 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5498 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5499 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5500 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5501
5502 DO_ST1_ZPZ_D(bd, zss, MO_8)
5503 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5504 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5505 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5506 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5507 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5508 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5509
5510 DO_ST1_ZPZ_D(bd, zd, MO_8)
5511 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5512 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5513 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5514 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5515 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5516 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5517
5518 #undef DO_ST1_ZPZ_S
5519 #undef DO_ST1_ZPZ_D