target/arm/sve_helper.c

   1 /*
   2  * ARM SVE Operations
   3  *
   4  * Copyright (c) 2018 Linaro, Ltd.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "cpu.h"
  22 #include "exec/exec-all.h"
  23 #include "exec/cpu_ldst.h"
  24 #include "exec/helper-proto.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26 #include "fpu/softfloat.h"
  27
  28
  29 /* Note that vector data is stored in host-endian 64-bit chunks,
  30    so addressing units smaller than that needs a host-endian fixup.  */
  31 #ifdef HOST_WORDS_BIGENDIAN
  32 #define H1(x)   ((x) ^ 7)
  33 #define H1_2(x) ((x) ^ 6)
  34 #define H1_4(x) ((x) ^ 4)
  35 #define H2(x)   ((x) ^ 3)
  36 #define H4(x)   ((x) ^ 1)
  37 #else
  38 #define H1(x)   (x)
  39 #define H1_2(x) (x)
  40 #define H1_4(x) (x)
  41 #define H2(x)   (x)
  42 #define H4(x)   (x)
  43 #endif
  44
  45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
  46  *
  47  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  48  * and bit 0 set if C is set.  Compare the definitions of these variables
  49  * within CPUARMState.
  50  */
  51
  52 /* For no G bits set, NZCV = C.  */
  53 #define PREDTEST_INIT  1
  54
  55 /* This is an iterative function, called for each Pd and Pg word
  56  * moving forward.
  57  */
  58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  59 {
  60     if (likely(g)) {
  61         /* Compute N from first D & G.
  62            Use bit 2 to signal first G bit seen.  */
  63         if (!(flags & 4)) {
  64             flags |= ((d & (g & -g)) != 0) << 31;
  65             flags |= 4;
  66         }
  67
  68         /* Accumulate Z from each D & G.  */
  69         flags |= ((d & g) != 0) << 1;
  70
  71         /* Compute C from last !(D & G).  Replace previous.  */
  72         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  73     }
  74     return flags;
  75 }
  76
  77 /* This is an iterative function, called for each Pd and Pg word
  78  * moving backward.
  79  */
  80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
  81 {
  82     if (likely(g)) {
  83         /* Compute C from first (i.e last) !(D & G).
  84            Use bit 2 to signal first G bit seen.  */
  85         if (!(flags & 4)) {
  86             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
  87             flags |= (d & pow2floor(g)) == 0;
  88         }
  89
  90         /* Accumulate Z from each D & G.  */
  91         flags |= ((d & g) != 0) << 1;
  92
  93         /* Compute N from last (i.e first) D & G.  Replace previous.  */
  94         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
  95     }
  96     return flags;
  97 }
  98
  99 /* The same for a single word predicate.  */
 100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
 101 {
 102     return iter_predtest_fwd(d, g, PREDTEST_INIT);
 103 }
 104
 105 /* The same for a multi-word predicate.  */
 106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
 107 {
 108     uint32_t flags = PREDTEST_INIT;
 109     uint64_t *d = vd, *g = vg;
 110     uintptr_t i = 0;
 111
 112     do {
 113         flags = iter_predtest_fwd(d[i], g[i], flags);
 114     } while (++i < words);
 115
 116     return flags;
 117 }
 118
 119 /* Expand active predicate bits to bytes, for byte elements.
 120  *  for (i = 0; i < 256; ++i) {
 121  *      unsigned long m = 0;
 122  *      for (j = 0; j < 8; j++) {
 123  *          if ((i >> j) & 1) {
 124  *              m |= 0xfful << (j << 3);
 125  *          }
 126  *      }
 127  *      printf("0x%016lx,\n", m);
 128  *  }
 129  */
 130 static inline uint64_t expand_pred_b(uint8_t byte)
 131 {
 132     static const uint64_t word[256] = {
 133         0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 134         0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 135         0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 136         0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 137         0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 138         0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 139         0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 140         0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 141         0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 142         0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 143         0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 144         0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 145         0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 146         0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 147         0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 148         0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 149         0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 150         0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 151         0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 152         0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 153         0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 154         0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 155         0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 156         0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 157         0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 158         0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 159         0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 160         0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 161         0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 162         0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 163         0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 164         0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 165         0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 166         0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 167         0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 168         0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 169         0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 170         0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 171         0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 172         0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 173         0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 174         0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 175         0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 176         0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 177         0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 178         0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 179         0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 180         0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 181         0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 182         0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 183         0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 184         0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 185         0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 186         0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 187         0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 188         0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 189         0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 190         0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 191         0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 192         0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 193         0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 194         0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 195         0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 196         0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 197         0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 198         0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 199         0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 200         0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 201         0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 202         0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 203         0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 204         0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 205         0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 206         0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 207         0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 208         0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 209         0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 210         0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 211         0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 212         0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 213         0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 214         0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 215         0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 216         0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 217         0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 218         0xffffffffffffffff,
 219     };
 220     return word[byte];
 221 }
 222
 223 /* Similarly for half-word elements.
 224  *  for (i = 0; i < 256; ++i) {
 225  *      unsigned long m = 0;
 226  *      if (i & 0xaa) {
 227  *          continue;
 228  *      }
 229  *      for (j = 0; j < 8; j += 2) {
 230  *          if ((i >> j) & 1) {
 231  *              m |= 0xfffful << (j << 3);
 232  *          }
 233  *      }
 234  *      printf("[0x%x] = 0x%016lx,\n", i, m);
 235  *  }
 236  */
 237 static inline uint64_t expand_pred_h(uint8_t byte)
 238 {
 239     static const uint64_t word[] = {
 240         [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 241         [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 242         [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 243         [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 244         [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 245         [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 246         [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 247         [0x55] = 0xffffffffffffffff,
 248     };
 249     return word[byte & 0x55];
 250 }
 251
 252 /* Similarly for single word elements.  */
 253 static inline uint64_t expand_pred_s(uint8_t byte)
 254 {
 255     static const uint64_t word[] = {
 256         [0x01] = 0x00000000ffffffffull,
 257         [0x10] = 0xffffffff00000000ull,
 258         [0x11] = 0xffffffffffffffffull,
 259     };
 260     return word[byte & 0x11];
 261 }
 262
 263 /* Swap 16-bit words within a 32-bit word.  */
 264 static inline uint32_t hswap32(uint32_t h)
 265 {
 266     return rol32(h, 16);
 267 }
 268
 269 /* Swap 16-bit words within a 64-bit word.  */
 270 static inline uint64_t hswap64(uint64_t h)
 271 {
 272     uint64_t m = 0x0000ffff0000ffffull;
 273     h = rol64(h, 32);
 274     return ((h & m) << 16) | ((h >> 16) & m);
 275 }
 276
 277 /* Swap 32-bit words within a 64-bit word.  */
 278 static inline uint64_t wswap64(uint64_t h)
 279 {
 280     return rol64(h, 32);
 281 }
 282
 283 #define LOGICAL_PPPP(NAME, FUNC) \
 284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 285 {                                                                         \
 286     uintptr_t opr_sz = simd_oprsz(desc);                                  \
 287     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 288     uintptr_t i;                                                          \
 289     for (i = 0; i < opr_sz / 8; ++i) {                                    \
 290         d[i] = FUNC(n[i], m[i], g[i]);                                    \
 291     }                                                                     \
 292 }
 293
 294 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
 295 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 296 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 297 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 298 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 299 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 301 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 302
 303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
 304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 311
 312 #undef DO_AND
 313 #undef DO_BIC
 314 #undef DO_EOR
 315 #undef DO_ORR
 316 #undef DO_ORN
 317 #undef DO_NOR
 318 #undef DO_NAND
 319 #undef DO_SEL
 320 #undef LOGICAL_PPPP
 321
 322 /* Fully general three-operand expander, controlled by a predicate.
 323  * This is complicated by the host-endian storage of the register file.
 324  */
 325 /* ??? I don't expect the compiler could ever vectorize this itself.
 326  * With some tables we can convert bit masks to byte masks, and with
 327  * extra care wrt byte/word ordering we could use gcc generic vectors
 328  * and do 16 bytes at a time.
 329  */
 330 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 332 {                                                                       \
 333     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 334     for (i = 0; i < opr_sz; ) {                                         \
 335         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 336         do {                                                            \
 337             if (pg & 1) {                                               \
 338                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 339                 TYPE mm = *(TYPE *)(vm + H(i));                         \
 340                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 341             }                                                           \
 342             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 343         } while (i & 15);                                               \
 344     }                                                                   \
 345 }
 346
 347 /* Similarly, specialized for 64-bit operands.  */
 348 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 350 {                                                               \
 351     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 352     TYPE *d = vd, *n = vn, *m = vm;                             \
 353     uint8_t *pg = vg;                                           \
 354     for (i = 0; i < opr_sz; i += 1) {                           \
 355         if (pg[H1(i)] & 1) {                                    \
 356             TYPE nn = n[i], mm = m[i];                          \
 357             d[i] = OP(nn, mm);                                  \
 358         }                                                       \
 359     }                                                           \
 360 }
 361
 362 #define DO_AND(N, M)  (N & M)
 363 #define DO_EOR(N, M)  (N ^ M)
 364 #define DO_ORR(N, M)  (N | M)
 365 #define DO_BIC(N, M)  (N & ~M)
 366 #define DO_ADD(N, M)  (N + M)
 367 #define DO_SUB(N, M)  (N - M)
 368 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 369 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 370 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 371 #define DO_MUL(N, M)  (N * M)
 372 #define DO_DIV(N, M)  (M ? N / M : 0)
 373
 374 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 375 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 376 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 377 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 378
 379 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 380 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 381 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 382 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 383
 384 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 385 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 386 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 387 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 388
 389 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 390 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 391 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 392 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 393
 394 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 395 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 396 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 397 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 398
 399 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 400 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 401 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 402 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 403
 404 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 405 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 406 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 407 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 408
 409 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 410 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 411 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 412 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 413
 414 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 415 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 416 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 417 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 418
 419 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 420 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 421 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 422 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 423
 424 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 425 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 426 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 427 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 428
 429 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 430 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 431 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 432 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 433
 434 /* Because the computation type is at least twice as large as required,
 435    these work for both signed and unsigned source types.  */
 436 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 437 {
 438     return (n * m) >> 8;
 439 }
 440
 441 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 442 {
 443     return (n * m) >> 16;
 444 }
 445
 446 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 447 {
 448     return (n * m) >> 32;
 449 }
 450
 451 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 452 {
 453     uint64_t lo, hi;
 454     muls64(&lo, &hi, n, m);
 455     return hi;
 456 }
 457
 458 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 459 {
 460     uint64_t lo, hi;
 461     mulu64(&lo, &hi, n, m);
 462     return hi;
 463 }
 464
 465 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 466 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 467 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 468 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 469
 470 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 471 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 472 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 473 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 474
 475 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 476 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 477 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 478 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 479
 480 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
 481 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
 482
 483 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
 484 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
 485
 486 /* Note that all bits of the shift are significant
 487    and not modulo the element size.  */
 488 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 489 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 490 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 491
 492 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 493 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 494 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 495
 496 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 497 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 498 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 499
 500 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 501 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 502 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 503
 504 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 505 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 506 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 507
 508 #undef DO_ZPZZ
 509 #undef DO_ZPZZ_D
 510
 511 /* Three-operand expander, controlled by a predicate, in which the
 512  * third operand is "wide".  That is, for D = N op M, the same 64-bit
 513  * value of M is used with all of the narrower values of N.
 514  */
 515 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 516 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 517 {                                                                       \
 518     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 519     for (i = 0; i < opr_sz; ) {                                         \
 520         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 521         TYPEW mm = *(TYPEW *)(vm + i);                                  \
 522         do {                                                            \
 523             if (pg & 1) {                                               \
 524                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 525                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 526             }                                                           \
 527             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 528         } while (i & 7);                                                \
 529     }                                                                   \
 530 }
 531
 532 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 533 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 534 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 535
 536 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 537 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 538 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 539
 540 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 541 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 542 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 543
 544 #undef DO_ZPZW
 545
 546 /* Fully general two-operand expander, controlled by a predicate.
 547  */
 548 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
 549 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 550 {                                                               \
 551     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 552     for (i = 0; i < opr_sz; ) {                                 \
 553         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 554         do {                                                    \
 555             if (pg & 1) {                                       \
 556                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 557                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
 558             }                                                   \
 559             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 560         } while (i & 15);                                       \
 561     }                                                           \
 562 }
 563
 564 /* Similarly, specialized for 64-bit operands.  */
 565 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
 566 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 567 {                                                               \
 568     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 569     TYPE *d = vd, *n = vn;                                      \
 570     uint8_t *pg = vg;                                           \
 571     for (i = 0; i < opr_sz; i += 1) {                           \
 572         if (pg[H1(i)] & 1) {                                    \
 573             TYPE nn = n[i];                                     \
 574             d[i] = OP(nn);                                      \
 575         }                                                       \
 576     }                                                           \
 577 }
 578
 579 #define DO_CLS_B(N)   (clrsb32(N) - 24)
 580 #define DO_CLS_H(N)   (clrsb32(N) - 16)
 581
 582 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 583 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 584 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 585 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 586
 587 #define DO_CLZ_B(N)   (clz32(N) - 24)
 588 #define DO_CLZ_H(N)   (clz32(N) - 16)
 589
 590 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 591 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 592 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 593 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 594
 595 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 596 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 597 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 598 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 599
 600 #define DO_CNOT(N)    (N == 0)
 601
 602 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 603 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 604 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 605 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 606
 607 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 608
 609 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 610 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 611 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 612
 613 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 614
 615 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 616 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 617 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 618
 619 #define DO_NOT(N)    (~N)
 620
 621 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 622 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 623 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 624 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 625
 626 #define DO_SXTB(N)    ((int8_t)N)
 627 #define DO_SXTH(N)    ((int16_t)N)
 628 #define DO_SXTS(N)    ((int32_t)N)
 629 #define DO_UXTB(N)    ((uint8_t)N)
 630 #define DO_UXTH(N)    ((uint16_t)N)
 631 #define DO_UXTS(N)    ((uint32_t)N)
 632
 633 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 634 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 635 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 636 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 637 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 638 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 639
 640 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 641 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 642 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 643 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 644 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 645 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 646
 647 #define DO_ABS(N)    (N < 0 ? -N : N)
 648
 649 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 650 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 651 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 652 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 653
 654 #define DO_NEG(N)    (-N)
 655
 656 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 657 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 658 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 659 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 660
 661 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 662 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 663 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 664
 665 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 666 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 667
 668 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 669
 670 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 671 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 672 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 673 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 674
 675 /* Three-operand expander, unpredicated, in which the third operand is "wide".
 676  */
 677 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 678 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 679 {                                                              \
 680     intptr_t i, opr_sz = simd_oprsz(desc);                     \
 681     for (i = 0; i < opr_sz; ) {                                \
 682         TYPEW mm = *(TYPEW *)(vm + i);                         \
 683         do {                                                   \
 684             TYPE nn = *(TYPE *)(vn + H(i));                    \
 685             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 686             i += sizeof(TYPE);                                 \
 687         } while (i & 7);                                       \
 688     }                                                          \
 689 }
 690
 691 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 692 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 693 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 694
 695 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 696 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 697 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 698
 699 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 700 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 701 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 702
 703 #undef DO_ZZW
 704
 705 #undef DO_CLS_B
 706 #undef DO_CLS_H
 707 #undef DO_CLZ_B
 708 #undef DO_CLZ_H
 709 #undef DO_CNOT
 710 #undef DO_FABS
 711 #undef DO_FNEG
 712 #undef DO_ABS
 713 #undef DO_NEG
 714 #undef DO_ZPZ
 715 #undef DO_ZPZ_D
 716
 717 /* Two-operand reduction expander, controlled by a predicate.
 718  * The difference between TYPERED and TYPERET has to do with
 719  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 720  * but TYPERET must be unsigned so that e.g. a 32-bit value
 721  * is not sign-extended to the ABI uint64_t return type.
 722  */
 723 /* ??? If we were to vectorize this by hand the reduction ordering
 724  * would change.  For integer operands, this is perfectly fine.
 725  */
 726 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 727 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 728 {                                                          \
 729     intptr_t i, opr_sz = simd_oprsz(desc);                 \
 730     TYPERED ret = INIT;                                    \
 731     for (i = 0; i < opr_sz; ) {                            \
 732         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 733         do {                                               \
 734             if (pg & 1) {                                  \
 735                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 736                 ret = OP(ret, nn);                         \
 737             }                                              \
 738             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 739         } while (i & 15);                                  \
 740     }                                                      \
 741     return (TYPERET)ret;                                   \
 742 }
 743
 744 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 745 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 746 {                                                          \
 747     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 748     TYPEE *n = vn;                                         \
 749     uint8_t *pg = vg;                                      \
 750     TYPER ret = INIT;                                      \
 751     for (i = 0; i < opr_sz; i += 1) {                      \
 752         if (pg[H1(i)] & 1) {                               \
 753             TYPEE nn = n[i];                               \
 754             ret = OP(ret, nn);                             \
 755         }                                                  \
 756     }                                                      \
 757     return ret;                                            \
 758 }
 759
 760 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 761 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 762 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 763 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 764
 765 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 766 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 767 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 768 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 769
 770 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 771 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 772 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 773 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 774
 775 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 776 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 777 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 778
 779 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 780 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 781 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 782 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 783
 784 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 785 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 786 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 787 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 788
 789 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 790 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 791 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 792 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 793
 794 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 795 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 796 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 797 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 798
 799 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 800 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 801 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 802 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 803
 804 #undef DO_VPZ
 805 #undef DO_VPZ_D
 806
 807 #undef DO_AND
 808 #undef DO_ORR
 809 #undef DO_EOR
 810 #undef DO_BIC
 811 #undef DO_ADD
 812 #undef DO_SUB
 813 #undef DO_MAX
 814 #undef DO_MIN
 815 #undef DO_ABD
 816 #undef DO_MUL
 817 #undef DO_DIV
 818 #undef DO_ASR
 819 #undef DO_LSR
 820 #undef DO_LSL
 821
 822 /* Similar to the ARM LastActiveElement pseudocode function, except the
 823    result is multiplied by the element size.  This includes the not found
 824    indication; e.g. not found for esz=3 is -8.  */
 825 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 826 {
 827     uint64_t mask = pred_esz_masks[esz];
 828     intptr_t i = words;
 829
 830     do {
 831         uint64_t this_g = g[--i] & mask;
 832         if (this_g) {
 833             return i * 64 + (63 - clz64(this_g));
 834         }
 835     } while (i > 0);
 836     return (intptr_t)-1 << esz;
 837 }
 838
 839 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 840 {
 841     uint32_t flags = PREDTEST_INIT;
 842     uint64_t *d = vd, *g = vg;
 843     intptr_t i = 0;
 844
 845     do {
 846         uint64_t this_d = d[i];
 847         uint64_t this_g = g[i];
 848
 849         if (this_g) {
 850             if (!(flags & 4)) {
 851                 /* Set in D the first bit of G.  */
 852                 this_d |= this_g & -this_g;
 853                 d[i] = this_d;
 854             }
 855             flags = iter_predtest_fwd(this_d, this_g, flags);
 856         }
 857     } while (++i < words);
 858
 859     return flags;
 860 }
 861
 862 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 863 {
 864     intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 865     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 866     uint32_t flags = PREDTEST_INIT;
 867     uint64_t *d = vd, *g = vg, esz_mask;
 868     intptr_t i, next;
 869
 870     next = last_active_element(vd, words, esz) + (1 << esz);
 871     esz_mask = pred_esz_masks[esz];
 872
 873     /* Similar to the pseudocode for pnext, but scaled by ESZ
 874        so that we find the correct bit.  */
 875     if (next < words * 64) {
 876         uint64_t mask = -1;
 877
 878         if (next & 63) {
 879             mask = ~((1ull << (next & 63)) - 1);
 880             next &= -64;
 881         }
 882         do {
 883             uint64_t this_g = g[next / 64] & esz_mask & mask;
 884             if (this_g != 0) {
 885                 next = (next & -64) + ctz64(this_g);
 886                 break;
 887             }
 888             next += 64;
 889             mask = -1;
 890         } while (next < words * 64);
 891     }
 892
 893     i = 0;
 894     do {
 895         uint64_t this_d = 0;
 896         if (i == next / 64) {
 897             this_d = 1ull << (next & 63);
 898         }
 899         d[i] = this_d;
 900         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 901     } while (++i < words);
 902
 903     return flags;
 904 }
 905
 906 /* Store zero into every active element of Zd.  We will use this for two
 907  * and three-operand predicated instructions for which logic dictates a
 908  * zero result.  In particular, logical shift by element size, which is
 909  * otherwise undefined on the host.
 910  *
 911  * For element sizes smaller than uint64_t, we use tables to expand
 912  * the N bits of the controlling predicate to a byte mask, and clear
 913  * those bytes.
 914  */
 915 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
 916 {
 917     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 918     uint64_t *d = vd;
 919     uint8_t *pg = vg;
 920     for (i = 0; i < opr_sz; i += 1) {
 921         d[i] &= ~expand_pred_b(pg[H1(i)]);
 922     }
 923 }
 924
 925 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
 926 {
 927     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 928     uint64_t *d = vd;
 929     uint8_t *pg = vg;
 930     for (i = 0; i < opr_sz; i += 1) {
 931         d[i] &= ~expand_pred_h(pg[H1(i)]);
 932     }
 933 }
 934
 935 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
 936 {
 937     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 938     uint64_t *d = vd;
 939     uint8_t *pg = vg;
 940     for (i = 0; i < opr_sz; i += 1) {
 941         d[i] &= ~expand_pred_s(pg[H1(i)]);
 942     }
 943 }
 944
 945 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 946 {
 947     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 948     uint64_t *d = vd;
 949     uint8_t *pg = vg;
 950     for (i = 0; i < opr_sz; i += 1) {
 951         if (pg[H1(i)] & 1) {
 952             d[i] = 0;
 953         }
 954     }
 955 }
 956
 957 /* Three-operand expander, immediate operand, controlled by a predicate.
 958  */
 959 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
 960 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 961 {                                                               \
 962     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 963     TYPE imm = simd_data(desc);                                 \
 964     for (i = 0; i < opr_sz; ) {                                 \
 965         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 966         do {                                                    \
 967             if (pg & 1) {                                       \
 968                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 969                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
 970             }                                                   \
 971             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 972         } while (i & 15);                                       \
 973     }                                                           \
 974 }
 975
 976 /* Similarly, specialized for 64-bit operands.  */
 977 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
 978 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 979 {                                                               \
 980     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 981     TYPE *d = vd, *n = vn;                                      \
 982     TYPE imm = simd_data(desc);                                 \
 983     uint8_t *pg = vg;                                           \
 984     for (i = 0; i < opr_sz; i += 1) {                           \
 985         if (pg[H1(i)] & 1) {                                    \
 986             TYPE nn = n[i];                                     \
 987             d[i] = OP(nn, imm);                                 \
 988         }                                                       \
 989     }                                                           \
 990 }
 991
 992 #define DO_SHR(N, M)  (N >> M)
 993 #define DO_SHL(N, M)  (N << M)
 994
 995 /* Arithmetic shift right for division.  This rounds negative numbers
 996    toward zero as per signed division.  Therefore before shifting,
 997    when N is negative, add 2**M-1.  */
 998 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
 999
1000 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1001 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1002 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1003 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1004
1005 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1006 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1007 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1008 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1009
1010 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1011 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1012 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1013 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1014
1015 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1016 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1017 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1018 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1019
1020 #undef DO_SHR
1021 #undef DO_SHL
1022 #undef DO_ASRD
1023 #undef DO_ZPZI
1024 #undef DO_ZPZI_D
1025
1026 /* Fully general four-operand expander, controlled by a predicate.
1027  */
1028 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
1029 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1030                   void *vg, uint32_t desc)                    \
1031 {                                                             \
1032     intptr_t i, opr_sz = simd_oprsz(desc);                    \
1033     for (i = 0; i < opr_sz; ) {                               \
1034         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
1035         do {                                                  \
1036             if (pg & 1) {                                     \
1037                 TYPE nn = *(TYPE *)(vn + H(i));               \
1038                 TYPE mm = *(TYPE *)(vm + H(i));               \
1039                 TYPE aa = *(TYPE *)(va + H(i));               \
1040                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
1041             }                                                 \
1042             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
1043         } while (i & 15);                                     \
1044     }                                                         \
1045 }
1046
1047 /* Similarly, specialized for 64-bit operands.  */
1048 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
1049 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1050                   void *vg, uint32_t desc)                    \
1051 {                                                             \
1052     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
1053     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
1054     uint8_t *pg = vg;                                         \
1055     for (i = 0; i < opr_sz; i += 1) {                         \
1056         if (pg[H1(i)] & 1) {                                  \
1057             TYPE aa = a[i], nn = n[i], mm = m[i];             \
1058             d[i] = OP(aa, nn, mm);                            \
1059         }                                                     \
1060     }                                                         \
1061 }
1062
1063 #define DO_MLA(A, N, M)  (A + N * M)
1064 #define DO_MLS(A, N, M)  (A - N * M)
1065
1066 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1067 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1068
1069 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1070 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1071
1072 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1073 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1074
1075 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1076 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1077
1078 #undef DO_MLA
1079 #undef DO_MLS
1080 #undef DO_ZPZZZ
1081 #undef DO_ZPZZZ_D
1082
1083 void HELPER(sve_index_b)(void *vd, uint32_t start,
1084                          uint32_t incr, uint32_t desc)
1085 {
1086     intptr_t i, opr_sz = simd_oprsz(desc);
1087     uint8_t *d = vd;
1088     for (i = 0; i < opr_sz; i += 1) {
1089         d[H1(i)] = start + i * incr;
1090     }
1091 }
1092
1093 void HELPER(sve_index_h)(void *vd, uint32_t start,
1094                          uint32_t incr, uint32_t desc)
1095 {
1096     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1097     uint16_t *d = vd;
1098     for (i = 0; i < opr_sz; i += 1) {
1099         d[H2(i)] = start + i * incr;
1100     }
1101 }
1102
1103 void HELPER(sve_index_s)(void *vd, uint32_t start,
1104                          uint32_t incr, uint32_t desc)
1105 {
1106     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1107     uint32_t *d = vd;
1108     for (i = 0; i < opr_sz; i += 1) {
1109         d[H4(i)] = start + i * incr;
1110     }
1111 }
1112
1113 void HELPER(sve_index_d)(void *vd, uint64_t start,
1114                          uint64_t incr, uint32_t desc)
1115 {
1116     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1117     uint64_t *d = vd;
1118     for (i = 0; i < opr_sz; i += 1) {
1119         d[i] = start + i * incr;
1120     }
1121 }
1122
1123 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1124 {
1125     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1126     uint32_t sh = simd_data(desc);
1127     uint32_t *d = vd, *n = vn, *m = vm;
1128     for (i = 0; i < opr_sz; i += 1) {
1129         d[i] = n[i] + (m[i] << sh);
1130     }
1131 }
1132
1133 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1134 {
1135     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1136     uint64_t sh = simd_data(desc);
1137     uint64_t *d = vd, *n = vn, *m = vm;
1138     for (i = 0; i < opr_sz; i += 1) {
1139         d[i] = n[i] + (m[i] << sh);
1140     }
1141 }
1142
1143 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1144 {
1145     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1146     uint64_t sh = simd_data(desc);
1147     uint64_t *d = vd, *n = vn, *m = vm;
1148     for (i = 0; i < opr_sz; i += 1) {
1149         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1150     }
1151 }
1152
1153 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1154 {
1155     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1156     uint64_t sh = simd_data(desc);
1157     uint64_t *d = vd, *n = vn, *m = vm;
1158     for (i = 0; i < opr_sz; i += 1) {
1159         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1160     }
1161 }
1162
1163 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1164 {
1165     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1166     static const uint16_t coeff[] = {
1167         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1168         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1169         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1170         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1171     };
1172     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1173     uint16_t *d = vd, *n = vn;
1174
1175     for (i = 0; i < opr_sz; i++) {
1176         uint16_t nn = n[i];
1177         intptr_t idx = extract32(nn, 0, 5);
1178         uint16_t exp = extract32(nn, 5, 5);
1179         d[i] = coeff[idx] | (exp << 10);
1180     }
1181 }
1182
1183 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1184 {
1185     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1186     static const uint32_t coeff[] = {
1187         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1188         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1189         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1190         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1191         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1192         0x1ef532, 0x20b051, 0x227043, 0x243516,
1193         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1194         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1195         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1196         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1197         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1198         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1199         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1200         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1201         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1202         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1203     };
1204     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1205     uint32_t *d = vd, *n = vn;
1206
1207     for (i = 0; i < opr_sz; i++) {
1208         uint32_t nn = n[i];
1209         intptr_t idx = extract32(nn, 0, 6);
1210         uint32_t exp = extract32(nn, 6, 8);
1211         d[i] = coeff[idx] | (exp << 23);
1212     }
1213 }
1214
1215 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1216 {
1217     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1218     static const uint64_t coeff[] = {
1219         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1220         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1221         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1222         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1223         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1224         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1225         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1226         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1227         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1228         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1229         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1230         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1231         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1232         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1233         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1234         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1235         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1236         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1237         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1238         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1239         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1240         0xFA7C1819E90D8ull,
1241     };
1242     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1243     uint64_t *d = vd, *n = vn;
1244
1245     for (i = 0; i < opr_sz; i++) {
1246         uint64_t nn = n[i];
1247         intptr_t idx = extract32(nn, 0, 6);
1248         uint64_t exp = extract32(nn, 6, 11);
1249         d[i] = coeff[idx] | (exp << 52);
1250     }
1251 }
1252
1253 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1254 {
1255     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1256     uint16_t *d = vd, *n = vn, *m = vm;
1257     for (i = 0; i < opr_sz; i += 1) {
1258         uint16_t nn = n[i];
1259         uint16_t mm = m[i];
1260         if (mm & 1) {
1261             nn = float16_one;
1262         }
1263         d[i] = nn ^ (mm & 2) << 14;
1264     }
1265 }
1266
1267 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1268 {
1269     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1270     uint32_t *d = vd, *n = vn, *m = vm;
1271     for (i = 0; i < opr_sz; i += 1) {
1272         uint32_t nn = n[i];
1273         uint32_t mm = m[i];
1274         if (mm & 1) {
1275             nn = float32_one;
1276         }
1277         d[i] = nn ^ (mm & 2) << 30;
1278     }
1279 }
1280
1281 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1282 {
1283     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1284     uint64_t *d = vd, *n = vn, *m = vm;
1285     for (i = 0; i < opr_sz; i += 1) {
1286         uint64_t nn = n[i];
1287         uint64_t mm = m[i];
1288         if (mm & 1) {
1289             nn = float64_one;
1290         }
1291         d[i] = nn ^ (mm & 2) << 62;
1292     }
1293 }
1294
1295 /*
1296  * Signed saturating addition with scalar operand.
1297  */
1298
1299 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1300 {
1301     intptr_t i, oprsz = simd_oprsz(desc);
1302
1303     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1304         int r = *(int8_t *)(a + i) + b;
1305         if (r > INT8_MAX) {
1306             r = INT8_MAX;
1307         } else if (r < INT8_MIN) {
1308             r = INT8_MIN;
1309         }
1310         *(int8_t *)(d + i) = r;
1311     }
1312 }
1313
1314 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1315 {
1316     intptr_t i, oprsz = simd_oprsz(desc);
1317
1318     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1319         int r = *(int16_t *)(a + i) + b;
1320         if (r > INT16_MAX) {
1321             r = INT16_MAX;
1322         } else if (r < INT16_MIN) {
1323             r = INT16_MIN;
1324         }
1325         *(int16_t *)(d + i) = r;
1326     }
1327 }
1328
1329 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1330 {
1331     intptr_t i, oprsz = simd_oprsz(desc);
1332
1333     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1334         int64_t r = *(int32_t *)(a + i) + b;
1335         if (r > INT32_MAX) {
1336             r = INT32_MAX;
1337         } else if (r < INT32_MIN) {
1338             r = INT32_MIN;
1339         }
1340         *(int32_t *)(d + i) = r;
1341     }
1342 }
1343
1344 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1345 {
1346     intptr_t i, oprsz = simd_oprsz(desc);
1347
1348     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1349         int64_t ai = *(int64_t *)(a + i);
1350         int64_t r = ai + b;
1351         if (((r ^ ai) & ~(ai ^ b)) < 0) {
1352             /* Signed overflow.  */
1353             r = (r < 0 ? INT64_MAX : INT64_MIN);
1354         }
1355         *(int64_t *)(d + i) = r;
1356     }
1357 }
1358
1359 /*
1360  * Unsigned saturating addition with scalar operand.
1361  */
1362
1363 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1364 {
1365     intptr_t i, oprsz = simd_oprsz(desc);
1366
1367     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1368         int r = *(uint8_t *)(a + i) + b;
1369         if (r > UINT8_MAX) {
1370             r = UINT8_MAX;
1371         } else if (r < 0) {
1372             r = 0;
1373         }
1374         *(uint8_t *)(d + i) = r;
1375     }
1376 }
1377
1378 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1379 {
1380     intptr_t i, oprsz = simd_oprsz(desc);
1381
1382     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1383         int r = *(uint16_t *)(a + i) + b;
1384         if (r > UINT16_MAX) {
1385             r = UINT16_MAX;
1386         } else if (r < 0) {
1387             r = 0;
1388         }
1389         *(uint16_t *)(d + i) = r;
1390     }
1391 }
1392
1393 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1394 {
1395     intptr_t i, oprsz = simd_oprsz(desc);
1396
1397     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1398         int64_t r = *(uint32_t *)(a + i) + b;
1399         if (r > UINT32_MAX) {
1400             r = UINT32_MAX;
1401         } else if (r < 0) {
1402             r = 0;
1403         }
1404         *(uint32_t *)(d + i) = r;
1405     }
1406 }
1407
1408 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1409 {
1410     intptr_t i, oprsz = simd_oprsz(desc);
1411
1412     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1413         uint64_t r = *(uint64_t *)(a + i) + b;
1414         if (r < b) {
1415             r = UINT64_MAX;
1416         }
1417         *(uint64_t *)(d + i) = r;
1418     }
1419 }
1420
1421 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1422 {
1423     intptr_t i, oprsz = simd_oprsz(desc);
1424
1425     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1426         uint64_t ai = *(uint64_t *)(a + i);
1427         *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1428     }
1429 }
1430
1431 /* Two operand predicated copy immediate with merge.  All valid immediates
1432  * can fit within 17 signed bits in the simd_data field.
1433  */
1434 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1435                          uint64_t mm, uint32_t desc)
1436 {
1437     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1438     uint64_t *d = vd, *n = vn;
1439     uint8_t *pg = vg;
1440
1441     mm = dup_const(MO_8, mm);
1442     for (i = 0; i < opr_sz; i += 1) {
1443         uint64_t nn = n[i];
1444         uint64_t pp = expand_pred_b(pg[H1(i)]);
1445         d[i] = (mm & pp) | (nn & ~pp);
1446     }
1447 }
1448
1449 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1450                          uint64_t mm, uint32_t desc)
1451 {
1452     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1453     uint64_t *d = vd, *n = vn;
1454     uint8_t *pg = vg;
1455
1456     mm = dup_const(MO_16, mm);
1457     for (i = 0; i < opr_sz; i += 1) {
1458         uint64_t nn = n[i];
1459         uint64_t pp = expand_pred_h(pg[H1(i)]);
1460         d[i] = (mm & pp) | (nn & ~pp);
1461     }
1462 }
1463
1464 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1465                          uint64_t mm, uint32_t desc)
1466 {
1467     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1468     uint64_t *d = vd, *n = vn;
1469     uint8_t *pg = vg;
1470
1471     mm = dup_const(MO_32, mm);
1472     for (i = 0; i < opr_sz; i += 1) {
1473         uint64_t nn = n[i];
1474         uint64_t pp = expand_pred_s(pg[H1(i)]);
1475         d[i] = (mm & pp) | (nn & ~pp);
1476     }
1477 }
1478
1479 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1480                          uint64_t mm, uint32_t desc)
1481 {
1482     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1483     uint64_t *d = vd, *n = vn;
1484     uint8_t *pg = vg;
1485
1486     for (i = 0; i < opr_sz; i += 1) {
1487         uint64_t nn = n[i];
1488         d[i] = (pg[H1(i)] & 1 ? mm : nn);
1489     }
1490 }
1491
1492 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1493 {
1494     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1495     uint64_t *d = vd;
1496     uint8_t *pg = vg;
1497
1498     val = dup_const(MO_8, val);
1499     for (i = 0; i < opr_sz; i += 1) {
1500         d[i] = val & expand_pred_b(pg[H1(i)]);
1501     }
1502 }
1503
1504 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1505 {
1506     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1507     uint64_t *d = vd;
1508     uint8_t *pg = vg;
1509
1510     val = dup_const(MO_16, val);
1511     for (i = 0; i < opr_sz; i += 1) {
1512         d[i] = val & expand_pred_h(pg[H1(i)]);
1513     }
1514 }
1515
1516 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1517 {
1518     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1519     uint64_t *d = vd;
1520     uint8_t *pg = vg;
1521
1522     val = dup_const(MO_32, val);
1523     for (i = 0; i < opr_sz; i += 1) {
1524         d[i] = val & expand_pred_s(pg[H1(i)]);
1525     }
1526 }
1527
1528 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1529 {
1530     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1531     uint64_t *d = vd;
1532     uint8_t *pg = vg;
1533
1534     for (i = 0; i < opr_sz; i += 1) {
1535         d[i] = (pg[H1(i)] & 1 ? val : 0);
1536     }
1537 }
1538
1539 /* Big-endian hosts need to frob the byte indicies.  If the copy
1540  * happens to be 8-byte aligned, then no frobbing necessary.
1541  */
1542 static void swap_memmove(void *vd, void *vs, size_t n)
1543 {
1544     uintptr_t d = (uintptr_t)vd;
1545     uintptr_t s = (uintptr_t)vs;
1546     uintptr_t o = (d | s | n) & 7;
1547     size_t i;
1548
1549 #ifndef HOST_WORDS_BIGENDIAN
1550     o = 0;
1551 #endif
1552     switch (o) {
1553     case 0:
1554         memmove(vd, vs, n);
1555         break;
1556
1557     case 4:
1558         if (d < s || d >= s + n) {
1559             for (i = 0; i < n; i += 4) {
1560                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1561             }
1562         } else {
1563             for (i = n; i > 0; ) {
1564                 i -= 4;
1565                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1566             }
1567         }
1568         break;
1569
1570     case 2:
1571     case 6:
1572         if (d < s || d >= s + n) {
1573             for (i = 0; i < n; i += 2) {
1574                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1575             }
1576         } else {
1577             for (i = n; i > 0; ) {
1578                 i -= 2;
1579                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1580             }
1581         }
1582         break;
1583
1584     default:
1585         if (d < s || d >= s + n) {
1586             for (i = 0; i < n; i++) {
1587                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1588             }
1589         } else {
1590             for (i = n; i > 0; ) {
1591                 i -= 1;
1592                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1593             }
1594         }
1595         break;
1596     }
1597 }
1598
1599 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1600 {
1601     intptr_t opr_sz = simd_oprsz(desc);
1602     size_t n_ofs = simd_data(desc);
1603     size_t n_siz = opr_sz - n_ofs;
1604
1605     if (vd != vm) {
1606         swap_memmove(vd, vn + n_ofs, n_siz);
1607         swap_memmove(vd + n_siz, vm, n_ofs);
1608     } else if (vd != vn) {
1609         swap_memmove(vd + n_siz, vd, n_ofs);
1610         swap_memmove(vd, vn + n_ofs, n_siz);
1611     } else {
1612         /* vd == vn == vm.  Need temp space.  */
1613         ARMVectorReg tmp;
1614         swap_memmove(&tmp, vm, n_ofs);
1615         swap_memmove(vd, vd + n_ofs, n_siz);
1616         memcpy(vd + n_siz, &tmp, n_ofs);
1617     }
1618 }
1619
1620 #define DO_INSR(NAME, TYPE, H) \
1621 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1622 {                                                                  \
1623     intptr_t opr_sz = simd_oprsz(desc);                            \
1624     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
1625     *(TYPE *)(vd + H(0)) = val;                                    \
1626 }
1627
1628 DO_INSR(sve_insr_b, uint8_t, H1)
1629 DO_INSR(sve_insr_h, uint16_t, H1_2)
1630 DO_INSR(sve_insr_s, uint32_t, H1_4)
1631 DO_INSR(sve_insr_d, uint64_t, )
1632
1633 #undef DO_INSR
1634
1635 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1636 {
1637     intptr_t i, j, opr_sz = simd_oprsz(desc);
1638     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1639         uint64_t f = *(uint64_t *)(vn + i);
1640         uint64_t b = *(uint64_t *)(vn + j);
1641         *(uint64_t *)(vd + i) = bswap64(b);
1642         *(uint64_t *)(vd + j) = bswap64(f);
1643     }
1644 }
1645
1646 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1647 {
1648     intptr_t i, j, opr_sz = simd_oprsz(desc);
1649     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1650         uint64_t f = *(uint64_t *)(vn + i);
1651         uint64_t b = *(uint64_t *)(vn + j);
1652         *(uint64_t *)(vd + i) = hswap64(b);
1653         *(uint64_t *)(vd + j) = hswap64(f);
1654     }
1655 }
1656
1657 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1658 {
1659     intptr_t i, j, opr_sz = simd_oprsz(desc);
1660     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1661         uint64_t f = *(uint64_t *)(vn + i);
1662         uint64_t b = *(uint64_t *)(vn + j);
1663         *(uint64_t *)(vd + i) = rol64(b, 32);
1664         *(uint64_t *)(vd + j) = rol64(f, 32);
1665     }
1666 }
1667
1668 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1669 {
1670     intptr_t i, j, opr_sz = simd_oprsz(desc);
1671     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1672         uint64_t f = *(uint64_t *)(vn + i);
1673         uint64_t b = *(uint64_t *)(vn + j);
1674         *(uint64_t *)(vd + i) = b;
1675         *(uint64_t *)(vd + j) = f;
1676     }
1677 }
1678
1679 #define DO_TBL(NAME, TYPE, H) \
1680 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1681 {                                                              \
1682     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1683     uintptr_t elem = opr_sz / sizeof(TYPE);                    \
1684     TYPE *d = vd, *n = vn, *m = vm;                            \
1685     ARMVectorReg tmp;                                          \
1686     if (unlikely(vd == vn)) {                                  \
1687         n = memcpy(&tmp, vn, opr_sz);                          \
1688     }                                                          \
1689     for (i = 0; i < elem; i++) {                               \
1690         TYPE j = m[H(i)];                                      \
1691         d[H(i)] = j < elem ? n[H(j)] : 0;                      \
1692     }                                                          \
1693 }
1694
1695 DO_TBL(sve_tbl_b, uint8_t, H1)
1696 DO_TBL(sve_tbl_h, uint16_t, H2)
1697 DO_TBL(sve_tbl_s, uint32_t, H4)
1698 DO_TBL(sve_tbl_d, uint64_t, )
1699
1700 #undef TBL
1701
1702 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1703 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1704 {                                                              \
1705     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1706     TYPED *d = vd;                                             \
1707     TYPES *n = vn;                                             \
1708     ARMVectorReg tmp;                                          \
1709     if (unlikely(vn - vd < opr_sz)) {                          \
1710         n = memcpy(&tmp, n, opr_sz / 2);                       \
1711     }                                                          \
1712     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
1713         d[HD(i)] = n[HS(i)];                                   \
1714     }                                                          \
1715 }
1716
1717 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1718 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1719 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1720
1721 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1722 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1723 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1724
1725 #undef DO_UNPK
1726
1727 /* Mask of bits included in the even numbered predicates of width esz.
1728  * We also use this for expand_bits/compress_bits, and so extend the
1729  * same pattern out to 16-bit units.
1730  */
1731 static const uint64_t even_bit_esz_masks[5] = {
1732     0x5555555555555555ull,
1733     0x3333333333333333ull,
1734     0x0f0f0f0f0f0f0f0full,
1735     0x00ff00ff00ff00ffull,
1736     0x0000ffff0000ffffull,
1737 };
1738
1739 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1740  * For N==0, this corresponds to the operation that in qemu/bitops.h
1741  * we call half_shuffle64; this algorithm is from Hacker's Delight,
1742  * section 7-2 Shuffling Bits.
1743  */
1744 static uint64_t expand_bits(uint64_t x, int n)
1745 {
1746     int i;
1747
1748     x &= 0xffffffffu;
1749     for (i = 4; i >= n; i--) {
1750         int sh = 1 << i;
1751         x = ((x << sh) | x) & even_bit_esz_masks[i];
1752     }
1753     return x;
1754 }
1755
1756 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1757  * For N==0, this corresponds to the operation that in qemu/bitops.h
1758  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1759  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1760  */
1761 static uint64_t compress_bits(uint64_t x, int n)
1762 {
1763     int i;
1764
1765     for (i = n; i <= 4; i++) {
1766         int sh = 1 << i;
1767         x &= even_bit_esz_masks[i];
1768         x = (x >> sh) | x;
1769     }
1770     return x & 0xffffffffu;
1771 }
1772
1773 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1774 {
1775     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1776     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1777     intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1778     uint64_t *d = vd;
1779     intptr_t i;
1780
1781     if (oprsz <= 8) {
1782         uint64_t nn = *(uint64_t *)vn;
1783         uint64_t mm = *(uint64_t *)vm;
1784         int half = 4 * oprsz;
1785
1786         nn = extract64(nn, high * half, half);
1787         mm = extract64(mm, high * half, half);
1788         nn = expand_bits(nn, esz);
1789         mm = expand_bits(mm, esz);
1790         d[0] = nn + (mm << (1 << esz));
1791     } else {
1792         ARMPredicateReg tmp_n, tmp_m;
1793
1794         /* We produce output faster than we consume input.
1795            Therefore we must be mindful of possible overlap.  */
1796         if ((vn - vd) < (uintptr_t)oprsz) {
1797             vn = memcpy(&tmp_n, vn, oprsz);
1798         }
1799         if ((vm - vd) < (uintptr_t)oprsz) {
1800             vm = memcpy(&tmp_m, vm, oprsz);
1801         }
1802         if (high) {
1803             high = oprsz >> 1;
1804         }
1805
1806         if ((high & 3) == 0) {
1807             uint32_t *n = vn, *m = vm;
1808             high >>= 2;
1809
1810             for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1811                 uint64_t nn = n[H4(high + i)];
1812                 uint64_t mm = m[H4(high + i)];
1813
1814                 nn = expand_bits(nn, esz);
1815                 mm = expand_bits(mm, esz);
1816                 d[i] = nn + (mm << (1 << esz));
1817             }
1818         } else {
1819             uint8_t *n = vn, *m = vm;
1820             uint16_t *d16 = vd;
1821
1822             for (i = 0; i < oprsz / 2; i++) {
1823                 uint16_t nn = n[H1(high + i)];
1824                 uint16_t mm = m[H1(high + i)];
1825
1826                 nn = expand_bits(nn, esz);
1827                 mm = expand_bits(mm, esz);
1828                 d16[H2(i)] = nn + (mm << (1 << esz));
1829             }
1830         }
1831     }
1832 }
1833
1834 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1835 {
1836     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1837     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1838     int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1839     uint64_t *d = vd, *n = vn, *m = vm;
1840     uint64_t l, h;
1841     intptr_t i;
1842
1843     if (oprsz <= 8) {
1844         l = compress_bits(n[0] >> odd, esz);
1845         h = compress_bits(m[0] >> odd, esz);
1846         d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1847     } else {
1848         ARMPredicateReg tmp_m;
1849         intptr_t oprsz_16 = oprsz / 16;
1850
1851         if ((vm - vd) < (uintptr_t)oprsz) {
1852             m = memcpy(&tmp_m, vm, oprsz);
1853         }
1854
1855         for (i = 0; i < oprsz_16; i++) {
1856             l = n[2 * i + 0];
1857             h = n[2 * i + 1];
1858             l = compress_bits(l >> odd, esz);
1859             h = compress_bits(h >> odd, esz);
1860             d[i] = l + (h << 32);
1861         }
1862
1863         /* For VL which is not a power of 2, the results from M do not
1864            align nicely with the uint64_t for D.  Put the aligned results
1865            from M into TMP_M and then copy it into place afterward.  */
1866         if (oprsz & 15) {
1867             d[i] = compress_bits(n[2 * i] >> odd, esz);
1868
1869             for (i = 0; i < oprsz_16; i++) {
1870                 l = m[2 * i + 0];
1871                 h = m[2 * i + 1];
1872                 l = compress_bits(l >> odd, esz);
1873                 h = compress_bits(h >> odd, esz);
1874                 tmp_m.p[i] = l + (h << 32);
1875             }
1876             tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1877
1878             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1879         } else {
1880             for (i = 0; i < oprsz_16; i++) {
1881                 l = m[2 * i + 0];
1882                 h = m[2 * i + 1];
1883                 l = compress_bits(l >> odd, esz);
1884                 h = compress_bits(h >> odd, esz);
1885                 d[oprsz_16 + i] = l + (h << 32);
1886             }
1887         }
1888     }
1889 }
1890
1891 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1892 {
1893     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1894     uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1895     bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1896     uint64_t *d = vd, *n = vn, *m = vm;
1897     uint64_t mask;
1898     int shr, shl;
1899     intptr_t i;
1900
1901     shl = 1 << esz;
1902     shr = 0;
1903     mask = even_bit_esz_masks[esz];
1904     if (odd) {
1905         mask <<= shl;
1906         shr = shl;
1907         shl = 0;
1908     }
1909
1910     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1911         uint64_t nn = (n[i] & mask) >> shr;
1912         uint64_t mm = (m[i] & mask) << shl;
1913         d[i] = nn + mm;
1914     }
1915 }
1916
1917 /* Reverse units of 2**N bits.  */
1918 static uint64_t reverse_bits_64(uint64_t x, int n)
1919 {
1920     int i, sh;
1921
1922     x = bswap64(x);
1923     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1924         uint64_t mask = even_bit_esz_masks[i];
1925         x = ((x & mask) << sh) | ((x >> sh) & mask);
1926     }
1927     return x;
1928 }
1929
1930 static uint8_t reverse_bits_8(uint8_t x, int n)
1931 {
1932     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
1933     int i, sh;
1934
1935     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1936         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
1937     }
1938     return x;
1939 }
1940
1941 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
1942 {
1943     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1944     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1945     intptr_t i, oprsz_2 = oprsz / 2;
1946
1947     if (oprsz <= 8) {
1948         uint64_t l = *(uint64_t *)vn;
1949         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
1950         *(uint64_t *)vd = l;
1951     } else if ((oprsz & 15) == 0) {
1952         for (i = 0; i < oprsz_2; i += 8) {
1953             intptr_t ih = oprsz - 8 - i;
1954             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
1955             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
1956             *(uint64_t *)(vd + i) = h;
1957             *(uint64_t *)(vd + ih) = l;
1958         }
1959     } else {
1960         for (i = 0; i < oprsz_2; i += 1) {
1961             intptr_t il = H1(i);
1962             intptr_t ih = H1(oprsz - 1 - i);
1963             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
1964             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
1965             *(uint8_t *)(vd + il) = h;
1966             *(uint8_t *)(vd + ih) = l;
1967         }
1968     }
1969 }
1970
1971 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
1972 {
1973     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1974     intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1975     uint64_t *d = vd;
1976     intptr_t i;
1977
1978     if (oprsz <= 8) {
1979         uint64_t nn = *(uint64_t *)vn;
1980         int half = 4 * oprsz;
1981
1982         nn = extract64(nn, high * half, half);
1983         nn = expand_bits(nn, 0);
1984         d[0] = nn;
1985     } else {
1986         ARMPredicateReg tmp_n;
1987
1988         /* We produce output faster than we consume input.
1989            Therefore we must be mindful of possible overlap.  */
1990         if ((vn - vd) < (uintptr_t)oprsz) {
1991             vn = memcpy(&tmp_n, vn, oprsz);
1992         }
1993         if (high) {
1994             high = oprsz >> 1;
1995         }
1996
1997         if ((high & 3) == 0) {
1998             uint32_t *n = vn;
1999             high >>= 2;
2000
2001             for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2002                 uint64_t nn = n[H4(high + i)];
2003                 d[i] = expand_bits(nn, 0);
2004             }
2005         } else {
2006             uint16_t *d16 = vd;
2007             uint8_t *n = vn;
2008
2009             for (i = 0; i < oprsz / 2; i++) {
2010                 uint16_t nn = n[H1(high + i)];
2011                 d16[H2(i)] = expand_bits(nn, 0);
2012             }
2013         }
2014     }
2015 }
2016
2017 #define DO_ZIP(NAME, TYPE, H) \
2018 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
2019 {                                                                    \
2020     intptr_t oprsz = simd_oprsz(desc);                               \
2021     intptr_t i, oprsz_2 = oprsz / 2;                                 \
2022     ARMVectorReg tmp_n, tmp_m;                                       \
2023     /* We produce output faster than we consume input.               \
2024        Therefore we must be mindful of possible overlap.  */         \
2025     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
2026         vn = memcpy(&tmp_n, vn, oprsz_2);                            \
2027     }                                                                \
2028     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
2029         vm = memcpy(&tmp_m, vm, oprsz_2);                            \
2030     }                                                                \
2031     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
2032         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
2033         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2034     }                                                                \
2035 }
2036
2037 DO_ZIP(sve_zip_b, uint8_t, H1)
2038 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2039 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2040 DO_ZIP(sve_zip_d, uint64_t, )
2041
2042 #define DO_UZP(NAME, TYPE, H) \
2043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2044 {                                                                      \
2045     intptr_t oprsz = simd_oprsz(desc);                                 \
2046     intptr_t oprsz_2 = oprsz / 2;                                      \
2047     intptr_t odd_ofs = simd_data(desc);                                \
2048     intptr_t i;                                                        \
2049     ARMVectorReg tmp_m;                                                \
2050     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
2051         vm = memcpy(&tmp_m, vm, oprsz);                                \
2052     }                                                                  \
2053     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2054         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs));     \
2055     }                                                                  \
2056     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2057         *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2058     }                                                                  \
2059 }
2060
2061 DO_UZP(sve_uzp_b, uint8_t, H1)
2062 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2063 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2064 DO_UZP(sve_uzp_d, uint64_t, )
2065
2066 #define DO_TRN(NAME, TYPE, H) \
2067 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2068 {                                                                      \
2069     intptr_t oprsz = simd_oprsz(desc);                                 \
2070     intptr_t odd_ofs = simd_data(desc);                                \
2071     intptr_t i;                                                        \
2072     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
2073         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
2074         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
2075         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
2076         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
2077     }                                                                  \
2078 }
2079
2080 DO_TRN(sve_trn_b, uint8_t, H1)
2081 DO_TRN(sve_trn_h, uint16_t, H1_2)
2082 DO_TRN(sve_trn_s, uint32_t, H1_4)
2083 DO_TRN(sve_trn_d, uint64_t, )
2084
2085 #undef DO_ZIP
2086 #undef DO_UZP
2087 #undef DO_TRN
2088
2089 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2090 {
2091     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2092     uint32_t *d = vd, *n = vn;
2093     uint8_t *pg = vg;
2094
2095     for (i = j = 0; i < opr_sz; i++) {
2096         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2097             d[H4(j)] = n[H4(i)];
2098             j++;
2099         }
2100     }
2101     for (; j < opr_sz; j++) {
2102         d[H4(j)] = 0;
2103     }
2104 }
2105
2106 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2107 {
2108     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2109     uint64_t *d = vd, *n = vn;
2110     uint8_t *pg = vg;
2111
2112     for (i = j = 0; i < opr_sz; i++) {
2113         if (pg[H1(i)] & 1) {
2114             d[j] = n[i];
2115             j++;
2116         }
2117     }
2118     for (; j < opr_sz; j++) {
2119         d[j] = 0;
2120     }
2121 }
2122
2123 /* Similar to the ARM LastActiveElement pseudocode function, except the
2124  * result is multiplied by the element size.  This includes the not found
2125  * indication; e.g. not found for esz=3 is -8.
2126  */
2127 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2128 {
2129     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2130     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2131
2132     return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2133 }
2134
2135 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2136 {
2137     intptr_t opr_sz = simd_oprsz(desc) / 8;
2138     int esz = simd_data(desc);
2139     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2140     intptr_t i, first_i, last_i;
2141     ARMVectorReg tmp;
2142
2143     first_i = last_i = 0;
2144     first_g = last_g = 0;
2145
2146     /* Find the extent of the active elements within VG.  */
2147     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2148         pg = *(uint64_t *)(vg + i) & mask;
2149         if (pg) {
2150             if (last_g == 0) {
2151                 last_g = pg;
2152                 last_i = i;
2153             }
2154             first_g = pg;
2155             first_i = i;
2156         }
2157     }
2158
2159     len = 0;
2160     if (first_g != 0) {
2161         first_i = first_i * 8 + ctz64(first_g);
2162         last_i = last_i * 8 + 63 - clz64(last_g);
2163         len = last_i - first_i + (1 << esz);
2164         if (vd == vm) {
2165             vm = memcpy(&tmp, vm, opr_sz * 8);
2166         }
2167         swap_memmove(vd, vn + first_i, len);
2168     }
2169     swap_memmove(vd + len, vm, opr_sz * 8 - len);
2170 }
2171
2172 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2173                             void *vg, uint32_t desc)
2174 {
2175     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2176     uint64_t *d = vd, *n = vn, *m = vm;
2177     uint8_t *pg = vg;
2178
2179     for (i = 0; i < opr_sz; i += 1) {
2180         uint64_t nn = n[i], mm = m[i];
2181         uint64_t pp = expand_pred_b(pg[H1(i)]);
2182         d[i] = (nn & pp) | (mm & ~pp);
2183     }
2184 }
2185
2186 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2187                             void *vg, uint32_t desc)
2188 {
2189     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2190     uint64_t *d = vd, *n = vn, *m = vm;
2191     uint8_t *pg = vg;
2192
2193     for (i = 0; i < opr_sz; i += 1) {
2194         uint64_t nn = n[i], mm = m[i];
2195         uint64_t pp = expand_pred_h(pg[H1(i)]);
2196         d[i] = (nn & pp) | (mm & ~pp);
2197     }
2198 }
2199
2200 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2201                             void *vg, uint32_t desc)
2202 {
2203     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2204     uint64_t *d = vd, *n = vn, *m = vm;
2205     uint8_t *pg = vg;
2206
2207     for (i = 0; i < opr_sz; i += 1) {
2208         uint64_t nn = n[i], mm = m[i];
2209         uint64_t pp = expand_pred_s(pg[H1(i)]);
2210         d[i] = (nn & pp) | (mm & ~pp);
2211     }
2212 }
2213
2214 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2215                             void *vg, uint32_t desc)
2216 {
2217     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2218     uint64_t *d = vd, *n = vn, *m = vm;
2219     uint8_t *pg = vg;
2220
2221     for (i = 0; i < opr_sz; i += 1) {
2222         uint64_t nn = n[i], mm = m[i];
2223         d[i] = (pg[H1(i)] & 1 ? nn : mm);
2224     }
2225 }
2226
2227 /* Two operand comparison controlled by a predicate.
2228  * ??? It is very tempting to want to be able to expand this inline
2229  * with x86 instructions, e.g.
2230  *
2231  *    vcmpeqw    zm, zn, %ymm0
2232  *    vpmovmskb  %ymm0, %eax
2233  *    and        $0x5555, %eax
2234  *    and        pg, %eax
2235  *
2236  * or even aarch64, e.g.
2237  *
2238  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2239  *    cmeq       v0.8h, zn, zm
2240  *    and        v0.8h, v0.8h, mask
2241  *    addv       h0, v0.8h
2242  *    and        v0.8b, pg
2243  *
2244  * However, coming up with an abstraction that allows vector inputs and
2245  * a scalar output, and also handles the byte-ordering of sub-uint64_t
2246  * scalar outputs, is tricky.
2247  */
2248 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
2249 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2250 {                                                                            \
2251     intptr_t opr_sz = simd_oprsz(desc);                                      \
2252     uint32_t flags = PREDTEST_INIT;                                          \
2253     intptr_t i = opr_sz;                                                     \
2254     do {                                                                     \
2255         uint64_t out = 0, pg;                                                \
2256         do {                                                                 \
2257             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
2258             TYPE nn = *(TYPE *)(vn + H(i));                                  \
2259             TYPE mm = *(TYPE *)(vm + H(i));                                  \
2260             out |= nn OP mm;                                                 \
2261         } while (i & 63);                                                    \
2262         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2263         out &= pg;                                                           \
2264         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2265         flags = iter_predtest_bwd(out, pg, flags);                           \
2266     } while (i > 0);                                                         \
2267     return flags;                                                            \
2268 }
2269
2270 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2271     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2272 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2273     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2274 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2275     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2276 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2277     DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
2278
2279 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
2280 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2281 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2282 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2283
2284 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
2285 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2286 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2287 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2288
2289 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
2290 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2291 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2292 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2293
2294 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
2295 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2296 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2297 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2298
2299 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
2300 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2301 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2302 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2303
2304 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
2305 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2306 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2307 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2308
2309 #undef DO_CMP_PPZZ_B
2310 #undef DO_CMP_PPZZ_H
2311 #undef DO_CMP_PPZZ_S
2312 #undef DO_CMP_PPZZ_D
2313 #undef DO_CMP_PPZZ
2314
2315 /* Similar, but the second source is "wide".  */
2316 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
2317 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2318 {                                                                            \
2319     intptr_t opr_sz = simd_oprsz(desc);                                      \
2320     uint32_t flags = PREDTEST_INIT;                                          \
2321     intptr_t i = opr_sz;                                                     \
2322     do {                                                                     \
2323         uint64_t out = 0, pg;                                                \
2324         do {                                                                 \
2325             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
2326             do {                                                             \
2327                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
2328                 TYPE nn = *(TYPE *)(vn + H(i));                              \
2329                 out |= nn OP mm;                                             \
2330             } while (i & 7);                                                 \
2331         } while (i & 63);                                                    \
2332         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2333         out &= pg;                                                           \
2334         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2335         flags = iter_predtest_bwd(out, pg, flags);                           \
2336     } while (i > 0);                                                         \
2337     return flags;                                                            \
2338 }
2339
2340 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2341     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
2342 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2343     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2344 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2345     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2346
2347 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t,  uint64_t, ==)
2348 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2349 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2350
2351 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t,  uint64_t, !=)
2352 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2353 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2354
2355 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
2356 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
2357 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
2358
2359 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
2360 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
2361 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
2362
2363 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
2364 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2365 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2366
2367 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
2368 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2369 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2370
2371 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
2372 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
2373 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
2374
2375 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
2376 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
2377 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
2378
2379 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
2380 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2381 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2382
2383 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
2384 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2385 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2386
2387 #undef DO_CMP_PPZW_B
2388 #undef DO_CMP_PPZW_H
2389 #undef DO_CMP_PPZW_S
2390 #undef DO_CMP_PPZW