target/arm/sve_helper.c

   1 /*
   2  * ARM SVE Operations
   3  *
   4  * Copyright (c) 2018 Linaro, Ltd.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "cpu.h"
  22 #include "exec/exec-all.h"
  23 #include "exec/cpu_ldst.h"
  24 #include "exec/helper-proto.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26 #include "fpu/softfloat.h"
  27
  28
  29 /* Note that vector data is stored in host-endian 64-bit chunks,
  30    so addressing units smaller than that needs a host-endian fixup.  */
  31 #ifdef HOST_WORDS_BIGENDIAN
  32 #define H1(x)   ((x) ^ 7)
  33 #define H1_2(x) ((x) ^ 6)
  34 #define H1_4(x) ((x) ^ 4)
  35 #define H2(x)   ((x) ^ 3)
  36 #define H4(x)   ((x) ^ 1)
  37 #else
  38 #define H1(x)   (x)
  39 #define H1_2(x) (x)
  40 #define H1_4(x) (x)
  41 #define H2(x)   (x)
  42 #define H4(x)   (x)
  43 #endif
  44
  45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
  46  *
  47  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  48  * and bit 0 set if C is set.  Compare the definitions of these variables
  49  * within CPUARMState.
  50  */
  51
  52 /* For no G bits set, NZCV = C.  */
  53 #define PREDTEST_INIT  1
  54
  55 /* This is an iterative function, called for each Pd and Pg word
  56  * moving forward.
  57  */
  58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  59 {
  60     if (likely(g)) {
  61         /* Compute N from first D & G.
  62            Use bit 2 to signal first G bit seen.  */
  63         if (!(flags & 4)) {
  64             flags |= ((d & (g & -g)) != 0) << 31;
  65             flags |= 4;
  66         }
  67
  68         /* Accumulate Z from each D & G.  */
  69         flags |= ((d & g) != 0) << 1;
  70
  71         /* Compute C from last !(D & G).  Replace previous.  */
  72         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  73     }
  74     return flags;
  75 }
  76
  77 /* The same for a single word predicate.  */
  78 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
  79 {
  80     return iter_predtest_fwd(d, g, PREDTEST_INIT);
  81 }
  82
  83 /* The same for a multi-word predicate.  */
  84 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
  85 {
  86     uint32_t flags = PREDTEST_INIT;
  87     uint64_t *d = vd, *g = vg;
  88     uintptr_t i = 0;
  89
  90     do {
  91         flags = iter_predtest_fwd(d[i], g[i], flags);
  92     } while (++i < words);
  93
  94     return flags;
  95 }
  96
  97 /* Expand active predicate bits to bytes, for byte elements.
  98  *  for (i = 0; i < 256; ++i) {
  99  *      unsigned long m = 0;
 100  *      for (j = 0; j < 8; j++) {
 101  *          if ((i >> j) & 1) {
 102  *              m |= 0xfful << (j << 3);
 103  *          }
 104  *      }
 105  *      printf("0x%016lx,\n", m);
 106  *  }
 107  */
 108 static inline uint64_t expand_pred_b(uint8_t byte)
 109 {
 110     static const uint64_t word[256] = {
 111         0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 112         0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 113         0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 114         0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 115         0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 116         0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 117         0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 118         0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 119         0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 120         0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 121         0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 122         0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 123         0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 124         0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 125         0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 126         0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 127         0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 128         0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 129         0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 130         0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 131         0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 132         0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 133         0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 134         0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 135         0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 136         0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 137         0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 138         0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 139         0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 140         0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 141         0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 142         0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 143         0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 144         0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 145         0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 146         0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 147         0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 148         0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 149         0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 150         0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 151         0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 152         0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 153         0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 154         0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 155         0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 156         0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 157         0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 158         0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 159         0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 160         0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 161         0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 162         0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 163         0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 164         0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 165         0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 166         0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 167         0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 168         0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 169         0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 170         0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 171         0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 172         0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 173         0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 174         0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 175         0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 176         0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 177         0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 178         0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 179         0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 180         0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 181         0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 182         0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 183         0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 184         0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 185         0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 186         0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 187         0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 188         0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 189         0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 190         0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 191         0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 192         0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 193         0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 194         0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 195         0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 196         0xffffffffffffffff,
 197     };
 198     return word[byte];
 199 }
 200
 201 /* Similarly for half-word elements.
 202  *  for (i = 0; i < 256; ++i) {
 203  *      unsigned long m = 0;
 204  *      if (i & 0xaa) {
 205  *          continue;
 206  *      }
 207  *      for (j = 0; j < 8; j += 2) {
 208  *          if ((i >> j) & 1) {
 209  *              m |= 0xfffful << (j << 3);
 210  *          }
 211  *      }
 212  *      printf("[0x%x] = 0x%016lx,\n", i, m);
 213  *  }
 214  */
 215 static inline uint64_t expand_pred_h(uint8_t byte)
 216 {
 217     static const uint64_t word[] = {
 218         [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 219         [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 220         [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 221         [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 222         [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 223         [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 224         [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 225         [0x55] = 0xffffffffffffffff,
 226     };
 227     return word[byte & 0x55];
 228 }
 229
 230 /* Similarly for single word elements.  */
 231 static inline uint64_t expand_pred_s(uint8_t byte)
 232 {
 233     static const uint64_t word[] = {
 234         [0x01] = 0x00000000ffffffffull,
 235         [0x10] = 0xffffffff00000000ull,
 236         [0x11] = 0xffffffffffffffffull,
 237     };
 238     return word[byte & 0x11];
 239 }
 240
 241 /* Swap 16-bit words within a 32-bit word.  */
 242 static inline uint32_t hswap32(uint32_t h)
 243 {
 244     return rol32(h, 16);
 245 }
 246
 247 /* Swap 16-bit words within a 64-bit word.  */
 248 static inline uint64_t hswap64(uint64_t h)
 249 {
 250     uint64_t m = 0x0000ffff0000ffffull;
 251     h = rol64(h, 32);
 252     return ((h & m) << 16) | ((h >> 16) & m);
 253 }
 254
 255 /* Swap 32-bit words within a 64-bit word.  */
 256 static inline uint64_t wswap64(uint64_t h)
 257 {
 258     return rol64(h, 32);
 259 }
 260
 261 #define LOGICAL_PPPP(NAME, FUNC) \
 262 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 263 {                                                                         \
 264     uintptr_t opr_sz = simd_oprsz(desc);                                  \
 265     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 266     uintptr_t i;                                                          \
 267     for (i = 0; i < opr_sz / 8; ++i) {                                    \
 268         d[i] = FUNC(n[i], m[i], g[i]);                                    \
 269     }                                                                     \
 270 }
 271
 272 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
 273 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 274 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 275 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 276 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 277 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 278 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 279 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 280
 281 LOGICAL_PPPP(sve_and_pppp, DO_AND)
 282 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 283 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 284 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 285 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 286 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 287 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 288 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 289
 290 #undef DO_AND
 291 #undef DO_BIC
 292 #undef DO_EOR
 293 #undef DO_ORR
 294 #undef DO_ORN
 295 #undef DO_NOR
 296 #undef DO_NAND
 297 #undef DO_SEL
 298 #undef LOGICAL_PPPP
 299
 300 /* Fully general three-operand expander, controlled by a predicate.
 301  * This is complicated by the host-endian storage of the register file.
 302  */
 303 /* ??? I don't expect the compiler could ever vectorize this itself.
 304  * With some tables we can convert bit masks to byte masks, and with
 305  * extra care wrt byte/word ordering we could use gcc generic vectors
 306  * and do 16 bytes at a time.
 307  */
 308 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 309 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 310 {                                                                       \
 311     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 312     for (i = 0; i < opr_sz; ) {                                         \
 313         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 314         do {                                                            \
 315             if (pg & 1) {                                               \
 316                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 317                 TYPE mm = *(TYPE *)(vm + H(i));                         \
 318                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 319             }                                                           \
 320             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 321         } while (i & 15);                                               \
 322     }                                                                   \
 323 }
 324
 325 /* Similarly, specialized for 64-bit operands.  */
 326 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 327 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 328 {                                                               \
 329     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 330     TYPE *d = vd, *n = vn, *m = vm;                             \
 331     uint8_t *pg = vg;                                           \
 332     for (i = 0; i < opr_sz; i += 1) {                           \
 333         if (pg[H1(i)] & 1) {                                    \
 334             TYPE nn = n[i], mm = m[i];                          \
 335             d[i] = OP(nn, mm);                                  \
 336         }                                                       \
 337     }                                                           \
 338 }
 339
 340 #define DO_AND(N, M)  (N & M)
 341 #define DO_EOR(N, M)  (N ^ M)
 342 #define DO_ORR(N, M)  (N | M)
 343 #define DO_BIC(N, M)  (N & ~M)
 344 #define DO_ADD(N, M)  (N + M)
 345 #define DO_SUB(N, M)  (N - M)
 346 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 347 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 348 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 349 #define DO_MUL(N, M)  (N * M)
 350 #define DO_DIV(N, M)  (M ? N / M : 0)
 351
 352 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 353 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 354 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 355 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 356
 357 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 358 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 359 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 360 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 361
 362 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 363 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 364 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 365 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 366
 367 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 368 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 369 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 370 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 371
 372 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 373 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 374 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 375 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 376
 377 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 378 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 379 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 380 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 381
 382 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 383 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 384 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 385 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 386
 387 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 388 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 389 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 390 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 391
 392 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 393 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 394 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 395 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 396
 397 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 398 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 399 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 400 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 401
 402 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 403 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 404 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 405 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 406
 407 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 408 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 409 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 410 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 411
 412 /* Because the computation type is at least twice as large as required,
 413    these work for both signed and unsigned source types.  */
 414 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 415 {
 416     return (n * m) >> 8;
 417 }
 418
 419 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 420 {
 421     return (n * m) >> 16;
 422 }
 423
 424 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 425 {
 426     return (n * m) >> 32;
 427 }
 428
 429 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 430 {
 431     uint64_t lo, hi;
 432     muls64(&lo, &hi, n, m);
 433     return hi;
 434 }
 435
 436 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 437 {
 438     uint64_t lo, hi;
 439     mulu64(&lo, &hi, n, m);
 440     return hi;
 441 }
 442
 443 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 444 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 445 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 446 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 447
 448 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 449 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 450 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 451 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 452
 453 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 454 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 455 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 456 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 457
 458 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
 459 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
 460
 461 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
 462 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
 463
 464 /* Note that all bits of the shift are significant
 465    and not modulo the element size.  */
 466 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 467 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 468 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 469
 470 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 471 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 472 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 473
 474 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 475 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 476 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 477
 478 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 479 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 480 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 481
 482 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 483 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 484 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 485
 486 #undef DO_ZPZZ
 487 #undef DO_ZPZZ_D
 488
 489 /* Three-operand expander, controlled by a predicate, in which the
 490  * third operand is "wide".  That is, for D = N op M, the same 64-bit
 491  * value of M is used with all of the narrower values of N.
 492  */
 493 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 494 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 495 {                                                                       \
 496     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 497     for (i = 0; i < opr_sz; ) {                                         \
 498         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 499         TYPEW mm = *(TYPEW *)(vm + i);                                  \
 500         do {                                                            \
 501             if (pg & 1) {                                               \
 502                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 503                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 504             }                                                           \
 505             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 506         } while (i & 7);                                                \
 507     }                                                                   \
 508 }
 509
 510 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 511 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 512 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 513
 514 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 515 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 516 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 517
 518 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 519 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 520 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 521
 522 #undef DO_ZPZW
 523
 524 /* Fully general two-operand expander, controlled by a predicate.
 525  */
 526 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
 527 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 528 {                                                               \
 529     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 530     for (i = 0; i < opr_sz; ) {                                 \
 531         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 532         do {                                                    \
 533             if (pg & 1) {                                       \
 534                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 535                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
 536             }                                                   \
 537             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 538         } while (i & 15);                                       \
 539     }                                                           \
 540 }
 541
 542 /* Similarly, specialized for 64-bit operands.  */
 543 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
 544 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 545 {                                                               \
 546     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 547     TYPE *d = vd, *n = vn;                                      \
 548     uint8_t *pg = vg;                                           \
 549     for (i = 0; i < opr_sz; i += 1) {                           \
 550         if (pg[H1(i)] & 1) {                                    \
 551             TYPE nn = n[i];                                     \
 552             d[i] = OP(nn);                                      \
 553         }                                                       \
 554     }                                                           \
 555 }
 556
 557 #define DO_CLS_B(N)   (clrsb32(N) - 24)
 558 #define DO_CLS_H(N)   (clrsb32(N) - 16)
 559
 560 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 561 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 562 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 563 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 564
 565 #define DO_CLZ_B(N)   (clz32(N) - 24)
 566 #define DO_CLZ_H(N)   (clz32(N) - 16)
 567
 568 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 569 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 570 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 571 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 572
 573 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 574 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 575 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 576 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 577
 578 #define DO_CNOT(N)    (N == 0)
 579
 580 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 581 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 582 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 583 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 584
 585 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 586
 587 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 588 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 589 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 590
 591 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 592
 593 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 594 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 595 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 596
 597 #define DO_NOT(N)    (~N)
 598
 599 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 600 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 601 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 602 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 603
 604 #define DO_SXTB(N)    ((int8_t)N)
 605 #define DO_SXTH(N)    ((int16_t)N)
 606 #define DO_SXTS(N)    ((int32_t)N)
 607 #define DO_UXTB(N)    ((uint8_t)N)
 608 #define DO_UXTH(N)    ((uint16_t)N)
 609 #define DO_UXTS(N)    ((uint32_t)N)
 610
 611 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 612 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 613 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 614 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 615 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 616 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 617
 618 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 619 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 620 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 621 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 622 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 623 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 624
 625 #define DO_ABS(N)    (N < 0 ? -N : N)
 626
 627 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 628 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 629 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 630 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 631
 632 #define DO_NEG(N)    (-N)
 633
 634 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 635 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 636 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 637 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 638
 639 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 640 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 641 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 642
 643 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 644 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 645
 646 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 647
 648 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 649 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 650 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 651 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 652
 653 /* Three-operand expander, unpredicated, in which the third operand is "wide".
 654  */
 655 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 656 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 657 {                                                              \
 658     intptr_t i, opr_sz = simd_oprsz(desc);                     \
 659     for (i = 0; i < opr_sz; ) {                                \
 660         TYPEW mm = *(TYPEW *)(vm + i);                         \
 661         do {                                                   \
 662             TYPE nn = *(TYPE *)(vn + H(i));                    \
 663             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 664             i += sizeof(TYPE);                                 \
 665         } while (i & 7);                                       \
 666     }                                                          \
 667 }
 668
 669 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 670 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 671 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 672
 673 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 674 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 675 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 676
 677 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 678 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 679 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 680
 681 #undef DO_ZZW
 682
 683 #undef DO_CLS_B
 684 #undef DO_CLS_H
 685 #undef DO_CLZ_B
 686 #undef DO_CLZ_H
 687 #undef DO_CNOT
 688 #undef DO_FABS
 689 #undef DO_FNEG
 690 #undef DO_ABS
 691 #undef DO_NEG
 692 #undef DO_ZPZ
 693 #undef DO_ZPZ_D
 694
 695 /* Two-operand reduction expander, controlled by a predicate.
 696  * The difference between TYPERED and TYPERET has to do with
 697  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 698  * but TYPERET must be unsigned so that e.g. a 32-bit value
 699  * is not sign-extended to the ABI uint64_t return type.
 700  */
 701 /* ??? If we were to vectorize this by hand the reduction ordering
 702  * would change.  For integer operands, this is perfectly fine.
 703  */
 704 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 705 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 706 {                                                          \
 707     intptr_t i, opr_sz = simd_oprsz(desc);                 \
 708     TYPERED ret = INIT;                                    \
 709     for (i = 0; i < opr_sz; ) {                            \
 710         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 711         do {                                               \
 712             if (pg & 1) {                                  \
 713                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 714                 ret = OP(ret, nn);                         \
 715             }                                              \
 716             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 717         } while (i & 15);                                  \
 718     }                                                      \
 719     return (TYPERET)ret;                                   \
 720 }
 721
 722 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 723 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 724 {                                                          \
 725     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 726     TYPEE *n = vn;                                         \
 727     uint8_t *pg = vg;                                      \
 728     TYPER ret = INIT;                                      \
 729     for (i = 0; i < opr_sz; i += 1) {                      \
 730         if (pg[H1(i)] & 1) {                               \
 731             TYPEE nn = n[i];                               \
 732             ret = OP(ret, nn);                             \
 733         }                                                  \
 734     }                                                      \
 735     return ret;                                            \
 736 }
 737
 738 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 739 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 740 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 741 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 742
 743 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 744 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 745 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 746 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 747
 748 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 749 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 750 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 751 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 752
 753 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 754 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 755 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 756
 757 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 758 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 759 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 760 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 761
 762 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 763 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 764 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 765 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 766
 767 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 768 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 769 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 770 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 771
 772 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 773 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 774 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 775 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 776
 777 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 778 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 779 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 780 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 781
 782 #undef DO_VPZ
 783 #undef DO_VPZ_D
 784
 785 #undef DO_AND
 786 #undef DO_ORR
 787 #undef DO_EOR
 788 #undef DO_BIC
 789 #undef DO_ADD
 790 #undef DO_SUB
 791 #undef DO_MAX
 792 #undef DO_MIN
 793 #undef DO_ABD
 794 #undef DO_MUL
 795 #undef DO_DIV
 796 #undef DO_ASR
 797 #undef DO_LSR
 798 #undef DO_LSL
 799
 800 /* Similar to the ARM LastActiveElement pseudocode function, except the
 801    result is multiplied by the element size.  This includes the not found
 802    indication; e.g. not found for esz=3 is -8.  */
 803 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 804 {
 805     uint64_t mask = pred_esz_masks[esz];
 806     intptr_t i = words;
 807
 808     do {
 809         uint64_t this_g = g[--i] & mask;
 810         if (this_g) {
 811             return i * 64 + (63 - clz64(this_g));
 812         }
 813     } while (i > 0);
 814     return (intptr_t)-1 << esz;
 815 }
 816
 817 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 818 {
 819     uint32_t flags = PREDTEST_INIT;
 820     uint64_t *d = vd, *g = vg;
 821     intptr_t i = 0;
 822
 823     do {
 824         uint64_t this_d = d[i];
 825         uint64_t this_g = g[i];
 826
 827         if (this_g) {
 828             if (!(flags & 4)) {
 829                 /* Set in D the first bit of G.  */
 830                 this_d |= this_g & -this_g;
 831                 d[i] = this_d;
 832             }
 833             flags = iter_predtest_fwd(this_d, this_g, flags);
 834         }
 835     } while (++i < words);
 836
 837     return flags;
 838 }
 839
 840 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 841 {
 842     intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 843     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 844     uint32_t flags = PREDTEST_INIT;
 845     uint64_t *d = vd, *g = vg, esz_mask;
 846     intptr_t i, next;
 847
 848     next = last_active_element(vd, words, esz) + (1 << esz);
 849     esz_mask = pred_esz_masks[esz];
 850
 851     /* Similar to the pseudocode for pnext, but scaled by ESZ
 852        so that we find the correct bit.  */
 853     if (next < words * 64) {
 854         uint64_t mask = -1;
 855
 856         if (next & 63) {
 857             mask = ~((1ull << (next & 63)) - 1);
 858             next &= -64;
 859         }
 860         do {
 861             uint64_t this_g = g[next / 64] & esz_mask & mask;
 862             if (this_g != 0) {
 863                 next = (next & -64) + ctz64(this_g);
 864                 break;
 865             }
 866             next += 64;
 867             mask = -1;
 868         } while (next < words * 64);
 869     }
 870
 871     i = 0;
 872     do {
 873         uint64_t this_d = 0;
 874         if (i == next / 64) {
 875             this_d = 1ull << (next & 63);
 876         }
 877         d[i] = this_d;
 878         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 879     } while (++i < words);
 880
 881     return flags;
 882 }
 883
 884 /* Store zero into every active element of Zd.  We will use this for two
 885  * and three-operand predicated instructions for which logic dictates a
 886  * zero result.  In particular, logical shift by element size, which is
 887  * otherwise undefined on the host.
 888  *
 889  * For element sizes smaller than uint64_t, we use tables to expand
 890  * the N bits of the controlling predicate to a byte mask, and clear
 891  * those bytes.
 892  */
 893 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
 894 {
 895     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 896     uint64_t *d = vd;
 897     uint8_t *pg = vg;
 898     for (i = 0; i < opr_sz; i += 1) {
 899         d[i] &= ~expand_pred_b(pg[H1(i)]);
 900     }
 901 }
 902
 903 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
 904 {
 905     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 906     uint64_t *d = vd;
 907     uint8_t *pg = vg;
 908     for (i = 0; i < opr_sz; i += 1) {
 909         d[i] &= ~expand_pred_h(pg[H1(i)]);
 910     }
 911 }
 912
 913 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
 914 {
 915     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 916     uint64_t *d = vd;
 917     uint8_t *pg = vg;
 918     for (i = 0; i < opr_sz; i += 1) {
 919         d[i] &= ~expand_pred_s(pg[H1(i)]);
 920     }
 921 }
 922
 923 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 924 {
 925     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 926     uint64_t *d = vd;
 927     uint8_t *pg = vg;
 928     for (i = 0; i < opr_sz; i += 1) {
 929         if (pg[H1(i)] & 1) {
 930             d[i] = 0;
 931         }
 932     }
 933 }
 934
 935 /* Three-operand expander, immediate operand, controlled by a predicate.
 936  */
 937 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
 938 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 939 {                                                               \
 940     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 941     TYPE imm = simd_data(desc);                                 \
 942     for (i = 0; i < opr_sz; ) {                                 \
 943         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 944         do {                                                    \
 945             if (pg & 1) {                                       \
 946                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 947                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
 948             }                                                   \
 949             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 950         } while (i & 15);                                       \
 951     }                                                           \
 952 }
 953
 954 /* Similarly, specialized for 64-bit operands.  */
 955 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
 956 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 957 {                                                               \
 958     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 959     TYPE *d = vd, *n = vn;                                      \
 960     TYPE imm = simd_data(desc);                                 \
 961     uint8_t *pg = vg;                                           \
 962     for (i = 0; i < opr_sz; i += 1) {                           \
 963         if (pg[H1(i)] & 1) {                                    \
 964             TYPE nn = n[i];                                     \
 965             d[i] = OP(nn, imm);                                 \
 966         }                                                       \
 967     }                                                           \
 968 }
 969
 970 #define DO_SHR(N, M)  (N >> M)
 971 #define DO_SHL(N, M)  (N << M)
 972
 973 /* Arithmetic shift right for division.  This rounds negative numbers
 974    toward zero as per signed division.  Therefore before shifting,
 975    when N is negative, add 2**M-1.  */
 976 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
 977
 978 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
 979 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
 980 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
 981 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
 982
 983 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
 984 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
 985 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
 986 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
 987
 988 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
 989 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
 990 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
 991 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
 992
 993 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
 994 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
 995 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
 996 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
 997
 998 #undef DO_SHR
 999 #undef DO_SHL
1000 #undef DO_ASRD
1001 #undef DO_ZPZI
1002 #undef DO_ZPZI_D
1003
1004 /* Fully general four-operand expander, controlled by a predicate.
1005  */
1006 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
1007 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1008                   void *vg, uint32_t desc)                    \
1009 {                                                             \
1010     intptr_t i, opr_sz = simd_oprsz(desc);                    \
1011     for (i = 0; i < opr_sz; ) {                               \
1012         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
1013         do {                                                  \
1014             if (pg & 1) {                                     \
1015                 TYPE nn = *(TYPE *)(vn + H(i));               \
1016                 TYPE mm = *(TYPE *)(vm + H(i));               \
1017                 TYPE aa = *(TYPE *)(va + H(i));               \
1018                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
1019             }                                                 \
1020             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
1021         } while (i & 15);                                     \
1022     }                                                         \
1023 }
1024
1025 /* Similarly, specialized for 64-bit operands.  */
1026 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
1027 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1028                   void *vg, uint32_t desc)                    \
1029 {                                                             \
1030     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
1031     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
1032     uint8_t *pg = vg;                                         \
1033     for (i = 0; i < opr_sz; i += 1) {                         \
1034         if (pg[H1(i)] & 1) {                                  \
1035             TYPE aa = a[i], nn = n[i], mm = m[i];             \
1036             d[i] = OP(aa, nn, mm);                            \
1037         }                                                     \
1038     }                                                         \
1039 }
1040
1041 #define DO_MLA(A, N, M)  (A + N * M)
1042 #define DO_MLS(A, N, M)  (A - N * M)
1043
1044 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1045 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1046
1047 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1048 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1049
1050 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1051 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1052
1053 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1054 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1055
1056 #undef DO_MLA
1057 #undef DO_MLS
1058 #undef DO_ZPZZZ
1059 #undef DO_ZPZZZ_D
1060
1061 void HELPER(sve_index_b)(void *vd, uint32_t start,
1062                          uint32_t incr, uint32_t desc)
1063 {
1064     intptr_t i, opr_sz = simd_oprsz(desc);
1065     uint8_t *d = vd;
1066     for (i = 0; i < opr_sz; i += 1) {
1067         d[H1(i)] = start + i * incr;
1068     }
1069 }
1070
1071 void HELPER(sve_index_h)(void *vd, uint32_t start,
1072                          uint32_t incr, uint32_t desc)
1073 {
1074     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1075     uint16_t *d = vd;
1076     for (i = 0; i < opr_sz; i += 1) {
1077         d[H2(i)] = start + i * incr;
1078     }
1079 }
1080
1081 void HELPER(sve_index_s)(void *vd, uint32_t start,
1082                          uint32_t incr, uint32_t desc)
1083 {
1084     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1085     uint32_t *d = vd;
1086     for (i = 0; i < opr_sz; i += 1) {
1087         d[H4(i)] = start + i * incr;
1088     }
1089 }
1090
1091 void HELPER(sve_index_d)(void *vd, uint64_t start,
1092                          uint64_t incr, uint32_t desc)
1093 {
1094     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1095     uint64_t *d = vd;
1096     for (i = 0; i < opr_sz; i += 1) {
1097         d[i] = start + i * incr;
1098     }
1099 }
1100
1101 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1102 {
1103     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1104     uint32_t sh = simd_data(desc);
1105     uint32_t *d = vd, *n = vn, *m = vm;
1106     for (i = 0; i < opr_sz; i += 1) {
1107         d[i] = n[i] + (m[i] << sh);
1108     }
1109 }
1110
1111 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1112 {
1113     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1114     uint64_t sh = simd_data(desc);
1115     uint64_t *d = vd, *n = vn, *m = vm;
1116     for (i = 0; i < opr_sz; i += 1) {
1117         d[i] = n[i] + (m[i] << sh);
1118     }
1119 }
1120
1121 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1122 {
1123     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1124     uint64_t sh = simd_data(desc);
1125     uint64_t *d = vd, *n = vn, *m = vm;
1126     for (i = 0; i < opr_sz; i += 1) {
1127         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1128     }
1129 }
1130
1131 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1132 {
1133     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1134     uint64_t sh = simd_data(desc);
1135     uint64_t *d = vd, *n = vn, *m = vm;
1136     for (i = 0; i < opr_sz; i += 1) {
1137         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1138     }
1139 }
1140
1141 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1142 {
1143     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1144     static const uint16_t coeff[] = {
1145         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1146         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1147         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1148         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1149     };
1150     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1151     uint16_t *d = vd, *n = vn;
1152
1153     for (i = 0; i < opr_sz; i++) {
1154         uint16_t nn = n[i];
1155         intptr_t idx = extract32(nn, 0, 5);
1156         uint16_t exp = extract32(nn, 5, 5);
1157         d[i] = coeff[idx] | (exp << 10);
1158     }
1159 }
1160
1161 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1162 {
1163     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1164     static const uint32_t coeff[] = {
1165         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1166         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1167         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1168         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1169         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1170         0x1ef532, 0x20b051, 0x227043, 0x243516,
1171         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1172         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1173         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1174         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1175         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1176         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1177         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1178         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1179         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1180         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1181     };
1182     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1183     uint32_t *d = vd, *n = vn;
1184
1185     for (i = 0; i < opr_sz; i++) {
1186         uint32_t nn = n[i];
1187         intptr_t idx = extract32(nn, 0, 6);
1188         uint32_t exp = extract32(nn, 6, 8);
1189         d[i] = coeff[idx] | (exp << 23);
1190     }
1191 }
1192
1193 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1194 {
1195     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1196     static const uint64_t coeff[] = {
1197         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1198         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1199         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1200         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1201         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1202         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1203         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1204         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1205         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1206         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1207         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1208         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1209         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1210         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1211         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1212         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1213         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1214         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1215         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1216         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1217         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1218         0xFA7C1819E90D8ull,
1219     };
1220     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1221     uint64_t *d = vd, *n = vn;
1222
1223     for (i = 0; i < opr_sz; i++) {
1224         uint64_t nn = n[i];
1225         intptr_t idx = extract32(nn, 0, 6);
1226         uint64_t exp = extract32(nn, 6, 11);
1227         d[i] = coeff[idx] | (exp << 52);
1228     }
1229 }
1230
1231 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1232 {
1233     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1234     uint16_t *d = vd, *n = vn, *m = vm;
1235     for (i = 0; i < opr_sz; i += 1) {
1236         uint16_t nn = n[i];
1237         uint16_t mm = m[i];
1238         if (mm & 1) {
1239             nn = float16_one;
1240         }
1241         d[i] = nn ^ (mm & 2) << 14;
1242     }
1243 }
1244
1245 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1246 {
1247     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1248     uint32_t *d = vd, *n = vn, *m = vm;
1249     for (i = 0; i < opr_sz; i += 1) {
1250         uint32_t nn = n[i];
1251         uint32_t mm = m[i];
1252         if (mm & 1) {
1253             nn = float32_one;
1254         }
1255         d[i] = nn ^ (mm & 2) << 30;
1256     }
1257 }
1258
1259 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1260 {
1261     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1262     uint64_t *d = vd, *n = vn, *m = vm;
1263     for (i = 0; i < opr_sz; i += 1) {
1264         uint64_t nn = n[i];
1265         uint64_t mm = m[i];
1266         if (mm & 1) {
1267             nn = float64_one;
1268         }
1269         d[i] = nn ^ (mm & 2) << 62;
1270     }
1271 }
1272
1273 /*
1274  * Signed saturating addition with scalar operand.
1275  */
1276
1277 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1278 {
1279     intptr_t i, oprsz = simd_oprsz(desc);
1280
1281     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1282         int r = *(int8_t *)(a + i) + b;
1283         if (r > INT8_MAX) {
1284             r = INT8_MAX;
1285         } else if (r < INT8_MIN) {
1286             r = INT8_MIN;
1287         }
1288         *(int8_t *)(d + i) = r;
1289     }
1290 }
1291
1292 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1293 {
1294     intptr_t i, oprsz = simd_oprsz(desc);
1295
1296     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1297         int r = *(int16_t *)(a + i) + b;
1298         if (r > INT16_MAX) {
1299             r = INT16_MAX;
1300         } else if (r < INT16_MIN) {
1301             r = INT16_MIN;
1302         }
1303         *(int16_t *)(d + i) = r;
1304     }
1305 }
1306
1307 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1308 {
1309     intptr_t i, oprsz = simd_oprsz(desc);
1310
1311     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1312         int64_t r = *(int32_t *)(a + i) + b;
1313         if (r > INT32_MAX) {
1314             r = INT32_MAX;
1315         } else if (r < INT32_MIN) {
1316             r = INT32_MIN;
1317         }
1318         *(int32_t *)(d + i) = r;
1319     }
1320 }
1321
1322 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1323 {
1324     intptr_t i, oprsz = simd_oprsz(desc);
1325
1326     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1327         int64_t ai = *(int64_t *)(a + i);
1328         int64_t r = ai + b;
1329         if (((r ^ ai) & ~(ai ^ b)) < 0) {
1330             /* Signed overflow.  */
1331             r = (r < 0 ? INT64_MAX : INT64_MIN);
1332         }
1333         *(int64_t *)(d + i) = r;
1334     }
1335 }
1336
1337 /*
1338  * Unsigned saturating addition with scalar operand.
1339  */
1340
1341 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1342 {
1343     intptr_t i, oprsz = simd_oprsz(desc);
1344
1345     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1346         int r = *(uint8_t *)(a + i) + b;
1347         if (r > UINT8_MAX) {
1348             r = UINT8_MAX;
1349         } else if (r < 0) {
1350             r = 0;
1351         }
1352         *(uint8_t *)(d + i) = r;
1353     }
1354 }
1355
1356 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1357 {
1358     intptr_t i, oprsz = simd_oprsz(desc);
1359
1360     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1361         int r = *(uint16_t *)(a + i) + b;
1362         if (r > UINT16_MAX) {
1363             r = UINT16_MAX;
1364         } else if (r < 0) {
1365             r = 0;
1366         }
1367         *(uint16_t *)(d + i) = r;
1368     }
1369 }
1370
1371 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1372 {
1373     intptr_t i, oprsz = simd_oprsz(desc);
1374
1375     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1376         int64_t r = *(uint32_t *)(a + i) + b;
1377         if (r > UINT32_MAX) {
1378             r = UINT32_MAX;
1379         } else if (r < 0) {
1380             r = 0;
1381         }
1382         *(uint32_t *)(d + i) = r;
1383     }
1384 }
1385
1386 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1387 {
1388     intptr_t i, oprsz = simd_oprsz(desc);
1389
1390     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1391         uint64_t r = *(uint64_t *)(a + i) + b;
1392         if (r < b) {
1393             r = UINT64_MAX;
1394         }
1395         *(uint64_t *)(d + i) = r;
1396     }
1397 }
1398
1399 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1400 {
1401     intptr_t i, oprsz = simd_oprsz(desc);
1402
1403     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1404         uint64_t ai = *(uint64_t *)(a + i);
1405         *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1406     }
1407 }
1408
1409 /* Two operand predicated copy immediate with merge.  All valid immediates
1410  * can fit within 17 signed bits in the simd_data field.
1411  */
1412 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1413                          uint64_t mm, uint32_t desc)
1414 {
1415     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1416     uint64_t *d = vd, *n = vn;
1417     uint8_t *pg = vg;
1418
1419     mm = dup_const(MO_8, mm);
1420     for (i = 0; i < opr_sz; i += 1) {
1421         uint64_t nn = n[i];
1422         uint64_t pp = expand_pred_b(pg[H1(i)]);
1423         d[i] = (mm & pp) | (nn & ~pp);
1424     }
1425 }
1426
1427 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1428                          uint64_t mm, uint32_t desc)
1429 {
1430     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1431     uint64_t *d = vd, *n = vn;
1432     uint8_t *pg = vg;
1433
1434     mm = dup_const(MO_16, mm);
1435     for (i = 0; i < opr_sz; i += 1) {
1436         uint64_t nn = n[i];
1437         uint64_t pp = expand_pred_h(pg[H1(i)]);
1438         d[i] = (mm & pp) | (nn & ~pp);
1439     }
1440 }
1441
1442 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1443                          uint64_t mm, uint32_t desc)
1444 {
1445     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1446     uint64_t *d = vd, *n = vn;
1447     uint8_t *pg = vg;
1448
1449     mm = dup_const(MO_32, mm);
1450     for (i = 0; i < opr_sz; i += 1) {
1451         uint64_t nn = n[i];
1452         uint64_t pp = expand_pred_s(pg[H1(i)]);
1453         d[i] = (mm & pp) | (nn & ~pp);
1454     }
1455 }
1456
1457 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1458                          uint64_t mm, uint32_t desc)
1459 {
1460     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1461     uint64_t *d = vd, *n = vn;
1462     uint8_t *pg = vg;
1463
1464     for (i = 0; i < opr_sz; i += 1) {
1465         uint64_t nn = n[i];
1466         d[i] = (pg[H1(i)] & 1 ? mm : nn);
1467     }
1468 }
1469
1470 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1471 {
1472     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1473     uint64_t *d = vd;
1474     uint8_t *pg = vg;
1475
1476     val = dup_const(MO_8, val);
1477     for (i = 0; i < opr_sz; i += 1) {
1478         d[i] = val & expand_pred_b(pg[H1(i)]);
1479     }
1480 }
1481
1482 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1483 {
1484     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1485     uint64_t *d = vd;
1486     uint8_t *pg = vg;
1487
1488     val = dup_const(MO_16, val);
1489     for (i = 0; i < opr_sz; i += 1) {
1490         d[i] = val & expand_pred_h(pg[H1(i)]);
1491     }
1492 }
1493
1494 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1495 {
1496     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1497     uint64_t *d = vd;
1498     uint8_t *pg = vg;
1499
1500     val = dup_const(MO_32, val);
1501     for (i = 0; i < opr_sz; i += 1) {
1502         d[i] = val & expand_pred_s(pg[H1(i)]);
1503     }
1504 }
1505
1506 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1507 {
1508     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1509     uint64_t *d = vd;
1510     uint8_t *pg = vg;
1511
1512     for (i = 0; i < opr_sz; i += 1) {
1513         d[i] = (pg[H1(i)] & 1 ? val : 0);
1514     }
1515 }
1516
1517 /* Big-endian hosts need to frob the byte indicies.  If the copy
1518  * happens to be 8-byte aligned, then no frobbing necessary.
1519  */
1520 static void swap_memmove(void *vd, void *vs, size_t n)
1521 {
1522     uintptr_t d = (uintptr_t)vd;
1523     uintptr_t s = (uintptr_t)vs;
1524     uintptr_t o = (d | s | n) & 7;
1525     size_t i;
1526
1527 #ifndef HOST_WORDS_BIGENDIAN
1528     o = 0;
1529 #endif
1530     switch (o) {
1531     case 0:
1532         memmove(vd, vs, n);
1533         break;
1534
1535     case 4:
1536         if (d < s || d >= s + n) {
1537             for (i = 0; i < n; i += 4) {
1538                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1539             }
1540         } else {
1541             for (i = n; i > 0; ) {
1542                 i -= 4;
1543                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1544             }
1545         }
1546         break;
1547
1548     case 2:
1549     case 6:
1550         if (d < s || d >= s + n) {
1551             for (i = 0; i < n; i += 2) {
1552                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1553             }
1554         } else {
1555             for (i = n; i > 0; ) {
1556                 i -= 2;
1557                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1558             }
1559         }
1560         break;
1561
1562     default:
1563         if (d < s || d >= s + n) {
1564             for (i = 0; i < n; i++) {
1565                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1566             }
1567         } else {
1568             for (i = n; i > 0; ) {
1569                 i -= 1;
1570                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1571             }
1572         }
1573         break;
1574     }
1575 }
1576
1577 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1578 {
1579     intptr_t opr_sz = simd_oprsz(desc);
1580     size_t n_ofs = simd_data(desc);
1581     size_t n_siz = opr_sz - n_ofs;
1582
1583     if (vd != vm) {
1584         swap_memmove(vd, vn + n_ofs, n_siz);
1585         swap_memmove(vd + n_siz, vm, n_ofs);
1586     } else if (vd != vn) {
1587         swap_memmove(vd + n_siz, vd, n_ofs);
1588         swap_memmove(vd, vn + n_ofs, n_siz);
1589     } else {
1590         /* vd == vn == vm.  Need temp space.  */
1591         ARMVectorReg tmp;
1592         swap_memmove(&tmp, vm, n_ofs);
1593         swap_memmove(vd, vd + n_ofs, n_siz);
1594         memcpy(vd + n_siz, &tmp, n_ofs);
1595     }
1596 }
1597
1598 #define DO_INSR(NAME, TYPE, H) \
1599 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1600 {                                                                  \
1601     intptr_t opr_sz = simd_oprsz(desc);                            \
1602     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
1603     *(TYPE *)(vd + H(0)) = val;                                    \
1604 }
1605
1606 DO_INSR(sve_insr_b, uint8_t, H1)
1607 DO_INSR(sve_insr_h, uint16_t, H1_2)
1608 DO_INSR(sve_insr_s, uint32_t, H1_4)
1609 DO_INSR(sve_insr_d, uint64_t, )
1610
1611 #undef DO_INSR
1612
1613 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1614 {
1615     intptr_t i, j, opr_sz = simd_oprsz(desc);
1616     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1617         uint64_t f = *(uint64_t *)(vn + i);
1618         uint64_t b = *(uint64_t *)(vn + j);
1619         *(uint64_t *)(vd + i) = bswap64(b);
1620         *(uint64_t *)(vd + j) = bswap64(f);
1621     }
1622 }
1623
1624 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1625 {
1626     intptr_t i, j, opr_sz = simd_oprsz(desc);
1627     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1628         uint64_t f = *(uint64_t *)(vn + i);
1629         uint64_t b = *(uint64_t *)(vn + j);
1630         *(uint64_t *)(vd + i) = hswap64(b);
1631         *(uint64_t *)(vd + j) = hswap64(f);
1632     }
1633 }
1634
1635 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1636 {
1637     intptr_t i, j, opr_sz = simd_oprsz(desc);
1638     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1639         uint64_t f = *(uint64_t *)(vn + i);
1640         uint64_t b = *(uint64_t *)(vn + j);
1641         *(uint64_t *)(vd + i) = rol64(b, 32);
1642         *(uint64_t *)(vd + j) = rol64(f, 32);
1643     }
1644 }
1645
1646 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1647 {
1648     intptr_t i, j, opr_sz = simd_oprsz(desc);
1649     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1650         uint64_t f = *(uint64_t *)(vn + i);
1651         uint64_t b = *(uint64_t *)(vn + j);
1652         *(uint64_t *)(vd + i) = b;
1653         *(uint64_t *)(vd + j) = f;
1654     }
1655 }
1656
1657 #define DO_TBL(NAME, TYPE, H) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1659 {                                                              \
1660     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1661     uintptr_t elem = opr_sz / sizeof(TYPE);                    \
1662     TYPE *d = vd, *n = vn, *m = vm;                            \
1663     ARMVectorReg tmp;                                          \
1664     if (unlikely(vd == vn)) {                                  \
1665         n = memcpy(&tmp, vn, opr_sz);                          \
1666     }                                                          \
1667     for (i = 0; i < elem; i++) {                               \
1668         TYPE j = m[H(i)];                                      \
1669         d[H(i)] = j < elem ? n[H(j)] : 0;                      \
1670     }                                                          \
1671 }
1672
1673 DO_TBL(sve_tbl_b, uint8_t, H1)
1674 DO_TBL(sve_tbl_h, uint16_t, H2)
1675 DO_TBL(sve_tbl_s, uint32_t, H4)
1676 DO_TBL(sve_tbl_d, uint64_t, )
1677
1678 #undef TBL
1679
1680 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1681 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1682 {                                                              \
1683     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1684     TYPED *d = vd;                                             \
1685     TYPES *n = vn;                                             \
1686     ARMVectorReg tmp;                                          \
1687     if (unlikely(vn - vd < opr_sz)) {                          \
1688         n = memcpy(&tmp, n, opr_sz / 2);                       \
1689     }                                                          \
1690     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
1691         d[HD(i)] = n[HS(i)];                                   \
1692     }                                                          \
1693 }
1694
1695 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1696 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1697 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1698
1699 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1700 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1701 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1702
1703 #undef DO_UNPK
1704
1705 /* Mask of bits included in the even numbered predicates of width esz.
1706  * We also use this for expand_bits/compress_bits, and so extend the
1707  * same pattern out to 16-bit units.
1708  */
1709 static const uint64_t even_bit_esz_masks[5] = {
1710     0x5555555555555555ull,
1711     0x3333333333333333ull,
1712     0x0f0f0f0f0f0f0f0full,
1713     0x00ff00ff00ff00ffull,
1714     0x0000ffff0000ffffull,
1715 };
1716
1717 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1718  * For N==0, this corresponds to the operation that in qemu/bitops.h
1719  * we call half_shuffle64; this algorithm is from Hacker's Delight,
1720  * section 7-2 Shuffling Bits.
1721  */
1722 static uint64_t expand_bits(uint64_t x, int n)
1723 {
1724     int i;
1725
1726     x &= 0xffffffffu;
1727     for (i = 4; i >= n; i--) {
1728         int sh = 1 << i;
1729         x = ((x << sh) | x) & even_bit_esz_masks[i];
1730     }
1731     return x;
1732 }
1733
1734 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1735  * For N==0, this corresponds to the operation that in qemu/bitops.h
1736  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1737  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1738  */
1739 static uint64_t compress_bits(uint64_t x, int n)
1740 {
1741     int i;
1742
1743     for (i = n; i <= 4; i++) {
1744         int sh = 1 << i;
1745         x &= even_bit_esz_masks[i];
1746         x = (x >> sh) | x;
1747     }
1748     return x & 0xffffffffu;
1749 }
1750
1751 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1752 {
1753     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1754     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1755     intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1756     uint64_t *d = vd;
1757     intptr_t i;
1758
1759     if (oprsz <= 8) {
1760         uint64_t nn = *(uint64_t *)vn;
1761         uint64_t mm = *(uint64_t *)vm;
1762         int half = 4 * oprsz;
1763
1764         nn = extract64(nn, high * half, half);
1765         mm = extract64(mm, high * half, half);
1766         nn = expand_bits(nn, esz);
1767         mm = expand_bits(mm, esz);
1768         d[0] = nn + (mm << (1 << esz));
1769     } else {
1770         ARMPredicateReg tmp_n, tmp_m;
1771
1772         /* We produce output faster than we consume input.
1773            Therefore we must be mindful of possible overlap.  */
1774         if ((vn - vd) < (uintptr_t)oprsz) {
1775             vn = memcpy(&tmp_n, vn, oprsz);
1776         }
1777         if ((vm - vd) < (uintptr_t)oprsz) {
1778             vm = memcpy(&tmp_m, vm, oprsz);
1779         }
1780         if (high) {
1781             high = oprsz >> 1;
1782         }
1783
1784         if ((high & 3) == 0) {
1785             uint32_t *n = vn, *m = vm;
1786             high >>= 2;
1787
1788             for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1789                 uint64_t nn = n[H4(high + i)];
1790                 uint64_t mm = m[H4(high + i)];
1791
1792                 nn = expand_bits(nn, esz);
1793                 mm = expand_bits(mm, esz);
1794                 d[i] = nn + (mm << (1 << esz));
1795             }
1796         } else {
1797             uint8_t *n = vn, *m = vm;
1798             uint16_t *d16 = vd;
1799
1800             for (i = 0; i < oprsz / 2; i++) {
1801                 uint16_t nn = n[H1(high + i)];
1802                 uint16_t mm = m[H1(high + i)];
1803
1804                 nn = expand_bits(nn, esz);
1805                 mm = expand_bits(mm, esz);
1806                 d16[H2(i)] = nn + (mm << (1 << esz));
1807             }
1808         }
1809     }
1810 }
1811
1812 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1813 {
1814     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1815     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1816     int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1817     uint64_t *d = vd, *n = vn, *m = vm;
1818     uint64_t l, h;
1819     intptr_t i;
1820
1821     if (oprsz <= 8) {
1822         l = compress_bits(n[0] >> odd, esz);
1823         h = compress_bits(m[0] >> odd, esz);
1824         d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1825     } else {
1826         ARMPredicateReg tmp_m;
1827         intptr_t oprsz_16 = oprsz / 16;
1828
1829         if ((vm - vd) < (uintptr_t)oprsz) {
1830             m = memcpy(&tmp_m, vm, oprsz);
1831         }
1832
1833         for (i = 0; i < oprsz_16; i++) {
1834             l = n[2 * i + 0];
1835             h = n[2 * i + 1];
1836             l = compress_bits(l >> odd, esz);
1837             h = compress_bits(h >> odd, esz);
1838             d[i] = l + (h << 32);
1839         }
1840
1841         /* For VL which is not a power of 2, the results from M do not
1842            align nicely with the uint64_t for D.  Put the aligned results
1843            from M into TMP_M and then copy it into place afterward.  */
1844         if (oprsz & 15) {
1845             d[i] = compress_bits(n[2 * i] >> odd, esz);
1846
1847             for (i = 0; i < oprsz_16; i++) {
1848                 l = m[2 * i + 0];
1849                 h = m[2 * i + 1];
1850                 l = compress_bits(l >> odd, esz);
1851                 h = compress_bits(h >> odd, esz);
1852                 tmp_m.p[i] = l + (h << 32);
1853             }
1854             tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1855
1856             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1857         } else {
1858             for (i = 0; i < oprsz_16; i++) {
1859                 l = m[2 * i + 0];
1860                 h = m[2 * i + 1];
1861                 l = compress_bits(l >> odd, esz);
1862                 h = compress_bits(h >> odd, esz);
1863                 d[oprsz_16 + i] = l + (h << 32);
1864             }
1865         }
1866     }
1867 }
1868
1869 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1870 {
1871     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1872     uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1873     bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1874     uint64_t *d = vd, *n = vn, *m = vm;
1875     uint64_t mask;
1876     int shr, shl;
1877     intptr_t i;
1878
1879     shl = 1 << esz;
1880     shr = 0;
1881     mask = even_bit_esz_masks[esz];
1882     if (odd) {
1883         mask <<= shl;
1884         shr = shl;
1885         shl = 0;
1886     }
1887
1888     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1889         uint64_t nn = (n[i] & mask) >> shr;
1890         uint64_t mm = (m[i] & mask) << shl;
1891         d[i] = nn + mm;
1892     }
1893 }
1894
1895 /* Reverse units of 2**N bits.  */
1896 static uint64_t reverse_bits_64(uint64_t x, int n)
1897 {
1898     int i, sh;
1899
1900     x = bswap64(x);
1901     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1902         uint64_t mask = even_bit_esz_masks[i];
1903         x = ((x & mask) << sh) | ((x >> sh) & mask);
1904     }
1905     return x;
1906 }
1907
1908 static uint8_t reverse_bits_8(uint8_t x, int n)
1909 {
1910     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
1911     int i, sh;
1912
1913     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1914         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
1915     }
1916     return x;
1917 }
1918
1919 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
1920 {
1921     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1922     int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1923     intptr_t i, oprsz_2 = oprsz / 2;
1924
1925     if (oprsz <= 8) {
1926         uint64_t l = *(uint64_t *)vn;
1927         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
1928         *(uint64_t *)vd = l;
1929     } else if ((oprsz & 15) == 0) {
1930         for (i = 0; i < oprsz_2; i += 8) {
1931             intptr_t ih = oprsz - 8 - i;
1932             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
1933             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
1934             *(uint64_t *)(vd + i) = h;
1935             *(uint64_t *)(vd + ih) = l;
1936         }
1937     } else {
1938         for (i = 0; i < oprsz_2; i += 1) {
1939             intptr_t il = H1(i);
1940             intptr_t ih = H1(oprsz - 1 - i);
1941             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
1942             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
1943             *(uint8_t *)(vd + il) = h;
1944             *(uint8_t *)(vd + ih) = l;
1945         }
1946     }
1947 }
1948
1949 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
1950 {
1951     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1952     intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1953     uint64_t *d = vd;
1954     intptr_t i;
1955
1956     if (oprsz <= 8) {
1957         uint64_t nn = *(uint64_t *)vn;
1958         int half = 4 * oprsz;
1959
1960         nn = extract64(nn, high * half, half);
1961         nn = expand_bits(nn, 0);
1962         d[0] = nn;
1963     } else {
1964         ARMPredicateReg tmp_n;
1965
1966         /* We produce output faster than we consume input.
1967            Therefore we must be mindful of possible overlap.  */
1968         if ((vn - vd) < (uintptr_t)oprsz) {
1969             vn = memcpy(&tmp_n, vn, oprsz);
1970         }
1971         if (high) {
1972             high = oprsz >> 1;
1973         }
1974
1975         if ((high & 3) == 0) {
1976             uint32_t *n = vn;
1977             high >>= 2;
1978
1979             for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1980                 uint64_t nn = n[H4(high + i)];
1981                 d[i] = expand_bits(nn, 0);
1982             }
1983         } else {
1984             uint16_t *d16 = vd;
1985             uint8_t *n = vn;
1986
1987             for (i = 0; i < oprsz / 2; i++) {
1988                 uint16_t nn = n[H1(high + i)];
1989                 d16[H2(i)] = expand_bits(nn, 0);
1990             }
1991         }
1992     }
1993 }
1994
1995 #define DO_ZIP(NAME, TYPE, H) \
1996 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
1997 {                                                                    \
1998     intptr_t oprsz = simd_oprsz(desc);                               \
1999     intptr_t i, oprsz_2 = oprsz / 2;                                 \
2000     ARMVectorReg tmp_n, tmp_m;                                       \
2001     /* We produce output faster than we consume input.               \
2002        Therefore we must be mindful of possible overlap.  */         \
2003     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
2004         vn = memcpy(&tmp_n, vn, oprsz_2);                            \
2005     }                                                                \
2006     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
2007         vm = memcpy(&tmp_m, vm, oprsz_2);                            \
2008     }                                                                \
2009     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
2010         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
2011         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2012     }                                                                \
2013 }
2014
2015 DO_ZIP(sve_zip_b, uint8_t, H1)
2016 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2017 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2018 DO_ZIP(sve_zip_d, uint64_t, )
2019
2020 #define DO_UZP(NAME, TYPE, H) \
2021 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2022 {                                                                      \
2023     intptr_t oprsz = simd_oprsz(desc);                                 \
2024     intptr_t oprsz_2 = oprsz / 2;                                      \
2025     intptr_t odd_ofs = simd_data(desc);                                \
2026     intptr_t i;                                                        \
2027     ARMVectorReg tmp_m;                                                \
2028     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
2029         vm = memcpy(&tmp_m, vm, oprsz);                                \
2030     }                                                                  \
2031     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2032         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs));     \
2033     }                                                                  \
2034     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2035         *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2036     }                                                                  \
2037 }
2038
2039 DO_UZP(sve_uzp_b, uint8_t, H1)
2040 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2041 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2042 DO_UZP(sve_uzp_d, uint64_t, )
2043
2044 #define DO_TRN(NAME, TYPE, H) \
2045 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2046 {                                                                      \
2047     intptr_t oprsz = simd_oprsz(desc);                                 \
2048     intptr_t odd_ofs = simd_data(desc);                                \
2049     intptr_t i;                                                        \
2050     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
2051         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
2052         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
2053         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
2054         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
2055     }                                                                  \
2056 }
2057
2058 DO_TRN(sve_trn_b, uint8_t, H1)
2059 DO_TRN(sve_trn_h, uint16_t, H1_2)
2060 DO_TRN(sve_trn_s, uint32_t, H1_4)
2061 DO_TRN(sve_trn_d, uint64_t, )
2062
2063 #undef DO_ZIP
2064 #undef DO_UZP
2065 #undef DO_TRN
2066
2067 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2068 {
2069     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2070     uint32_t *d = vd, *n = vn;
2071     uint8_t *pg = vg;
2072
2073     for (i = j = 0; i < opr_sz; i++) {
2074         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2075             d[H4(j)] = n[H4(i)];
2076             j++;
2077         }
2078     }
2079     for (; j < opr_sz; j++) {
2080         d[H4(j)] = 0;
2081     }
2082 }
2083
2084 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2085 {
2086     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2087     uint64_t *d = vd, *n = vn;
2088     uint8_t *pg = vg;
2089
2090     for (i = j = 0; i < opr_sz; i++) {
2091         if (pg[H1(i)] & 1) {
2092             d[j] = n[i];
2093             j++;
2094         }
2095     }
2096     for (; j < opr_sz; j++) {
2097         d[j] = 0;
2098     }
2099 }
2100
2101 /* Similar to the ARM LastActiveElement pseudocode function, except the
2102  * result is multiplied by the element size.  This includes the not found
2103  * indication; e.g. not found for esz=3 is -8.
2104  */
2105 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2106 {
2107     intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2108     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2109
2110     return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2111 }