target/arm/sve_helper.c

   1 /*
   2  * ARM SVE Operations
   3  *
   4  * Copyright (c) 2018 Linaro, Ltd.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "cpu.h"
  22 #include "exec/exec-all.h"
  23 #include "exec/cpu_ldst.h"
  24 #include "exec/helper-proto.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26 #include "fpu/softfloat.h"
  27
  28
  29 /* Note that vector data is stored in host-endian 64-bit chunks,
  30    so addressing units smaller than that needs a host-endian fixup.  */
  31 #ifdef HOST_WORDS_BIGENDIAN
  32 #define H1(x)   ((x) ^ 7)
  33 #define H1_2(x) ((x) ^ 6)
  34 #define H1_4(x) ((x) ^ 4)
  35 #define H2(x)   ((x) ^ 3)
  36 #define H4(x)   ((x) ^ 1)
  37 #else
  38 #define H1(x)   (x)
  39 #define H1_2(x) (x)
  40 #define H1_4(x) (x)
  41 #define H2(x)   (x)
  42 #define H4(x)   (x)
  43 #endif
  44
  45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
  46  *
  47  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  48  * and bit 0 set if C is set.  Compare the definitions of these variables
  49  * within CPUARMState.
  50  */
  51
  52 /* For no G bits set, NZCV = C.  */
  53 #define PREDTEST_INIT  1
  54
  55 /* This is an iterative function, called for each Pd and Pg word
  56  * moving forward.
  57  */
  58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  59 {
  60     if (likely(g)) {
  61         /* Compute N from first D & G.
  62            Use bit 2 to signal first G bit seen.  */
  63         if (!(flags & 4)) {
  64             flags |= ((d & (g & -g)) != 0) << 31;
  65             flags |= 4;
  66         }
  67
  68         /* Accumulate Z from each D & G.  */
  69         flags |= ((d & g) != 0) << 1;
  70
  71         /* Compute C from last !(D & G).  Replace previous.  */
  72         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  73     }
  74     return flags;
  75 }
  76
  77 /* The same for a single word predicate.  */
  78 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
  79 {
  80     return iter_predtest_fwd(d, g, PREDTEST_INIT);
  81 }
  82
  83 /* The same for a multi-word predicate.  */
  84 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
  85 {
  86     uint32_t flags = PREDTEST_INIT;
  87     uint64_t *d = vd, *g = vg;
  88     uintptr_t i = 0;
  89
  90     do {
  91         flags = iter_predtest_fwd(d[i], g[i], flags);
  92     } while (++i < words);
  93
  94     return flags;
  95 }
  96
  97 /* Expand active predicate bits to bytes, for byte elements.
  98  *  for (i = 0; i < 256; ++i) {
  99  *      unsigned long m = 0;
 100  *      for (j = 0; j < 8; j++) {
 101  *          if ((i >> j) & 1) {
 102  *              m |= 0xfful << (j << 3);
 103  *          }
 104  *      }
 105  *      printf("0x%016lx,\n", m);
 106  *  }
 107  */
 108 static inline uint64_t expand_pred_b(uint8_t byte)
 109 {
 110     static const uint64_t word[256] = {
 111         0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 112         0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 113         0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 114         0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 115         0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 116         0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 117         0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 118         0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 119         0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 120         0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 121         0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 122         0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 123         0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 124         0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 125         0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 126         0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 127         0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 128         0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 129         0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 130         0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 131         0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 132         0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 133         0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 134         0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 135         0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 136         0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 137         0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 138         0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 139         0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 140         0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 141         0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 142         0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 143         0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 144         0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 145         0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 146         0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 147         0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 148         0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 149         0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 150         0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 151         0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 152         0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 153         0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 154         0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 155         0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 156         0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 157         0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 158         0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 159         0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 160         0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 161         0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 162         0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 163         0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 164         0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 165         0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 166         0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 167         0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 168         0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 169         0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 170         0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 171         0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 172         0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 173         0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 174         0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 175         0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 176         0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 177         0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 178         0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 179         0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 180         0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 181         0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 182         0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 183         0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 184         0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 185         0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 186         0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 187         0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 188         0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 189         0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 190         0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 191         0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 192         0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 193         0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 194         0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 195         0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 196         0xffffffffffffffff,
 197     };
 198     return word[byte];
 199 }
 200
 201 /* Similarly for half-word elements.
 202  *  for (i = 0; i < 256; ++i) {
 203  *      unsigned long m = 0;
 204  *      if (i & 0xaa) {
 205  *          continue;
 206  *      }
 207  *      for (j = 0; j < 8; j += 2) {
 208  *          if ((i >> j) & 1) {
 209  *              m |= 0xfffful << (j << 3);
 210  *          }
 211  *      }
 212  *      printf("[0x%x] = 0x%016lx,\n", i, m);
 213  *  }
 214  */
 215 static inline uint64_t expand_pred_h(uint8_t byte)
 216 {
 217     static const uint64_t word[] = {
 218         [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 219         [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 220         [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 221         [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 222         [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 223         [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 224         [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 225         [0x55] = 0xffffffffffffffff,
 226     };
 227     return word[byte & 0x55];
 228 }
 229
 230 /* Similarly for single word elements.  */
 231 static inline uint64_t expand_pred_s(uint8_t byte)
 232 {
 233     static const uint64_t word[] = {
 234         [0x01] = 0x00000000ffffffffull,
 235         [0x10] = 0xffffffff00000000ull,
 236         [0x11] = 0xffffffffffffffffull,
 237     };
 238     return word[byte & 0x11];
 239 }
 240
 241 #define LOGICAL_PPPP(NAME, FUNC) \
 242 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 243 {                                                                         \
 244     uintptr_t opr_sz = simd_oprsz(desc);                                  \
 245     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 246     uintptr_t i;                                                          \
 247     for (i = 0; i < opr_sz / 8; ++i) {                                    \
 248         d[i] = FUNC(n[i], m[i], g[i]);                                    \
 249     }                                                                     \
 250 }
 251
 252 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
 253 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 254 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 255 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 256 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 257 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 258 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 259 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 260
 261 LOGICAL_PPPP(sve_and_pppp, DO_AND)
 262 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 263 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 264 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 265 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 266 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 267 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 268 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 269
 270 #undef DO_AND
 271 #undef DO_BIC
 272 #undef DO_EOR
 273 #undef DO_ORR
 274 #undef DO_ORN
 275 #undef DO_NOR
 276 #undef DO_NAND
 277 #undef DO_SEL
 278 #undef LOGICAL_PPPP
 279
 280 /* Fully general three-operand expander, controlled by a predicate.
 281  * This is complicated by the host-endian storage of the register file.
 282  */
 283 /* ??? I don't expect the compiler could ever vectorize this itself.
 284  * With some tables we can convert bit masks to byte masks, and with
 285  * extra care wrt byte/word ordering we could use gcc generic vectors
 286  * and do 16 bytes at a time.
 287  */
 288 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 289 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 290 {                                                                       \
 291     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 292     for (i = 0; i < opr_sz; ) {                                         \
 293         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 294         do {                                                            \
 295             if (pg & 1) {                                               \
 296                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 297                 TYPE mm = *(TYPE *)(vm + H(i));                         \
 298                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 299             }                                                           \
 300             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 301         } while (i & 15);                                               \
 302     }                                                                   \
 303 }
 304
 305 /* Similarly, specialized for 64-bit operands.  */
 306 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 307 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 308 {                                                               \
 309     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 310     TYPE *d = vd, *n = vn, *m = vm;                             \
 311     uint8_t *pg = vg;                                           \
 312     for (i = 0; i < opr_sz; i += 1) {                           \
 313         if (pg[H1(i)] & 1) {                                    \
 314             TYPE nn = n[i], mm = m[i];                          \
 315             d[i] = OP(nn, mm);                                  \
 316         }                                                       \
 317     }                                                           \
 318 }
 319
 320 #define DO_AND(N, M)  (N & M)
 321 #define DO_EOR(N, M)  (N ^ M)
 322 #define DO_ORR(N, M)  (N | M)
 323 #define DO_BIC(N, M)  (N & ~M)
 324 #define DO_ADD(N, M)  (N + M)
 325 #define DO_SUB(N, M)  (N - M)
 326 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 327 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 328 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 329 #define DO_MUL(N, M)  (N * M)
 330 #define DO_DIV(N, M)  (M ? N / M : 0)
 331
 332 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 333 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 334 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 335 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 336
 337 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 338 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 339 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 340 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 341
 342 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 343 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 344 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 345 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 346
 347 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 348 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 349 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 350 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 351
 352 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 353 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 354 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 355 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 356
 357 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 358 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 359 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 360 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 361
 362 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 363 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 364 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 365 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 366
 367 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 368 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 369 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 370 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 371
 372 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 373 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 374 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 375 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 376
 377 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 378 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 379 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 380 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 381
 382 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 383 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 384 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 385 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 386
 387 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 388 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 389 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 390 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 391
 392 /* Because the computation type is at least twice as large as required,
 393    these work for both signed and unsigned source types.  */
 394 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 395 {
 396     return (n * m) >> 8;
 397 }
 398
 399 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 400 {
 401     return (n * m) >> 16;
 402 }
 403
 404 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 405 {
 406     return (n * m) >> 32;
 407 }
 408
 409 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 410 {
 411     uint64_t lo, hi;
 412     muls64(&lo, &hi, n, m);
 413     return hi;
 414 }
 415
 416 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 417 {
 418     uint64_t lo, hi;
 419     mulu64(&lo, &hi, n, m);
 420     return hi;
 421 }
 422
 423 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 424 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 425 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 426 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 427
 428 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 429 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 430 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 431 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 432
 433 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 434 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 435 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 436 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 437
 438 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
 439 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
 440
 441 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
 442 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
 443
 444 /* Note that all bits of the shift are significant
 445    and not modulo the element size.  */
 446 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 447 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 448 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 449
 450 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 451 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 452 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 453
 454 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 455 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 456 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 457
 458 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 459 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 460 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 461
 462 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 463 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 464 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 465
 466 #undef DO_ZPZZ
 467 #undef DO_ZPZZ_D
 468
 469 /* Three-operand expander, controlled by a predicate, in which the
 470  * third operand is "wide".  That is, for D = N op M, the same 64-bit
 471  * value of M is used with all of the narrower values of N.
 472  */
 473 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 474 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 475 {                                                                       \
 476     intptr_t i, opr_sz = simd_oprsz(desc);                              \
 477     for (i = 0; i < opr_sz; ) {                                         \
 478         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 479         TYPEW mm = *(TYPEW *)(vm + i);                                  \
 480         do {                                                            \
 481             if (pg & 1) {                                               \
 482                 TYPE nn = *(TYPE *)(vn + H(i));                         \
 483                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 484             }                                                           \
 485             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 486         } while (i & 7);                                                \
 487     }                                                                   \
 488 }
 489
 490 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 491 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 492 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 493
 494 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 495 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 496 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 497
 498 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 499 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 500 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 501
 502 #undef DO_ZPZW
 503
 504 /* Fully general two-operand expander, controlled by a predicate.
 505  */
 506 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
 507 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 508 {                                                               \
 509     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 510     for (i = 0; i < opr_sz; ) {                                 \
 511         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 512         do {                                                    \
 513             if (pg & 1) {                                       \
 514                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 515                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
 516             }                                                   \
 517             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 518         } while (i & 15);                                       \
 519     }                                                           \
 520 }
 521
 522 /* Similarly, specialized for 64-bit operands.  */
 523 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
 524 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 525 {                                                               \
 526     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 527     TYPE *d = vd, *n = vn;                                      \
 528     uint8_t *pg = vg;                                           \
 529     for (i = 0; i < opr_sz; i += 1) {                           \
 530         if (pg[H1(i)] & 1) {                                    \
 531             TYPE nn = n[i];                                     \
 532             d[i] = OP(nn);                                      \
 533         }                                                       \
 534     }                                                           \
 535 }
 536
 537 #define DO_CLS_B(N)   (clrsb32(N) - 24)
 538 #define DO_CLS_H(N)   (clrsb32(N) - 16)
 539
 540 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 541 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 542 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 543 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 544
 545 #define DO_CLZ_B(N)   (clz32(N) - 24)
 546 #define DO_CLZ_H(N)   (clz32(N) - 16)
 547
 548 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 549 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 550 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 551 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 552
 553 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 554 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 555 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 556 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 557
 558 #define DO_CNOT(N)    (N == 0)
 559
 560 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 561 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 562 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 563 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 564
 565 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 566
 567 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 568 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 569 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 570
 571 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 572
 573 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 574 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 575 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 576
 577 #define DO_NOT(N)    (~N)
 578
 579 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 580 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 581 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 582 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 583
 584 #define DO_SXTB(N)    ((int8_t)N)
 585 #define DO_SXTH(N)    ((int16_t)N)
 586 #define DO_SXTS(N)    ((int32_t)N)
 587 #define DO_UXTB(N)    ((uint8_t)N)
 588 #define DO_UXTH(N)    ((uint16_t)N)
 589 #define DO_UXTS(N)    ((uint32_t)N)
 590
 591 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 592 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 593 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 594 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 595 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 596 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 597
 598 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 599 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 600 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 601 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 602 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 603 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 604
 605 #define DO_ABS(N)    (N < 0 ? -N : N)
 606
 607 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 608 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 609 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 610 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 611
 612 #define DO_NEG(N)    (-N)
 613
 614 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 615 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 616 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 617 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 618
 619 /* Three-operand expander, unpredicated, in which the third operand is "wide".
 620  */
 621 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 622 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 623 {                                                              \
 624     intptr_t i, opr_sz = simd_oprsz(desc);                     \
 625     for (i = 0; i < opr_sz; ) {                                \
 626         TYPEW mm = *(TYPEW *)(vm + i);                         \
 627         do {                                                   \
 628             TYPE nn = *(TYPE *)(vn + H(i));                    \
 629             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 630             i += sizeof(TYPE);                                 \
 631         } while (i & 7);                                       \
 632     }                                                          \
 633 }
 634
 635 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 636 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 637 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 638
 639 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 640 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 641 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 642
 643 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 644 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 645 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 646
 647 #undef DO_ZZW
 648
 649 #undef DO_CLS_B
 650 #undef DO_CLS_H
 651 #undef DO_CLZ_B
 652 #undef DO_CLZ_H
 653 #undef DO_CNOT
 654 #undef DO_FABS
 655 #undef DO_FNEG
 656 #undef DO_ABS
 657 #undef DO_NEG
 658 #undef DO_ZPZ
 659 #undef DO_ZPZ_D
 660
 661 /* Two-operand reduction expander, controlled by a predicate.
 662  * The difference between TYPERED and TYPERET has to do with
 663  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 664  * but TYPERET must be unsigned so that e.g. a 32-bit value
 665  * is not sign-extended to the ABI uint64_t return type.
 666  */
 667 /* ??? If we were to vectorize this by hand the reduction ordering
 668  * would change.  For integer operands, this is perfectly fine.
 669  */
 670 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 671 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 672 {                                                          \
 673     intptr_t i, opr_sz = simd_oprsz(desc);                 \
 674     TYPERED ret = INIT;                                    \
 675     for (i = 0; i < opr_sz; ) {                            \
 676         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 677         do {                                               \
 678             if (pg & 1) {                                  \
 679                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 680                 ret = OP(ret, nn);                         \
 681             }                                              \
 682             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 683         } while (i & 15);                                  \
 684     }                                                      \
 685     return (TYPERET)ret;                                   \
 686 }
 687
 688 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 689 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 690 {                                                          \
 691     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 692     TYPEE *n = vn;                                         \
 693     uint8_t *pg = vg;                                      \
 694     TYPER ret = INIT;                                      \
 695     for (i = 0; i < opr_sz; i += 1) {                      \
 696         if (pg[H1(i)] & 1) {                               \
 697             TYPEE nn = n[i];                               \
 698             ret = OP(ret, nn);                             \
 699         }                                                  \
 700     }                                                      \
 701     return ret;                                            \
 702 }
 703
 704 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 705 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 706 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 707 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 708
 709 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 710 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 711 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 712 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 713
 714 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 715 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 716 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 717 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 718
 719 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 720 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 721 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 722
 723 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 724 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 725 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 726 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 727
 728 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 729 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 730 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 731 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 732
 733 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 734 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 735 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 736 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 737
 738 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 739 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 740 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 741 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 742
 743 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 744 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 745 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 746 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 747
 748 #undef DO_VPZ
 749 #undef DO_VPZ_D
 750
 751 #undef DO_AND
 752 #undef DO_ORR
 753 #undef DO_EOR
 754 #undef DO_BIC
 755 #undef DO_ADD
 756 #undef DO_SUB
 757 #undef DO_MAX
 758 #undef DO_MIN
 759 #undef DO_ABD
 760 #undef DO_MUL
 761 #undef DO_DIV
 762 #undef DO_ASR
 763 #undef DO_LSR
 764 #undef DO_LSL
 765
 766 /* Similar to the ARM LastActiveElement pseudocode function, except the
 767    result is multiplied by the element size.  This includes the not found
 768    indication; e.g. not found for esz=3 is -8.  */
 769 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 770 {
 771     uint64_t mask = pred_esz_masks[esz];
 772     intptr_t i = words;
 773
 774     do {
 775         uint64_t this_g = g[--i] & mask;
 776         if (this_g) {
 777             return i * 64 + (63 - clz64(this_g));
 778         }
 779     } while (i > 0);
 780     return (intptr_t)-1 << esz;
 781 }
 782
 783 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 784 {
 785     uint32_t flags = PREDTEST_INIT;
 786     uint64_t *d = vd, *g = vg;
 787     intptr_t i = 0;
 788
 789     do {
 790         uint64_t this_d = d[i];
 791         uint64_t this_g = g[i];
 792
 793         if (this_g) {
 794             if (!(flags & 4)) {
 795                 /* Set in D the first bit of G.  */
 796                 this_d |= this_g & -this_g;
 797                 d[i] = this_d;
 798             }
 799             flags = iter_predtest_fwd(this_d, this_g, flags);
 800         }
 801     } while (++i < words);
 802
 803     return flags;
 804 }
 805
 806 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 807 {
 808     intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 809     intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 810     uint32_t flags = PREDTEST_INIT;
 811     uint64_t *d = vd, *g = vg, esz_mask;
 812     intptr_t i, next;
 813
 814     next = last_active_element(vd, words, esz) + (1 << esz);
 815     esz_mask = pred_esz_masks[esz];
 816
 817     /* Similar to the pseudocode for pnext, but scaled by ESZ
 818        so that we find the correct bit.  */
 819     if (next < words * 64) {
 820         uint64_t mask = -1;
 821
 822         if (next & 63) {
 823             mask = ~((1ull << (next & 63)) - 1);
 824             next &= -64;
 825         }
 826         do {
 827             uint64_t this_g = g[next / 64] & esz_mask & mask;
 828             if (this_g != 0) {
 829                 next = (next & -64) + ctz64(this_g);
 830                 break;
 831             }
 832             next += 64;
 833             mask = -1;
 834         } while (next < words * 64);
 835     }
 836
 837     i = 0;
 838     do {
 839         uint64_t this_d = 0;
 840         if (i == next / 64) {
 841             this_d = 1ull << (next & 63);
 842         }
 843         d[i] = this_d;
 844         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 845     } while (++i < words);
 846
 847     return flags;
 848 }
 849
 850 /* Store zero into every active element of Zd.  We will use this for two
 851  * and three-operand predicated instructions for which logic dictates a
 852  * zero result.  In particular, logical shift by element size, which is
 853  * otherwise undefined on the host.
 854  *
 855  * For element sizes smaller than uint64_t, we use tables to expand
 856  * the N bits of the controlling predicate to a byte mask, and clear
 857  * those bytes.
 858  */
 859 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
 860 {
 861     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 862     uint64_t *d = vd;
 863     uint8_t *pg = vg;
 864     for (i = 0; i < opr_sz; i += 1) {
 865         d[i] &= ~expand_pred_b(pg[H1(i)]);
 866     }
 867 }
 868
 869 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
 870 {
 871     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 872     uint64_t *d = vd;
 873     uint8_t *pg = vg;
 874     for (i = 0; i < opr_sz; i += 1) {
 875         d[i] &= ~expand_pred_h(pg[H1(i)]);
 876     }
 877 }
 878
 879 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
 880 {
 881     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 882     uint64_t *d = vd;
 883     uint8_t *pg = vg;
 884     for (i = 0; i < opr_sz; i += 1) {
 885         d[i] &= ~expand_pred_s(pg[H1(i)]);
 886     }
 887 }
 888
 889 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 890 {
 891     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 892     uint64_t *d = vd;
 893     uint8_t *pg = vg;
 894     for (i = 0; i < opr_sz; i += 1) {
 895         if (pg[H1(i)] & 1) {
 896             d[i] = 0;
 897         }
 898     }
 899 }
 900
 901 /* Three-operand expander, immediate operand, controlled by a predicate.
 902  */
 903 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
 904 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 905 {                                                               \
 906     intptr_t i, opr_sz = simd_oprsz(desc);                      \
 907     TYPE imm = simd_data(desc);                                 \
 908     for (i = 0; i < opr_sz; ) {                                 \
 909         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 910         do {                                                    \
 911             if (pg & 1) {                                       \
 912                 TYPE nn = *(TYPE *)(vn + H(i));                 \
 913                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
 914             }                                                   \
 915             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 916         } while (i & 15);                                       \
 917     }                                                           \
 918 }
 919
 920 /* Similarly, specialized for 64-bit operands.  */
 921 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
 922 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 923 {                                                               \
 924     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 925     TYPE *d = vd, *n = vn;                                      \
 926     TYPE imm = simd_data(desc);                                 \
 927     uint8_t *pg = vg;                                           \
 928     for (i = 0; i < opr_sz; i += 1) {                           \
 929         if (pg[H1(i)] & 1) {                                    \
 930             TYPE nn = n[i];                                     \
 931             d[i] = OP(nn, imm);                                 \
 932         }                                                       \
 933     }                                                           \
 934 }
 935
 936 #define DO_SHR(N, M)  (N >> M)
 937 #define DO_SHL(N, M)  (N << M)
 938
 939 /* Arithmetic shift right for division.  This rounds negative numbers
 940    toward zero as per signed division.  Therefore before shifting,
 941    when N is negative, add 2**M-1.  */
 942 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
 943
 944 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
 945 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
 946 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
 947 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
 948
 949 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
 950 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
 951 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
 952 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
 953
 954 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
 955 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
 956 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
 957 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
 958
 959 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
 960 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
 961 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
 962 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
 963
 964 #undef DO_SHR
 965 #undef DO_SHL
 966 #undef DO_ASRD
 967 #undef DO_ZPZI
 968 #undef DO_ZPZI_D
 969
 970 /* Fully general four-operand expander, controlled by a predicate.
 971  */
 972 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
 973 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
 974                   void *vg, uint32_t desc)                    \
 975 {                                                             \
 976     intptr_t i, opr_sz = simd_oprsz(desc);                    \
 977     for (i = 0; i < opr_sz; ) {                               \
 978         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
 979         do {                                                  \
 980             if (pg & 1) {                                     \
 981                 TYPE nn = *(TYPE *)(vn + H(i));               \
 982                 TYPE mm = *(TYPE *)(vm + H(i));               \
 983                 TYPE aa = *(TYPE *)(va + H(i));               \
 984                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
 985             }                                                 \
 986             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
 987         } while (i & 15);                                     \
 988     }                                                         \
 989 }
 990
 991 /* Similarly, specialized for 64-bit operands.  */
 992 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
 993 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
 994                   void *vg, uint32_t desc)                    \
 995 {                                                             \
 996     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
 997     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
 998     uint8_t *pg = vg;                                         \
 999     for (i = 0; i < opr_sz; i += 1) {                         \
1000         if (pg[H1(i)] & 1) {                                  \
1001             TYPE aa = a[i], nn = n[i], mm = m[i];             \
1002             d[i] = OP(aa, nn, mm);                            \
1003         }                                                     \
1004     }                                                         \
1005 }
1006
1007 #define DO_MLA(A, N, M)  (A + N * M)
1008 #define DO_MLS(A, N, M)  (A - N * M)
1009
1010 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1011 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1012
1013 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1014 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1015
1016 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1017 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1018
1019 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1020 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1021
1022 #undef DO_MLA
1023 #undef DO_MLS
1024 #undef DO_ZPZZZ
1025 #undef DO_ZPZZZ_D
1026
1027 void HELPER(sve_index_b)(void *vd, uint32_t start,
1028                          uint32_t incr, uint32_t desc)
1029 {
1030     intptr_t i, opr_sz = simd_oprsz(desc);
1031     uint8_t *d = vd;
1032     for (i = 0; i < opr_sz; i += 1) {
1033         d[H1(i)] = start + i * incr;
1034     }
1035 }
1036
1037 void HELPER(sve_index_h)(void *vd, uint32_t start,
1038                          uint32_t incr, uint32_t desc)
1039 {
1040     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1041     uint16_t *d = vd;
1042     for (i = 0; i < opr_sz; i += 1) {
1043         d[H2(i)] = start + i * incr;
1044     }
1045 }
1046
1047 void HELPER(sve_index_s)(void *vd, uint32_t start,
1048                          uint32_t incr, uint32_t desc)
1049 {
1050     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1051     uint32_t *d = vd;
1052     for (i = 0; i < opr_sz; i += 1) {
1053         d[H4(i)] = start + i * incr;
1054     }
1055 }
1056
1057 void HELPER(sve_index_d)(void *vd, uint64_t start,
1058                          uint64_t incr, uint32_t desc)
1059 {
1060     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1061     uint64_t *d = vd;
1062     for (i = 0; i < opr_sz; i += 1) {
1063         d[i] = start + i * incr;
1064     }
1065 }
1066
1067 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1068 {
1069     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1070     uint32_t sh = simd_data(desc);
1071     uint32_t *d = vd, *n = vn, *m = vm;
1072     for (i = 0; i < opr_sz; i += 1) {
1073         d[i] = n[i] + (m[i] << sh);
1074     }
1075 }
1076
1077 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1078 {
1079     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1080     uint64_t sh = simd_data(desc);
1081     uint64_t *d = vd, *n = vn, *m = vm;
1082     for (i = 0; i < opr_sz; i += 1) {
1083         d[i] = n[i] + (m[i] << sh);
1084     }
1085 }
1086
1087 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1088 {
1089     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1090     uint64_t sh = simd_data(desc);
1091     uint64_t *d = vd, *n = vn, *m = vm;
1092     for (i = 0; i < opr_sz; i += 1) {
1093         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1094     }
1095 }
1096
1097 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1098 {
1099     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1100     uint64_t sh = simd_data(desc);
1101     uint64_t *d = vd, *n = vn, *m = vm;
1102     for (i = 0; i < opr_sz; i += 1) {
1103         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1104     }
1105 }
1106
1107 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1108 {
1109     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1110     static const uint16_t coeff[] = {
1111         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1112         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1113         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1114         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1115     };
1116     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1117     uint16_t *d = vd, *n = vn;
1118
1119     for (i = 0; i < opr_sz; i++) {
1120         uint16_t nn = n[i];
1121         intptr_t idx = extract32(nn, 0, 5);
1122         uint16_t exp = extract32(nn, 5, 5);
1123         d[i] = coeff[idx] | (exp << 10);
1124     }
1125 }
1126
1127 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1128 {
1129     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1130     static const uint32_t coeff[] = {
1131         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1132         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1133         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1134         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1135         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1136         0x1ef532, 0x20b051, 0x227043, 0x243516,
1137         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1138         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1139         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1140         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1141         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1142         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1143         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1144         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1145         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1146         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1147     };
1148     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1149     uint32_t *d = vd, *n = vn;
1150
1151     for (i = 0; i < opr_sz; i++) {
1152         uint32_t nn = n[i];
1153         intptr_t idx = extract32(nn, 0, 6);
1154         uint32_t exp = extract32(nn, 6, 8);
1155         d[i] = coeff[idx] | (exp << 23);
1156     }
1157 }
1158
1159 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1160 {
1161     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1162     static const uint64_t coeff[] = {
1163         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1164         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1165         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1166         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1167         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1168         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1169         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1170         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1171         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1172         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1173         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1174         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1175         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1176         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1177         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1178         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1179         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1180         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1181         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1182         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1183         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1184         0xFA7C1819E90D8ull,
1185     };
1186     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1187     uint64_t *d = vd, *n = vn;
1188
1189     for (i = 0; i < opr_sz; i++) {
1190         uint64_t nn = n[i];
1191         intptr_t idx = extract32(nn, 0, 6);
1192         uint64_t exp = extract32(nn, 6, 11);
1193         d[i] = coeff[idx] | (exp << 52);
1194     }
1195 }
1196
1197 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1198 {
1199     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1200     uint16_t *d = vd, *n = vn, *m = vm;
1201     for (i = 0; i < opr_sz; i += 1) {
1202         uint16_t nn = n[i];
1203         uint16_t mm = m[i];
1204         if (mm & 1) {
1205             nn = float16_one;
1206         }
1207         d[i] = nn ^ (mm & 2) << 14;
1208     }
1209 }
1210
1211 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1212 {
1213     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1214     uint32_t *d = vd, *n = vn, *m = vm;
1215     for (i = 0; i < opr_sz; i += 1) {
1216         uint32_t nn = n[i];
1217         uint32_t mm = m[i];
1218         if (mm & 1) {
1219             nn = float32_one;
1220         }
1221         d[i] = nn ^ (mm & 2) << 30;
1222     }
1223 }
1224
1225 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1226 {
1227     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228     uint64_t *d = vd, *n = vn, *m = vm;
1229     for (i = 0; i < opr_sz; i += 1) {
1230         uint64_t nn = n[i];
1231         uint64_t mm = m[i];
1232         if (mm & 1) {
1233             nn = float64_one;
1234         }
1235         d[i] = nn ^ (mm & 2) << 62;
1236     }
1237 }