tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "tcg/tcg.h"
  22 #include "tcg/tcg-temp-internal.h"
  23 #include "tcg/tcg-op-common.h"
  24 #include "tcg/tcg-op-gvec-common.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26
  27 #define MAX_UNROLL  4
  28
  29 #ifdef CONFIG_DEBUG_TCG
  30 static const TCGOpcode vecop_list_empty[1] = { 0 };
  31 #else
  32 #define vecop_list_empty NULL
  33 #endif
  34
  35
  36 /* Verify vector size and alignment rules.  OFS should be the OR of all
  37    of the operand offsets so that we can check them all at once.  */
  38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39 {
  40     uint32_t max_align;
  41
  42     switch (oprsz) {
  43     case 8:
  44     case 16:
  45     case 32:
  46         tcg_debug_assert(oprsz <= maxsz);
  47         break;
  48     default:
  49         tcg_debug_assert(oprsz == maxsz);
  50         break;
  51     }
  52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54     max_align = maxsz >= 16 ? 15 : 7;
  55     tcg_debug_assert((maxsz & max_align) == 0);
  56     tcg_debug_assert((ofs & max_align) == 0);
  57 }
  58
  59 /* Verify vector overlap rules for two operands.  */
  60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61 {
  62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63 }
  64
  65 /* Verify vector overlap rules for three operands.  */
  66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67 {
  68     check_overlap_2(d, a, s);
  69     check_overlap_2(d, b, s);
  70     check_overlap_2(a, b, s);
  71 }
  72
  73 /* Verify vector overlap rules for four operands.  */
  74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                             uint32_t c, uint32_t s)
  76 {
  77     check_overlap_2(d, a, s);
  78     check_overlap_2(d, b, s);
  79     check_overlap_2(d, c, s);
  80     check_overlap_2(a, b, s);
  81     check_overlap_2(a, c, s);
  82     check_overlap_2(b, c, s);
  83 }
  84
  85 /* Create a descriptor from components.  */
  86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87 {
  88     uint32_t desc = 0;
  89
  90     check_size_align(oprsz, maxsz, 0);
  91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93     oprsz = (oprsz / 8) - 1;
  94     maxsz = (maxsz / 8) - 1;
  95
  96     /*
  97      * We have just asserted in check_size_align that either
  98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99      * case with '2', as that would otherwise map to 24.
 100      */
 101     if (oprsz == maxsz) {
 102         oprsz = 2;
 103     }
 104
 105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109     return desc;
 110 }
 111
 112 /* Generate a call to a gvec-style helper with two vector operands.  */
 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                         gen_helper_gvec_2 *fn)
 116 {
 117     TCGv_ptr a0, a1;
 118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 119
 120     a0 = tcg_temp_ebb_new_ptr();
 121     a1 = tcg_temp_ebb_new_ptr();
 122
 123     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 124     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 125
 126     fn(a0, a1, desc);
 127
 128     tcg_temp_free_ptr(a0);
 129     tcg_temp_free_ptr(a1);
 130 }
 131
 132 /* Generate a call to a gvec-style helper with two vector operands
 133    and one scalar operand.  */
 134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 136                          gen_helper_gvec_2i *fn)
 137 {
 138     TCGv_ptr a0, a1;
 139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 140
 141     a0 = tcg_temp_ebb_new_ptr();
 142     a1 = tcg_temp_ebb_new_ptr();
 143
 144     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 145     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 146
 147     fn(a0, a1, c, desc);
 148
 149     tcg_temp_free_ptr(a0);
 150     tcg_temp_free_ptr(a1);
 151 }
 152
 153 /* Generate a call to a gvec-style helper with three vector operands.  */
 154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 156                         gen_helper_gvec_3 *fn)
 157 {
 158     TCGv_ptr a0, a1, a2;
 159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 160
 161     a0 = tcg_temp_ebb_new_ptr();
 162     a1 = tcg_temp_ebb_new_ptr();
 163     a2 = tcg_temp_ebb_new_ptr();
 164
 165     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 166     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 167     tcg_gen_addi_ptr(a2, tcg_env, bofs);
 168
 169     fn(a0, a1, a2, desc);
 170
 171     tcg_temp_free_ptr(a0);
 172     tcg_temp_free_ptr(a1);
 173     tcg_temp_free_ptr(a2);
 174 }
 175
 176 /* Generate a call to a gvec-style helper with four vector operands.  */
 177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 179                         int32_t data, gen_helper_gvec_4 *fn)
 180 {
 181     TCGv_ptr a0, a1, a2, a3;
 182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 183
 184     a0 = tcg_temp_ebb_new_ptr();
 185     a1 = tcg_temp_ebb_new_ptr();
 186     a2 = tcg_temp_ebb_new_ptr();
 187     a3 = tcg_temp_ebb_new_ptr();
 188
 189     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 190     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 191     tcg_gen_addi_ptr(a2, tcg_env, bofs);
 192     tcg_gen_addi_ptr(a3, tcg_env, cofs);
 193
 194     fn(a0, a1, a2, a3, desc);
 195
 196     tcg_temp_free_ptr(a0);
 197     tcg_temp_free_ptr(a1);
 198     tcg_temp_free_ptr(a2);
 199     tcg_temp_free_ptr(a3);
 200 }
 201
 202 /* Generate a call to a gvec-style helper with five vector operands.  */
 203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 206 {
 207     TCGv_ptr a0, a1, a2, a3, a4;
 208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 209
 210     a0 = tcg_temp_ebb_new_ptr();
 211     a1 = tcg_temp_ebb_new_ptr();
 212     a2 = tcg_temp_ebb_new_ptr();
 213     a3 = tcg_temp_ebb_new_ptr();
 214     a4 = tcg_temp_ebb_new_ptr();
 215
 216     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 217     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 218     tcg_gen_addi_ptr(a2, tcg_env, bofs);
 219     tcg_gen_addi_ptr(a3, tcg_env, cofs);
 220     tcg_gen_addi_ptr(a4, tcg_env, xofs);
 221
 222     fn(a0, a1, a2, a3, a4, desc);
 223
 224     tcg_temp_free_ptr(a0);
 225     tcg_temp_free_ptr(a1);
 226     tcg_temp_free_ptr(a2);
 227     tcg_temp_free_ptr(a3);
 228     tcg_temp_free_ptr(a4);
 229 }
 230
 231 /* Generate a call to a gvec-style helper with three vector operands
 232    and an extra pointer operand.  */
 233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 235                         int32_t data, gen_helper_gvec_2_ptr *fn)
 236 {
 237     TCGv_ptr a0, a1;
 238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 239
 240     a0 = tcg_temp_ebb_new_ptr();
 241     a1 = tcg_temp_ebb_new_ptr();
 242
 243     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 244     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 245
 246     fn(a0, a1, ptr, desc);
 247
 248     tcg_temp_free_ptr(a0);
 249     tcg_temp_free_ptr(a1);
 250 }
 251
 252 /* Generate a call to a gvec-style helper with three vector operands
 253    and an extra pointer operand.  */
 254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 256                         int32_t data, gen_helper_gvec_3_ptr *fn)
 257 {
 258     TCGv_ptr a0, a1, a2;
 259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 260
 261     a0 = tcg_temp_ebb_new_ptr();
 262     a1 = tcg_temp_ebb_new_ptr();
 263     a2 = tcg_temp_ebb_new_ptr();
 264
 265     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 266     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 267     tcg_gen_addi_ptr(a2, tcg_env, bofs);
 268
 269     fn(a0, a1, a2, ptr, desc);
 270
 271     tcg_temp_free_ptr(a0);
 272     tcg_temp_free_ptr(a1);
 273     tcg_temp_free_ptr(a2);
 274 }
 275
 276 /* Generate a call to a gvec-style helper with four vector operands
 277    and an extra pointer operand.  */
 278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 280                         uint32_t maxsz, int32_t data,
 281                         gen_helper_gvec_4_ptr *fn)
 282 {
 283     TCGv_ptr a0, a1, a2, a3;
 284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 285
 286     a0 = tcg_temp_ebb_new_ptr();
 287     a1 = tcg_temp_ebb_new_ptr();
 288     a2 = tcg_temp_ebb_new_ptr();
 289     a3 = tcg_temp_ebb_new_ptr();
 290
 291     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 292     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 293     tcg_gen_addi_ptr(a2, tcg_env, bofs);
 294     tcg_gen_addi_ptr(a3, tcg_env, cofs);
 295
 296     fn(a0, a1, a2, a3, ptr, desc);
 297
 298     tcg_temp_free_ptr(a0);
 299     tcg_temp_free_ptr(a1);
 300     tcg_temp_free_ptr(a2);
 301     tcg_temp_free_ptr(a3);
 302 }
 303
 304 /* Generate a call to a gvec-style helper with five vector operands
 305    and an extra pointer operand.  */
 306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 309                         gen_helper_gvec_5_ptr *fn)
 310 {
 311     TCGv_ptr a0, a1, a2, a3, a4;
 312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 313
 314     a0 = tcg_temp_ebb_new_ptr();
 315     a1 = tcg_temp_ebb_new_ptr();
 316     a2 = tcg_temp_ebb_new_ptr();
 317     a3 = tcg_temp_ebb_new_ptr();
 318     a4 = tcg_temp_ebb_new_ptr();
 319
 320     tcg_gen_addi_ptr(a0, tcg_env, dofs);
 321     tcg_gen_addi_ptr(a1, tcg_env, aofs);
 322     tcg_gen_addi_ptr(a2, tcg_env, bofs);
 323     tcg_gen_addi_ptr(a3, tcg_env, cofs);
 324     tcg_gen_addi_ptr(a4, tcg_env, eofs);
 325
 326     fn(a0, a1, a2, a3, a4, ptr, desc);
 327
 328     tcg_temp_free_ptr(a0);
 329     tcg_temp_free_ptr(a1);
 330     tcg_temp_free_ptr(a2);
 331     tcg_temp_free_ptr(a3);
 332     tcg_temp_free_ptr(a4);
 333 }
 334
 335 /* Return true if we want to implement something of OPRSZ bytes
 336    in units of LNSZ.  This limits the expansion of inline code.  */
 337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 338 {
 339     uint32_t q, r;
 340
 341     if (oprsz < lnsz) {
 342         return false;
 343     }
 344
 345     q = oprsz / lnsz;
 346     r = oprsz % lnsz;
 347     tcg_debug_assert((r & 7) == 0);
 348
 349     if (lnsz < 16) {
 350         /* For sizes below 16, accept no remainder. */
 351         if (r != 0) {
 352             return false;
 353         }
 354     } else {
 355         /*
 356          * Recall that ARM SVE allows vector sizes that are not a
 357          * power of 2, but always a multiple of 16.  The intent is
 358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 359          * In addition, expand_clr needs to handle a multiple of 8.
 360          * Thus we can handle the tail with one more operation per
 361          * diminishing power of 2.
 362          */
 363         q += ctpop32(r);
 364     }
 365
 366     return q <= MAX_UNROLL;
 367 }
 368
 369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 370
 371 /* Duplicate C as per VECE.  */
 372 uint64_t (dup_const)(unsigned vece, uint64_t c)
 373 {
 374     switch (vece) {
 375     case MO_8:
 376         return 0x0101010101010101ull * (uint8_t)c;
 377     case MO_16:
 378         return 0x0001000100010001ull * (uint16_t)c;
 379     case MO_32:
 380         return 0x0000000100000001ull * (uint32_t)c;
 381     case MO_64:
 382         return c;
 383     default:
 384         g_assert_not_reached();
 385     }
 386 }
 387
 388 /* Duplicate IN into OUT as per VECE.  */
 389 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 390 {
 391     switch (vece) {
 392     case MO_8:
 393         tcg_gen_ext8u_i32(out, in);
 394         tcg_gen_muli_i32(out, out, 0x01010101);
 395         break;
 396     case MO_16:
 397         tcg_gen_deposit_i32(out, in, in, 16, 16);
 398         break;
 399     case MO_32:
 400         tcg_gen_mov_i32(out, in);
 401         break;
 402     default:
 403         g_assert_not_reached();
 404     }
 405 }
 406
 407 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 408 {
 409     switch (vece) {
 410     case MO_8:
 411         tcg_gen_ext8u_i64(out, in);
 412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 413         break;
 414     case MO_16:
 415         tcg_gen_ext16u_i64(out, in);
 416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 417         break;
 418     case MO_32:
 419         tcg_gen_deposit_i64(out, in, in, 32, 32);
 420         break;
 421     case MO_64:
 422         tcg_gen_mov_i64(out, in);
 423         break;
 424     default:
 425         g_assert_not_reached();
 426     }
 427 }
 428
 429 /* Select a supported vector type for implementing an operation on SIZE
 430  * bytes.  If OP is 0, assume that the real operation to be performed is
 431  * required by all backends.  Otherwise, make sure than OP can be performed
 432  * on elements of size VECE in the selected type.  Do not select V64 if
 433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
 434  */
 435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 436                                   uint32_t size, bool prefer_i64)
 437 {
 438     /*
 439      * Recall that ARM SVE allows vector sizes that are not a
 440      * power of 2, but always a multiple of 16.  The intent is
 441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 442      * It is hard to imagine a case in which v256 is supported
 443      * but v128 is not, but check anyway.
 444      * In addition, expand_clr needs to handle a multiple of 8.
 445      */
 446     if (TCG_TARGET_HAS_v256 &&
 447         check_size_impl(size, 32) &&
 448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 449         (!(size & 16) ||
 450          (TCG_TARGET_HAS_v128 &&
 451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 452         (!(size & 8) ||
 453          (TCG_TARGET_HAS_v64 &&
 454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 455         return TCG_TYPE_V256;
 456     }
 457     if (TCG_TARGET_HAS_v128 &&
 458         check_size_impl(size, 16) &&
 459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 460         (!(size & 8) ||
 461          (TCG_TARGET_HAS_v64 &&
 462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 463         return TCG_TYPE_V128;
 464     }
 465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 467         return TCG_TYPE_V64;
 468     }
 469     return 0;
 470 }
 471
 472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 473                          uint32_t maxsz, TCGv_vec t_vec)
 474 {
 475     uint32_t i = 0;
 476
 477     tcg_debug_assert(oprsz >= 8);
 478
 479     /*
 480      * This may be expand_clr for the tail of an operation, e.g.
 481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 482      * are misaligned wrt the maximum vector size, so do that first.
 483      */
 484     if (dofs & 8) {
 485         tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
 486         i += 8;
 487     }
 488
 489     switch (type) {
 490     case TCG_TYPE_V256:
 491         /*
 492          * Recall that ARM SVE allows vector sizes that are not a
 493          * power of 2, but always a multiple of 16.  The intent is
 494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 495          */
 496         for (; i + 32 <= oprsz; i += 32) {
 497             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V256);
 498         }
 499         /* fallthru */
 500     case TCG_TYPE_V128:
 501         for (; i + 16 <= oprsz; i += 16) {
 502             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V128);
 503         }
 504         break;
 505     case TCG_TYPE_V64:
 506         for (; i < oprsz; i += 8) {
 507             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
 508         }
 509         break;
 510     default:
 511         g_assert_not_reached();
 512     }
 513
 514     if (oprsz < maxsz) {
 515         expand_clr(dofs + oprsz, maxsz - oprsz);
 516     }
 517 }
 518
 519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 520  * Only one of IN_32 or IN_64 may be set;
 521  * IN_C is used if IN_32 and IN_64 are unset.
 522  */
 523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 525                    uint64_t in_c)
 526 {
 527     TCGType type;
 528     TCGv_i64 t_64;
 529     TCGv_i32 t_32, t_desc;
 530     TCGv_ptr t_ptr;
 531     uint32_t i;
 532
 533     assert(vece <= (in_32 ? MO_32 : MO_64));
 534     assert(in_32 == NULL || in_64 == NULL);
 535
 536     /* If we're storing 0, expand oprsz to maxsz.  */
 537     if (in_32 == NULL && in_64 == NULL) {
 538         in_c = dup_const(vece, in_c);
 539         if (in_c == 0) {
 540             oprsz = maxsz;
 541             vece = MO_8;
 542         } else if (in_c == dup_const(MO_8, in_c)) {
 543             vece = MO_8;
 544         }
 545     }
 546
 547     /* Implement inline with a vector type, if possible.
 548      * Prefer integer when 64-bit host and no variable dup.
 549      */
 550     type = choose_vector_type(NULL, vece, oprsz,
 551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 552                                && (in_64 == NULL || vece == MO_64)));
 553     if (type != 0) {
 554         TCGv_vec t_vec = tcg_temp_new_vec(type);
 555
 556         if (in_32) {
 557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 558         } else if (in_64) {
 559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 560         } else {
 561             tcg_gen_dupi_vec(vece, t_vec, in_c);
 562         }
 563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 564         return;
 565     }
 566
 567     /* Otherwise, inline with an integer type, unless "large".  */
 568     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 569         t_64 = NULL;
 570         t_32 = NULL;
 571
 572         if (in_32) {
 573             /* We are given a 32-bit variable input.  For a 64-bit host,
 574                use a 64-bit operation unless the 32-bit operation would
 575                be simple enough.  */
 576             if (TCG_TARGET_REG_BITS == 64
 577                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 578                 t_64 = tcg_temp_ebb_new_i64();
 579                 tcg_gen_extu_i32_i64(t_64, in_32);
 580                 tcg_gen_dup_i64(vece, t_64, t_64);
 581             } else {
 582                 t_32 = tcg_temp_ebb_new_i32();
 583                 tcg_gen_dup_i32(vece, t_32, in_32);
 584             }
 585         } else if (in_64) {
 586             /* We are given a 64-bit variable input.  */
 587             t_64 = tcg_temp_ebb_new_i64();
 588             tcg_gen_dup_i64(vece, t_64, in_64);
 589         } else {
 590             /* We are given a constant input.  */
 591             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 592                or when we'd need too many 32-bit stores, or when a 64-bit
 593                constant is really required.  */
 594             if (vece == MO_64
 595                 || (TCG_TARGET_REG_BITS == 64
 596                     && (in_c == 0 || in_c == -1
 597                         || !check_size_impl(oprsz, 4)))) {
 598                 t_64 = tcg_constant_i64(in_c);
 599             } else {
 600                 t_32 = tcg_constant_i32(in_c);
 601             }
 602         }
 603
 604         /* Implement inline if we picked an implementation size above.  */
 605         if (t_32) {
 606             for (i = 0; i < oprsz; i += 4) {
 607                 tcg_gen_st_i32(t_32, tcg_env, dofs + i);
 608             }
 609             tcg_temp_free_i32(t_32);
 610             goto done;
 611         }
 612         if (t_64) {
 613             for (i = 0; i < oprsz; i += 8) {
 614                 tcg_gen_st_i64(t_64, tcg_env, dofs + i);
 615             }
 616             tcg_temp_free_i64(t_64);
 617             goto done;
 618         }
 619     }
 620
 621     /* Otherwise implement out of line.  */
 622     t_ptr = tcg_temp_ebb_new_ptr();
 623     tcg_gen_addi_ptr(t_ptr, tcg_env, dofs);
 624
 625     /*
 626      * This may be expand_clr for the tail of an operation, e.g.
 627      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 628      * wrt simd_desc and will assert.  Simply pass all replicated byte
 629      * stores through to memset.
 630      */
 631     if (oprsz == maxsz && vece == MO_8) {
 632         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
 633         TCGv_i32 t_val;
 634
 635         if (in_32) {
 636             t_val = in_32;
 637         } else if (in_64) {
 638             t_val = tcg_temp_ebb_new_i32();
 639             tcg_gen_extrl_i64_i32(t_val, in_64);
 640         } else {
 641             t_val = tcg_constant_i32(in_c);
 642         }
 643         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 644
 645         if (in_64) {
 646             tcg_temp_free_i32(t_val);
 647         }
 648         tcg_temp_free_ptr(t_ptr);
 649         return;
 650     }
 651
 652     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 653
 654     if (vece == MO_64) {
 655         if (in_64) {
 656             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 657         } else {
 658             t_64 = tcg_constant_i64(in_c);
 659             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 660         }
 661     } else {
 662         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 663         static dup_fn * const fns[3] = {
 664             gen_helper_gvec_dup8,
 665             gen_helper_gvec_dup16,
 666             gen_helper_gvec_dup32
 667         };
 668
 669         if (in_32) {
 670             fns[vece](t_ptr, t_desc, in_32);
 671         } else if (in_64) {
 672             t_32 = tcg_temp_ebb_new_i32();
 673             tcg_gen_extrl_i64_i32(t_32, in_64);
 674             fns[vece](t_ptr, t_desc, t_32);
 675             tcg_temp_free_i32(t_32);
 676         } else {
 677             if (vece == MO_8) {
 678                 in_c &= 0xff;
 679             } else if (vece == MO_16) {
 680                 in_c &= 0xffff;
 681             }
 682             t_32 = tcg_constant_i32(in_c);
 683             fns[vece](t_ptr, t_desc, t_32);
 684         }
 685     }
 686
 687     tcg_temp_free_ptr(t_ptr);
 688     return;
 689
 690  done:
 691     if (oprsz < maxsz) {
 692         expand_clr(dofs + oprsz, maxsz - oprsz);
 693     }
 694 }
 695
 696 /* Likewise, but with zero.  */
 697 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 698 {
 699     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 700 }
 701
 702 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 703 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 704                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 705 {
 706     TCGv_i32 t0 = tcg_temp_new_i32();
 707     TCGv_i32 t1 = tcg_temp_new_i32();
 708     uint32_t i;
 709
 710     for (i = 0; i < oprsz; i += 4) {
 711         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
 712         if (load_dest) {
 713             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
 714         }
 715         fni(t1, t0);
 716         tcg_gen_st_i32(t1, tcg_env, dofs + i);
 717     }
 718     tcg_temp_free_i32(t0);
 719     tcg_temp_free_i32(t1);
 720 }
 721
 722 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 723                           int32_t c, bool load_dest,
 724                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 725 {
 726     TCGv_i32 t0 = tcg_temp_new_i32();
 727     TCGv_i32 t1 = tcg_temp_new_i32();
 728     uint32_t i;
 729
 730     for (i = 0; i < oprsz; i += 4) {
 731         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
 732         if (load_dest) {
 733             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
 734         }
 735         fni(t1, t0, c);
 736         tcg_gen_st_i32(t1, tcg_env, dofs + i);
 737     }
 738     tcg_temp_free_i32(t0);
 739     tcg_temp_free_i32(t1);
 740 }
 741
 742 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 743                           TCGv_i32 c, bool scalar_first,
 744                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 745 {
 746     TCGv_i32 t0 = tcg_temp_new_i32();
 747     TCGv_i32 t1 = tcg_temp_new_i32();
 748     uint32_t i;
 749
 750     for (i = 0; i < oprsz; i += 4) {
 751         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
 752         if (scalar_first) {
 753             fni(t1, c, t0);
 754         } else {
 755             fni(t1, t0, c);
 756         }
 757         tcg_gen_st_i32(t1, tcg_env, dofs + i);
 758     }
 759     tcg_temp_free_i32(t0);
 760     tcg_temp_free_i32(t1);
 761 }
 762
 763 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 764 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 765                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 766                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 767 {
 768     TCGv_i32 t0 = tcg_temp_new_i32();
 769     TCGv_i32 t1 = tcg_temp_new_i32();
 770     TCGv_i32 t2 = tcg_temp_new_i32();
 771     uint32_t i;
 772
 773     for (i = 0; i < oprsz; i += 4) {
 774         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
 775         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
 776         if (load_dest) {
 777             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
 778         }
 779         fni(t2, t0, t1);
 780         tcg_gen_st_i32(t2, tcg_env, dofs + i);
 781     }
 782     tcg_temp_free_i32(t2);
 783     tcg_temp_free_i32(t1);
 784     tcg_temp_free_i32(t0);
 785 }
 786
 787 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 788                           uint32_t oprsz, int32_t c,
 789                           bool load_dest, bool write_aofs,
 790                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 791 {
 792     TCGv_i32 t0 = tcg_temp_new_i32();
 793     TCGv_i32 t1 = tcg_temp_new_i32();
 794     TCGv_i32 t2 = tcg_temp_new_i32();
 795     uint32_t i;
 796
 797     for (i = 0; i < oprsz; i += 4) {
 798         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
 799         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
 800         if (load_dest) {
 801             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
 802         }
 803         fni(t2, t0, t1, c);
 804         tcg_gen_st_i32(t2, tcg_env, dofs + i);
 805         if (write_aofs) {
 806             tcg_gen_st_i32(t0, tcg_env, aofs + i);
 807         }
 808     }
 809     tcg_temp_free_i32(t0);
 810     tcg_temp_free_i32(t1);
 811     tcg_temp_free_i32(t2);
 812 }
 813
 814 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 815 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 816                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 817                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 818 {
 819     TCGv_i32 t0 = tcg_temp_new_i32();
 820     TCGv_i32 t1 = tcg_temp_new_i32();
 821     TCGv_i32 t2 = tcg_temp_new_i32();
 822     TCGv_i32 t3 = tcg_temp_new_i32();
 823     uint32_t i;
 824
 825     for (i = 0; i < oprsz; i += 4) {
 826         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
 827         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
 828         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
 829         fni(t0, t1, t2, t3);
 830         tcg_gen_st_i32(t0, tcg_env, dofs + i);
 831         if (write_aofs) {
 832             tcg_gen_st_i32(t1, tcg_env, aofs + i);
 833         }
 834     }
 835     tcg_temp_free_i32(t3);
 836     tcg_temp_free_i32(t2);
 837     tcg_temp_free_i32(t1);
 838     tcg_temp_free_i32(t0);
 839 }
 840
 841 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 842                           uint32_t cofs, uint32_t oprsz, int32_t c,
 843                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
 844                                       int32_t))
 845 {
 846     TCGv_i32 t0 = tcg_temp_new_i32();
 847     TCGv_i32 t1 = tcg_temp_new_i32();
 848     TCGv_i32 t2 = tcg_temp_new_i32();
 849     TCGv_i32 t3 = tcg_temp_new_i32();
 850     uint32_t i;
 851
 852     for (i = 0; i < oprsz; i += 4) {
 853         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
 854         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
 855         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
 856         fni(t0, t1, t2, t3, c);
 857         tcg_gen_st_i32(t0, tcg_env, dofs + i);
 858     }
 859     tcg_temp_free_i32(t3);
 860     tcg_temp_free_i32(t2);
 861     tcg_temp_free_i32(t1);
 862     tcg_temp_free_i32(t0);
 863 }
 864
 865 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 866 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 867                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 868 {
 869     TCGv_i64 t0 = tcg_temp_new_i64();
 870     TCGv_i64 t1 = tcg_temp_new_i64();
 871     uint32_t i;
 872
 873     for (i = 0; i < oprsz; i += 8) {
 874         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
 875         if (load_dest) {
 876             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
 877         }
 878         fni(t1, t0);
 879         tcg_gen_st_i64(t1, tcg_env, dofs + i);
 880     }
 881     tcg_temp_free_i64(t0);
 882     tcg_temp_free_i64(t1);
 883 }
 884
 885 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 886                           int64_t c, bool load_dest,
 887                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 888 {
 889     TCGv_i64 t0 = tcg_temp_new_i64();
 890     TCGv_i64 t1 = tcg_temp_new_i64();
 891     uint32_t i;
 892
 893     for (i = 0; i < oprsz; i += 8) {
 894         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
 895         if (load_dest) {
 896             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
 897         }
 898         fni(t1, t0, c);
 899         tcg_gen_st_i64(t1, tcg_env, dofs + i);
 900     }
 901     tcg_temp_free_i64(t0);
 902     tcg_temp_free_i64(t1);
 903 }
 904
 905 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 906                           TCGv_i64 c, bool scalar_first,
 907                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 908 {
 909     TCGv_i64 t0 = tcg_temp_new_i64();
 910     TCGv_i64 t1 = tcg_temp_new_i64();
 911     uint32_t i;
 912
 913     for (i = 0; i < oprsz; i += 8) {
 914         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
 915         if (scalar_first) {
 916             fni(t1, c, t0);
 917         } else {
 918             fni(t1, t0, c);
 919         }
 920         tcg_gen_st_i64(t1, tcg_env, dofs + i);
 921     }
 922     tcg_temp_free_i64(t0);
 923     tcg_temp_free_i64(t1);
 924 }
 925
 926 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 927 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 928                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 929                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 930 {
 931     TCGv_i64 t0 = tcg_temp_new_i64();
 932     TCGv_i64 t1 = tcg_temp_new_i64();
 933     TCGv_i64 t2 = tcg_temp_new_i64();
 934     uint32_t i;
 935
 936     for (i = 0; i < oprsz; i += 8) {
 937         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
 938         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
 939         if (load_dest) {
 940             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
 941         }
 942         fni(t2, t0, t1);
 943         tcg_gen_st_i64(t2, tcg_env, dofs + i);
 944     }
 945     tcg_temp_free_i64(t2);
 946     tcg_temp_free_i64(t1);
 947     tcg_temp_free_i64(t0);
 948 }
 949
 950 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 951                           uint32_t oprsz, int64_t c,
 952                           bool load_dest, bool write_aofs,
 953                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 954 {
 955     TCGv_i64 t0 = tcg_temp_new_i64();
 956     TCGv_i64 t1 = tcg_temp_new_i64();
 957     TCGv_i64 t2 = tcg_temp_new_i64();
 958     uint32_t i;
 959
 960     for (i = 0; i < oprsz; i += 8) {
 961         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
 962         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
 963         if (load_dest) {
 964             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
 965         }
 966         fni(t2, t0, t1, c);
 967         tcg_gen_st_i64(t2, tcg_env, dofs + i);
 968         if (write_aofs) {
 969             tcg_gen_st_i64(t0, tcg_env, aofs + i);
 970         }
 971     }
 972     tcg_temp_free_i64(t0);
 973     tcg_temp_free_i64(t1);
 974     tcg_temp_free_i64(t2);
 975 }
 976
 977 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 978 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 979                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 980                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 981 {
 982     TCGv_i64 t0 = tcg_temp_new_i64();
 983     TCGv_i64 t1 = tcg_temp_new_i64();
 984     TCGv_i64 t2 = tcg_temp_new_i64();
 985     TCGv_i64 t3 = tcg_temp_new_i64();
 986     uint32_t i;
 987
 988     for (i = 0; i < oprsz; i += 8) {
 989         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
 990         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
 991         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
 992         fni(t0, t1, t2, t3);
 993         tcg_gen_st_i64(t0, tcg_env, dofs + i);
 994         if (write_aofs) {
 995             tcg_gen_st_i64(t1, tcg_env, aofs + i);
 996         }
 997     }
 998     tcg_temp_free_i64(t3);
 999     tcg_temp_free_i64(t2);
1000     tcg_temp_free_i64(t1);
1001     tcg_temp_free_i64(t0);
1002 }
1003
1004 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1005                           uint32_t cofs, uint32_t oprsz, int64_t c,
1006                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1007                                       int64_t))
1008 {
1009     TCGv_i64 t0 = tcg_temp_new_i64();
1010     TCGv_i64 t1 = tcg_temp_new_i64();
1011     TCGv_i64 t2 = tcg_temp_new_i64();
1012     TCGv_i64 t3 = tcg_temp_new_i64();
1013     uint32_t i;
1014
1015     for (i = 0; i < oprsz; i += 8) {
1016         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1017         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1018         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1019         fni(t0, t1, t2, t3, c);
1020         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1021     }
1022     tcg_temp_free_i64(t3);
1023     tcg_temp_free_i64(t2);
1024     tcg_temp_free_i64(t1);
1025     tcg_temp_free_i64(t0);
1026 }
1027
1028 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1029 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1030                          uint32_t oprsz, uint32_t tysz, TCGType type,
1031                          bool load_dest,
1032                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1033 {
1034     for (uint32_t i = 0; i < oprsz; i += tysz) {
1035         TCGv_vec t0 = tcg_temp_new_vec(type);
1036         TCGv_vec t1 = tcg_temp_new_vec(type);
1037
1038         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1039         if (load_dest) {
1040             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1041         }
1042         fni(vece, t1, t0);
1043         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1044     }
1045 }
1046
1047 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1048    using host vectors.  */
1049 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1050                           uint32_t oprsz, uint32_t tysz, TCGType type,
1051                           int64_t c, bool load_dest,
1052                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1053 {
1054     for (uint32_t i = 0; i < oprsz; i += tysz) {
1055         TCGv_vec t0 = tcg_temp_new_vec(type);
1056         TCGv_vec t1 = tcg_temp_new_vec(type);
1057
1058         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1059         if (load_dest) {
1060             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1061         }
1062         fni(vece, t1, t0, c);
1063         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1064     }
1065 }
1066
1067 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1068                           uint32_t oprsz, uint32_t tysz, TCGType type,
1069                           TCGv_vec c, bool scalar_first,
1070                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1071 {
1072     for (uint32_t i = 0; i < oprsz; i += tysz) {
1073         TCGv_vec t0 = tcg_temp_new_vec(type);
1074         TCGv_vec t1 = tcg_temp_new_vec(type);
1075
1076         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1077         if (scalar_first) {
1078             fni(vece, t1, c, t0);
1079         } else {
1080             fni(vece, t1, t0, c);
1081         }
1082         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1083     }
1084 }
1085
1086 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1087 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1088                          uint32_t bofs, uint32_t oprsz,
1089                          uint32_t tysz, TCGType type, bool load_dest,
1090                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1091 {
1092     for (uint32_t i = 0; i < oprsz; i += tysz) {
1093         TCGv_vec t0 = tcg_temp_new_vec(type);
1094         TCGv_vec t1 = tcg_temp_new_vec(type);
1095         TCGv_vec t2 = tcg_temp_new_vec(type);
1096
1097         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1098         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1099         if (load_dest) {
1100             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1101         }
1102         fni(vece, t2, t0, t1);
1103         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1104     }
1105 }
1106
1107 /*
1108  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1109  * using host vectors.
1110  */
1111 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1112                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1113                           TCGType type, int64_t c,
1114                           bool load_dest, bool write_aofs,
1115                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1116                                       int64_t))
1117 {
1118     for (uint32_t i = 0; i < oprsz; i += tysz) {
1119         TCGv_vec t0 = tcg_temp_new_vec(type);
1120         TCGv_vec t1 = tcg_temp_new_vec(type);
1121         TCGv_vec t2 = tcg_temp_new_vec(type);
1122
1123         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1124         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1125         if (load_dest) {
1126             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1127         }
1128         fni(vece, t2, t0, t1, c);
1129         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1130         if (write_aofs) {
1131             tcg_gen_st_vec(t0, tcg_env, aofs + i);
1132         }
1133     }
1134 }
1135
1136 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1137 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1138                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1139                          uint32_t tysz, TCGType type, bool write_aofs,
1140                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1141                                      TCGv_vec, TCGv_vec))
1142 {
1143     for (uint32_t i = 0; i < oprsz; i += tysz) {
1144         TCGv_vec t0 = tcg_temp_new_vec(type);
1145         TCGv_vec t1 = tcg_temp_new_vec(type);
1146         TCGv_vec t2 = tcg_temp_new_vec(type);
1147         TCGv_vec t3 = tcg_temp_new_vec(type);
1148
1149         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1150         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1151         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1152         fni(vece, t0, t1, t2, t3);
1153         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1154         if (write_aofs) {
1155             tcg_gen_st_vec(t1, tcg_env, aofs + i);
1156         }
1157     }
1158 }
1159
1160 /*
1161  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1162  * using host vectors.
1163  */
1164 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1165                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1166                           uint32_t tysz, TCGType type, int64_t c,
1167                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1168                                      TCGv_vec, TCGv_vec, int64_t))
1169 {
1170     for (uint32_t i = 0; i < oprsz; i += tysz) {
1171         TCGv_vec t0 = tcg_temp_new_vec(type);
1172         TCGv_vec t1 = tcg_temp_new_vec(type);
1173         TCGv_vec t2 = tcg_temp_new_vec(type);
1174         TCGv_vec t3 = tcg_temp_new_vec(type);
1175
1176         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1177         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1178         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1179         fni(vece, t0, t1, t2, t3, c);
1180         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1181     }
1182 }
1183
1184 /* Expand a vector two-operand operation.  */
1185 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1186                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1187 {
1188     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1189     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1190     TCGType type;
1191     uint32_t some;
1192
1193     check_size_align(oprsz, maxsz, dofs | aofs);
1194     check_overlap_2(dofs, aofs, maxsz);
1195
1196     type = 0;
1197     if (g->fniv) {
1198         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1199     }
1200     switch (type) {
1201     case TCG_TYPE_V256:
1202         /* Recall that ARM SVE allows vector sizes that are not a
1203          * power of 2, but always a multiple of 16.  The intent is
1204          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1205          */
1206         some = QEMU_ALIGN_DOWN(oprsz, 32);
1207         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1208                      g->load_dest, g->fniv);
1209         if (some == oprsz) {
1210             break;
1211         }
1212         dofs += some;
1213         aofs += some;
1214         oprsz -= some;
1215         maxsz -= some;
1216         /* fallthru */
1217     case TCG_TYPE_V128:
1218         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1219                      g->load_dest, g->fniv);
1220         break;
1221     case TCG_TYPE_V64:
1222         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1223                      g->load_dest, g->fniv);
1224         break;
1225
1226     case 0:
1227         if (g->fni8 && check_size_impl(oprsz, 8)) {
1228             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1229         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1230             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1231         } else {
1232             assert(g->fno != NULL);
1233             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1234             oprsz = maxsz;
1235         }
1236         break;
1237
1238     default:
1239         g_assert_not_reached();
1240     }
1241     tcg_swap_vecop_list(hold_list);
1242
1243     if (oprsz < maxsz) {
1244         expand_clr(dofs + oprsz, maxsz - oprsz);
1245     }
1246 }
1247
1248 /* Expand a vector operation with two vectors and an immediate.  */
1249 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1250                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1251 {
1252     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1253     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1254     TCGType type;
1255     uint32_t some;
1256
1257     check_size_align(oprsz, maxsz, dofs | aofs);
1258     check_overlap_2(dofs, aofs, maxsz);
1259
1260     type = 0;
1261     if (g->fniv) {
1262         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1263     }
1264     switch (type) {
1265     case TCG_TYPE_V256:
1266         /* Recall that ARM SVE allows vector sizes that are not a
1267          * power of 2, but always a multiple of 16.  The intent is
1268          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1269          */
1270         some = QEMU_ALIGN_DOWN(oprsz, 32);
1271         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1272                       c, g->load_dest, g->fniv);
1273         if (some == oprsz) {
1274             break;
1275         }
1276         dofs += some;
1277         aofs += some;
1278         oprsz -= some;
1279         maxsz -= some;
1280         /* fallthru */
1281     case TCG_TYPE_V128:
1282         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1283                       c, g->load_dest, g->fniv);
1284         break;
1285     case TCG_TYPE_V64:
1286         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1287                       c, g->load_dest, g->fniv);
1288         break;
1289
1290     case 0:
1291         if (g->fni8 && check_size_impl(oprsz, 8)) {
1292             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1293         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1294             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1295         } else {
1296             if (g->fno) {
1297                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1298             } else {
1299                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1300                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1301                                     maxsz, c, g->fnoi);
1302             }
1303             oprsz = maxsz;
1304         }
1305         break;
1306
1307     default:
1308         g_assert_not_reached();
1309     }
1310     tcg_swap_vecop_list(hold_list);
1311
1312     if (oprsz < maxsz) {
1313         expand_clr(dofs + oprsz, maxsz - oprsz);
1314     }
1315 }
1316
1317 /* Expand a vector operation with two vectors and a scalar.  */
1318 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1319                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1320 {
1321     TCGType type;
1322
1323     check_size_align(oprsz, maxsz, dofs | aofs);
1324     check_overlap_2(dofs, aofs, maxsz);
1325
1326     type = 0;
1327     if (g->fniv) {
1328         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1329     }
1330     if (type != 0) {
1331         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1332         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1333         TCGv_vec t_vec = tcg_temp_new_vec(type);
1334         uint32_t some;
1335
1336         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1337
1338         switch (type) {
1339         case TCG_TYPE_V256:
1340             /* Recall that ARM SVE allows vector sizes that are not a
1341              * power of 2, but always a multiple of 16.  The intent is
1342              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1343              */
1344             some = QEMU_ALIGN_DOWN(oprsz, 32);
1345             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1346                           t_vec, g->scalar_first, g->fniv);
1347             if (some == oprsz) {
1348                 break;
1349             }
1350             dofs += some;
1351             aofs += some;
1352             oprsz -= some;
1353             maxsz -= some;
1354             /* fallthru */
1355
1356         case TCG_TYPE_V128:
1357             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1358                           t_vec, g->scalar_first, g->fniv);
1359             break;
1360
1361         case TCG_TYPE_V64:
1362             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1363                           t_vec, g->scalar_first, g->fniv);
1364             break;
1365
1366         default:
1367             g_assert_not_reached();
1368         }
1369         tcg_temp_free_vec(t_vec);
1370         tcg_swap_vecop_list(hold_list);
1371     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1372         TCGv_i64 t64 = tcg_temp_new_i64();
1373
1374         tcg_gen_dup_i64(g->vece, t64, c);
1375         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1376         tcg_temp_free_i64(t64);
1377     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1378         TCGv_i32 t32 = tcg_temp_new_i32();
1379
1380         tcg_gen_extrl_i64_i32(t32, c);
1381         tcg_gen_dup_i32(g->vece, t32, t32);
1382         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1383         tcg_temp_free_i32(t32);
1384     } else {
1385         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1386         return;
1387     }
1388
1389     if (oprsz < maxsz) {
1390         expand_clr(dofs + oprsz, maxsz - oprsz);
1391     }
1392 }
1393
1394 /* Expand a vector three-operand operation.  */
1395 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1396                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1397 {
1398     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1399     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1400     TCGType type;
1401     uint32_t some;
1402
1403     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1404     check_overlap_3(dofs, aofs, bofs, maxsz);
1405
1406     type = 0;
1407     if (g->fniv) {
1408         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1409     }
1410     switch (type) {
1411     case TCG_TYPE_V256:
1412         /* Recall that ARM SVE allows vector sizes that are not a
1413          * power of 2, but always a multiple of 16.  The intent is
1414          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1415          */
1416         some = QEMU_ALIGN_DOWN(oprsz, 32);
1417         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1418                      g->load_dest, g->fniv);
1419         if (some == oprsz) {
1420             break;
1421         }
1422         dofs += some;
1423         aofs += some;
1424         bofs += some;
1425         oprsz -= some;
1426         maxsz -= some;
1427         /* fallthru */
1428     case TCG_TYPE_V128:
1429         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1430                      g->load_dest, g->fniv);
1431         break;
1432     case TCG_TYPE_V64:
1433         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1434                      g->load_dest, g->fniv);
1435         break;
1436
1437     case 0:
1438         if (g->fni8 && check_size_impl(oprsz, 8)) {
1439             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1440         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1441             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1442         } else {
1443             assert(g->fno != NULL);
1444             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1445                                maxsz, g->data, g->fno);
1446             oprsz = maxsz;
1447         }
1448         break;
1449
1450     default:
1451         g_assert_not_reached();
1452     }
1453     tcg_swap_vecop_list(hold_list);
1454
1455     if (oprsz < maxsz) {
1456         expand_clr(dofs + oprsz, maxsz - oprsz);
1457     }
1458 }
1459
1460 /* Expand a vector operation with three vectors and an immediate.  */
1461 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1462                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1463                      const GVecGen3i *g)
1464 {
1465     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1466     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1467     TCGType type;
1468     uint32_t some;
1469
1470     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1471     check_overlap_3(dofs, aofs, bofs, maxsz);
1472
1473     type = 0;
1474     if (g->fniv) {
1475         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1476     }
1477     switch (type) {
1478     case TCG_TYPE_V256:
1479         /*
1480          * Recall that ARM SVE allows vector sizes that are not a
1481          * power of 2, but always a multiple of 16.  The intent is
1482          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1483          */
1484         some = QEMU_ALIGN_DOWN(oprsz, 32);
1485         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1486                       c, g->load_dest, g->write_aofs, g->fniv);
1487         if (some == oprsz) {
1488             break;
1489         }
1490         dofs += some;
1491         aofs += some;
1492         bofs += some;
1493         oprsz -= some;
1494         maxsz -= some;
1495         /* fallthru */
1496     case TCG_TYPE_V128:
1497         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1498                       c, g->load_dest, g->write_aofs, g->fniv);
1499         break;
1500     case TCG_TYPE_V64:
1501         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1502                       c, g->load_dest, g->write_aofs, g->fniv);
1503         break;
1504
1505     case 0:
1506         if (g->fni8 && check_size_impl(oprsz, 8)) {
1507             expand_3i_i64(dofs, aofs, bofs, oprsz, c,
1508                           g->load_dest, g->write_aofs, g->fni8);
1509         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1510             expand_3i_i32(dofs, aofs, bofs, oprsz, c,
1511                           g->load_dest, g->write_aofs, g->fni4);
1512         } else {
1513             assert(g->fno != NULL);
1514             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1515             oprsz = maxsz;
1516         }
1517         break;
1518
1519     default:
1520         g_assert_not_reached();
1521     }
1522     tcg_swap_vecop_list(hold_list);
1523
1524     if (oprsz < maxsz) {
1525         expand_clr(dofs + oprsz, maxsz - oprsz);
1526     }
1527 }
1528
1529 /* Expand a vector four-operand operation.  */
1530 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1531                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1532 {
1533     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1534     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1535     TCGType type;
1536     uint32_t some;
1537
1538     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1539     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1540
1541     type = 0;
1542     if (g->fniv) {
1543         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1544     }
1545     switch (type) {
1546     case TCG_TYPE_V256:
1547         /* Recall that ARM SVE allows vector sizes that are not a
1548          * power of 2, but always a multiple of 16.  The intent is
1549          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1550          */
1551         some = QEMU_ALIGN_DOWN(oprsz, 32);
1552         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1553                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1554         if (some == oprsz) {
1555             break;
1556         }
1557         dofs += some;
1558         aofs += some;
1559         bofs += some;
1560         cofs += some;
1561         oprsz -= some;
1562         maxsz -= some;
1563         /* fallthru */
1564     case TCG_TYPE_V128:
1565         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1566                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1567         break;
1568     case TCG_TYPE_V64:
1569         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1570                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1571         break;
1572
1573     case 0:
1574         if (g->fni8 && check_size_impl(oprsz, 8)) {
1575             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1576                          g->write_aofs, g->fni8);
1577         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1578             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1579                          g->write_aofs, g->fni4);
1580         } else {
1581             assert(g->fno != NULL);
1582             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1583                                oprsz, maxsz, g->data, g->fno);
1584             oprsz = maxsz;
1585         }
1586         break;
1587
1588     default:
1589         g_assert_not_reached();
1590     }
1591     tcg_swap_vecop_list(hold_list);
1592
1593     if (oprsz < maxsz) {
1594         expand_clr(dofs + oprsz, maxsz - oprsz);
1595     }
1596 }
1597
1598 /* Expand a vector four-operand operation.  */
1599 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1600                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1601                      const GVecGen4i *g)
1602 {
1603     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1604     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1605     TCGType type;
1606     uint32_t some;
1607
1608     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1609     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1610
1611     type = 0;
1612     if (g->fniv) {
1613         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1614     }
1615     switch (type) {
1616     case TCG_TYPE_V256:
1617         /*
1618          * Recall that ARM SVE allows vector sizes that are not a
1619          * power of 2, but always a multiple of 16.  The intent is
1620          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1621          */
1622         some = QEMU_ALIGN_DOWN(oprsz, 32);
1623         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1624                       32, TCG_TYPE_V256, c, g->fniv);
1625         if (some == oprsz) {
1626             break;
1627         }
1628         dofs += some;
1629         aofs += some;
1630         bofs += some;
1631         cofs += some;
1632         oprsz -= some;
1633         maxsz -= some;
1634         /* fallthru */
1635     case TCG_TYPE_V128:
1636         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1637                        16, TCG_TYPE_V128, c, g->fniv);
1638         break;
1639     case TCG_TYPE_V64:
1640         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1641                       8, TCG_TYPE_V64, c, g->fniv);
1642         break;
1643
1644     case 0:
1645         if (g->fni8 && check_size_impl(oprsz, 8)) {
1646             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1647         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1648             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1649         } else {
1650             assert(g->fno != NULL);
1651             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1652                                oprsz, maxsz, c, g->fno);
1653             oprsz = maxsz;
1654         }
1655         break;
1656
1657     default:
1658         g_assert_not_reached();
1659     }
1660     tcg_swap_vecop_list(hold_list);
1661
1662     if (oprsz < maxsz) {
1663         expand_clr(dofs + oprsz, maxsz - oprsz);
1664     }
1665 }
1666
1667 /*
1668  * Expand specific vector operations.
1669  */
1670
1671 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1672 {
1673     tcg_gen_mov_vec(a, b);
1674 }
1675
1676 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1677                       uint32_t oprsz, uint32_t maxsz)
1678 {
1679     static const GVecGen2 g = {
1680         .fni8 = tcg_gen_mov_i64,
1681         .fniv = vec_mov2,
1682         .fno = gen_helper_gvec_mov,
1683         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1684     };
1685     if (dofs != aofs) {
1686         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1687     } else {
1688         check_size_align(oprsz, maxsz, dofs);
1689         if (oprsz < maxsz) {
1690             expand_clr(dofs + oprsz, maxsz - oprsz);
1691         }
1692     }
1693 }
1694
1695 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1696                           uint32_t maxsz, TCGv_i32 in)
1697 {
1698     check_size_align(oprsz, maxsz, dofs);
1699     tcg_debug_assert(vece <= MO_32);
1700     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1701 }
1702
1703 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1704                           uint32_t maxsz, TCGv_i64 in)
1705 {
1706     check_size_align(oprsz, maxsz, dofs);
1707     tcg_debug_assert(vece <= MO_64);
1708     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1709 }
1710
1711 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1712                           uint32_t oprsz, uint32_t maxsz)
1713 {
1714     check_size_align(oprsz, maxsz, dofs);
1715     if (vece <= MO_64) {
1716         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1717         if (type != 0) {
1718             TCGv_vec t_vec = tcg_temp_new_vec(type);
1719             tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs);
1720             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1721         } else if (vece <= MO_32) {
1722             TCGv_i32 in = tcg_temp_ebb_new_i32();
1723             switch (vece) {
1724             case MO_8:
1725                 tcg_gen_ld8u_i32(in, tcg_env, aofs);
1726                 break;
1727             case MO_16:
1728                 tcg_gen_ld16u_i32(in, tcg_env, aofs);
1729                 break;
1730             default:
1731                 tcg_gen_ld_i32(in, tcg_env, aofs);
1732                 break;
1733             }
1734             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1735             tcg_temp_free_i32(in);
1736         } else {
1737             TCGv_i64 in = tcg_temp_ebb_new_i64();
1738             tcg_gen_ld_i64(in, tcg_env, aofs);
1739             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1740             tcg_temp_free_i64(in);
1741         }
1742     } else if (vece == 4) {
1743         /* 128-bit duplicate.  */
1744         int i;
1745
1746         tcg_debug_assert(oprsz >= 16);
1747         if (TCG_TARGET_HAS_v128) {
1748             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1749
1750             tcg_gen_ld_vec(in, tcg_env, aofs);
1751             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1752                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1753             }
1754         } else {
1755             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1756             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1757
1758             tcg_gen_ld_i64(in0, tcg_env, aofs);
1759             tcg_gen_ld_i64(in1, tcg_env, aofs + 8);
1760             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1761                 tcg_gen_st_i64(in0, tcg_env, dofs + i);
1762                 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8);
1763             }
1764             tcg_temp_free_i64(in0);
1765             tcg_temp_free_i64(in1);
1766         }
1767         if (oprsz < maxsz) {
1768             expand_clr(dofs + oprsz, maxsz - oprsz);
1769         }
1770     } else if (vece == 5) {
1771         /* 256-bit duplicate.  */
1772         int i;
1773
1774         tcg_debug_assert(oprsz >= 32);
1775         tcg_debug_assert(oprsz % 32 == 0);
1776         if (TCG_TARGET_HAS_v256) {
1777             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1778
1779             tcg_gen_ld_vec(in, tcg_env, aofs);
1780             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1781                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1782             }
1783         } else if (TCG_TARGET_HAS_v128) {
1784             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1785             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1786
1787             tcg_gen_ld_vec(in0, tcg_env, aofs);
1788             tcg_gen_ld_vec(in1, tcg_env, aofs + 16);
1789             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1790                 tcg_gen_st_vec(in0, tcg_env, dofs + i);
1791                 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16);
1792             }
1793         } else {
1794             TCGv_i64 in[4];
1795             int j;
1796
1797             for (j = 0; j < 4; ++j) {
1798                 in[j] = tcg_temp_ebb_new_i64();
1799                 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8);
1800             }
1801             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1802                 for (j = 0; j < 4; ++j) {
1803                     tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8);
1804                 }
1805             }
1806             for (j = 0; j < 4; ++j) {
1807                 tcg_temp_free_i64(in[j]);
1808             }
1809         }
1810         if (oprsz < maxsz) {
1811             expand_clr(dofs + oprsz, maxsz - oprsz);
1812         }
1813     } else {
1814         g_assert_not_reached();
1815     }
1816 }
1817
1818 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1819                           uint32_t maxsz, uint64_t x)
1820 {
1821     check_size_align(oprsz, maxsz, dofs);
1822     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1823 }
1824
1825 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1826                       uint32_t oprsz, uint32_t maxsz)
1827 {
1828     static const GVecGen2 g = {
1829         .fni8 = tcg_gen_not_i64,
1830         .fniv = tcg_gen_not_vec,
1831         .fno = gen_helper_gvec_not,
1832         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1833     };
1834     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1835 }
1836
1837 /* Perform a vector addition using normal addition and a mask.  The mask
1838    should be the sign bit of each lane.  This 6-operation form is more
1839    efficient than separate additions when there are 4 or more lanes in
1840    the 64-bit operation.  */
1841 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1842 {
1843     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1844     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1845     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1846
1847     tcg_gen_andc_i64(t1, a, m);
1848     tcg_gen_andc_i64(t2, b, m);
1849     tcg_gen_xor_i64(t3, a, b);
1850     tcg_gen_add_i64(d, t1, t2);
1851     tcg_gen_and_i64(t3, t3, m);
1852     tcg_gen_xor_i64(d, d, t3);
1853
1854     tcg_temp_free_i64(t1);
1855     tcg_temp_free_i64(t2);
1856     tcg_temp_free_i64(t3);
1857 }
1858
1859 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1860 {
1861     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1862     gen_addv_mask(d, a, b, m);
1863 }
1864
1865 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1866 {
1867     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1868     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1869     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1870     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1871
1872     tcg_gen_andc_i32(t1, a, m);
1873     tcg_gen_andc_i32(t2, b, m);
1874     tcg_gen_xor_i32(t3, a, b);
1875     tcg_gen_add_i32(d, t1, t2);
1876     tcg_gen_and_i32(t3, t3, m);
1877     tcg_gen_xor_i32(d, d, t3);
1878
1879     tcg_temp_free_i32(t1);
1880     tcg_temp_free_i32(t2);
1881     tcg_temp_free_i32(t3);
1882 }
1883
1884 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1885 {
1886     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1887     gen_addv_mask(d, a, b, m);
1888 }
1889
1890 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1891 {
1892     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1893     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1894
1895     tcg_gen_andi_i32(t1, a, ~0xffff);
1896     tcg_gen_add_i32(t2, a, b);
1897     tcg_gen_add_i32(t1, t1, b);
1898     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1899
1900     tcg_temp_free_i32(t1);
1901     tcg_temp_free_i32(t2);
1902 }
1903
1904 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1905 {
1906     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1907     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1908
1909     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1910     tcg_gen_add_i64(t2, a, b);
1911     tcg_gen_add_i64(t1, t1, b);
1912     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1913
1914     tcg_temp_free_i64(t1);
1915     tcg_temp_free_i64(t2);
1916 }
1917
1918 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1919
1920 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1921                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1922 {
1923     static const GVecGen3 g[4] = {
1924         { .fni8 = tcg_gen_vec_add8_i64,
1925           .fniv = tcg_gen_add_vec,
1926           .fno = gen_helper_gvec_add8,
1927           .opt_opc = vecop_list_add,
1928           .vece = MO_8 },
1929         { .fni8 = tcg_gen_vec_add16_i64,
1930           .fniv = tcg_gen_add_vec,
1931           .fno = gen_helper_gvec_add16,
1932           .opt_opc = vecop_list_add,
1933           .vece = MO_16 },
1934         { .fni4 = tcg_gen_add_i32,
1935           .fniv = tcg_gen_add_vec,
1936           .fno = gen_helper_gvec_add32,
1937           .opt_opc = vecop_list_add,
1938           .vece = MO_32 },
1939         { .fni8 = tcg_gen_add_i64,
1940           .fniv = tcg_gen_add_vec,
1941           .fno = gen_helper_gvec_add64,
1942           .opt_opc = vecop_list_add,
1943           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1944           .vece = MO_64 },
1945     };
1946
1947     tcg_debug_assert(vece <= MO_64);
1948     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1949 }
1950
1951 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1952                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1953 {
1954     static const GVecGen2s g[4] = {
1955         { .fni8 = tcg_gen_vec_add8_i64,
1956           .fniv = tcg_gen_add_vec,
1957           .fno = gen_helper_gvec_adds8,
1958           .opt_opc = vecop_list_add,
1959           .vece = MO_8 },
1960         { .fni8 = tcg_gen_vec_add16_i64,
1961           .fniv = tcg_gen_add_vec,
1962           .fno = gen_helper_gvec_adds16,
1963           .opt_opc = vecop_list_add,
1964           .vece = MO_16 },
1965         { .fni4 = tcg_gen_add_i32,
1966           .fniv = tcg_gen_add_vec,
1967           .fno = gen_helper_gvec_adds32,
1968           .opt_opc = vecop_list_add,
1969           .vece = MO_32 },
1970         { .fni8 = tcg_gen_add_i64,
1971           .fniv = tcg_gen_add_vec,
1972           .fno = gen_helper_gvec_adds64,
1973           .opt_opc = vecop_list_add,
1974           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1975           .vece = MO_64 },
1976     };
1977
1978     tcg_debug_assert(vece <= MO_64);
1979     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1980 }
1981
1982 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1983                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1984 {
1985     TCGv_i64 tmp = tcg_constant_i64(c);
1986     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1987 }
1988
1989 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1990
1991 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1992                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1993 {
1994     static const GVecGen2s g[4] = {
1995         { .fni8 = tcg_gen_vec_sub8_i64,
1996           .fniv = tcg_gen_sub_vec,
1997           .fno = gen_helper_gvec_subs8,
1998           .opt_opc = vecop_list_sub,
1999           .vece = MO_8 },
2000         { .fni8 = tcg_gen_vec_sub16_i64,
2001           .fniv = tcg_gen_sub_vec,
2002           .fno = gen_helper_gvec_subs16,
2003           .opt_opc = vecop_list_sub,
2004           .vece = MO_16 },
2005         { .fni4 = tcg_gen_sub_i32,
2006           .fniv = tcg_gen_sub_vec,
2007           .fno = gen_helper_gvec_subs32,
2008           .opt_opc = vecop_list_sub,
2009           .vece = MO_32 },
2010         { .fni8 = tcg_gen_sub_i64,
2011           .fniv = tcg_gen_sub_vec,
2012           .fno = gen_helper_gvec_subs64,
2013           .opt_opc = vecop_list_sub,
2014           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2015           .vece = MO_64 },
2016     };
2017
2018     tcg_debug_assert(vece <= MO_64);
2019     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2020 }
2021
2022 /* Perform a vector subtraction using normal subtraction and a mask.
2023    Compare gen_addv_mask above.  */
2024 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2025 {
2026     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2027     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2028     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2029
2030     tcg_gen_or_i64(t1, a, m);
2031     tcg_gen_andc_i64(t2, b, m);
2032     tcg_gen_eqv_i64(t3, a, b);
2033     tcg_gen_sub_i64(d, t1, t2);
2034     tcg_gen_and_i64(t3, t3, m);
2035     tcg_gen_xor_i64(d, d, t3);
2036
2037     tcg_temp_free_i64(t1);
2038     tcg_temp_free_i64(t2);
2039     tcg_temp_free_i64(t3);
2040 }
2041
2042 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2043 {
2044     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2045     gen_subv_mask(d, a, b, m);
2046 }
2047
2048 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2049 {
2050     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2051     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2052     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2053     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2054
2055     tcg_gen_or_i32(t1, a, m);
2056     tcg_gen_andc_i32(t2, b, m);
2057     tcg_gen_eqv_i32(t3, a, b);
2058     tcg_gen_sub_i32(d, t1, t2);
2059     tcg_gen_and_i32(t3, t3, m);
2060     tcg_gen_xor_i32(d, d, t3);
2061
2062     tcg_temp_free_i32(t1);
2063     tcg_temp_free_i32(t2);
2064     tcg_temp_free_i32(t3);
2065 }
2066
2067 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2068 {
2069     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2070     gen_subv_mask(d, a, b, m);
2071 }
2072
2073 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2074 {
2075     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2076     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2077
2078     tcg_gen_andi_i32(t1, b, ~0xffff);
2079     tcg_gen_sub_i32(t2, a, b);
2080     tcg_gen_sub_i32(t1, a, t1);
2081     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2082
2083     tcg_temp_free_i32(t1);
2084     tcg_temp_free_i32(t2);
2085 }
2086
2087 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2088 {
2089     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2090     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2091
2092     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2093     tcg_gen_sub_i64(t2, a, b);
2094     tcg_gen_sub_i64(t1, a, t1);
2095     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2096
2097     tcg_temp_free_i64(t1);
2098     tcg_temp_free_i64(t2);
2099 }
2100
2101 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2102                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2103 {
2104     static const GVecGen3 g[4] = {
2105         { .fni8 = tcg_gen_vec_sub8_i64,
2106           .fniv = tcg_gen_sub_vec,
2107           .fno = gen_helper_gvec_sub8,
2108           .opt_opc = vecop_list_sub,
2109           .vece = MO_8 },
2110         { .fni8 = tcg_gen_vec_sub16_i64,
2111           .fniv = tcg_gen_sub_vec,
2112           .fno = gen_helper_gvec_sub16,
2113           .opt_opc = vecop_list_sub,
2114           .vece = MO_16 },
2115         { .fni4 = tcg_gen_sub_i32,
2116           .fniv = tcg_gen_sub_vec,
2117           .fno = gen_helper_gvec_sub32,
2118           .opt_opc = vecop_list_sub,
2119           .vece = MO_32 },
2120         { .fni8 = tcg_gen_sub_i64,
2121           .fniv = tcg_gen_sub_vec,
2122           .fno = gen_helper_gvec_sub64,
2123           .opt_opc = vecop_list_sub,
2124           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2125           .vece = MO_64 },
2126     };
2127
2128     tcg_debug_assert(vece <= MO_64);
2129     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2130 }
2131
2132 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2133
2134 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2135                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2136 {
2137     static const GVecGen3 g[4] = {
2138         { .fniv = tcg_gen_mul_vec,
2139           .fno = gen_helper_gvec_mul8,
2140           .opt_opc = vecop_list_mul,
2141           .vece = MO_8 },
2142         { .fniv = tcg_gen_mul_vec,
2143           .fno = gen_helper_gvec_mul16,
2144           .opt_opc = vecop_list_mul,
2145           .vece = MO_16 },
2146         { .fni4 = tcg_gen_mul_i32,
2147           .fniv = tcg_gen_mul_vec,
2148           .fno = gen_helper_gvec_mul32,
2149           .opt_opc = vecop_list_mul,
2150           .vece = MO_32 },
2151         { .fni8 = tcg_gen_mul_i64,
2152           .fniv = tcg_gen_mul_vec,
2153           .fno = gen_helper_gvec_mul64,
2154           .opt_opc = vecop_list_mul,
2155           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2156           .vece = MO_64 },
2157     };
2158
2159     tcg_debug_assert(vece <= MO_64);
2160     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2161 }
2162
2163 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2164                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2165 {
2166     static const GVecGen2s g[4] = {
2167         { .fniv = tcg_gen_mul_vec,
2168           .fno = gen_helper_gvec_muls8,
2169           .opt_opc = vecop_list_mul,
2170           .vece = MO_8 },
2171         { .fniv = tcg_gen_mul_vec,
2172           .fno = gen_helper_gvec_muls16,
2173           .opt_opc = vecop_list_mul,
2174           .vece = MO_16 },
2175         { .fni4 = tcg_gen_mul_i32,
2176           .fniv = tcg_gen_mul_vec,
2177           .fno = gen_helper_gvec_muls32,
2178           .opt_opc = vecop_list_mul,
2179           .vece = MO_32 },
2180         { .fni8 = tcg_gen_mul_i64,
2181           .fniv = tcg_gen_mul_vec,
2182           .fno = gen_helper_gvec_muls64,
2183           .opt_opc = vecop_list_mul,
2184           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2185           .vece = MO_64 },
2186     };
2187
2188     tcg_debug_assert(vece <= MO_64);
2189     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2190 }
2191
2192 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2193                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2194 {
2195     TCGv_i64 tmp = tcg_constant_i64(c);
2196     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2197 }
2198
2199 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2200                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2201 {
2202     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2203     static const GVecGen3 g[4] = {
2204         { .fniv = tcg_gen_ssadd_vec,
2205           .fno = gen_helper_gvec_ssadd8,
2206           .opt_opc = vecop_list,
2207           .vece = MO_8 },
2208         { .fniv = tcg_gen_ssadd_vec,
2209           .fno = gen_helper_gvec_ssadd16,
2210           .opt_opc = vecop_list,
2211           .vece = MO_16 },
2212         { .fniv = tcg_gen_ssadd_vec,
2213           .fno = gen_helper_gvec_ssadd32,
2214           .opt_opc = vecop_list,
2215           .vece = MO_32 },
2216         { .fniv = tcg_gen_ssadd_vec,
2217           .fno = gen_helper_gvec_ssadd64,
2218           .opt_opc = vecop_list,
2219           .vece = MO_64 },
2220     };
2221     tcg_debug_assert(vece <= MO_64);
2222     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2223 }
2224
2225 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2226                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2227 {
2228     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2229     static const GVecGen3 g[4] = {
2230         { .fniv = tcg_gen_sssub_vec,
2231           .fno = gen_helper_gvec_sssub8,
2232           .opt_opc = vecop_list,
2233           .vece = MO_8 },
2234         { .fniv = tcg_gen_sssub_vec,
2235           .fno = gen_helper_gvec_sssub16,
2236           .opt_opc = vecop_list,
2237           .vece = MO_16 },
2238         { .fniv = tcg_gen_sssub_vec,
2239           .fno = gen_helper_gvec_sssub32,
2240           .opt_opc = vecop_list,
2241           .vece = MO_32 },
2242         { .fniv = tcg_gen_sssub_vec,
2243           .fno = gen_helper_gvec_sssub64,
2244           .opt_opc = vecop_list,
2245           .vece = MO_64 },
2246     };
2247     tcg_debug_assert(vece <= MO_64);
2248     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2249 }
2250
2251 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2252 {
2253     TCGv_i32 max = tcg_constant_i32(-1);
2254     tcg_gen_add_i32(d, a, b);
2255     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2256 }
2257
2258 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2259 {
2260     TCGv_i64 max = tcg_constant_i64(-1);
2261     tcg_gen_add_i64(d, a, b);
2262     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2263 }
2264
2265 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2266                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2267 {
2268     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2269     static const GVecGen3 g[4] = {
2270         { .fniv = tcg_gen_usadd_vec,
2271           .fno = gen_helper_gvec_usadd8,
2272           .opt_opc = vecop_list,
2273           .vece = MO_8 },
2274         { .fniv = tcg_gen_usadd_vec,
2275           .fno = gen_helper_gvec_usadd16,
2276           .opt_opc = vecop_list,
2277           .vece = MO_16 },
2278         { .fni4 = tcg_gen_usadd_i32,
2279           .fniv = tcg_gen_usadd_vec,
2280           .fno = gen_helper_gvec_usadd32,
2281           .opt_opc = vecop_list,
2282           .vece = MO_32 },
2283         { .fni8 = tcg_gen_usadd_i64,
2284           .fniv = tcg_gen_usadd_vec,
2285           .fno = gen_helper_gvec_usadd64,
2286           .opt_opc = vecop_list,
2287           .vece = MO_64 }
2288     };
2289     tcg_debug_assert(vece <= MO_64);
2290     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2291 }
2292
2293 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2294 {
2295     TCGv_i32 min = tcg_constant_i32(0);
2296     tcg_gen_sub_i32(d, a, b);
2297     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2298 }
2299
2300 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2301 {
2302     TCGv_i64 min = tcg_constant_i64(0);
2303     tcg_gen_sub_i64(d, a, b);
2304     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2305 }
2306
2307 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2308                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2309 {
2310     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2311     static const GVecGen3 g[4] = {
2312         { .fniv = tcg_gen_ussub_vec,
2313           .fno = gen_helper_gvec_ussub8,
2314           .opt_opc = vecop_list,
2315           .vece = MO_8 },
2316         { .fniv = tcg_gen_ussub_vec,
2317           .fno = gen_helper_gvec_ussub16,
2318           .opt_opc = vecop_list,
2319           .vece = MO_16 },
2320         { .fni4 = tcg_gen_ussub_i32,
2321           .fniv = tcg_gen_ussub_vec,
2322           .fno = gen_helper_gvec_ussub32,
2323           .opt_opc = vecop_list,
2324           .vece = MO_32 },
2325         { .fni8 = tcg_gen_ussub_i64,
2326           .fniv = tcg_gen_ussub_vec,
2327           .fno = gen_helper_gvec_ussub64,
2328           .opt_opc = vecop_list,
2329           .vece = MO_64 }
2330     };
2331     tcg_debug_assert(vece <= MO_64);
2332     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2333 }
2334
2335 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2336                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2337 {
2338     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2339     static const GVecGen3 g[4] = {
2340         { .fniv = tcg_gen_smin_vec,
2341           .fno = gen_helper_gvec_smin8,
2342           .opt_opc = vecop_list,
2343           .vece = MO_8 },
2344         { .fniv = tcg_gen_smin_vec,
2345           .fno = gen_helper_gvec_smin16,
2346           .opt_opc = vecop_list,
2347           .vece = MO_16 },
2348         { .fni4 = tcg_gen_smin_i32,
2349           .fniv = tcg_gen_smin_vec,
2350           .fno = gen_helper_gvec_smin32,
2351           .opt_opc = vecop_list,
2352           .vece = MO_32 },
2353         { .fni8 = tcg_gen_smin_i64,
2354           .fniv = tcg_gen_smin_vec,
2355           .fno = gen_helper_gvec_smin64,
2356           .opt_opc = vecop_list,
2357           .vece = MO_64 }
2358     };
2359     tcg_debug_assert(vece <= MO_64);
2360     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2361 }
2362
2363 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2365 {
2366     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2367     static const GVecGen3 g[4] = {
2368         { .fniv = tcg_gen_umin_vec,
2369           .fno = gen_helper_gvec_umin8,
2370           .opt_opc = vecop_list,
2371           .vece = MO_8 },
2372         { .fniv = tcg_gen_umin_vec,
2373           .fno = gen_helper_gvec_umin16,
2374           .opt_opc = vecop_list,
2375           .vece = MO_16 },
2376         { .fni4 = tcg_gen_umin_i32,
2377           .fniv = tcg_gen_umin_vec,
2378           .fno = gen_helper_gvec_umin32,
2379           .opt_opc = vecop_list,
2380           .vece = MO_32 },
2381         { .fni8 = tcg_gen_umin_i64,
2382           .fniv = tcg_gen_umin_vec,
2383           .fno = gen_helper_gvec_umin64,
2384           .opt_opc = vecop_list,
2385           .vece = MO_64 }
2386     };
2387     tcg_debug_assert(vece <= MO_64);
2388     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2389 }
2390
2391 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2392                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2393 {
2394     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2395     static const GVecGen3 g[4] = {
2396         { .fniv = tcg_gen_smax_vec,
2397           .fno = gen_helper_gvec_smax8,
2398           .opt_opc = vecop_list,
2399           .vece = MO_8 },
2400         { .fniv = tcg_gen_smax_vec,
2401           .fno = gen_helper_gvec_smax16,
2402           .opt_opc = vecop_list,
2403           .vece = MO_16 },
2404         { .fni4 = tcg_gen_smax_i32,
2405           .fniv = tcg_gen_smax_vec,
2406           .fno = gen_helper_gvec_smax32,
2407           .opt_opc = vecop_list,
2408           .vece = MO_32 },
2409         { .fni8 = tcg_gen_smax_i64,
2410           .fniv = tcg_gen_smax_vec,
2411           .fno = gen_helper_gvec_smax64,
2412           .opt_opc = vecop_list,
2413           .vece = MO_64 }
2414     };
2415     tcg_debug_assert(vece <= MO_64);
2416     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2417 }
2418
2419 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2420                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2421 {
2422     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2423     static const GVecGen3 g[4] = {
2424         { .fniv = tcg_gen_umax_vec,
2425           .fno = gen_helper_gvec_umax8,
2426           .opt_opc = vecop_list,
2427           .vece = MO_8 },
2428         { .fniv = tcg_gen_umax_vec,
2429           .fno = gen_helper_gvec_umax16,
2430           .opt_opc = vecop_list,
2431           .vece = MO_16 },
2432         { .fni4 = tcg_gen_umax_i32,
2433           .fniv = tcg_gen_umax_vec,
2434           .fno = gen_helper_gvec_umax32,
2435           .opt_opc = vecop_list,
2436           .vece = MO_32 },
2437         { .fni8 = tcg_gen_umax_i64,
2438           .fniv = tcg_gen_umax_vec,
2439           .fno = gen_helper_gvec_umax64,
2440           .opt_opc = vecop_list,
2441           .vece = MO_64 }
2442     };
2443     tcg_debug_assert(vece <= MO_64);
2444     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2445 }
2446
2447 /* Perform a vector negation using normal negation and a mask.
2448    Compare gen_subv_mask above.  */
2449 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2450 {
2451     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2452     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2453
2454     tcg_gen_andc_i64(t3, m, b);
2455     tcg_gen_andc_i64(t2, b, m);
2456     tcg_gen_sub_i64(d, m, t2);
2457     tcg_gen_xor_i64(d, d, t3);
2458
2459     tcg_temp_free_i64(t2);
2460     tcg_temp_free_i64(t3);
2461 }
2462
2463 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2464 {
2465     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2466     gen_negv_mask(d, b, m);
2467 }
2468
2469 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2470 {
2471     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2472     gen_negv_mask(d, b, m);
2473 }
2474
2475 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2476 {
2477     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2478     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2479
2480     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2481     tcg_gen_neg_i64(t2, b);
2482     tcg_gen_neg_i64(t1, t1);
2483     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2484
2485     tcg_temp_free_i64(t1);
2486     tcg_temp_free_i64(t2);
2487 }
2488
2489 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2490                       uint32_t oprsz, uint32_t maxsz)
2491 {
2492     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2493     static const GVecGen2 g[4] = {
2494         { .fni8 = tcg_gen_vec_neg8_i64,
2495           .fniv = tcg_gen_neg_vec,
2496           .fno = gen_helper_gvec_neg8,
2497           .opt_opc = vecop_list,
2498           .vece = MO_8 },
2499         { .fni8 = tcg_gen_vec_neg16_i64,
2500           .fniv = tcg_gen_neg_vec,
2501           .fno = gen_helper_gvec_neg16,
2502           .opt_opc = vecop_list,
2503           .vece = MO_16 },
2504         { .fni4 = tcg_gen_neg_i32,
2505           .fniv = tcg_gen_neg_vec,
2506           .fno = gen_helper_gvec_neg32,
2507           .opt_opc = vecop_list,
2508           .vece = MO_32 },
2509         { .fni8 = tcg_gen_neg_i64,
2510           .fniv = tcg_gen_neg_vec,
2511           .fno = gen_helper_gvec_neg64,
2512           .opt_opc = vecop_list,
2513           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2514           .vece = MO_64 },
2515     };
2516
2517     tcg_debug_assert(vece <= MO_64);
2518     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2519 }
2520
2521 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2522 {
2523     TCGv_i64 t = tcg_temp_ebb_new_i64();
2524     int nbit = 8 << vece;
2525
2526     /* Create -1 for each negative element.  */
2527     tcg_gen_shri_i64(t, b, nbit - 1);
2528     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2529     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2530
2531     /*
2532      * Invert (via xor -1) and add one.
2533      * Because of the ordering the msb is cleared,
2534      * so we never have carry into the next element.
2535      */
2536     tcg_gen_xor_i64(d, b, t);
2537     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2538     tcg_gen_add_i64(d, d, t);
2539
2540     tcg_temp_free_i64(t);
2541 }
2542
2543 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2544 {
2545     gen_absv_mask(d, b, MO_8);
2546 }
2547
2548 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2549 {
2550     gen_absv_mask(d, b, MO_16);
2551 }
2552
2553 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2554                       uint32_t oprsz, uint32_t maxsz)
2555 {
2556     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2557     static const GVecGen2 g[4] = {
2558         { .fni8 = tcg_gen_vec_abs8_i64,
2559           .fniv = tcg_gen_abs_vec,
2560           .fno = gen_helper_gvec_abs8,
2561           .opt_opc = vecop_list,
2562           .vece = MO_8 },
2563         { .fni8 = tcg_gen_vec_abs16_i64,
2564           .fniv = tcg_gen_abs_vec,
2565           .fno = gen_helper_gvec_abs16,
2566           .opt_opc = vecop_list,
2567           .vece = MO_16 },
2568         { .fni4 = tcg_gen_abs_i32,
2569           .fniv = tcg_gen_abs_vec,
2570           .fno = gen_helper_gvec_abs32,
2571           .opt_opc = vecop_list,
2572           .vece = MO_32 },
2573         { .fni8 = tcg_gen_abs_i64,
2574           .fniv = tcg_gen_abs_vec,
2575           .fno = gen_helper_gvec_abs64,
2576           .opt_opc = vecop_list,
2577           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2578           .vece = MO_64 },
2579     };
2580
2581     tcg_debug_assert(vece <= MO_64);
2582     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2583 }
2584
2585 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2586                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2587 {
2588     static const GVecGen3 g = {
2589         .fni8 = tcg_gen_and_i64,
2590         .fniv = tcg_gen_and_vec,
2591         .fno = gen_helper_gvec_and,
2592         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2593     };
2594
2595     if (aofs == bofs) {
2596         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2597     } else {
2598         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2599     }
2600 }
2601
2602 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2603                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2604 {
2605     static const GVecGen3 g = {
2606         .fni8 = tcg_gen_or_i64,
2607         .fniv = tcg_gen_or_vec,
2608         .fno = gen_helper_gvec_or,
2609         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2610     };
2611
2612     if (aofs == bofs) {
2613         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2614     } else {
2615         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2616     }
2617 }
2618
2619 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2620                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2621 {
2622     static const GVecGen3 g = {
2623         .fni8 = tcg_gen_xor_i64,
2624         .fniv = tcg_gen_xor_vec,
2625         .fno = gen_helper_gvec_xor,
2626         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2627     };
2628
2629     if (aofs == bofs) {
2630         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2631     } else {
2632         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2633     }
2634 }
2635
2636 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2638 {
2639     static const GVecGen3 g = {
2640         .fni8 = tcg_gen_andc_i64,
2641         .fniv = tcg_gen_andc_vec,
2642         .fno = gen_helper_gvec_andc,
2643         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2644     };
2645
2646     if (aofs == bofs) {
2647         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2648     } else {
2649         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2650     }
2651 }
2652
2653 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2654                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2655 {
2656     static const GVecGen3 g = {
2657         .fni8 = tcg_gen_orc_i64,
2658         .fniv = tcg_gen_orc_vec,
2659         .fno = gen_helper_gvec_orc,
2660         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2661     };
2662
2663     if (aofs == bofs) {
2664         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2665     } else {
2666         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2667     }
2668 }
2669
2670 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2671                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2672 {
2673     static const GVecGen3 g = {
2674         .fni8 = tcg_gen_nand_i64,
2675         .fniv = tcg_gen_nand_vec,
2676         .fno = gen_helper_gvec_nand,
2677         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2678     };
2679
2680     if (aofs == bofs) {
2681         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2682     } else {
2683         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2684     }
2685 }
2686
2687 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2688                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2689 {
2690     static const GVecGen3 g = {
2691         .fni8 = tcg_gen_nor_i64,
2692         .fniv = tcg_gen_nor_vec,
2693         .fno = gen_helper_gvec_nor,
2694         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2695     };
2696
2697     if (aofs == bofs) {
2698         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2699     } else {
2700         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2701     }
2702 }
2703
2704 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2705                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2706 {
2707     static const GVecGen3 g = {
2708         .fni8 = tcg_gen_eqv_i64,
2709         .fniv = tcg_gen_eqv_vec,
2710         .fno = gen_helper_gvec_eqv,
2711         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2712     };
2713
2714     if (aofs == bofs) {
2715         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2716     } else {
2717         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2718     }
2719 }
2720
2721 static const GVecGen2s gop_ands = {
2722     .fni8 = tcg_gen_and_i64,
2723     .fniv = tcg_gen_and_vec,
2724     .fno = gen_helper_gvec_ands,
2725     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2726     .vece = MO_64
2727 };
2728
2729 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2730                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2731 {
2732     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2733     tcg_gen_dup_i64(vece, tmp, c);
2734     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2735     tcg_temp_free_i64(tmp);
2736 }
2737
2738 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2739                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2740 {
2741     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2742     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2743 }
2744
2745 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
2746                         TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2747 {
2748     static GVecGen2s g = {
2749         .fni8 = tcg_gen_andc_i64,
2750         .fniv = tcg_gen_andc_vec,
2751         .fno = gen_helper_gvec_andcs,
2752         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2753         .vece = MO_64
2754     };
2755
2756     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2757     tcg_gen_dup_i64(vece, tmp, c);
2758     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g);
2759     tcg_temp_free_i64(tmp);
2760 }
2761
2762 static const GVecGen2s gop_xors = {
2763     .fni8 = tcg_gen_xor_i64,
2764     .fniv = tcg_gen_xor_vec,
2765     .fno = gen_helper_gvec_xors,
2766     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2767     .vece = MO_64
2768 };
2769
2770 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2771                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2772 {
2773     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2774     tcg_gen_dup_i64(vece, tmp, c);
2775     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2776     tcg_temp_free_i64(tmp);
2777 }
2778
2779 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2780                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2781 {
2782     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2783     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2784 }
2785
2786 static const GVecGen2s gop_ors = {
2787     .fni8 = tcg_gen_or_i64,
2788     .fniv = tcg_gen_or_vec,
2789     .fno = gen_helper_gvec_ors,
2790     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2791     .vece = MO_64
2792 };
2793
2794 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2795                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2796 {
2797     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2798     tcg_gen_dup_i64(vece, tmp, c);
2799     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2800     tcg_temp_free_i64(tmp);
2801 }
2802
2803 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2804                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2805 {
2806     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2807     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2808 }
2809
2810 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2811 {
2812     uint64_t mask = dup_const(MO_8, 0xff << c);
2813     tcg_gen_shli_i64(d, a, c);
2814     tcg_gen_andi_i64(d, d, mask);
2815 }
2816
2817 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2818 {
2819     uint64_t mask = dup_const(MO_16, 0xffff << c);
2820     tcg_gen_shli_i64(d, a, c);
2821     tcg_gen_andi_i64(d, d, mask);
2822 }
2823
2824 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2825 {
2826     uint32_t mask = dup_const(MO_8, 0xff << c);
2827     tcg_gen_shli_i32(d, a, c);
2828     tcg_gen_andi_i32(d, d, mask);
2829 }
2830
2831 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2832 {
2833     uint32_t mask = dup_const(MO_16, 0xffff << c);
2834     tcg_gen_shli_i32(d, a, c);
2835     tcg_gen_andi_i32(d, d, mask);
2836 }
2837
2838 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2839                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2840 {
2841     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2842     static const GVecGen2i g[4] = {
2843         { .fni8 = tcg_gen_vec_shl8i_i64,
2844           .fniv = tcg_gen_shli_vec,
2845           .fno = gen_helper_gvec_shl8i,
2846           .opt_opc = vecop_list,
2847           .vece = MO_8 },
2848         { .fni8 = tcg_gen_vec_shl16i_i64,
2849           .fniv = tcg_gen_shli_vec,
2850           .fno = gen_helper_gvec_shl16i,
2851           .opt_opc = vecop_list,
2852           .vece = MO_16 },
2853         { .fni4 = tcg_gen_shli_i32,
2854           .fniv = tcg_gen_shli_vec,
2855           .fno = gen_helper_gvec_shl32i,
2856           .opt_opc = vecop_list,
2857           .vece = MO_32 },
2858         { .fni8 = tcg_gen_shli_i64,
2859           .fniv = tcg_gen_shli_vec,
2860           .fno = gen_helper_gvec_shl64i,
2861           .opt_opc = vecop_list,
2862           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2863           .vece = MO_64 },
2864     };
2865
2866     tcg_debug_assert(vece <= MO_64);
2867     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2868     if (shift == 0) {
2869         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2870     } else {
2871         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2872     }
2873 }
2874
2875 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2876 {
2877     uint64_t mask = dup_const(MO_8, 0xff >> c);
2878     tcg_gen_shri_i64(d, a, c);
2879     tcg_gen_andi_i64(d, d, mask);
2880 }
2881
2882 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2883 {
2884     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2885     tcg_gen_shri_i64(d, a, c);
2886     tcg_gen_andi_i64(d, d, mask);
2887 }
2888
2889 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2890 {
2891     uint32_t mask = dup_const(MO_8, 0xff >> c);
2892     tcg_gen_shri_i32(d, a, c);
2893     tcg_gen_andi_i32(d, d, mask);
2894 }
2895
2896 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2897 {
2898     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2899     tcg_gen_shri_i32(d, a, c);
2900     tcg_gen_andi_i32(d, d, mask);
2901 }
2902
2903 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2904                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2905 {
2906     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2907     static const GVecGen2i g[4] = {
2908         { .fni8 = tcg_gen_vec_shr8i_i64,
2909           .fniv = tcg_gen_shri_vec,
2910           .fno = gen_helper_gvec_shr8i,
2911           .opt_opc = vecop_list,
2912           .vece = MO_8 },
2913         { .fni8 = tcg_gen_vec_shr16i_i64,
2914           .fniv = tcg_gen_shri_vec,
2915           .fno = gen_helper_gvec_shr16i,
2916           .opt_opc = vecop_list,
2917           .vece = MO_16 },
2918         { .fni4 = tcg_gen_shri_i32,
2919           .fniv = tcg_gen_shri_vec,
2920           .fno = gen_helper_gvec_shr32i,
2921           .opt_opc = vecop_list,
2922           .vece = MO_32 },
2923         { .fni8 = tcg_gen_shri_i64,
2924           .fniv = tcg_gen_shri_vec,
2925           .fno = gen_helper_gvec_shr64i,
2926           .opt_opc = vecop_list,
2927           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2928           .vece = MO_64 },
2929     };
2930
2931     tcg_debug_assert(vece <= MO_64);
2932     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2933     if (shift == 0) {
2934         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2935     } else {
2936         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2937     }
2938 }
2939
2940 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2941 {
2942     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2943     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2944     TCGv_i64 s = tcg_temp_ebb_new_i64();
2945
2946     tcg_gen_shri_i64(d, a, c);
2947     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2948     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2949     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2950     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2951     tcg_temp_free_i64(s);
2952 }
2953
2954 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2955 {
2956     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2957     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2958     TCGv_i64 s = tcg_temp_ebb_new_i64();
2959
2960     tcg_gen_shri_i64(d, a, c);
2961     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2962     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2963     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2964     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2965     tcg_temp_free_i64(s);
2966 }
2967
2968 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2969 {
2970     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2971     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2972     TCGv_i32 s = tcg_temp_ebb_new_i32();
2973
2974     tcg_gen_shri_i32(d, a, c);
2975     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2976     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2977     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2978     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2979     tcg_temp_free_i32(s);
2980 }
2981
2982 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2983 {
2984     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2985     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2986     TCGv_i32 s = tcg_temp_ebb_new_i32();
2987
2988     tcg_gen_shri_i32(d, a, c);
2989     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2990     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2991     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2992     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2993     tcg_temp_free_i32(s);
2994 }
2995
2996 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2997                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2998 {
2999     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3000     static const GVecGen2i g[4] = {
3001         { .fni8 = tcg_gen_vec_sar8i_i64,
3002           .fniv = tcg_gen_sari_vec,
3003           .fno = gen_helper_gvec_sar8i,
3004           .opt_opc = vecop_list,
3005           .vece = MO_8 },
3006         { .fni8 = tcg_gen_vec_sar16i_i64,
3007           .fniv = tcg_gen_sari_vec,
3008           .fno = gen_helper_gvec_sar16i,
3009           .opt_opc = vecop_list,
3010           .vece = MO_16 },
3011         { .fni4 = tcg_gen_sari_i32,
3012           .fniv = tcg_gen_sari_vec,
3013           .fno = gen_helper_gvec_sar32i,
3014           .opt_opc = vecop_list,
3015           .vece = MO_32 },
3016         { .fni8 = tcg_gen_sari_i64,
3017           .fniv = tcg_gen_sari_vec,
3018           .fno = gen_helper_gvec_sar64i,
3019           .opt_opc = vecop_list,
3020           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3021           .vece = MO_64 },
3022     };
3023
3024     tcg_debug_assert(vece <= MO_64);
3025     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3026     if (shift == 0) {
3027         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3028     } else {
3029         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3030     }
3031 }
3032
3033 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3034 {
3035     uint64_t mask = dup_const(MO_8, 0xff << c);
3036
3037     tcg_gen_shli_i64(d, a, c);
3038     tcg_gen_shri_i64(a, a, 8 - c);
3039     tcg_gen_andi_i64(d, d, mask);
3040     tcg_gen_andi_i64(a, a, ~mask);
3041     tcg_gen_or_i64(d, d, a);
3042 }
3043
3044 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3045 {
3046     uint64_t mask = dup_const(MO_16, 0xffff << c);
3047
3048     tcg_gen_shli_i64(d, a, c);
3049     tcg_gen_shri_i64(a, a, 16 - c);
3050     tcg_gen_andi_i64(d, d, mask);
3051     tcg_gen_andi_i64(a, a, ~mask);
3052     tcg_gen_or_i64(d, d, a);
3053 }
3054
3055 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3056                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3057 {
3058     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3059     static const GVecGen2i g[4] = {
3060         { .fni8 = tcg_gen_vec_rotl8i_i64,
3061           .fniv = tcg_gen_rotli_vec,
3062           .fno = gen_helper_gvec_rotl8i,
3063           .opt_opc = vecop_list,
3064           .vece = MO_8 },
3065         { .fni8 = tcg_gen_vec_rotl16i_i64,
3066           .fniv = tcg_gen_rotli_vec,
3067           .fno = gen_helper_gvec_rotl16i,
3068           .opt_opc = vecop_list,
3069           .vece = MO_16 },
3070         { .fni4 = tcg_gen_rotli_i32,
3071           .fniv = tcg_gen_rotli_vec,
3072           .fno = gen_helper_gvec_rotl32i,
3073           .opt_opc = vecop_list,
3074           .vece = MO_32 },
3075         { .fni8 = tcg_gen_rotli_i64,
3076           .fniv = tcg_gen_rotli_vec,
3077           .fno = gen_helper_gvec_rotl64i,
3078           .opt_opc = vecop_list,
3079           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3080           .vece = MO_64 },
3081     };
3082
3083     tcg_debug_assert(vece <= MO_64);
3084     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3085     if (shift == 0) {
3086         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3087     } else {
3088         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3089     }
3090 }
3091
3092 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3093                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3094 {
3095     tcg_debug_assert(vece <= MO_64);
3096     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3097     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3098                        oprsz, maxsz);
3099 }
3100
3101 /*
3102  * Specialized generation vector shifts by a non-constant scalar.
3103  */
3104
3105 typedef struct {
3106     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3107     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3108     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3109     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3110     gen_helper_gvec_2 *fno[4];
3111     TCGOpcode s_list[2];
3112     TCGOpcode v_list[2];
3113 } GVecGen2sh;
3114
3115 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3116                            uint32_t oprsz, uint32_t tysz, TCGType type,
3117                            TCGv_i32 shift,
3118                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3119 {
3120     for (uint32_t i = 0; i < oprsz; i += tysz) {
3121         TCGv_vec t0 = tcg_temp_new_vec(type);
3122         TCGv_vec t1 = tcg_temp_new_vec(type);
3123
3124         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3125         fni(vece, t1, t0, shift);
3126         tcg_gen_st_vec(t1, tcg_env, dofs + i);
3127     }
3128 }
3129
3130 static void
3131 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3132                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3133 {
3134     TCGType type;
3135     uint32_t some;
3136
3137     check_size_align(oprsz, maxsz, dofs | aofs);
3138     check_overlap_2(dofs, aofs, maxsz);
3139
3140     /* If the backend has a scalar expansion, great.  */
3141     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3142     if (type) {
3143         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3144         switch (type) {
3145         case TCG_TYPE_V256:
3146             some = QEMU_ALIGN_DOWN(oprsz, 32);
3147             expand_2sh_vec(vece, dofs, aofs, some, 32,
3148                            TCG_TYPE_V256, shift, g->fniv_s);
3149             if (some == oprsz) {
3150                 break;
3151             }
3152             dofs += some;
3153             aofs += some;
3154             oprsz -= some;
3155             maxsz -= some;
3156             /* fallthru */
3157         case TCG_TYPE_V128:
3158             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3159                            TCG_TYPE_V128, shift, g->fniv_s);
3160             break;
3161         case TCG_TYPE_V64:
3162             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3163                            TCG_TYPE_V64, shift, g->fniv_s);
3164             break;
3165         default:
3166             g_assert_not_reached();
3167         }
3168         tcg_swap_vecop_list(hold_list);
3169         goto clear_tail;
3170     }
3171
3172     /* If the backend supports variable vector shifts, also cool.  */
3173     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3174     if (type) {
3175         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3176         TCGv_vec v_shift = tcg_temp_new_vec(type);
3177
3178         if (vece == MO_64) {
3179             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3180             tcg_gen_extu_i32_i64(sh64, shift);
3181             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3182             tcg_temp_free_i64(sh64);
3183         } else {
3184             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3185         }
3186
3187         switch (type) {
3188         case TCG_TYPE_V256:
3189             some = QEMU_ALIGN_DOWN(oprsz, 32);
3190             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3191                           v_shift, false, g->fniv_v);
3192             if (some == oprsz) {
3193                 break;
3194             }
3195             dofs += some;
3196             aofs += some;
3197             oprsz -= some;
3198             maxsz -= some;
3199             /* fallthru */
3200         case TCG_TYPE_V128:
3201             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3202                           v_shift, false, g->fniv_v);
3203             break;
3204         case TCG_TYPE_V64:
3205             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3206                           v_shift, false, g->fniv_v);
3207             break;
3208         default:
3209             g_assert_not_reached();
3210         }
3211         tcg_temp_free_vec(v_shift);
3212         tcg_swap_vecop_list(hold_list);
3213         goto clear_tail;
3214     }
3215
3216     /* Otherwise fall back to integral... */
3217     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3218         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3219     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3220         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3221         tcg_gen_extu_i32_i64(sh64, shift);
3222         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3223         tcg_temp_free_i64(sh64);
3224     } else {
3225         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3226         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3227         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3228
3229         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3230         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3231         tcg_gen_addi_ptr(a0, tcg_env, dofs);
3232         tcg_gen_addi_ptr(a1, tcg_env, aofs);
3233
3234         g->fno[vece](a0, a1, desc);
3235
3236         tcg_temp_free_ptr(a0);
3237         tcg_temp_free_ptr(a1);
3238         tcg_temp_free_i32(desc);
3239         return;
3240     }
3241
3242  clear_tail:
3243     if (oprsz < maxsz) {
3244         expand_clr(dofs + oprsz, maxsz - oprsz);
3245     }
3246 }
3247
3248 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3249                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3250 {
3251     static const GVecGen2sh g = {
3252         .fni4 = tcg_gen_shl_i32,
3253         .fni8 = tcg_gen_shl_i64,
3254         .fniv_s = tcg_gen_shls_vec,
3255         .fniv_v = tcg_gen_shlv_vec,
3256         .fno = {
3257             gen_helper_gvec_shl8i,
3258             gen_helper_gvec_shl16i,
3259             gen_helper_gvec_shl32i,
3260             gen_helper_gvec_shl64i,
3261         },
3262         .s_list = { INDEX_op_shls_vec, 0 },
3263         .v_list = { INDEX_op_shlv_vec, 0 },
3264     };
3265
3266     tcg_debug_assert(vece <= MO_64);
3267     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3268 }
3269
3270 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3271                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3272 {
3273     static const GVecGen2sh g = {
3274         .fni4 = tcg_gen_shr_i32,
3275         .fni8 = tcg_gen_shr_i64,
3276         .fniv_s = tcg_gen_shrs_vec,
3277         .fniv_v = tcg_gen_shrv_vec,
3278         .fno = {
3279             gen_helper_gvec_shr8i,
3280             gen_helper_gvec_shr16i,
3281             gen_helper_gvec_shr32i,
3282             gen_helper_gvec_shr64i,
3283         },
3284         .s_list = { INDEX_op_shrs_vec, 0 },
3285         .v_list = { INDEX_op_shrv_vec, 0 },
3286     };
3287
3288     tcg_debug_assert(vece <= MO_64);
3289     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3290 }
3291
3292 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3293                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3294 {
3295     static const GVecGen2sh g = {
3296         .fni4 = tcg_gen_sar_i32,
3297         .fni8 = tcg_gen_sar_i64,
3298         .fniv_s = tcg_gen_sars_vec,
3299         .fniv_v = tcg_gen_sarv_vec,
3300         .fno = {
3301             gen_helper_gvec_sar8i,
3302             gen_helper_gvec_sar16i,
3303             gen_helper_gvec_sar32i,
3304             gen_helper_gvec_sar64i,
3305         },
3306         .s_list = { INDEX_op_sars_vec, 0 },
3307         .v_list = { INDEX_op_sarv_vec, 0 },
3308     };
3309
3310     tcg_debug_assert(vece <= MO_64);
3311     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3312 }
3313
3314 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3315                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3316 {
3317     static const GVecGen2sh g = {
3318         .fni4 = tcg_gen_rotl_i32,
3319         .fni8 = tcg_gen_rotl_i64,
3320         .fniv_s = tcg_gen_rotls_vec,
3321         .fniv_v = tcg_gen_rotlv_vec,
3322         .fno = {
3323             gen_helper_gvec_rotl8i,
3324             gen_helper_gvec_rotl16i,
3325             gen_helper_gvec_rotl32i,
3326             gen_helper_gvec_rotl64i,
3327         },
3328         .s_list = { INDEX_op_rotls_vec, 0 },
3329         .v_list = { INDEX_op_rotlv_vec, 0 },
3330     };
3331
3332     tcg_debug_assert(vece <= MO_64);
3333     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3334 }
3335
3336 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3337                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3338 {
3339     TCGv_i32 tmp = tcg_temp_ebb_new_i32();
3340
3341     tcg_gen_neg_i32(tmp, shift);
3342     tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
3343     tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
3344     tcg_temp_free_i32(tmp);
3345 }
3346
3347 /*
3348  * Expand D = A << (B % element bits)
3349  *
3350  * Unlike scalar shifts, where it is easy for the target front end
3351  * to include the modulo as part of the expansion.  If the target
3352  * naturally includes the modulo as part of the operation, great!
3353  * If the target has some other behaviour from out-of-range shifts,
3354  * then it could not use this function anyway, and would need to
3355  * do it's own expansion with custom functions.
3356  */
3357 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3358                                  TCGv_vec a, TCGv_vec b)
3359 {
3360     TCGv_vec t = tcg_temp_new_vec_matching(d);
3361     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3362
3363     tcg_gen_and_vec(vece, t, b, m);
3364     tcg_gen_shlv_vec(vece, d, a, t);
3365     tcg_temp_free_vec(t);
3366 }
3367
3368 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3369 {
3370     TCGv_i32 t = tcg_temp_ebb_new_i32();
3371
3372     tcg_gen_andi_i32(t, b, 31);
3373     tcg_gen_shl_i32(d, a, t);
3374     tcg_temp_free_i32(t);
3375 }
3376
3377 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3378 {
3379     TCGv_i64 t = tcg_temp_ebb_new_i64();
3380
3381     tcg_gen_andi_i64(t, b, 63);
3382     tcg_gen_shl_i64(d, a, t);
3383     tcg_temp_free_i64(t);
3384 }
3385
3386 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3387                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3388 {
3389     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3390     static const GVecGen3 g[4] = {
3391         { .fniv = tcg_gen_shlv_mod_vec,
3392           .fno = gen_helper_gvec_shl8v,
3393           .opt_opc = vecop_list,
3394           .vece = MO_8 },
3395         { .fniv = tcg_gen_shlv_mod_vec,
3396           .fno = gen_helper_gvec_shl16v,
3397           .opt_opc = vecop_list,
3398           .vece = MO_16 },
3399         { .fni4 = tcg_gen_shl_mod_i32,
3400           .fniv = tcg_gen_shlv_mod_vec,
3401           .fno = gen_helper_gvec_shl32v,
3402           .opt_opc = vecop_list,
3403           .vece = MO_32 },
3404         { .fni8 = tcg_gen_shl_mod_i64,
3405           .fniv = tcg_gen_shlv_mod_vec,
3406           .fno = gen_helper_gvec_shl64v,
3407           .opt_opc = vecop_list,
3408           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3409           .vece = MO_64 },
3410     };
3411
3412     tcg_debug_assert(vece <= MO_64);
3413     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3414 }
3415
3416 /*
3417  * Similarly for logical right shifts.
3418  */
3419
3420 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3421                                  TCGv_vec a, TCGv_vec b)
3422 {
3423     TCGv_vec t = tcg_temp_new_vec_matching(d);
3424     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3425
3426     tcg_gen_and_vec(vece, t, b, m);
3427     tcg_gen_shrv_vec(vece, d, a, t);
3428     tcg_temp_free_vec(t);
3429 }
3430
3431 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3432 {
3433     TCGv_i32 t = tcg_temp_ebb_new_i32();
3434
3435     tcg_gen_andi_i32(t, b, 31);
3436     tcg_gen_shr_i32(d, a, t);
3437     tcg_temp_free_i32(t);
3438 }
3439
3440 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3441 {
3442     TCGv_i64 t = tcg_temp_ebb_new_i64();
3443
3444     tcg_gen_andi_i64(t, b, 63);
3445     tcg_gen_shr_i64(d, a, t);
3446     tcg_temp_free_i64(t);
3447 }
3448
3449 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3450                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3451 {
3452     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3453     static const GVecGen3 g[4] = {
3454         { .fniv = tcg_gen_shrv_mod_vec,
3455           .fno = gen_helper_gvec_shr8v,
3456           .opt_opc = vecop_list,
3457           .vece = MO_8 },
3458         { .fniv = tcg_gen_shrv_mod_vec,
3459           .fno = gen_helper_gvec_shr16v,
3460           .opt_opc = vecop_list,
3461           .vece = MO_16 },
3462         { .fni4 = tcg_gen_shr_mod_i32,
3463           .fniv = tcg_gen_shrv_mod_vec,
3464           .fno = gen_helper_gvec_shr32v,
3465           .opt_opc = vecop_list,
3466           .vece = MO_32 },
3467         { .fni8 = tcg_gen_shr_mod_i64,
3468           .fniv = tcg_gen_shrv_mod_vec,
3469           .fno = gen_helper_gvec_shr64v,
3470           .opt_opc = vecop_list,
3471           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3472           .vece = MO_64 },
3473     };
3474
3475     tcg_debug_assert(vece <= MO_64);
3476     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3477 }
3478
3479 /*
3480  * Similarly for arithmetic right shifts.
3481  */
3482
3483 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3484                                  TCGv_vec a, TCGv_vec b)
3485 {
3486     TCGv_vec t = tcg_temp_new_vec_matching(d);
3487     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3488
3489     tcg_gen_and_vec(vece, t, b, m);
3490     tcg_gen_sarv_vec(vece, d, a, t);
3491     tcg_temp_free_vec(t);
3492 }
3493
3494 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3495 {
3496     TCGv_i32 t = tcg_temp_ebb_new_i32();
3497
3498     tcg_gen_andi_i32(t, b, 31);
3499     tcg_gen_sar_i32(d, a, t);
3500     tcg_temp_free_i32(t);
3501 }
3502
3503 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3504 {
3505     TCGv_i64 t = tcg_temp_ebb_new_i64();
3506
3507     tcg_gen_andi_i64(t, b, 63);
3508     tcg_gen_sar_i64(d, a, t);
3509     tcg_temp_free_i64(t);
3510 }
3511
3512 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3513                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3514 {
3515     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3516     static const GVecGen3 g[4] = {
3517         { .fniv = tcg_gen_sarv_mod_vec,
3518           .fno = gen_helper_gvec_sar8v,
3519           .opt_opc = vecop_list,
3520           .vece = MO_8 },
3521         { .fniv = tcg_gen_sarv_mod_vec,
3522           .fno = gen_helper_gvec_sar16v,
3523           .opt_opc = vecop_list,
3524           .vece = MO_16 },
3525         { .fni4 = tcg_gen_sar_mod_i32,
3526           .fniv = tcg_gen_sarv_mod_vec,
3527           .fno = gen_helper_gvec_sar32v,
3528           .opt_opc = vecop_list,
3529           .vece = MO_32 },
3530         { .fni8 = tcg_gen_sar_mod_i64,
3531           .fniv = tcg_gen_sarv_mod_vec,
3532           .fno = gen_helper_gvec_sar64v,
3533           .opt_opc = vecop_list,
3534           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3535           .vece = MO_64 },
3536     };
3537
3538     tcg_debug_assert(vece <= MO_64);
3539     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3540 }
3541
3542 /*
3543  * Similarly for rotates.
3544  */
3545
3546 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3547                                   TCGv_vec a, TCGv_vec b)
3548 {
3549     TCGv_vec t = tcg_temp_new_vec_matching(d);
3550     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3551
3552     tcg_gen_and_vec(vece, t, b, m);
3553     tcg_gen_rotlv_vec(vece, d, a, t);
3554     tcg_temp_free_vec(t);
3555 }
3556
3557 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3558 {
3559     TCGv_i32 t = tcg_temp_ebb_new_i32();
3560
3561     tcg_gen_andi_i32(t, b, 31);
3562     tcg_gen_rotl_i32(d, a, t);
3563     tcg_temp_free_i32(t);
3564 }
3565
3566 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3567 {
3568     TCGv_i64 t = tcg_temp_ebb_new_i64();
3569
3570     tcg_gen_andi_i64(t, b, 63);
3571     tcg_gen_rotl_i64(d, a, t);
3572     tcg_temp_free_i64(t);
3573 }
3574
3575 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3576                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3577 {
3578     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3579     static const GVecGen3 g[4] = {
3580         { .fniv = tcg_gen_rotlv_mod_vec,
3581           .fno = gen_helper_gvec_rotl8v,
3582           .opt_opc = vecop_list,
3583           .vece = MO_8 },
3584         { .fniv = tcg_gen_rotlv_mod_vec,
3585           .fno = gen_helper_gvec_rotl16v,
3586           .opt_opc = vecop_list,
3587           .vece = MO_16 },
3588         { .fni4 = tcg_gen_rotl_mod_i32,
3589           .fniv = tcg_gen_rotlv_mod_vec,
3590           .fno = gen_helper_gvec_rotl32v,
3591           .opt_opc = vecop_list,
3592           .vece = MO_32 },
3593         { .fni8 = tcg_gen_rotl_mod_i64,
3594           .fniv = tcg_gen_rotlv_mod_vec,
3595           .fno = gen_helper_gvec_rotl64v,
3596           .opt_opc = vecop_list,
3597           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3598           .vece = MO_64 },
3599     };
3600
3601     tcg_debug_assert(vece <= MO_64);
3602     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3603 }
3604
3605 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3606                                   TCGv_vec a, TCGv_vec b)
3607 {
3608     TCGv_vec t = tcg_temp_new_vec_matching(d);
3609     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3610
3611     tcg_gen_and_vec(vece, t, b, m);
3612     tcg_gen_rotrv_vec(vece, d, a, t);
3613     tcg_temp_free_vec(t);
3614 }
3615
3616 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3617 {
3618     TCGv_i32 t = tcg_temp_ebb_new_i32();
3619
3620     tcg_gen_andi_i32(t, b, 31);
3621     tcg_gen_rotr_i32(d, a, t);
3622     tcg_temp_free_i32(t);
3623 }
3624
3625 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3626 {
3627     TCGv_i64 t = tcg_temp_ebb_new_i64();
3628
3629     tcg_gen_andi_i64(t, b, 63);
3630     tcg_gen_rotr_i64(d, a, t);
3631     tcg_temp_free_i64(t);
3632 }
3633
3634 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3635                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3636 {
3637     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3638     static const GVecGen3 g[4] = {
3639         { .fniv = tcg_gen_rotrv_mod_vec,
3640           .fno = gen_helper_gvec_rotr8v,
3641           .opt_opc = vecop_list,
3642           .vece = MO_8 },
3643         { .fniv = tcg_gen_rotrv_mod_vec,
3644           .fno = gen_helper_gvec_rotr16v,
3645           .opt_opc = vecop_list,
3646           .vece = MO_16 },
3647         { .fni4 = tcg_gen_rotr_mod_i32,
3648           .fniv = tcg_gen_rotrv_mod_vec,
3649           .fno = gen_helper_gvec_rotr32v,
3650           .opt_opc = vecop_list,
3651           .vece = MO_32 },
3652         { .fni8 = tcg_gen_rotr_mod_i64,
3653           .fniv = tcg_gen_rotrv_mod_vec,
3654           .fno = gen_helper_gvec_rotr64v,
3655           .opt_opc = vecop_list,
3656           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3657           .vece = MO_64 },
3658     };
3659
3660     tcg_debug_assert(vece <= MO_64);
3661     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3662 }
3663
3664 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3665 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3666                            uint32_t oprsz, TCGCond cond)
3667 {
3668     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3669     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3670     uint32_t i;
3671
3672     for (i = 0; i < oprsz; i += 4) {
3673         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3674         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
3675         tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3676         tcg_gen_st_i32(t0, tcg_env, dofs + i);
3677     }
3678     tcg_temp_free_i32(t1);
3679     tcg_temp_free_i32(t0);
3680 }
3681
3682 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3683                            uint32_t oprsz, TCGCond cond)
3684 {
3685     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3686     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3687     uint32_t i;
3688
3689     for (i = 0; i < oprsz; i += 8) {
3690         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3691         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
3692         tcg_gen_negsetcond_i64(cond, t0, t0, t1);
3693         tcg_gen_st_i64(t0, tcg_env, dofs + i);
3694     }
3695     tcg_temp_free_i64(t1);
3696     tcg_temp_free_i64(t0);
3697 }
3698
3699 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3700                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3701                            TCGType type, TCGCond cond)
3702 {
3703     for (uint32_t i = 0; i < oprsz; i += tysz) {
3704         TCGv_vec t0 = tcg_temp_new_vec(type);
3705         TCGv_vec t1 = tcg_temp_new_vec(type);
3706         TCGv_vec t2 = tcg_temp_new_vec(type);
3707
3708         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3709         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
3710         tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
3711         tcg_gen_st_vec(t2, tcg_env, dofs + i);
3712     }
3713 }
3714
3715 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3716                       uint32_t aofs, uint32_t bofs,
3717                       uint32_t oprsz, uint32_t maxsz)
3718 {
3719     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3720     static gen_helper_gvec_3 * const eq_fn[4] = {
3721         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3722         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3723     };
3724     static gen_helper_gvec_3 * const ne_fn[4] = {
3725         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3726         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3727     };
3728     static gen_helper_gvec_3 * const lt_fn[4] = {
3729         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3730         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3731     };
3732     static gen_helper_gvec_3 * const le_fn[4] = {
3733         gen_helper_gvec_le8, gen_helper_gvec_le16,
3734         gen_helper_gvec_le32, gen_helper_gvec_le64
3735     };
3736     static gen_helper_gvec_3 * const ltu_fn[4] = {
3737         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3738         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3739     };
3740     static gen_helper_gvec_3 * const leu_fn[4] = {
3741         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3742         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3743     };
3744     static gen_helper_gvec_3 * const * const fns[16] = {
3745         [TCG_COND_EQ] = eq_fn,
3746         [TCG_COND_NE] = ne_fn,
3747         [TCG_COND_LT] = lt_fn,
3748         [TCG_COND_LE] = le_fn,
3749         [TCG_COND_LTU] = ltu_fn,
3750         [TCG_COND_LEU] = leu_fn,
3751     };
3752
3753     const TCGOpcode *hold_list;
3754     TCGType type;
3755     uint32_t some;
3756
3757     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3758     check_overlap_3(dofs, aofs, bofs, maxsz);
3759
3760     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3761         do_dup(MO_8, dofs, oprsz, maxsz,
3762                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3763         return;
3764     }
3765
3766     /*
3767      * Implement inline with a vector type, if possible.
3768      * Prefer integer when 64-bit host and 64-bit comparison.
3769      */
3770     hold_list = tcg_swap_vecop_list(cmp_list);
3771     type = choose_vector_type(cmp_list, vece, oprsz,
3772                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3773     switch (type) {
3774     case TCG_TYPE_V256:
3775         /* Recall that ARM SVE allows vector sizes that are not a
3776          * power of 2, but always a multiple of 16.  The intent is
3777          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3778          */
3779         some = QEMU_ALIGN_DOWN(oprsz, 32);
3780         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3781         if (some == oprsz) {
3782             break;
3783         }
3784         dofs += some;
3785         aofs += some;
3786         bofs += some;
3787         oprsz -= some;
3788         maxsz -= some;
3789         /* fallthru */
3790     case TCG_TYPE_V128:
3791         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3792         break;
3793     case TCG_TYPE_V64:
3794         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3795         break;
3796
3797     case 0:
3798         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3799             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3800         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3801             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3802         } else {
3803             gen_helper_gvec_3 * const *fn = fns[cond];
3804
3805             if (fn == NULL) {
3806                 uint32_t tmp;
3807                 tmp = aofs, aofs = bofs, bofs = tmp;
3808                 cond = tcg_swap_cond(cond);
3809                 fn = fns[cond];
3810                 assert(fn != NULL);
3811             }
3812             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3813             oprsz = maxsz;
3814         }
3815         break;
3816
3817     default:
3818         g_assert_not_reached();
3819     }
3820     tcg_swap_vecop_list(hold_list);
3821
3822     if (oprsz < maxsz) {
3823         expand_clr(dofs + oprsz, maxsz - oprsz);
3824     }
3825 }
3826
3827 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3828                             uint32_t oprsz, uint32_t tysz, TCGType type,
3829                             TCGCond cond, TCGv_vec c)
3830 {
3831     TCGv_vec t0 = tcg_temp_new_vec(type);
3832     TCGv_vec t1 = tcg_temp_new_vec(type);
3833     uint32_t i;
3834
3835     for (i = 0; i < oprsz; i += tysz) {
3836         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
3837         tcg_gen_cmp_vec(cond, vece, t0, t1, c);
3838         tcg_gen_st_vec(t0, tcg_env, dofs + i);
3839     }
3840 }
3841
3842 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
3843                        uint32_t aofs, TCGv_i64 c,
3844                        uint32_t oprsz, uint32_t maxsz)
3845 {
3846     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3847     static gen_helper_gvec_2i * const eq_fn[4] = {
3848         gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
3849         gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
3850     };
3851     static gen_helper_gvec_2i * const lt_fn[4] = {
3852         gen_helper_gvec_lts8, gen_helper_gvec_lts16,
3853         gen_helper_gvec_lts32, gen_helper_gvec_lts64
3854     };
3855     static gen_helper_gvec_2i * const le_fn[4] = {
3856         gen_helper_gvec_les8, gen_helper_gvec_les16,
3857         gen_helper_gvec_les32, gen_helper_gvec_les64
3858     };
3859     static gen_helper_gvec_2i * const ltu_fn[4] = {
3860         gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
3861         gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
3862     };
3863     static gen_helper_gvec_2i * const leu_fn[4] = {
3864         gen_helper_gvec_leus8, gen_helper_gvec_leus16,
3865         gen_helper_gvec_leus32, gen_helper_gvec_leus64
3866     };
3867     static gen_helper_gvec_2i * const * const fns[16] = {
3868         [TCG_COND_EQ] = eq_fn,
3869         [TCG_COND_LT] = lt_fn,
3870         [TCG_COND_LE] = le_fn,
3871         [TCG_COND_LTU] = ltu_fn,
3872         [TCG_COND_LEU] = leu_fn,
3873     };
3874
3875     TCGType type;
3876
3877     check_size_align(oprsz, maxsz, dofs | aofs);
3878     check_overlap_2(dofs, aofs, maxsz);
3879
3880     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3881         do_dup(MO_8, dofs, oprsz, maxsz,
3882                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3883         return;
3884     }
3885
3886     /*
3887      * Implement inline with a vector type, if possible.
3888      * Prefer integer when 64-bit host and 64-bit comparison.
3889      */
3890     type = choose_vector_type(cmp_list, vece, oprsz,
3891                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3892     if (type != 0) {
3893         const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
3894         TCGv_vec t_vec = tcg_temp_new_vec(type);
3895         uint32_t some;
3896
3897         tcg_gen_dup_i64_vec(vece, t_vec, c);
3898         switch (type) {
3899         case TCG_TYPE_V256:
3900             some = QEMU_ALIGN_DOWN(oprsz, 32);
3901             expand_cmps_vec(vece, dofs, aofs, some, 32,
3902                             TCG_TYPE_V256, cond, t_vec);
3903             aofs += some;
3904             dofs += some;
3905             oprsz -= some;
3906             maxsz -= some;
3907             /* fallthru */
3908
3909         case TCG_TYPE_V128:
3910             some = QEMU_ALIGN_DOWN(oprsz, 16);
3911             expand_cmps_vec(vece, dofs, aofs, some, 16,
3912                             TCG_TYPE_V128, cond, t_vec);
3913             break;
3914
3915         case TCG_TYPE_V64:
3916             some = QEMU_ALIGN_DOWN(oprsz, 8);
3917             expand_cmps_vec(vece, dofs, aofs, some, 8,
3918                             TCG_TYPE_V64, cond, t_vec);
3919             break;
3920
3921         default:
3922             g_assert_not_reached();
3923         }
3924         tcg_temp_free_vec(t_vec);
3925         tcg_swap_vecop_list(hold_list);
3926     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3927         TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3928         uint32_t i;
3929
3930         for (i = 0; i < oprsz; i += 8) {
3931             tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3932             tcg_gen_negsetcond_i64(cond, t0, t0, c);
3933             tcg_gen_st_i64(t0, tcg_env, dofs + i);
3934         }
3935         tcg_temp_free_i64(t0);
3936     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3937         TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3938         TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3939         uint32_t i;
3940
3941         tcg_gen_extrl_i64_i32(t1, c);
3942         for (i = 0; i < oprsz; i += 4) {
3943             tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3944             tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3945             tcg_gen_st_i32(t0, tcg_env, dofs + i);
3946         }
3947         tcg_temp_free_i32(t0);
3948         tcg_temp_free_i32(t1);
3949     } else {
3950         gen_helper_gvec_2i * const *fn = fns[cond];
3951         bool inv = false;
3952
3953         if (fn == NULL) {
3954             cond = tcg_invert_cond(cond);
3955             fn = fns[cond];
3956             assert(fn != NULL);
3957             inv = true;
3958         }
3959         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
3960         return;
3961     }
3962
3963     if (oprsz < maxsz) {
3964         expand_clr(dofs + oprsz, maxsz - oprsz);
3965     }
3966 }
3967
3968 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
3969                        uint32_t aofs, int64_t c,
3970                        uint32_t oprsz, uint32_t maxsz)
3971 {
3972     TCGv_i64 tmp = tcg_constant_i64(c);
3973     tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
3974 }
3975
3976 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3977 {
3978     TCGv_i64 t = tcg_temp_ebb_new_i64();
3979
3980     tcg_gen_and_i64(t, b, a);
3981     tcg_gen_andc_i64(d, c, a);
3982     tcg_gen_or_i64(d, d, t);
3983     tcg_temp_free_i64(t);
3984 }
3985
3986 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3987                          uint32_t bofs, uint32_t cofs,
3988                          uint32_t oprsz, uint32_t maxsz)
3989 {
3990     static const GVecGen4 g = {
3991         .fni8 = tcg_gen_bitsel_i64,
3992         .fniv = tcg_gen_bitsel_vec,
3993         .fno = gen_helper_gvec_bitsel,
3994     };
3995
3996     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3997 }