tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "tcg/tcg.h"
  22 #include "tcg/tcg-op.h"
  23 #include "tcg/tcg-op-gvec.h"
  24 #include "tcg/tcg-gvec-desc.h"
  25
  26 #define MAX_UNROLL  4
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const TCGOpcode vecop_list_empty[1] = { 0 };
  30 #else
  31 #define vecop_list_empty NULL
  32 #endif
  33
  34
  35 /* Verify vector size and alignment rules.  OFS should be the OR of all
  36    of the operand offsets so that we can check them all at once.  */
  37 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  38 {
  39     uint32_t max_align;
  40
  41     switch (oprsz) {
  42     case 8:
  43     case 16:
  44     case 32:
  45         tcg_debug_assert(oprsz <= maxsz);
  46         break;
  47     default:
  48         tcg_debug_assert(oprsz == maxsz);
  49         break;
  50     }
  51     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  52
  53     max_align = maxsz >= 16 ? 15 : 7;
  54     tcg_debug_assert((maxsz & max_align) == 0);
  55     tcg_debug_assert((ofs & max_align) == 0);
  56 }
  57
  58 /* Verify vector overlap rules for two operands.  */
  59 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  60 {
  61     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  62 }
  63
  64 /* Verify vector overlap rules for three operands.  */
  65 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  66 {
  67     check_overlap_2(d, a, s);
  68     check_overlap_2(d, b, s);
  69     check_overlap_2(a, b, s);
  70 }
  71
  72 /* Verify vector overlap rules for four operands.  */
  73 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  74                             uint32_t c, uint32_t s)
  75 {
  76     check_overlap_2(d, a, s);
  77     check_overlap_2(d, b, s);
  78     check_overlap_2(d, c, s);
  79     check_overlap_2(a, b, s);
  80     check_overlap_2(a, c, s);
  81     check_overlap_2(b, c, s);
  82 }
  83
  84 /* Create a descriptor from components.  */
  85 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  86 {
  87     uint32_t desc = 0;
  88
  89     check_size_align(oprsz, maxsz, 0);
  90     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  91
  92     oprsz = (oprsz / 8) - 1;
  93     maxsz = (maxsz / 8) - 1;
  94
  95     /*
  96      * We have just asserted in check_size_align that either
  97      * oprsz is {8,16,32} or matches maxsz.  Encode the final
  98      * case with '2', as that would otherwise map to 24.
  99      */
 100     if (oprsz == maxsz) {
 101         oprsz = 2;
 102     }
 103
 104     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 105     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 106     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 107
 108     return desc;
 109 }
 110
 111 /* Generate a call to a gvec-style helper with two vector operands.  */
 112 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 113                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 114                         gen_helper_gvec_2 *fn)
 115 {
 116     TCGv_ptr a0, a1;
 117     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 118
 119     a0 = tcg_temp_ebb_new_ptr();
 120     a1 = tcg_temp_ebb_new_ptr();
 121
 122     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 123     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 124
 125     fn(a0, a1, desc);
 126
 127     tcg_temp_free_ptr(a0);
 128     tcg_temp_free_ptr(a1);
 129 }
 130
 131 /* Generate a call to a gvec-style helper with two vector operands
 132    and one scalar operand.  */
 133 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 134                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 135                          gen_helper_gvec_2i *fn)
 136 {
 137     TCGv_ptr a0, a1;
 138     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 139
 140     a0 = tcg_temp_ebb_new_ptr();
 141     a1 = tcg_temp_ebb_new_ptr();
 142
 143     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 144     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 145
 146     fn(a0, a1, c, desc);
 147
 148     tcg_temp_free_ptr(a0);
 149     tcg_temp_free_ptr(a1);
 150 }
 151
 152 /* Generate a call to a gvec-style helper with three vector operands.  */
 153 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 154                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 155                         gen_helper_gvec_3 *fn)
 156 {
 157     TCGv_ptr a0, a1, a2;
 158     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 159
 160     a0 = tcg_temp_ebb_new_ptr();
 161     a1 = tcg_temp_ebb_new_ptr();
 162     a2 = tcg_temp_ebb_new_ptr();
 163
 164     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 165     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 166     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 167
 168     fn(a0, a1, a2, desc);
 169
 170     tcg_temp_free_ptr(a0);
 171     tcg_temp_free_ptr(a1);
 172     tcg_temp_free_ptr(a2);
 173 }
 174
 175 /* Generate a call to a gvec-style helper with four vector operands.  */
 176 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 177                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 178                         int32_t data, gen_helper_gvec_4 *fn)
 179 {
 180     TCGv_ptr a0, a1, a2, a3;
 181     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 182
 183     a0 = tcg_temp_ebb_new_ptr();
 184     a1 = tcg_temp_ebb_new_ptr();
 185     a2 = tcg_temp_ebb_new_ptr();
 186     a3 = tcg_temp_ebb_new_ptr();
 187
 188     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 189     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 190     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 191     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 192
 193     fn(a0, a1, a2, a3, desc);
 194
 195     tcg_temp_free_ptr(a0);
 196     tcg_temp_free_ptr(a1);
 197     tcg_temp_free_ptr(a2);
 198     tcg_temp_free_ptr(a3);
 199 }
 200
 201 /* Generate a call to a gvec-style helper with five vector operands.  */
 202 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 203                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 204                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 205 {
 206     TCGv_ptr a0, a1, a2, a3, a4;
 207     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 208
 209     a0 = tcg_temp_ebb_new_ptr();
 210     a1 = tcg_temp_ebb_new_ptr();
 211     a2 = tcg_temp_ebb_new_ptr();
 212     a3 = tcg_temp_ebb_new_ptr();
 213     a4 = tcg_temp_ebb_new_ptr();
 214
 215     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 216     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 217     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 218     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 219     tcg_gen_addi_ptr(a4, cpu_env, xofs);
 220
 221     fn(a0, a1, a2, a3, a4, desc);
 222
 223     tcg_temp_free_ptr(a0);
 224     tcg_temp_free_ptr(a1);
 225     tcg_temp_free_ptr(a2);
 226     tcg_temp_free_ptr(a3);
 227     tcg_temp_free_ptr(a4);
 228 }
 229
 230 /* Generate a call to a gvec-style helper with three vector operands
 231    and an extra pointer operand.  */
 232 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 233                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 234                         int32_t data, gen_helper_gvec_2_ptr *fn)
 235 {
 236     TCGv_ptr a0, a1;
 237     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 238
 239     a0 = tcg_temp_ebb_new_ptr();
 240     a1 = tcg_temp_ebb_new_ptr();
 241
 242     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 243     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 244
 245     fn(a0, a1, ptr, desc);
 246
 247     tcg_temp_free_ptr(a0);
 248     tcg_temp_free_ptr(a1);
 249 }
 250
 251 /* Generate a call to a gvec-style helper with three vector operands
 252    and an extra pointer operand.  */
 253 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 254                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 255                         int32_t data, gen_helper_gvec_3_ptr *fn)
 256 {
 257     TCGv_ptr a0, a1, a2;
 258     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 259
 260     a0 = tcg_temp_ebb_new_ptr();
 261     a1 = tcg_temp_ebb_new_ptr();
 262     a2 = tcg_temp_ebb_new_ptr();
 263
 264     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 265     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 266     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 267
 268     fn(a0, a1, a2, ptr, desc);
 269
 270     tcg_temp_free_ptr(a0);
 271     tcg_temp_free_ptr(a1);
 272     tcg_temp_free_ptr(a2);
 273 }
 274
 275 /* Generate a call to a gvec-style helper with four vector operands
 276    and an extra pointer operand.  */
 277 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 278                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 279                         uint32_t maxsz, int32_t data,
 280                         gen_helper_gvec_4_ptr *fn)
 281 {
 282     TCGv_ptr a0, a1, a2, a3;
 283     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 284
 285     a0 = tcg_temp_ebb_new_ptr();
 286     a1 = tcg_temp_ebb_new_ptr();
 287     a2 = tcg_temp_ebb_new_ptr();
 288     a3 = tcg_temp_ebb_new_ptr();
 289
 290     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 291     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 292     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 293     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 294
 295     fn(a0, a1, a2, a3, ptr, desc);
 296
 297     tcg_temp_free_ptr(a0);
 298     tcg_temp_free_ptr(a1);
 299     tcg_temp_free_ptr(a2);
 300     tcg_temp_free_ptr(a3);
 301 }
 302
 303 /* Generate a call to a gvec-style helper with five vector operands
 304    and an extra pointer operand.  */
 305 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 306                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 307                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 308                         gen_helper_gvec_5_ptr *fn)
 309 {
 310     TCGv_ptr a0, a1, a2, a3, a4;
 311     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 312
 313     a0 = tcg_temp_ebb_new_ptr();
 314     a1 = tcg_temp_ebb_new_ptr();
 315     a2 = tcg_temp_ebb_new_ptr();
 316     a3 = tcg_temp_ebb_new_ptr();
 317     a4 = tcg_temp_ebb_new_ptr();
 318
 319     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 320     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 321     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 322     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 323     tcg_gen_addi_ptr(a4, cpu_env, eofs);
 324
 325     fn(a0, a1, a2, a3, a4, ptr, desc);
 326
 327     tcg_temp_free_ptr(a0);
 328     tcg_temp_free_ptr(a1);
 329     tcg_temp_free_ptr(a2);
 330     tcg_temp_free_ptr(a3);
 331     tcg_temp_free_ptr(a4);
 332 }
 333
 334 /* Return true if we want to implement something of OPRSZ bytes
 335    in units of LNSZ.  This limits the expansion of inline code.  */
 336 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 337 {
 338     uint32_t q, r;
 339
 340     if (oprsz < lnsz) {
 341         return false;
 342     }
 343
 344     q = oprsz / lnsz;
 345     r = oprsz % lnsz;
 346     tcg_debug_assert((r & 7) == 0);
 347
 348     if (lnsz < 16) {
 349         /* For sizes below 16, accept no remainder. */
 350         if (r != 0) {
 351             return false;
 352         }
 353     } else {
 354         /*
 355          * Recall that ARM SVE allows vector sizes that are not a
 356          * power of 2, but always a multiple of 16.  The intent is
 357          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 358          * In addition, expand_clr needs to handle a multiple of 8.
 359          * Thus we can handle the tail with one more operation per
 360          * diminishing power of 2.
 361          */
 362         q += ctpop32(r);
 363     }
 364
 365     return q <= MAX_UNROLL;
 366 }
 367
 368 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 369
 370 /* Duplicate C as per VECE.  */
 371 uint64_t (dup_const)(unsigned vece, uint64_t c)
 372 {
 373     switch (vece) {
 374     case MO_8:
 375         return 0x0101010101010101ull * (uint8_t)c;
 376     case MO_16:
 377         return 0x0001000100010001ull * (uint16_t)c;
 378     case MO_32:
 379         return 0x0000000100000001ull * (uint32_t)c;
 380     case MO_64:
 381         return c;
 382     default:
 383         g_assert_not_reached();
 384     }
 385 }
 386
 387 /* Duplicate IN into OUT as per VECE.  */
 388 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 389 {
 390     switch (vece) {
 391     case MO_8:
 392         tcg_gen_ext8u_i32(out, in);
 393         tcg_gen_muli_i32(out, out, 0x01010101);
 394         break;
 395     case MO_16:
 396         tcg_gen_deposit_i32(out, in, in, 16, 16);
 397         break;
 398     case MO_32:
 399         tcg_gen_mov_i32(out, in);
 400         break;
 401     default:
 402         g_assert_not_reached();
 403     }
 404 }
 405
 406 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 407 {
 408     switch (vece) {
 409     case MO_8:
 410         tcg_gen_ext8u_i64(out, in);
 411         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 412         break;
 413     case MO_16:
 414         tcg_gen_ext16u_i64(out, in);
 415         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 416         break;
 417     case MO_32:
 418         tcg_gen_deposit_i64(out, in, in, 32, 32);
 419         break;
 420     case MO_64:
 421         tcg_gen_mov_i64(out, in);
 422         break;
 423     default:
 424         g_assert_not_reached();
 425     }
 426 }
 427
 428 /* Select a supported vector type for implementing an operation on SIZE
 429  * bytes.  If OP is 0, assume that the real operation to be performed is
 430  * required by all backends.  Otherwise, make sure than OP can be performed
 431  * on elements of size VECE in the selected type.  Do not select V64 if
 432  * PREFER_I64 is true.  Return 0 if no vector type is selected.
 433  */
 434 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 435                                   uint32_t size, bool prefer_i64)
 436 {
 437     /*
 438      * Recall that ARM SVE allows vector sizes that are not a
 439      * power of 2, but always a multiple of 16.  The intent is
 440      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 441      * It is hard to imagine a case in which v256 is supported
 442      * but v128 is not, but check anyway.
 443      * In addition, expand_clr needs to handle a multiple of 8.
 444      */
 445     if (TCG_TARGET_HAS_v256 &&
 446         check_size_impl(size, 32) &&
 447         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 448         (!(size & 16) ||
 449          (TCG_TARGET_HAS_v128 &&
 450           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 451         (!(size & 8) ||
 452          (TCG_TARGET_HAS_v64 &&
 453           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 454         return TCG_TYPE_V256;
 455     }
 456     if (TCG_TARGET_HAS_v128 &&
 457         check_size_impl(size, 16) &&
 458         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 459         (!(size & 8) ||
 460          (TCG_TARGET_HAS_v64 &&
 461           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 462         return TCG_TYPE_V128;
 463     }
 464     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 465         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 466         return TCG_TYPE_V64;
 467     }
 468     return 0;
 469 }
 470
 471 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 472                          uint32_t maxsz, TCGv_vec t_vec)
 473 {
 474     uint32_t i = 0;
 475
 476     tcg_debug_assert(oprsz >= 8);
 477
 478     /*
 479      * This may be expand_clr for the tail of an operation, e.g.
 480      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 481      * are misaligned wrt the maximum vector size, so do that first.
 482      */
 483     if (dofs & 8) {
 484         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 485         i += 8;
 486     }
 487
 488     switch (type) {
 489     case TCG_TYPE_V256:
 490         /*
 491          * Recall that ARM SVE allows vector sizes that are not a
 492          * power of 2, but always a multiple of 16.  The intent is
 493          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 494          */
 495         for (; i + 32 <= oprsz; i += 32) {
 496             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 497         }
 498         /* fallthru */
 499     case TCG_TYPE_V128:
 500         for (; i + 16 <= oprsz; i += 16) {
 501             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 502         }
 503         break;
 504     case TCG_TYPE_V64:
 505         for (; i < oprsz; i += 8) {
 506             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 507         }
 508         break;
 509     default:
 510         g_assert_not_reached();
 511     }
 512
 513     if (oprsz < maxsz) {
 514         expand_clr(dofs + oprsz, maxsz - oprsz);
 515     }
 516 }
 517
 518 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 519  * Only one of IN_32 or IN_64 may be set;
 520  * IN_C is used if IN_32 and IN_64 are unset.
 521  */
 522 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 523                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 524                    uint64_t in_c)
 525 {
 526     TCGType type;
 527     TCGv_i64 t_64;
 528     TCGv_i32 t_32, t_desc;
 529     TCGv_ptr t_ptr;
 530     uint32_t i;
 531
 532     assert(vece <= (in_32 ? MO_32 : MO_64));
 533     assert(in_32 == NULL || in_64 == NULL);
 534
 535     /* If we're storing 0, expand oprsz to maxsz.  */
 536     if (in_32 == NULL && in_64 == NULL) {
 537         in_c = dup_const(vece, in_c);
 538         if (in_c == 0) {
 539             oprsz = maxsz;
 540             vece = MO_8;
 541         } else if (in_c == dup_const(MO_8, in_c)) {
 542             vece = MO_8;
 543         }
 544     }
 545
 546     /* Implement inline with a vector type, if possible.
 547      * Prefer integer when 64-bit host and no variable dup.
 548      */
 549     type = choose_vector_type(NULL, vece, oprsz,
 550                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 551                                && (in_64 == NULL || vece == MO_64)));
 552     if (type != 0) {
 553         TCGv_vec t_vec = tcg_temp_new_vec(type);
 554
 555         if (in_32) {
 556             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 557         } else if (in_64) {
 558             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 559         } else {
 560             tcg_gen_dupi_vec(vece, t_vec, in_c);
 561         }
 562         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 563         tcg_temp_free_vec(t_vec);
 564         return;
 565     }
 566
 567     /* Otherwise, inline with an integer type, unless "large".  */
 568     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 569         t_64 = NULL;
 570         t_32 = NULL;
 571
 572         if (in_32) {
 573             /* We are given a 32-bit variable input.  For a 64-bit host,
 574                use a 64-bit operation unless the 32-bit operation would
 575                be simple enough.  */
 576             if (TCG_TARGET_REG_BITS == 64
 577                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 578                 t_64 = tcg_temp_ebb_new_i64();
 579                 tcg_gen_extu_i32_i64(t_64, in_32);
 580                 tcg_gen_dup_i64(vece, t_64, t_64);
 581             } else {
 582                 t_32 = tcg_temp_ebb_new_i32();
 583                 tcg_gen_dup_i32(vece, t_32, in_32);
 584             }
 585         } else if (in_64) {
 586             /* We are given a 64-bit variable input.  */
 587             t_64 = tcg_temp_ebb_new_i64();
 588             tcg_gen_dup_i64(vece, t_64, in_64);
 589         } else {
 590             /* We are given a constant input.  */
 591             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 592                or when we'd need too many 32-bit stores, or when a 64-bit
 593                constant is really required.  */
 594             if (vece == MO_64
 595                 || (TCG_TARGET_REG_BITS == 64
 596                     && (in_c == 0 || in_c == -1
 597                         || !check_size_impl(oprsz, 4)))) {
 598                 t_64 = tcg_constant_i64(in_c);
 599             } else {
 600                 t_32 = tcg_constant_i32(in_c);
 601             }
 602         }
 603
 604         /* Implement inline if we picked an implementation size above.  */
 605         if (t_32) {
 606             for (i = 0; i < oprsz; i += 4) {
 607                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 608             }
 609             tcg_temp_free_i32(t_32);
 610             goto done;
 611         }
 612         if (t_64) {
 613             for (i = 0; i < oprsz; i += 8) {
 614                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 615             }
 616             tcg_temp_free_i64(t_64);
 617             goto done;
 618         }
 619     }
 620
 621     /* Otherwise implement out of line.  */
 622     t_ptr = tcg_temp_ebb_new_ptr();
 623     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 624
 625     /*
 626      * This may be expand_clr for the tail of an operation, e.g.
 627      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 628      * wrt simd_desc and will assert.  Simply pass all replicated byte
 629      * stores through to memset.
 630      */
 631     if (oprsz == maxsz && vece == MO_8) {
 632         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
 633         TCGv_i32 t_val;
 634
 635         if (in_32) {
 636             t_val = in_32;
 637         } else if (in_64) {
 638             t_val = tcg_temp_ebb_new_i32();
 639             tcg_gen_extrl_i64_i32(t_val, in_64);
 640         } else {
 641             t_val = tcg_constant_i32(in_c);
 642         }
 643         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 644
 645         if (in_64) {
 646             tcg_temp_free_i32(t_val);
 647         }
 648         tcg_temp_free_ptr(t_ptr);
 649         return;
 650     }
 651
 652     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 653
 654     if (vece == MO_64) {
 655         if (in_64) {
 656             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 657         } else {
 658             t_64 = tcg_constant_i64(in_c);
 659             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 660         }
 661     } else {
 662         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 663         static dup_fn * const fns[3] = {
 664             gen_helper_gvec_dup8,
 665             gen_helper_gvec_dup16,
 666             gen_helper_gvec_dup32
 667         };
 668
 669         if (in_32) {
 670             fns[vece](t_ptr, t_desc, in_32);
 671         } else if (in_64) {
 672             t_32 = tcg_temp_ebb_new_i32();
 673             tcg_gen_extrl_i64_i32(t_32, in_64);
 674             fns[vece](t_ptr, t_desc, t_32);
 675             tcg_temp_free_i32(t_32);
 676         } else {
 677             if (vece == MO_8) {
 678                 in_c &= 0xff;
 679             } else if (vece == MO_16) {
 680                 in_c &= 0xffff;
 681             }
 682             t_32 = tcg_constant_i32(in_c);
 683             fns[vece](t_ptr, t_desc, t_32);
 684         }
 685     }
 686
 687     tcg_temp_free_ptr(t_ptr);
 688     return;
 689
 690  done:
 691     if (oprsz < maxsz) {
 692         expand_clr(dofs + oprsz, maxsz - oprsz);
 693     }
 694 }
 695
 696 /* Likewise, but with zero.  */
 697 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 698 {
 699     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 700 }
 701
 702 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 703 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 704                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 705 {
 706     TCGv_i32 t0 = tcg_temp_new_i32();
 707     TCGv_i32 t1 = tcg_temp_new_i32();
 708     uint32_t i;
 709
 710     for (i = 0; i < oprsz; i += 4) {
 711         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 712         if (load_dest) {
 713             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 714         }
 715         fni(t1, t0);
 716         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 717     }
 718     tcg_temp_free_i32(t0);
 719     tcg_temp_free_i32(t1);
 720 }
 721
 722 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 723                           int32_t c, bool load_dest,
 724                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 725 {
 726     TCGv_i32 t0 = tcg_temp_new_i32();
 727     TCGv_i32 t1 = tcg_temp_new_i32();
 728     uint32_t i;
 729
 730     for (i = 0; i < oprsz; i += 4) {
 731         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 732         if (load_dest) {
 733             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 734         }
 735         fni(t1, t0, c);
 736         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 737     }
 738     tcg_temp_free_i32(t0);
 739     tcg_temp_free_i32(t1);
 740 }
 741
 742 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 743                           TCGv_i32 c, bool scalar_first,
 744                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 745 {
 746     TCGv_i32 t0 = tcg_temp_new_i32();
 747     TCGv_i32 t1 = tcg_temp_new_i32();
 748     uint32_t i;
 749
 750     for (i = 0; i < oprsz; i += 4) {
 751         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 752         if (scalar_first) {
 753             fni(t1, c, t0);
 754         } else {
 755             fni(t1, t0, c);
 756         }
 757         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 758     }
 759     tcg_temp_free_i32(t0);
 760     tcg_temp_free_i32(t1);
 761 }
 762
 763 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 764 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 765                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 766                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 767 {
 768     TCGv_i32 t0 = tcg_temp_new_i32();
 769     TCGv_i32 t1 = tcg_temp_new_i32();
 770     TCGv_i32 t2 = tcg_temp_new_i32();
 771     uint32_t i;
 772
 773     for (i = 0; i < oprsz; i += 4) {
 774         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 775         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 776         if (load_dest) {
 777             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 778         }
 779         fni(t2, t0, t1);
 780         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 781     }
 782     tcg_temp_free_i32(t2);
 783     tcg_temp_free_i32(t1);
 784     tcg_temp_free_i32(t0);
 785 }
 786
 787 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 788                           uint32_t oprsz, int32_t c, bool load_dest,
 789                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 790 {
 791     TCGv_i32 t0 = tcg_temp_new_i32();
 792     TCGv_i32 t1 = tcg_temp_new_i32();
 793     TCGv_i32 t2 = tcg_temp_new_i32();
 794     uint32_t i;
 795
 796     for (i = 0; i < oprsz; i += 4) {
 797         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 798         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 799         if (load_dest) {
 800             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 801         }
 802         fni(t2, t0, t1, c);
 803         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 804     }
 805     tcg_temp_free_i32(t0);
 806     tcg_temp_free_i32(t1);
 807     tcg_temp_free_i32(t2);
 808 }
 809
 810 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 811 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 812                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 813                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 814 {
 815     TCGv_i32 t0 = tcg_temp_new_i32();
 816     TCGv_i32 t1 = tcg_temp_new_i32();
 817     TCGv_i32 t2 = tcg_temp_new_i32();
 818     TCGv_i32 t3 = tcg_temp_new_i32();
 819     uint32_t i;
 820
 821     for (i = 0; i < oprsz; i += 4) {
 822         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 823         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 824         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 825         fni(t0, t1, t2, t3);
 826         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 827         if (write_aofs) {
 828             tcg_gen_st_i32(t1, cpu_env, aofs + i);
 829         }
 830     }
 831     tcg_temp_free_i32(t3);
 832     tcg_temp_free_i32(t2);
 833     tcg_temp_free_i32(t1);
 834     tcg_temp_free_i32(t0);
 835 }
 836
 837 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 838                           uint32_t cofs, uint32_t oprsz, int32_t c,
 839                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
 840                                       int32_t))
 841 {
 842     TCGv_i32 t0 = tcg_temp_new_i32();
 843     TCGv_i32 t1 = tcg_temp_new_i32();
 844     TCGv_i32 t2 = tcg_temp_new_i32();
 845     TCGv_i32 t3 = tcg_temp_new_i32();
 846     uint32_t i;
 847
 848     for (i = 0; i < oprsz; i += 4) {
 849         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 850         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 851         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 852         fni(t0, t1, t2, t3, c);
 853         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 854     }
 855     tcg_temp_free_i32(t3);
 856     tcg_temp_free_i32(t2);
 857     tcg_temp_free_i32(t1);
 858     tcg_temp_free_i32(t0);
 859 }
 860
 861 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 862 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 863                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 864 {
 865     TCGv_i64 t0 = tcg_temp_new_i64();
 866     TCGv_i64 t1 = tcg_temp_new_i64();
 867     uint32_t i;
 868
 869     for (i = 0; i < oprsz; i += 8) {
 870         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 871         if (load_dest) {
 872             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 873         }
 874         fni(t1, t0);
 875         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 876     }
 877     tcg_temp_free_i64(t0);
 878     tcg_temp_free_i64(t1);
 879 }
 880
 881 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 882                           int64_t c, bool load_dest,
 883                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 884 {
 885     TCGv_i64 t0 = tcg_temp_new_i64();
 886     TCGv_i64 t1 = tcg_temp_new_i64();
 887     uint32_t i;
 888
 889     for (i = 0; i < oprsz; i += 8) {
 890         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 891         if (load_dest) {
 892             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 893         }
 894         fni(t1, t0, c);
 895         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 896     }
 897     tcg_temp_free_i64(t0);
 898     tcg_temp_free_i64(t1);
 899 }
 900
 901 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 902                           TCGv_i64 c, bool scalar_first,
 903                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 904 {
 905     TCGv_i64 t0 = tcg_temp_new_i64();
 906     TCGv_i64 t1 = tcg_temp_new_i64();
 907     uint32_t i;
 908
 909     for (i = 0; i < oprsz; i += 8) {
 910         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 911         if (scalar_first) {
 912             fni(t1, c, t0);
 913         } else {
 914             fni(t1, t0, c);
 915         }
 916         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 917     }
 918     tcg_temp_free_i64(t0);
 919     tcg_temp_free_i64(t1);
 920 }
 921
 922 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 923 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 924                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 925                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 926 {
 927     TCGv_i64 t0 = tcg_temp_new_i64();
 928     TCGv_i64 t1 = tcg_temp_new_i64();
 929     TCGv_i64 t2 = tcg_temp_new_i64();
 930     uint32_t i;
 931
 932     for (i = 0; i < oprsz; i += 8) {
 933         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 934         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 935         if (load_dest) {
 936             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 937         }
 938         fni(t2, t0, t1);
 939         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 940     }
 941     tcg_temp_free_i64(t2);
 942     tcg_temp_free_i64(t1);
 943     tcg_temp_free_i64(t0);
 944 }
 945
 946 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 947                           uint32_t oprsz, int64_t c, bool load_dest,
 948                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 949 {
 950     TCGv_i64 t0 = tcg_temp_new_i64();
 951     TCGv_i64 t1 = tcg_temp_new_i64();
 952     TCGv_i64 t2 = tcg_temp_new_i64();
 953     uint32_t i;
 954
 955     for (i = 0; i < oprsz; i += 8) {
 956         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 957         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 958         if (load_dest) {
 959             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 960         }
 961         fni(t2, t0, t1, c);
 962         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 963     }
 964     tcg_temp_free_i64(t0);
 965     tcg_temp_free_i64(t1);
 966     tcg_temp_free_i64(t2);
 967 }
 968
 969 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 970 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 971                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 972                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 973 {
 974     TCGv_i64 t0 = tcg_temp_new_i64();
 975     TCGv_i64 t1 = tcg_temp_new_i64();
 976     TCGv_i64 t2 = tcg_temp_new_i64();
 977     TCGv_i64 t3 = tcg_temp_new_i64();
 978     uint32_t i;
 979
 980     for (i = 0; i < oprsz; i += 8) {
 981         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 982         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 983         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 984         fni(t0, t1, t2, t3);
 985         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 986         if (write_aofs) {
 987             tcg_gen_st_i64(t1, cpu_env, aofs + i);
 988         }
 989     }
 990     tcg_temp_free_i64(t3);
 991     tcg_temp_free_i64(t2);
 992     tcg_temp_free_i64(t1);
 993     tcg_temp_free_i64(t0);
 994 }
 995
 996 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 997                           uint32_t cofs, uint32_t oprsz, int64_t c,
 998                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
 999                                       int64_t))
1000 {
1001     TCGv_i64 t0 = tcg_temp_new_i64();
1002     TCGv_i64 t1 = tcg_temp_new_i64();
1003     TCGv_i64 t2 = tcg_temp_new_i64();
1004     TCGv_i64 t3 = tcg_temp_new_i64();
1005     uint32_t i;
1006
1007     for (i = 0; i < oprsz; i += 8) {
1008         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
1009         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
1010         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
1011         fni(t0, t1, t2, t3, c);
1012         tcg_gen_st_i64(t0, cpu_env, dofs + i);
1013     }
1014     tcg_temp_free_i64(t3);
1015     tcg_temp_free_i64(t2);
1016     tcg_temp_free_i64(t1);
1017     tcg_temp_free_i64(t0);
1018 }
1019
1020 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1021 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1022                          uint32_t oprsz, uint32_t tysz, TCGType type,
1023                          bool load_dest,
1024                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1025 {
1026     TCGv_vec t0 = tcg_temp_new_vec(type);
1027     TCGv_vec t1 = tcg_temp_new_vec(type);
1028     uint32_t i;
1029
1030     for (i = 0; i < oprsz; i += tysz) {
1031         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1032         if (load_dest) {
1033             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1034         }
1035         fni(vece, t1, t0);
1036         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1037     }
1038     tcg_temp_free_vec(t0);
1039     tcg_temp_free_vec(t1);
1040 }
1041
1042 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1043    using host vectors.  */
1044 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1045                           uint32_t oprsz, uint32_t tysz, TCGType type,
1046                           int64_t c, bool load_dest,
1047                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1048 {
1049     TCGv_vec t0 = tcg_temp_new_vec(type);
1050     TCGv_vec t1 = tcg_temp_new_vec(type);
1051     uint32_t i;
1052
1053     for (i = 0; i < oprsz; i += tysz) {
1054         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1055         if (load_dest) {
1056             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1057         }
1058         fni(vece, t1, t0, c);
1059         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1060     }
1061     tcg_temp_free_vec(t0);
1062     tcg_temp_free_vec(t1);
1063 }
1064
1065 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1066                           uint32_t oprsz, uint32_t tysz, TCGType type,
1067                           TCGv_vec c, bool scalar_first,
1068                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1069 {
1070     TCGv_vec t0 = tcg_temp_new_vec(type);
1071     TCGv_vec t1 = tcg_temp_new_vec(type);
1072     uint32_t i;
1073
1074     for (i = 0; i < oprsz; i += tysz) {
1075         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1076         if (scalar_first) {
1077             fni(vece, t1, c, t0);
1078         } else {
1079             fni(vece, t1, t0, c);
1080         }
1081         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1082     }
1083     tcg_temp_free_vec(t0);
1084     tcg_temp_free_vec(t1);
1085 }
1086
1087 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1088 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1089                          uint32_t bofs, uint32_t oprsz,
1090                          uint32_t tysz, TCGType type, bool load_dest,
1091                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1092 {
1093     TCGv_vec t0 = tcg_temp_new_vec(type);
1094     TCGv_vec t1 = tcg_temp_new_vec(type);
1095     TCGv_vec t2 = tcg_temp_new_vec(type);
1096     uint32_t i;
1097
1098     for (i = 0; i < oprsz; i += tysz) {
1099         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1100         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1101         if (load_dest) {
1102             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1103         }
1104         fni(vece, t2, t0, t1);
1105         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1106     }
1107     tcg_temp_free_vec(t2);
1108     tcg_temp_free_vec(t1);
1109     tcg_temp_free_vec(t0);
1110 }
1111
1112 /*
1113  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1114  * using host vectors.
1115  */
1116 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1117                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1118                           TCGType type, int64_t c, bool load_dest,
1119                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1120                                       int64_t))
1121 {
1122     TCGv_vec t0 = tcg_temp_new_vec(type);
1123     TCGv_vec t1 = tcg_temp_new_vec(type);
1124     TCGv_vec t2 = tcg_temp_new_vec(type);
1125     uint32_t i;
1126
1127     for (i = 0; i < oprsz; i += tysz) {
1128         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1129         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1130         if (load_dest) {
1131             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1132         }
1133         fni(vece, t2, t0, t1, c);
1134         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1135     }
1136     tcg_temp_free_vec(t0);
1137     tcg_temp_free_vec(t1);
1138     tcg_temp_free_vec(t2);
1139 }
1140
1141 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1142 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1143                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1144                          uint32_t tysz, TCGType type, bool write_aofs,
1145                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1146                                      TCGv_vec, TCGv_vec))
1147 {
1148     TCGv_vec t0 = tcg_temp_new_vec(type);
1149     TCGv_vec t1 = tcg_temp_new_vec(type);
1150     TCGv_vec t2 = tcg_temp_new_vec(type);
1151     TCGv_vec t3 = tcg_temp_new_vec(type);
1152     uint32_t i;
1153
1154     for (i = 0; i < oprsz; i += tysz) {
1155         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1156         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1157         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1158         fni(vece, t0, t1, t2, t3);
1159         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1160         if (write_aofs) {
1161             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1162         }
1163     }
1164     tcg_temp_free_vec(t3);
1165     tcg_temp_free_vec(t2);
1166     tcg_temp_free_vec(t1);
1167     tcg_temp_free_vec(t0);
1168 }
1169
1170 /*
1171  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1172  * using host vectors.
1173  */
1174 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1175                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1176                           uint32_t tysz, TCGType type, int64_t c,
1177                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1178                                      TCGv_vec, TCGv_vec, int64_t))
1179 {
1180     TCGv_vec t0 = tcg_temp_new_vec(type);
1181     TCGv_vec t1 = tcg_temp_new_vec(type);
1182     TCGv_vec t2 = tcg_temp_new_vec(type);
1183     TCGv_vec t3 = tcg_temp_new_vec(type);
1184     uint32_t i;
1185
1186     for (i = 0; i < oprsz; i += tysz) {
1187         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1188         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1189         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1190         fni(vece, t0, t1, t2, t3, c);
1191         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1192     }
1193     tcg_temp_free_vec(t3);
1194     tcg_temp_free_vec(t2);
1195     tcg_temp_free_vec(t1);
1196     tcg_temp_free_vec(t0);
1197 }
1198
1199 /* Expand a vector two-operand operation.  */
1200 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1201                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1202 {
1203     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1204     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1205     TCGType type;
1206     uint32_t some;
1207
1208     check_size_align(oprsz, maxsz, dofs | aofs);
1209     check_overlap_2(dofs, aofs, maxsz);
1210
1211     type = 0;
1212     if (g->fniv) {
1213         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1214     }
1215     switch (type) {
1216     case TCG_TYPE_V256:
1217         /* Recall that ARM SVE allows vector sizes that are not a
1218          * power of 2, but always a multiple of 16.  The intent is
1219          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1220          */
1221         some = QEMU_ALIGN_DOWN(oprsz, 32);
1222         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1223                      g->load_dest, g->fniv);
1224         if (some == oprsz) {
1225             break;
1226         }
1227         dofs += some;
1228         aofs += some;
1229         oprsz -= some;
1230         maxsz -= some;
1231         /* fallthru */
1232     case TCG_TYPE_V128:
1233         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1234                      g->load_dest, g->fniv);
1235         break;
1236     case TCG_TYPE_V64:
1237         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1238                      g->load_dest, g->fniv);
1239         break;
1240
1241     case 0:
1242         if (g->fni8 && check_size_impl(oprsz, 8)) {
1243             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1244         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1245             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1246         } else {
1247             assert(g->fno != NULL);
1248             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1249             oprsz = maxsz;
1250         }
1251         break;
1252
1253     default:
1254         g_assert_not_reached();
1255     }
1256     tcg_swap_vecop_list(hold_list);
1257
1258     if (oprsz < maxsz) {
1259         expand_clr(dofs + oprsz, maxsz - oprsz);
1260     }
1261 }
1262
1263 /* Expand a vector operation with two vectors and an immediate.  */
1264 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1265                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1266 {
1267     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1268     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1269     TCGType type;
1270     uint32_t some;
1271
1272     check_size_align(oprsz, maxsz, dofs | aofs);
1273     check_overlap_2(dofs, aofs, maxsz);
1274
1275     type = 0;
1276     if (g->fniv) {
1277         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1278     }
1279     switch (type) {
1280     case TCG_TYPE_V256:
1281         /* Recall that ARM SVE allows vector sizes that are not a
1282          * power of 2, but always a multiple of 16.  The intent is
1283          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1284          */
1285         some = QEMU_ALIGN_DOWN(oprsz, 32);
1286         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1287                       c, g->load_dest, g->fniv);
1288         if (some == oprsz) {
1289             break;
1290         }
1291         dofs += some;
1292         aofs += some;
1293         oprsz -= some;
1294         maxsz -= some;
1295         /* fallthru */
1296     case TCG_TYPE_V128:
1297         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                       c, g->load_dest, g->fniv);
1299         break;
1300     case TCG_TYPE_V64:
1301         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1302                       c, g->load_dest, g->fniv);
1303         break;
1304
1305     case 0:
1306         if (g->fni8 && check_size_impl(oprsz, 8)) {
1307             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1308         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1309             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1310         } else {
1311             if (g->fno) {
1312                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1313             } else {
1314                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1315                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1316                                     maxsz, c, g->fnoi);
1317             }
1318             oprsz = maxsz;
1319         }
1320         break;
1321
1322     default:
1323         g_assert_not_reached();
1324     }
1325     tcg_swap_vecop_list(hold_list);
1326
1327     if (oprsz < maxsz) {
1328         expand_clr(dofs + oprsz, maxsz - oprsz);
1329     }
1330 }
1331
1332 /* Expand a vector operation with two vectors and a scalar.  */
1333 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1334                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1335 {
1336     TCGType type;
1337
1338     check_size_align(oprsz, maxsz, dofs | aofs);
1339     check_overlap_2(dofs, aofs, maxsz);
1340
1341     type = 0;
1342     if (g->fniv) {
1343         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1344     }
1345     if (type != 0) {
1346         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1347         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1348         TCGv_vec t_vec = tcg_temp_new_vec(type);
1349         uint32_t some;
1350
1351         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1352
1353         switch (type) {
1354         case TCG_TYPE_V256:
1355             /* Recall that ARM SVE allows vector sizes that are not a
1356              * power of 2, but always a multiple of 16.  The intent is
1357              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1358              */
1359             some = QEMU_ALIGN_DOWN(oprsz, 32);
1360             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1361                           t_vec, g->scalar_first, g->fniv);
1362             if (some == oprsz) {
1363                 break;
1364             }
1365             dofs += some;
1366             aofs += some;
1367             oprsz -= some;
1368             maxsz -= some;
1369             /* fallthru */
1370
1371         case TCG_TYPE_V128:
1372             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1373                           t_vec, g->scalar_first, g->fniv);
1374             break;
1375
1376         case TCG_TYPE_V64:
1377             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1378                           t_vec, g->scalar_first, g->fniv);
1379             break;
1380
1381         default:
1382             g_assert_not_reached();
1383         }
1384         tcg_temp_free_vec(t_vec);
1385         tcg_swap_vecop_list(hold_list);
1386     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1387         TCGv_i64 t64 = tcg_temp_new_i64();
1388
1389         tcg_gen_dup_i64(g->vece, t64, c);
1390         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1391         tcg_temp_free_i64(t64);
1392     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1393         TCGv_i32 t32 = tcg_temp_new_i32();
1394
1395         tcg_gen_extrl_i64_i32(t32, c);
1396         tcg_gen_dup_i32(g->vece, t32, t32);
1397         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1398         tcg_temp_free_i32(t32);
1399     } else {
1400         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1401         return;
1402     }
1403
1404     if (oprsz < maxsz) {
1405         expand_clr(dofs + oprsz, maxsz - oprsz);
1406     }
1407 }
1408
1409 /* Expand a vector three-operand operation.  */
1410 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1411                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1412 {
1413     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1414     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1415     TCGType type;
1416     uint32_t some;
1417
1418     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1419     check_overlap_3(dofs, aofs, bofs, maxsz);
1420
1421     type = 0;
1422     if (g->fniv) {
1423         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1424     }
1425     switch (type) {
1426     case TCG_TYPE_V256:
1427         /* Recall that ARM SVE allows vector sizes that are not a
1428          * power of 2, but always a multiple of 16.  The intent is
1429          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1430          */
1431         some = QEMU_ALIGN_DOWN(oprsz, 32);
1432         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1433                      g->load_dest, g->fniv);
1434         if (some == oprsz) {
1435             break;
1436         }
1437         dofs += some;
1438         aofs += some;
1439         bofs += some;
1440         oprsz -= some;
1441         maxsz -= some;
1442         /* fallthru */
1443     case TCG_TYPE_V128:
1444         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1445                      g->load_dest, g->fniv);
1446         break;
1447     case TCG_TYPE_V64:
1448         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1449                      g->load_dest, g->fniv);
1450         break;
1451
1452     case 0:
1453         if (g->fni8 && check_size_impl(oprsz, 8)) {
1454             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1455         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1456             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1457         } else {
1458             assert(g->fno != NULL);
1459             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1460                                maxsz, g->data, g->fno);
1461             oprsz = maxsz;
1462         }
1463         break;
1464
1465     default:
1466         g_assert_not_reached();
1467     }
1468     tcg_swap_vecop_list(hold_list);
1469
1470     if (oprsz < maxsz) {
1471         expand_clr(dofs + oprsz, maxsz - oprsz);
1472     }
1473 }
1474
1475 /* Expand a vector operation with three vectors and an immediate.  */
1476 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1477                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1478                      const GVecGen3i *g)
1479 {
1480     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1481     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1482     TCGType type;
1483     uint32_t some;
1484
1485     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1486     check_overlap_3(dofs, aofs, bofs, maxsz);
1487
1488     type = 0;
1489     if (g->fniv) {
1490         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1491     }
1492     switch (type) {
1493     case TCG_TYPE_V256:
1494         /*
1495          * Recall that ARM SVE allows vector sizes that are not a
1496          * power of 2, but always a multiple of 16.  The intent is
1497          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1498          */
1499         some = QEMU_ALIGN_DOWN(oprsz, 32);
1500         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1501                       c, g->load_dest, g->fniv);
1502         if (some == oprsz) {
1503             break;
1504         }
1505         dofs += some;
1506         aofs += some;
1507         bofs += some;
1508         oprsz -= some;
1509         maxsz -= some;
1510         /* fallthru */
1511     case TCG_TYPE_V128:
1512         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1513                       c, g->load_dest, g->fniv);
1514         break;
1515     case TCG_TYPE_V64:
1516         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1517                       c, g->load_dest, g->fniv);
1518         break;
1519
1520     case 0:
1521         if (g->fni8 && check_size_impl(oprsz, 8)) {
1522             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1523         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1524             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1525         } else {
1526             assert(g->fno != NULL);
1527             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1528             oprsz = maxsz;
1529         }
1530         break;
1531
1532     default:
1533         g_assert_not_reached();
1534     }
1535     tcg_swap_vecop_list(hold_list);
1536
1537     if (oprsz < maxsz) {
1538         expand_clr(dofs + oprsz, maxsz - oprsz);
1539     }
1540 }
1541
1542 /* Expand a vector four-operand operation.  */
1543 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1544                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1545 {
1546     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1547     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1548     TCGType type;
1549     uint32_t some;
1550
1551     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1552     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1553
1554     type = 0;
1555     if (g->fniv) {
1556         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1557     }
1558     switch (type) {
1559     case TCG_TYPE_V256:
1560         /* Recall that ARM SVE allows vector sizes that are not a
1561          * power of 2, but always a multiple of 16.  The intent is
1562          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1563          */
1564         some = QEMU_ALIGN_DOWN(oprsz, 32);
1565         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1566                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1567         if (some == oprsz) {
1568             break;
1569         }
1570         dofs += some;
1571         aofs += some;
1572         bofs += some;
1573         cofs += some;
1574         oprsz -= some;
1575         maxsz -= some;
1576         /* fallthru */
1577     case TCG_TYPE_V128:
1578         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1579                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1580         break;
1581     case TCG_TYPE_V64:
1582         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1583                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1584         break;
1585
1586     case 0:
1587         if (g->fni8 && check_size_impl(oprsz, 8)) {
1588             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1589                          g->write_aofs, g->fni8);
1590         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1591             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1592                          g->write_aofs, g->fni4);
1593         } else {
1594             assert(g->fno != NULL);
1595             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1596                                oprsz, maxsz, g->data, g->fno);
1597             oprsz = maxsz;
1598         }
1599         break;
1600
1601     default:
1602         g_assert_not_reached();
1603     }
1604     tcg_swap_vecop_list(hold_list);
1605
1606     if (oprsz < maxsz) {
1607         expand_clr(dofs + oprsz, maxsz - oprsz);
1608     }
1609 }
1610
1611 /* Expand a vector four-operand operation.  */
1612 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1613                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1614                      const GVecGen4i *g)
1615 {
1616     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1617     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1618     TCGType type;
1619     uint32_t some;
1620
1621     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1622     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1623
1624     type = 0;
1625     if (g->fniv) {
1626         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1627     }
1628     switch (type) {
1629     case TCG_TYPE_V256:
1630         /*
1631          * Recall that ARM SVE allows vector sizes that are not a
1632          * power of 2, but always a multiple of 16.  The intent is
1633          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1634          */
1635         some = QEMU_ALIGN_DOWN(oprsz, 32);
1636         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1637                       32, TCG_TYPE_V256, c, g->fniv);
1638         if (some == oprsz) {
1639             break;
1640         }
1641         dofs += some;
1642         aofs += some;
1643         bofs += some;
1644         cofs += some;
1645         oprsz -= some;
1646         maxsz -= some;
1647         /* fallthru */
1648     case TCG_TYPE_V128:
1649         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1650                        16, TCG_TYPE_V128, c, g->fniv);
1651         break;
1652     case TCG_TYPE_V64:
1653         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1654                       8, TCG_TYPE_V64, c, g->fniv);
1655         break;
1656
1657     case 0:
1658         if (g->fni8 && check_size_impl(oprsz, 8)) {
1659             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1660         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1661             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1662         } else {
1663             assert(g->fno != NULL);
1664             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1665                                oprsz, maxsz, c, g->fno);
1666             oprsz = maxsz;
1667         }
1668         break;
1669
1670     default:
1671         g_assert_not_reached();
1672     }
1673     tcg_swap_vecop_list(hold_list);
1674
1675     if (oprsz < maxsz) {
1676         expand_clr(dofs + oprsz, maxsz - oprsz);
1677     }
1678 }
1679
1680 /*
1681  * Expand specific vector operations.
1682  */
1683
1684 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1685 {
1686     tcg_gen_mov_vec(a, b);
1687 }
1688
1689 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1690                       uint32_t oprsz, uint32_t maxsz)
1691 {
1692     static const GVecGen2 g = {
1693         .fni8 = tcg_gen_mov_i64,
1694         .fniv = vec_mov2,
1695         .fno = gen_helper_gvec_mov,
1696         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1697     };
1698     if (dofs != aofs) {
1699         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1700     } else {
1701         check_size_align(oprsz, maxsz, dofs);
1702         if (oprsz < maxsz) {
1703             expand_clr(dofs + oprsz, maxsz - oprsz);
1704         }
1705     }
1706 }
1707
1708 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1709                           uint32_t maxsz, TCGv_i32 in)
1710 {
1711     check_size_align(oprsz, maxsz, dofs);
1712     tcg_debug_assert(vece <= MO_32);
1713     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1714 }
1715
1716 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1717                           uint32_t maxsz, TCGv_i64 in)
1718 {
1719     check_size_align(oprsz, maxsz, dofs);
1720     tcg_debug_assert(vece <= MO_64);
1721     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1722 }
1723
1724 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1725                           uint32_t oprsz, uint32_t maxsz)
1726 {
1727     check_size_align(oprsz, maxsz, dofs);
1728     if (vece <= MO_64) {
1729         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1730         if (type != 0) {
1731             TCGv_vec t_vec = tcg_temp_new_vec(type);
1732             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1733             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1734             tcg_temp_free_vec(t_vec);
1735         } else if (vece <= MO_32) {
1736             TCGv_i32 in = tcg_temp_ebb_new_i32();
1737             switch (vece) {
1738             case MO_8:
1739                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1740                 break;
1741             case MO_16:
1742                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1743                 break;
1744             default:
1745                 tcg_gen_ld_i32(in, cpu_env, aofs);
1746                 break;
1747             }
1748             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1749             tcg_temp_free_i32(in);
1750         } else {
1751             TCGv_i64 in = tcg_temp_ebb_new_i64();
1752             tcg_gen_ld_i64(in, cpu_env, aofs);
1753             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1754             tcg_temp_free_i64(in);
1755         }
1756     } else if (vece == 4) {
1757         /* 128-bit duplicate.  */
1758         int i;
1759
1760         tcg_debug_assert(oprsz >= 16);
1761         if (TCG_TARGET_HAS_v128) {
1762             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1763
1764             tcg_gen_ld_vec(in, cpu_env, aofs);
1765             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1766                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1767             }
1768             tcg_temp_free_vec(in);
1769         } else {
1770             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1771             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1772
1773             tcg_gen_ld_i64(in0, cpu_env, aofs);
1774             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1775             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1776                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1777                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1778             }
1779             tcg_temp_free_i64(in0);
1780             tcg_temp_free_i64(in1);
1781         }
1782         if (oprsz < maxsz) {
1783             expand_clr(dofs + oprsz, maxsz - oprsz);
1784         }
1785     } else if (vece == 5) {
1786         /* 256-bit duplicate.  */
1787         int i;
1788
1789         tcg_debug_assert(oprsz >= 32);
1790         tcg_debug_assert(oprsz % 32 == 0);
1791         if (TCG_TARGET_HAS_v256) {
1792             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1793
1794             tcg_gen_ld_vec(in, cpu_env, aofs);
1795             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1796                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1797             }
1798             tcg_temp_free_vec(in);
1799         } else if (TCG_TARGET_HAS_v128) {
1800             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1801             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1802
1803             tcg_gen_ld_vec(in0, cpu_env, aofs);
1804             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1805             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1806                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1807                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1808             }
1809             tcg_temp_free_vec(in0);
1810             tcg_temp_free_vec(in1);
1811         } else {
1812             TCGv_i64 in[4];
1813             int j;
1814
1815             for (j = 0; j < 4; ++j) {
1816                 in[j] = tcg_temp_ebb_new_i64();
1817                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1818             }
1819             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1820                 for (j = 0; j < 4; ++j) {
1821                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1822                 }
1823             }
1824             for (j = 0; j < 4; ++j) {
1825                 tcg_temp_free_i64(in[j]);
1826             }
1827         }
1828         if (oprsz < maxsz) {
1829             expand_clr(dofs + oprsz, maxsz - oprsz);
1830         }
1831     } else {
1832         g_assert_not_reached();
1833     }
1834 }
1835
1836 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1837                           uint32_t maxsz, uint64_t x)
1838 {
1839     check_size_align(oprsz, maxsz, dofs);
1840     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1841 }
1842
1843 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1844                       uint32_t oprsz, uint32_t maxsz)
1845 {
1846     static const GVecGen2 g = {
1847         .fni8 = tcg_gen_not_i64,
1848         .fniv = tcg_gen_not_vec,
1849         .fno = gen_helper_gvec_not,
1850         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1851     };
1852     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1853 }
1854
1855 /* Perform a vector addition using normal addition and a mask.  The mask
1856    should be the sign bit of each lane.  This 6-operation form is more
1857    efficient than separate additions when there are 4 or more lanes in
1858    the 64-bit operation.  */
1859 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1860 {
1861     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1862     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1863     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1864
1865     tcg_gen_andc_i64(t1, a, m);
1866     tcg_gen_andc_i64(t2, b, m);
1867     tcg_gen_xor_i64(t3, a, b);
1868     tcg_gen_add_i64(d, t1, t2);
1869     tcg_gen_and_i64(t3, t3, m);
1870     tcg_gen_xor_i64(d, d, t3);
1871
1872     tcg_temp_free_i64(t1);
1873     tcg_temp_free_i64(t2);
1874     tcg_temp_free_i64(t3);
1875 }
1876
1877 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1878 {
1879     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1880     gen_addv_mask(d, a, b, m);
1881 }
1882
1883 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1884 {
1885     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1886     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1887     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1888     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1889
1890     tcg_gen_andc_i32(t1, a, m);
1891     tcg_gen_andc_i32(t2, b, m);
1892     tcg_gen_xor_i32(t3, a, b);
1893     tcg_gen_add_i32(d, t1, t2);
1894     tcg_gen_and_i32(t3, t3, m);
1895     tcg_gen_xor_i32(d, d, t3);
1896
1897     tcg_temp_free_i32(t1);
1898     tcg_temp_free_i32(t2);
1899     tcg_temp_free_i32(t3);
1900 }
1901
1902 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1903 {
1904     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1905     gen_addv_mask(d, a, b, m);
1906 }
1907
1908 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1909 {
1910     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1911     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1912
1913     tcg_gen_andi_i32(t1, a, ~0xffff);
1914     tcg_gen_add_i32(t2, a, b);
1915     tcg_gen_add_i32(t1, t1, b);
1916     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1917
1918     tcg_temp_free_i32(t1);
1919     tcg_temp_free_i32(t2);
1920 }
1921
1922 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1923 {
1924     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1925     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1926
1927     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1928     tcg_gen_add_i64(t2, a, b);
1929     tcg_gen_add_i64(t1, t1, b);
1930     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1931
1932     tcg_temp_free_i64(t1);
1933     tcg_temp_free_i64(t2);
1934 }
1935
1936 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1937
1938 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1939                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1940 {
1941     static const GVecGen3 g[4] = {
1942         { .fni8 = tcg_gen_vec_add8_i64,
1943           .fniv = tcg_gen_add_vec,
1944           .fno = gen_helper_gvec_add8,
1945           .opt_opc = vecop_list_add,
1946           .vece = MO_8 },
1947         { .fni8 = tcg_gen_vec_add16_i64,
1948           .fniv = tcg_gen_add_vec,
1949           .fno = gen_helper_gvec_add16,
1950           .opt_opc = vecop_list_add,
1951           .vece = MO_16 },
1952         { .fni4 = tcg_gen_add_i32,
1953           .fniv = tcg_gen_add_vec,
1954           .fno = gen_helper_gvec_add32,
1955           .opt_opc = vecop_list_add,
1956           .vece = MO_32 },
1957         { .fni8 = tcg_gen_add_i64,
1958           .fniv = tcg_gen_add_vec,
1959           .fno = gen_helper_gvec_add64,
1960           .opt_opc = vecop_list_add,
1961           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1962           .vece = MO_64 },
1963     };
1964
1965     tcg_debug_assert(vece <= MO_64);
1966     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1967 }
1968
1969 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1970                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1971 {
1972     static const GVecGen2s g[4] = {
1973         { .fni8 = tcg_gen_vec_add8_i64,
1974           .fniv = tcg_gen_add_vec,
1975           .fno = gen_helper_gvec_adds8,
1976           .opt_opc = vecop_list_add,
1977           .vece = MO_8 },
1978         { .fni8 = tcg_gen_vec_add16_i64,
1979           .fniv = tcg_gen_add_vec,
1980           .fno = gen_helper_gvec_adds16,
1981           .opt_opc = vecop_list_add,
1982           .vece = MO_16 },
1983         { .fni4 = tcg_gen_add_i32,
1984           .fniv = tcg_gen_add_vec,
1985           .fno = gen_helper_gvec_adds32,
1986           .opt_opc = vecop_list_add,
1987           .vece = MO_32 },
1988         { .fni8 = tcg_gen_add_i64,
1989           .fniv = tcg_gen_add_vec,
1990           .fno = gen_helper_gvec_adds64,
1991           .opt_opc = vecop_list_add,
1992           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1993           .vece = MO_64 },
1994     };
1995
1996     tcg_debug_assert(vece <= MO_64);
1997     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1998 }
1999
2000 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
2001                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2002 {
2003     TCGv_i64 tmp = tcg_constant_i64(c);
2004     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2005 }
2006
2007 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2008
2009 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2010                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2011 {
2012     static const GVecGen2s g[4] = {
2013         { .fni8 = tcg_gen_vec_sub8_i64,
2014           .fniv = tcg_gen_sub_vec,
2015           .fno = gen_helper_gvec_subs8,
2016           .opt_opc = vecop_list_sub,
2017           .vece = MO_8 },
2018         { .fni8 = tcg_gen_vec_sub16_i64,
2019           .fniv = tcg_gen_sub_vec,
2020           .fno = gen_helper_gvec_subs16,
2021           .opt_opc = vecop_list_sub,
2022           .vece = MO_16 },
2023         { .fni4 = tcg_gen_sub_i32,
2024           .fniv = tcg_gen_sub_vec,
2025           .fno = gen_helper_gvec_subs32,
2026           .opt_opc = vecop_list_sub,
2027           .vece = MO_32 },
2028         { .fni8 = tcg_gen_sub_i64,
2029           .fniv = tcg_gen_sub_vec,
2030           .fno = gen_helper_gvec_subs64,
2031           .opt_opc = vecop_list_sub,
2032           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2033           .vece = MO_64 },
2034     };
2035
2036     tcg_debug_assert(vece <= MO_64);
2037     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2038 }
2039
2040 /* Perform a vector subtraction using normal subtraction and a mask.
2041    Compare gen_addv_mask above.  */
2042 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2043 {
2044     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2045     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2046     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2047
2048     tcg_gen_or_i64(t1, a, m);
2049     tcg_gen_andc_i64(t2, b, m);
2050     tcg_gen_eqv_i64(t3, a, b);
2051     tcg_gen_sub_i64(d, t1, t2);
2052     tcg_gen_and_i64(t3, t3, m);
2053     tcg_gen_xor_i64(d, d, t3);
2054
2055     tcg_temp_free_i64(t1);
2056     tcg_temp_free_i64(t2);
2057     tcg_temp_free_i64(t3);
2058 }
2059
2060 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2061 {
2062     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2063     gen_subv_mask(d, a, b, m);
2064 }
2065
2066 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2067 {
2068     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2069     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2070     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2071     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2072
2073     tcg_gen_or_i32(t1, a, m);
2074     tcg_gen_andc_i32(t2, b, m);
2075     tcg_gen_eqv_i32(t3, a, b);
2076     tcg_gen_sub_i32(d, t1, t2);
2077     tcg_gen_and_i32(t3, t3, m);
2078     tcg_gen_xor_i32(d, d, t3);
2079
2080     tcg_temp_free_i32(t1);
2081     tcg_temp_free_i32(t2);
2082     tcg_temp_free_i32(t3);
2083 }
2084
2085 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2086 {
2087     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2088     gen_subv_mask(d, a, b, m);
2089 }
2090
2091 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2092 {
2093     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2094     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2095
2096     tcg_gen_andi_i32(t1, b, ~0xffff);
2097     tcg_gen_sub_i32(t2, a, b);
2098     tcg_gen_sub_i32(t1, a, t1);
2099     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2100
2101     tcg_temp_free_i32(t1);
2102     tcg_temp_free_i32(t2);
2103 }
2104
2105 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2106 {
2107     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2108     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2109
2110     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2111     tcg_gen_sub_i64(t2, a, b);
2112     tcg_gen_sub_i64(t1, a, t1);
2113     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2114
2115     tcg_temp_free_i64(t1);
2116     tcg_temp_free_i64(t2);
2117 }
2118
2119 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2120                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2121 {
2122     static const GVecGen3 g[4] = {
2123         { .fni8 = tcg_gen_vec_sub8_i64,
2124           .fniv = tcg_gen_sub_vec,
2125           .fno = gen_helper_gvec_sub8,
2126           .opt_opc = vecop_list_sub,
2127           .vece = MO_8 },
2128         { .fni8 = tcg_gen_vec_sub16_i64,
2129           .fniv = tcg_gen_sub_vec,
2130           .fno = gen_helper_gvec_sub16,
2131           .opt_opc = vecop_list_sub,
2132           .vece = MO_16 },
2133         { .fni4 = tcg_gen_sub_i32,
2134           .fniv = tcg_gen_sub_vec,
2135           .fno = gen_helper_gvec_sub32,
2136           .opt_opc = vecop_list_sub,
2137           .vece = MO_32 },
2138         { .fni8 = tcg_gen_sub_i64,
2139           .fniv = tcg_gen_sub_vec,
2140           .fno = gen_helper_gvec_sub64,
2141           .opt_opc = vecop_list_sub,
2142           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2143           .vece = MO_64 },
2144     };
2145
2146     tcg_debug_assert(vece <= MO_64);
2147     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2148 }
2149
2150 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2151
2152 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2153                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2154 {
2155     static const GVecGen3 g[4] = {
2156         { .fniv = tcg_gen_mul_vec,
2157           .fno = gen_helper_gvec_mul8,
2158           .opt_opc = vecop_list_mul,
2159           .vece = MO_8 },
2160         { .fniv = tcg_gen_mul_vec,
2161           .fno = gen_helper_gvec_mul16,
2162           .opt_opc = vecop_list_mul,
2163           .vece = MO_16 },
2164         { .fni4 = tcg_gen_mul_i32,
2165           .fniv = tcg_gen_mul_vec,
2166           .fno = gen_helper_gvec_mul32,
2167           .opt_opc = vecop_list_mul,
2168           .vece = MO_32 },
2169         { .fni8 = tcg_gen_mul_i64,
2170           .fniv = tcg_gen_mul_vec,
2171           .fno = gen_helper_gvec_mul64,
2172           .opt_opc = vecop_list_mul,
2173           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2174           .vece = MO_64 },
2175     };
2176
2177     tcg_debug_assert(vece <= MO_64);
2178     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2179 }
2180
2181 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2182                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2183 {
2184     static const GVecGen2s g[4] = {
2185         { .fniv = tcg_gen_mul_vec,
2186           .fno = gen_helper_gvec_muls8,
2187           .opt_opc = vecop_list_mul,
2188           .vece = MO_8 },
2189         { .fniv = tcg_gen_mul_vec,
2190           .fno = gen_helper_gvec_muls16,
2191           .opt_opc = vecop_list_mul,
2192           .vece = MO_16 },
2193         { .fni4 = tcg_gen_mul_i32,
2194           .fniv = tcg_gen_mul_vec,
2195           .fno = gen_helper_gvec_muls32,
2196           .opt_opc = vecop_list_mul,
2197           .vece = MO_32 },
2198         { .fni8 = tcg_gen_mul_i64,
2199           .fniv = tcg_gen_mul_vec,
2200           .fno = gen_helper_gvec_muls64,
2201           .opt_opc = vecop_list_mul,
2202           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2203           .vece = MO_64 },
2204     };
2205
2206     tcg_debug_assert(vece <= MO_64);
2207     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2208 }
2209
2210 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2211                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2212 {
2213     TCGv_i64 tmp = tcg_constant_i64(c);
2214     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2215 }
2216
2217 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2218                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2219 {
2220     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2221     static const GVecGen3 g[4] = {
2222         { .fniv = tcg_gen_ssadd_vec,
2223           .fno = gen_helper_gvec_ssadd8,
2224           .opt_opc = vecop_list,
2225           .vece = MO_8 },
2226         { .fniv = tcg_gen_ssadd_vec,
2227           .fno = gen_helper_gvec_ssadd16,
2228           .opt_opc = vecop_list,
2229           .vece = MO_16 },
2230         { .fniv = tcg_gen_ssadd_vec,
2231           .fno = gen_helper_gvec_ssadd32,
2232           .opt_opc = vecop_list,
2233           .vece = MO_32 },
2234         { .fniv = tcg_gen_ssadd_vec,
2235           .fno = gen_helper_gvec_ssadd64,
2236           .opt_opc = vecop_list,
2237           .vece = MO_64 },
2238     };
2239     tcg_debug_assert(vece <= MO_64);
2240     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2241 }
2242
2243 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2244                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2245 {
2246     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2247     static const GVecGen3 g[4] = {
2248         { .fniv = tcg_gen_sssub_vec,
2249           .fno = gen_helper_gvec_sssub8,
2250           .opt_opc = vecop_list,
2251           .vece = MO_8 },
2252         { .fniv = tcg_gen_sssub_vec,
2253           .fno = gen_helper_gvec_sssub16,
2254           .opt_opc = vecop_list,
2255           .vece = MO_16 },
2256         { .fniv = tcg_gen_sssub_vec,
2257           .fno = gen_helper_gvec_sssub32,
2258           .opt_opc = vecop_list,
2259           .vece = MO_32 },
2260         { .fniv = tcg_gen_sssub_vec,
2261           .fno = gen_helper_gvec_sssub64,
2262           .opt_opc = vecop_list,
2263           .vece = MO_64 },
2264     };
2265     tcg_debug_assert(vece <= MO_64);
2266     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2267 }
2268
2269 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2270 {
2271     TCGv_i32 max = tcg_constant_i32(-1);
2272     tcg_gen_add_i32(d, a, b);
2273     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2274 }
2275
2276 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2277 {
2278     TCGv_i64 max = tcg_constant_i64(-1);
2279     tcg_gen_add_i64(d, a, b);
2280     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2281 }
2282
2283 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2284                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2285 {
2286     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2287     static const GVecGen3 g[4] = {
2288         { .fniv = tcg_gen_usadd_vec,
2289           .fno = gen_helper_gvec_usadd8,
2290           .opt_opc = vecop_list,
2291           .vece = MO_8 },
2292         { .fniv = tcg_gen_usadd_vec,
2293           .fno = gen_helper_gvec_usadd16,
2294           .opt_opc = vecop_list,
2295           .vece = MO_16 },
2296         { .fni4 = tcg_gen_usadd_i32,
2297           .fniv = tcg_gen_usadd_vec,
2298           .fno = gen_helper_gvec_usadd32,
2299           .opt_opc = vecop_list,
2300           .vece = MO_32 },
2301         { .fni8 = tcg_gen_usadd_i64,
2302           .fniv = tcg_gen_usadd_vec,
2303           .fno = gen_helper_gvec_usadd64,
2304           .opt_opc = vecop_list,
2305           .vece = MO_64 }
2306     };
2307     tcg_debug_assert(vece <= MO_64);
2308     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2309 }
2310
2311 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2312 {
2313     TCGv_i32 min = tcg_constant_i32(0);
2314     tcg_gen_sub_i32(d, a, b);
2315     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2316 }
2317
2318 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2319 {
2320     TCGv_i64 min = tcg_constant_i64(0);
2321     tcg_gen_sub_i64(d, a, b);
2322     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2323 }
2324
2325 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2326                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2327 {
2328     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2329     static const GVecGen3 g[4] = {
2330         { .fniv = tcg_gen_ussub_vec,
2331           .fno = gen_helper_gvec_ussub8,
2332           .opt_opc = vecop_list,
2333           .vece = MO_8 },
2334         { .fniv = tcg_gen_ussub_vec,
2335           .fno = gen_helper_gvec_ussub16,
2336           .opt_opc = vecop_list,
2337           .vece = MO_16 },
2338         { .fni4 = tcg_gen_ussub_i32,
2339           .fniv = tcg_gen_ussub_vec,
2340           .fno = gen_helper_gvec_ussub32,
2341           .opt_opc = vecop_list,
2342           .vece = MO_32 },
2343         { .fni8 = tcg_gen_ussub_i64,
2344           .fniv = tcg_gen_ussub_vec,
2345           .fno = gen_helper_gvec_ussub64,
2346           .opt_opc = vecop_list,
2347           .vece = MO_64 }
2348     };
2349     tcg_debug_assert(vece <= MO_64);
2350     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2351 }
2352
2353 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2354                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2355 {
2356     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2357     static const GVecGen3 g[4] = {
2358         { .fniv = tcg_gen_smin_vec,
2359           .fno = gen_helper_gvec_smin8,
2360           .opt_opc = vecop_list,
2361           .vece = MO_8 },
2362         { .fniv = tcg_gen_smin_vec,
2363           .fno = gen_helper_gvec_smin16,
2364           .opt_opc = vecop_list,
2365           .vece = MO_16 },
2366         { .fni4 = tcg_gen_smin_i32,
2367           .fniv = tcg_gen_smin_vec,
2368           .fno = gen_helper_gvec_smin32,
2369           .opt_opc = vecop_list,
2370           .vece = MO_32 },
2371         { .fni8 = tcg_gen_smin_i64,
2372           .fniv = tcg_gen_smin_vec,
2373           .fno = gen_helper_gvec_smin64,
2374           .opt_opc = vecop_list,
2375           .vece = MO_64 }
2376     };
2377     tcg_debug_assert(vece <= MO_64);
2378     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2379 }
2380
2381 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2382                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2383 {
2384     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2385     static const GVecGen3 g[4] = {
2386         { .fniv = tcg_gen_umin_vec,
2387           .fno = gen_helper_gvec_umin8,
2388           .opt_opc = vecop_list,
2389           .vece = MO_8 },
2390         { .fniv = tcg_gen_umin_vec,
2391           .fno = gen_helper_gvec_umin16,
2392           .opt_opc = vecop_list,
2393           .vece = MO_16 },
2394         { .fni4 = tcg_gen_umin_i32,
2395           .fniv = tcg_gen_umin_vec,
2396           .fno = gen_helper_gvec_umin32,
2397           .opt_opc = vecop_list,
2398           .vece = MO_32 },
2399         { .fni8 = tcg_gen_umin_i64,
2400           .fniv = tcg_gen_umin_vec,
2401           .fno = gen_helper_gvec_umin64,
2402           .opt_opc = vecop_list,
2403           .vece = MO_64 }
2404     };
2405     tcg_debug_assert(vece <= MO_64);
2406     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2407 }
2408
2409 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2410                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2411 {
2412     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2413     static const GVecGen3 g[4] = {
2414         { .fniv = tcg_gen_smax_vec,
2415           .fno = gen_helper_gvec_smax8,
2416           .opt_opc = vecop_list,
2417           .vece = MO_8 },
2418         { .fniv = tcg_gen_smax_vec,
2419           .fno = gen_helper_gvec_smax16,
2420           .opt_opc = vecop_list,
2421           .vece = MO_16 },
2422         { .fni4 = tcg_gen_smax_i32,
2423           .fniv = tcg_gen_smax_vec,
2424           .fno = gen_helper_gvec_smax32,
2425           .opt_opc = vecop_list,
2426           .vece = MO_32 },
2427         { .fni8 = tcg_gen_smax_i64,
2428           .fniv = tcg_gen_smax_vec,
2429           .fno = gen_helper_gvec_smax64,
2430           .opt_opc = vecop_list,
2431           .vece = MO_64 }
2432     };
2433     tcg_debug_assert(vece <= MO_64);
2434     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2435 }
2436
2437 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2438                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2439 {
2440     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2441     static const GVecGen3 g[4] = {
2442         { .fniv = tcg_gen_umax_vec,
2443           .fno = gen_helper_gvec_umax8,
2444           .opt_opc = vecop_list,
2445           .vece = MO_8 },
2446         { .fniv = tcg_gen_umax_vec,
2447           .fno = gen_helper_gvec_umax16,
2448           .opt_opc = vecop_list,
2449           .vece = MO_16 },
2450         { .fni4 = tcg_gen_umax_i32,
2451           .fniv = tcg_gen_umax_vec,
2452           .fno = gen_helper_gvec_umax32,
2453           .opt_opc = vecop_list,
2454           .vece = MO_32 },
2455         { .fni8 = tcg_gen_umax_i64,
2456           .fniv = tcg_gen_umax_vec,
2457           .fno = gen_helper_gvec_umax64,
2458           .opt_opc = vecop_list,
2459           .vece = MO_64 }
2460     };
2461     tcg_debug_assert(vece <= MO_64);
2462     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2463 }
2464
2465 /* Perform a vector negation using normal negation and a mask.
2466    Compare gen_subv_mask above.  */
2467 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2468 {
2469     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2470     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2471
2472     tcg_gen_andc_i64(t3, m, b);
2473     tcg_gen_andc_i64(t2, b, m);
2474     tcg_gen_sub_i64(d, m, t2);
2475     tcg_gen_xor_i64(d, d, t3);
2476
2477     tcg_temp_free_i64(t2);
2478     tcg_temp_free_i64(t3);
2479 }
2480
2481 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2482 {
2483     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2484     gen_negv_mask(d, b, m);
2485 }
2486
2487 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2488 {
2489     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2490     gen_negv_mask(d, b, m);
2491 }
2492
2493 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2494 {
2495     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2496     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2497
2498     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2499     tcg_gen_neg_i64(t2, b);
2500     tcg_gen_neg_i64(t1, t1);
2501     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2502
2503     tcg_temp_free_i64(t1);
2504     tcg_temp_free_i64(t2);
2505 }
2506
2507 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2508                       uint32_t oprsz, uint32_t maxsz)
2509 {
2510     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2511     static const GVecGen2 g[4] = {
2512         { .fni8 = tcg_gen_vec_neg8_i64,
2513           .fniv = tcg_gen_neg_vec,
2514           .fno = gen_helper_gvec_neg8,
2515           .opt_opc = vecop_list,
2516           .vece = MO_8 },
2517         { .fni8 = tcg_gen_vec_neg16_i64,
2518           .fniv = tcg_gen_neg_vec,
2519           .fno = gen_helper_gvec_neg16,
2520           .opt_opc = vecop_list,
2521           .vece = MO_16 },
2522         { .fni4 = tcg_gen_neg_i32,
2523           .fniv = tcg_gen_neg_vec,
2524           .fno = gen_helper_gvec_neg32,
2525           .opt_opc = vecop_list,
2526           .vece = MO_32 },
2527         { .fni8 = tcg_gen_neg_i64,
2528           .fniv = tcg_gen_neg_vec,
2529           .fno = gen_helper_gvec_neg64,
2530           .opt_opc = vecop_list,
2531           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2532           .vece = MO_64 },
2533     };
2534
2535     tcg_debug_assert(vece <= MO_64);
2536     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2537 }
2538
2539 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2540 {
2541     TCGv_i64 t = tcg_temp_ebb_new_i64();
2542     int nbit = 8 << vece;
2543
2544     /* Create -1 for each negative element.  */
2545     tcg_gen_shri_i64(t, b, nbit - 1);
2546     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2547     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2548
2549     /*
2550      * Invert (via xor -1) and add one.
2551      * Because of the ordering the msb is cleared,
2552      * so we never have carry into the next element.
2553      */
2554     tcg_gen_xor_i64(d, b, t);
2555     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2556     tcg_gen_add_i64(d, d, t);
2557
2558     tcg_temp_free_i64(t);
2559 }
2560
2561 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2562 {
2563     gen_absv_mask(d, b, MO_8);
2564 }
2565
2566 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2567 {
2568     gen_absv_mask(d, b, MO_16);
2569 }
2570
2571 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2572                       uint32_t oprsz, uint32_t maxsz)
2573 {
2574     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2575     static const GVecGen2 g[4] = {
2576         { .fni8 = tcg_gen_vec_abs8_i64,
2577           .fniv = tcg_gen_abs_vec,
2578           .fno = gen_helper_gvec_abs8,
2579           .opt_opc = vecop_list,
2580           .vece = MO_8 },
2581         { .fni8 = tcg_gen_vec_abs16_i64,
2582           .fniv = tcg_gen_abs_vec,
2583           .fno = gen_helper_gvec_abs16,
2584           .opt_opc = vecop_list,
2585           .vece = MO_16 },
2586         { .fni4 = tcg_gen_abs_i32,
2587           .fniv = tcg_gen_abs_vec,
2588           .fno = gen_helper_gvec_abs32,
2589           .opt_opc = vecop_list,
2590           .vece = MO_32 },
2591         { .fni8 = tcg_gen_abs_i64,
2592           .fniv = tcg_gen_abs_vec,
2593           .fno = gen_helper_gvec_abs64,
2594           .opt_opc = vecop_list,
2595           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2596           .vece = MO_64 },
2597     };
2598
2599     tcg_debug_assert(vece <= MO_64);
2600     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2601 }
2602
2603 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2604                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2605 {
2606     static const GVecGen3 g = {
2607         .fni8 = tcg_gen_and_i64,
2608         .fniv = tcg_gen_and_vec,
2609         .fno = gen_helper_gvec_and,
2610         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2611     };
2612
2613     if (aofs == bofs) {
2614         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2615     } else {
2616         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2617     }
2618 }
2619
2620 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2621                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2622 {
2623     static const GVecGen3 g = {
2624         .fni8 = tcg_gen_or_i64,
2625         .fniv = tcg_gen_or_vec,
2626         .fno = gen_helper_gvec_or,
2627         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2628     };
2629
2630     if (aofs == bofs) {
2631         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2632     } else {
2633         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2634     }
2635 }
2636
2637 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2638                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2639 {
2640     static const GVecGen3 g = {
2641         .fni8 = tcg_gen_xor_i64,
2642         .fniv = tcg_gen_xor_vec,
2643         .fno = gen_helper_gvec_xor,
2644         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2645     };
2646
2647     if (aofs == bofs) {
2648         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2649     } else {
2650         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2651     }
2652 }
2653
2654 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2655                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2656 {
2657     static const GVecGen3 g = {
2658         .fni8 = tcg_gen_andc_i64,
2659         .fniv = tcg_gen_andc_vec,
2660         .fno = gen_helper_gvec_andc,
2661         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2662     };
2663
2664     if (aofs == bofs) {
2665         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2666     } else {
2667         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2668     }
2669 }
2670
2671 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2672                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2673 {
2674     static const GVecGen3 g = {
2675         .fni8 = tcg_gen_orc_i64,
2676         .fniv = tcg_gen_orc_vec,
2677         .fno = gen_helper_gvec_orc,
2678         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2679     };
2680
2681     if (aofs == bofs) {
2682         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2683     } else {
2684         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2685     }
2686 }
2687
2688 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2689                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2690 {
2691     static const GVecGen3 g = {
2692         .fni8 = tcg_gen_nand_i64,
2693         .fniv = tcg_gen_nand_vec,
2694         .fno = gen_helper_gvec_nand,
2695         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2696     };
2697
2698     if (aofs == bofs) {
2699         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2700     } else {
2701         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2702     }
2703 }
2704
2705 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2706                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2707 {
2708     static const GVecGen3 g = {
2709         .fni8 = tcg_gen_nor_i64,
2710         .fniv = tcg_gen_nor_vec,
2711         .fno = gen_helper_gvec_nor,
2712         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2713     };
2714
2715     if (aofs == bofs) {
2716         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2717     } else {
2718         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2719     }
2720 }
2721
2722 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2723                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2724 {
2725     static const GVecGen3 g = {
2726         .fni8 = tcg_gen_eqv_i64,
2727         .fniv = tcg_gen_eqv_vec,
2728         .fno = gen_helper_gvec_eqv,
2729         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2730     };
2731
2732     if (aofs == bofs) {
2733         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2734     } else {
2735         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2736     }
2737 }
2738
2739 static const GVecGen2s gop_ands = {
2740     .fni8 = tcg_gen_and_i64,
2741     .fniv = tcg_gen_and_vec,
2742     .fno = gen_helper_gvec_ands,
2743     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2744     .vece = MO_64
2745 };
2746
2747 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2748                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2749 {
2750     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2751     tcg_gen_dup_i64(vece, tmp, c);
2752     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2753     tcg_temp_free_i64(tmp);
2754 }
2755
2756 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2757                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2758 {
2759     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2760     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2761 }
2762
2763 static const GVecGen2s gop_xors = {
2764     .fni8 = tcg_gen_xor_i64,
2765     .fniv = tcg_gen_xor_vec,
2766     .fno = gen_helper_gvec_xors,
2767     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2768     .vece = MO_64
2769 };
2770
2771 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2772                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2773 {
2774     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2775     tcg_gen_dup_i64(vece, tmp, c);
2776     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2777     tcg_temp_free_i64(tmp);
2778 }
2779
2780 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2781                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2782 {
2783     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2784     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2785 }
2786
2787 static const GVecGen2s gop_ors = {
2788     .fni8 = tcg_gen_or_i64,
2789     .fniv = tcg_gen_or_vec,
2790     .fno = gen_helper_gvec_ors,
2791     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2792     .vece = MO_64
2793 };
2794
2795 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2796                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2797 {
2798     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2799     tcg_gen_dup_i64(vece, tmp, c);
2800     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2801     tcg_temp_free_i64(tmp);
2802 }
2803
2804 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2805                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2806 {
2807     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2808     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2809 }
2810
2811 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2812 {
2813     uint64_t mask = dup_const(MO_8, 0xff << c);
2814     tcg_gen_shli_i64(d, a, c);
2815     tcg_gen_andi_i64(d, d, mask);
2816 }
2817
2818 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2819 {
2820     uint64_t mask = dup_const(MO_16, 0xffff << c);
2821     tcg_gen_shli_i64(d, a, c);
2822     tcg_gen_andi_i64(d, d, mask);
2823 }
2824
2825 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2826 {
2827     uint32_t mask = dup_const(MO_8, 0xff << c);
2828     tcg_gen_shli_i32(d, a, c);
2829     tcg_gen_andi_i32(d, d, mask);
2830 }
2831
2832 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2833 {
2834     uint32_t mask = dup_const(MO_16, 0xffff << c);
2835     tcg_gen_shli_i32(d, a, c);
2836     tcg_gen_andi_i32(d, d, mask);
2837 }
2838
2839 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2840                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2841 {
2842     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2843     static const GVecGen2i g[4] = {
2844         { .fni8 = tcg_gen_vec_shl8i_i64,
2845           .fniv = tcg_gen_shli_vec,
2846           .fno = gen_helper_gvec_shl8i,
2847           .opt_opc = vecop_list,
2848           .vece = MO_8 },
2849         { .fni8 = tcg_gen_vec_shl16i_i64,
2850           .fniv = tcg_gen_shli_vec,
2851           .fno = gen_helper_gvec_shl16i,
2852           .opt_opc = vecop_list,
2853           .vece = MO_16 },
2854         { .fni4 = tcg_gen_shli_i32,
2855           .fniv = tcg_gen_shli_vec,
2856           .fno = gen_helper_gvec_shl32i,
2857           .opt_opc = vecop_list,
2858           .vece = MO_32 },
2859         { .fni8 = tcg_gen_shli_i64,
2860           .fniv = tcg_gen_shli_vec,
2861           .fno = gen_helper_gvec_shl64i,
2862           .opt_opc = vecop_list,
2863           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2864           .vece = MO_64 },
2865     };
2866
2867     tcg_debug_assert(vece <= MO_64);
2868     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2869     if (shift == 0) {
2870         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2871     } else {
2872         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2873     }
2874 }
2875
2876 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2877 {
2878     uint64_t mask = dup_const(MO_8, 0xff >> c);
2879     tcg_gen_shri_i64(d, a, c);
2880     tcg_gen_andi_i64(d, d, mask);
2881 }
2882
2883 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2884 {
2885     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2886     tcg_gen_shri_i64(d, a, c);
2887     tcg_gen_andi_i64(d, d, mask);
2888 }
2889
2890 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2891 {
2892     uint32_t mask = dup_const(MO_8, 0xff >> c);
2893     tcg_gen_shri_i32(d, a, c);
2894     tcg_gen_andi_i32(d, d, mask);
2895 }
2896
2897 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2898 {
2899     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2900     tcg_gen_shri_i32(d, a, c);
2901     tcg_gen_andi_i32(d, d, mask);
2902 }
2903
2904 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2905                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2906 {
2907     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2908     static const GVecGen2i g[4] = {
2909         { .fni8 = tcg_gen_vec_shr8i_i64,
2910           .fniv = tcg_gen_shri_vec,
2911           .fno = gen_helper_gvec_shr8i,
2912           .opt_opc = vecop_list,
2913           .vece = MO_8 },
2914         { .fni8 = tcg_gen_vec_shr16i_i64,
2915           .fniv = tcg_gen_shri_vec,
2916           .fno = gen_helper_gvec_shr16i,
2917           .opt_opc = vecop_list,
2918           .vece = MO_16 },
2919         { .fni4 = tcg_gen_shri_i32,
2920           .fniv = tcg_gen_shri_vec,
2921           .fno = gen_helper_gvec_shr32i,
2922           .opt_opc = vecop_list,
2923           .vece = MO_32 },
2924         { .fni8 = tcg_gen_shri_i64,
2925           .fniv = tcg_gen_shri_vec,
2926           .fno = gen_helper_gvec_shr64i,
2927           .opt_opc = vecop_list,
2928           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2929           .vece = MO_64 },
2930     };
2931
2932     tcg_debug_assert(vece <= MO_64);
2933     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2934     if (shift == 0) {
2935         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2936     } else {
2937         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2938     }
2939 }
2940
2941 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2942 {
2943     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2944     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2945     TCGv_i64 s = tcg_temp_ebb_new_i64();
2946
2947     tcg_gen_shri_i64(d, a, c);
2948     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2949     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2950     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2951     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2952     tcg_temp_free_i64(s);
2953 }
2954
2955 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2956 {
2957     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2958     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2959     TCGv_i64 s = tcg_temp_ebb_new_i64();
2960
2961     tcg_gen_shri_i64(d, a, c);
2962     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2963     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2964     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2965     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2966     tcg_temp_free_i64(s);
2967 }
2968
2969 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2970 {
2971     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2972     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2973     TCGv_i32 s = tcg_temp_ebb_new_i32();
2974
2975     tcg_gen_shri_i32(d, a, c);
2976     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2977     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2978     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2979     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2980     tcg_temp_free_i32(s);
2981 }
2982
2983 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2984 {
2985     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2986     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2987     TCGv_i32 s = tcg_temp_ebb_new_i32();
2988
2989     tcg_gen_shri_i32(d, a, c);
2990     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2991     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2992     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2993     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2994     tcg_temp_free_i32(s);
2995 }
2996
2997 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2998                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2999 {
3000     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3001     static const GVecGen2i g[4] = {
3002         { .fni8 = tcg_gen_vec_sar8i_i64,
3003           .fniv = tcg_gen_sari_vec,
3004           .fno = gen_helper_gvec_sar8i,
3005           .opt_opc = vecop_list,
3006           .vece = MO_8 },
3007         { .fni8 = tcg_gen_vec_sar16i_i64,
3008           .fniv = tcg_gen_sari_vec,
3009           .fno = gen_helper_gvec_sar16i,
3010           .opt_opc = vecop_list,
3011           .vece = MO_16 },
3012         { .fni4 = tcg_gen_sari_i32,
3013           .fniv = tcg_gen_sari_vec,
3014           .fno = gen_helper_gvec_sar32i,
3015           .opt_opc = vecop_list,
3016           .vece = MO_32 },
3017         { .fni8 = tcg_gen_sari_i64,
3018           .fniv = tcg_gen_sari_vec,
3019           .fno = gen_helper_gvec_sar64i,
3020           .opt_opc = vecop_list,
3021           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3022           .vece = MO_64 },
3023     };
3024
3025     tcg_debug_assert(vece <= MO_64);
3026     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3027     if (shift == 0) {
3028         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3029     } else {
3030         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3031     }
3032 }
3033
3034 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3035 {
3036     uint64_t mask = dup_const(MO_8, 0xff << c);
3037
3038     tcg_gen_shli_i64(d, a, c);
3039     tcg_gen_shri_i64(a, a, 8 - c);
3040     tcg_gen_andi_i64(d, d, mask);
3041     tcg_gen_andi_i64(a, a, ~mask);
3042     tcg_gen_or_i64(d, d, a);
3043 }
3044
3045 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3046 {
3047     uint64_t mask = dup_const(MO_16, 0xffff << c);
3048
3049     tcg_gen_shli_i64(d, a, c);
3050     tcg_gen_shri_i64(a, a, 16 - c);
3051     tcg_gen_andi_i64(d, d, mask);
3052     tcg_gen_andi_i64(a, a, ~mask);
3053     tcg_gen_or_i64(d, d, a);
3054 }
3055
3056 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3057                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3058 {
3059     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3060     static const GVecGen2i g[4] = {
3061         { .fni8 = tcg_gen_vec_rotl8i_i64,
3062           .fniv = tcg_gen_rotli_vec,
3063           .fno = gen_helper_gvec_rotl8i,
3064           .opt_opc = vecop_list,
3065           .vece = MO_8 },
3066         { .fni8 = tcg_gen_vec_rotl16i_i64,
3067           .fniv = tcg_gen_rotli_vec,
3068           .fno = gen_helper_gvec_rotl16i,
3069           .opt_opc = vecop_list,
3070           .vece = MO_16 },
3071         { .fni4 = tcg_gen_rotli_i32,
3072           .fniv = tcg_gen_rotli_vec,
3073           .fno = gen_helper_gvec_rotl32i,
3074           .opt_opc = vecop_list,
3075           .vece = MO_32 },
3076         { .fni8 = tcg_gen_rotli_i64,
3077           .fniv = tcg_gen_rotli_vec,
3078           .fno = gen_helper_gvec_rotl64i,
3079           .opt_opc = vecop_list,
3080           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3081           .vece = MO_64 },
3082     };
3083
3084     tcg_debug_assert(vece <= MO_64);
3085     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3086     if (shift == 0) {
3087         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3088     } else {
3089         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3090     }
3091 }
3092
3093 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3094                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3095 {
3096     tcg_debug_assert(vece <= MO_64);
3097     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3098     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3099                        oprsz, maxsz);
3100 }
3101
3102 /*
3103  * Specialized generation vector shifts by a non-constant scalar.
3104  */
3105
3106 typedef struct {
3107     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3108     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3109     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3110     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3111     gen_helper_gvec_2 *fno[4];
3112     TCGOpcode s_list[2];
3113     TCGOpcode v_list[2];
3114 } GVecGen2sh;
3115
3116 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3117                            uint32_t oprsz, uint32_t tysz, TCGType type,
3118                            TCGv_i32 shift,
3119                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3120 {
3121     TCGv_vec t0 = tcg_temp_new_vec(type);
3122     uint32_t i;
3123
3124     for (i = 0; i < oprsz; i += tysz) {
3125         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3126         fni(vece, t0, t0, shift);
3127         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3128     }
3129     tcg_temp_free_vec(t0);
3130 }
3131
3132 static void
3133 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3134                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3135 {
3136     TCGType type;
3137     uint32_t some;
3138
3139     check_size_align(oprsz, maxsz, dofs | aofs);
3140     check_overlap_2(dofs, aofs, maxsz);
3141
3142     /* If the backend has a scalar expansion, great.  */
3143     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3144     if (type) {
3145         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3146         switch (type) {
3147         case TCG_TYPE_V256:
3148             some = QEMU_ALIGN_DOWN(oprsz, 32);
3149             expand_2sh_vec(vece, dofs, aofs, some, 32,
3150                            TCG_TYPE_V256, shift, g->fniv_s);
3151             if (some == oprsz) {
3152                 break;
3153             }
3154             dofs += some;
3155             aofs += some;
3156             oprsz -= some;
3157             maxsz -= some;
3158             /* fallthru */
3159         case TCG_TYPE_V128:
3160             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3161                            TCG_TYPE_V128, shift, g->fniv_s);
3162             break;
3163         case TCG_TYPE_V64:
3164             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3165                            TCG_TYPE_V64, shift, g->fniv_s);
3166             break;
3167         default:
3168             g_assert_not_reached();
3169         }
3170         tcg_swap_vecop_list(hold_list);
3171         goto clear_tail;
3172     }
3173
3174     /* If the backend supports variable vector shifts, also cool.  */
3175     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3176     if (type) {
3177         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3178         TCGv_vec v_shift = tcg_temp_new_vec(type);
3179
3180         if (vece == MO_64) {
3181             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3182             tcg_gen_extu_i32_i64(sh64, shift);
3183             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3184             tcg_temp_free_i64(sh64);
3185         } else {
3186             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3187         }
3188
3189         switch (type) {
3190         case TCG_TYPE_V256:
3191             some = QEMU_ALIGN_DOWN(oprsz, 32);
3192             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3193                           v_shift, false, g->fniv_v);
3194             if (some == oprsz) {
3195                 break;
3196             }
3197             dofs += some;
3198             aofs += some;
3199             oprsz -= some;
3200             maxsz -= some;
3201             /* fallthru */
3202         case TCG_TYPE_V128:
3203             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3204                           v_shift, false, g->fniv_v);
3205             break;
3206         case TCG_TYPE_V64:
3207             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3208                           v_shift, false, g->fniv_v);
3209             break;
3210         default:
3211             g_assert_not_reached();
3212         }
3213         tcg_temp_free_vec(v_shift);
3214         tcg_swap_vecop_list(hold_list);
3215         goto clear_tail;
3216     }
3217
3218     /* Otherwise fall back to integral... */
3219     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3220         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3221     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3222         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3223         tcg_gen_extu_i32_i64(sh64, shift);
3224         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3225         tcg_temp_free_i64(sh64);
3226     } else {
3227         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3228         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3229         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3230
3231         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3232         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3233         tcg_gen_addi_ptr(a0, cpu_env, dofs);
3234         tcg_gen_addi_ptr(a1, cpu_env, aofs);
3235
3236         g->fno[vece](a0, a1, desc);
3237
3238         tcg_temp_free_ptr(a0);
3239         tcg_temp_free_ptr(a1);
3240         tcg_temp_free_i32(desc);
3241         return;
3242     }
3243
3244  clear_tail:
3245     if (oprsz < maxsz) {
3246         expand_clr(dofs + oprsz, maxsz - oprsz);
3247     }
3248 }
3249
3250 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3251                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3252 {
3253     static const GVecGen2sh g = {
3254         .fni4 = tcg_gen_shl_i32,
3255         .fni8 = tcg_gen_shl_i64,
3256         .fniv_s = tcg_gen_shls_vec,
3257         .fniv_v = tcg_gen_shlv_vec,
3258         .fno = {
3259             gen_helper_gvec_shl8i,
3260             gen_helper_gvec_shl16i,
3261             gen_helper_gvec_shl32i,
3262             gen_helper_gvec_shl64i,
3263         },
3264         .s_list = { INDEX_op_shls_vec, 0 },
3265         .v_list = { INDEX_op_shlv_vec, 0 },
3266     };
3267
3268     tcg_debug_assert(vece <= MO_64);
3269     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3270 }
3271
3272 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3273                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3274 {
3275     static const GVecGen2sh g = {
3276         .fni4 = tcg_gen_shr_i32,
3277         .fni8 = tcg_gen_shr_i64,
3278         .fniv_s = tcg_gen_shrs_vec,
3279         .fniv_v = tcg_gen_shrv_vec,
3280         .fno = {
3281             gen_helper_gvec_shr8i,
3282             gen_helper_gvec_shr16i,
3283             gen_helper_gvec_shr32i,
3284             gen_helper_gvec_shr64i,
3285         },
3286         .s_list = { INDEX_op_shrs_vec, 0 },
3287         .v_list = { INDEX_op_shrv_vec, 0 },
3288     };
3289
3290     tcg_debug_assert(vece <= MO_64);
3291     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3292 }
3293
3294 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3295                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3296 {
3297     static const GVecGen2sh g = {
3298         .fni4 = tcg_gen_sar_i32,
3299         .fni8 = tcg_gen_sar_i64,
3300         .fniv_s = tcg_gen_sars_vec,
3301         .fniv_v = tcg_gen_sarv_vec,
3302         .fno = {
3303             gen_helper_gvec_sar8i,
3304             gen_helper_gvec_sar16i,
3305             gen_helper_gvec_sar32i,
3306             gen_helper_gvec_sar64i,
3307         },
3308         .s_list = { INDEX_op_sars_vec, 0 },
3309         .v_list = { INDEX_op_sarv_vec, 0 },
3310     };
3311
3312     tcg_debug_assert(vece <= MO_64);
3313     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3314 }
3315
3316 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3317                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3318 {
3319     static const GVecGen2sh g = {
3320         .fni4 = tcg_gen_rotl_i32,
3321         .fni8 = tcg_gen_rotl_i64,
3322         .fniv_s = tcg_gen_rotls_vec,
3323         .fniv_v = tcg_gen_rotlv_vec,
3324         .fno = {
3325             gen_helper_gvec_rotl8i,
3326             gen_helper_gvec_rotl16i,
3327             gen_helper_gvec_rotl32i,
3328             gen_helper_gvec_rotl64i,
3329         },
3330         .s_list = { INDEX_op_rotls_vec, 0 },
3331         .v_list = { INDEX_op_rotlv_vec, 0 },
3332     };
3333
3334     tcg_debug_assert(vece <= MO_64);
3335     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3336 }
3337
3338 /*
3339  * Expand D = A << (B % element bits)
3340  *
3341  * Unlike scalar shifts, where it is easy for the target front end
3342  * to include the modulo as part of the expansion.  If the target
3343  * naturally includes the modulo as part of the operation, great!
3344  * If the target has some other behaviour from out-of-range shifts,
3345  * then it could not use this function anyway, and would need to
3346  * do it's own expansion with custom functions.
3347  */
3348 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3349                                  TCGv_vec a, TCGv_vec b)
3350 {
3351     TCGv_vec t = tcg_temp_new_vec_matching(d);
3352     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3353
3354     tcg_gen_and_vec(vece, t, b, m);
3355     tcg_gen_shlv_vec(vece, d, a, t);
3356     tcg_temp_free_vec(t);
3357 }
3358
3359 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3360 {
3361     TCGv_i32 t = tcg_temp_ebb_new_i32();
3362
3363     tcg_gen_andi_i32(t, b, 31);
3364     tcg_gen_shl_i32(d, a, t);
3365     tcg_temp_free_i32(t);
3366 }
3367
3368 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3369 {
3370     TCGv_i64 t = tcg_temp_ebb_new_i64();
3371
3372     tcg_gen_andi_i64(t, b, 63);
3373     tcg_gen_shl_i64(d, a, t);
3374     tcg_temp_free_i64(t);
3375 }
3376
3377 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3378                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3379 {
3380     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3381     static const GVecGen3 g[4] = {
3382         { .fniv = tcg_gen_shlv_mod_vec,
3383           .fno = gen_helper_gvec_shl8v,
3384           .opt_opc = vecop_list,
3385           .vece = MO_8 },
3386         { .fniv = tcg_gen_shlv_mod_vec,
3387           .fno = gen_helper_gvec_shl16v,
3388           .opt_opc = vecop_list,
3389           .vece = MO_16 },
3390         { .fni4 = tcg_gen_shl_mod_i32,
3391           .fniv = tcg_gen_shlv_mod_vec,
3392           .fno = gen_helper_gvec_shl32v,
3393           .opt_opc = vecop_list,
3394           .vece = MO_32 },
3395         { .fni8 = tcg_gen_shl_mod_i64,
3396           .fniv = tcg_gen_shlv_mod_vec,
3397           .fno = gen_helper_gvec_shl64v,
3398           .opt_opc = vecop_list,
3399           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3400           .vece = MO_64 },
3401     };
3402
3403     tcg_debug_assert(vece <= MO_64);
3404     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3405 }
3406
3407 /*
3408  * Similarly for logical right shifts.
3409  */
3410
3411 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3412                                  TCGv_vec a, TCGv_vec b)
3413 {
3414     TCGv_vec t = tcg_temp_new_vec_matching(d);
3415     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3416
3417     tcg_gen_and_vec(vece, t, b, m);
3418     tcg_gen_shrv_vec(vece, d, a, t);
3419     tcg_temp_free_vec(t);
3420 }
3421
3422 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3423 {
3424     TCGv_i32 t = tcg_temp_ebb_new_i32();
3425
3426     tcg_gen_andi_i32(t, b, 31);
3427     tcg_gen_shr_i32(d, a, t);
3428     tcg_temp_free_i32(t);
3429 }
3430
3431 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3432 {
3433     TCGv_i64 t = tcg_temp_ebb_new_i64();
3434
3435     tcg_gen_andi_i64(t, b, 63);
3436     tcg_gen_shr_i64(d, a, t);
3437     tcg_temp_free_i64(t);
3438 }
3439
3440 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3441                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3442 {
3443     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3444     static const GVecGen3 g[4] = {
3445         { .fniv = tcg_gen_shrv_mod_vec,
3446           .fno = gen_helper_gvec_shr8v,
3447           .opt_opc = vecop_list,
3448           .vece = MO_8 },
3449         { .fniv = tcg_gen_shrv_mod_vec,
3450           .fno = gen_helper_gvec_shr16v,
3451           .opt_opc = vecop_list,
3452           .vece = MO_16 },
3453         { .fni4 = tcg_gen_shr_mod_i32,
3454           .fniv = tcg_gen_shrv_mod_vec,
3455           .fno = gen_helper_gvec_shr32v,
3456           .opt_opc = vecop_list,
3457           .vece = MO_32 },
3458         { .fni8 = tcg_gen_shr_mod_i64,
3459           .fniv = tcg_gen_shrv_mod_vec,
3460           .fno = gen_helper_gvec_shr64v,
3461           .opt_opc = vecop_list,
3462           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3463           .vece = MO_64 },
3464     };
3465
3466     tcg_debug_assert(vece <= MO_64);
3467     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3468 }
3469
3470 /*
3471  * Similarly for arithmetic right shifts.
3472  */
3473
3474 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3475                                  TCGv_vec a, TCGv_vec b)
3476 {
3477     TCGv_vec t = tcg_temp_new_vec_matching(d);
3478     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3479
3480     tcg_gen_and_vec(vece, t, b, m);
3481     tcg_gen_sarv_vec(vece, d, a, t);
3482     tcg_temp_free_vec(t);
3483 }
3484
3485 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3486 {
3487     TCGv_i32 t = tcg_temp_ebb_new_i32();
3488
3489     tcg_gen_andi_i32(t, b, 31);
3490     tcg_gen_sar_i32(d, a, t);
3491     tcg_temp_free_i32(t);
3492 }
3493
3494 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3495 {
3496     TCGv_i64 t = tcg_temp_ebb_new_i64();
3497
3498     tcg_gen_andi_i64(t, b, 63);
3499     tcg_gen_sar_i64(d, a, t);
3500     tcg_temp_free_i64(t);
3501 }
3502
3503 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3504                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3505 {
3506     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3507     static const GVecGen3 g[4] = {
3508         { .fniv = tcg_gen_sarv_mod_vec,
3509           .fno = gen_helper_gvec_sar8v,
3510           .opt_opc = vecop_list,
3511           .vece = MO_8 },
3512         { .fniv = tcg_gen_sarv_mod_vec,
3513           .fno = gen_helper_gvec_sar16v,
3514           .opt_opc = vecop_list,
3515           .vece = MO_16 },
3516         { .fni4 = tcg_gen_sar_mod_i32,
3517           .fniv = tcg_gen_sarv_mod_vec,
3518           .fno = gen_helper_gvec_sar32v,
3519           .opt_opc = vecop_list,
3520           .vece = MO_32 },
3521         { .fni8 = tcg_gen_sar_mod_i64,
3522           .fniv = tcg_gen_sarv_mod_vec,
3523           .fno = gen_helper_gvec_sar64v,
3524           .opt_opc = vecop_list,
3525           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3526           .vece = MO_64 },
3527     };
3528
3529     tcg_debug_assert(vece <= MO_64);
3530     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3531 }
3532
3533 /*
3534  * Similarly for rotates.
3535  */
3536
3537 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3538                                   TCGv_vec a, TCGv_vec b)
3539 {
3540     TCGv_vec t = tcg_temp_new_vec_matching(d);
3541     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3542
3543     tcg_gen_and_vec(vece, t, b, m);
3544     tcg_gen_rotlv_vec(vece, d, a, t);
3545     tcg_temp_free_vec(t);
3546 }
3547
3548 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3549 {
3550     TCGv_i32 t = tcg_temp_ebb_new_i32();
3551
3552     tcg_gen_andi_i32(t, b, 31);
3553     tcg_gen_rotl_i32(d, a, t);
3554     tcg_temp_free_i32(t);
3555 }
3556
3557 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3558 {
3559     TCGv_i64 t = tcg_temp_ebb_new_i64();
3560
3561     tcg_gen_andi_i64(t, b, 63);
3562     tcg_gen_rotl_i64(d, a, t);
3563     tcg_temp_free_i64(t);
3564 }
3565
3566 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3567                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3568 {
3569     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3570     static const GVecGen3 g[4] = {
3571         { .fniv = tcg_gen_rotlv_mod_vec,
3572           .fno = gen_helper_gvec_rotl8v,
3573           .opt_opc = vecop_list,
3574           .vece = MO_8 },
3575         { .fniv = tcg_gen_rotlv_mod_vec,
3576           .fno = gen_helper_gvec_rotl16v,
3577           .opt_opc = vecop_list,
3578           .vece = MO_16 },
3579         { .fni4 = tcg_gen_rotl_mod_i32,
3580           .fniv = tcg_gen_rotlv_mod_vec,
3581           .fno = gen_helper_gvec_rotl32v,
3582           .opt_opc = vecop_list,
3583           .vece = MO_32 },
3584         { .fni8 = tcg_gen_rotl_mod_i64,
3585           .fniv = tcg_gen_rotlv_mod_vec,
3586           .fno = gen_helper_gvec_rotl64v,
3587           .opt_opc = vecop_list,
3588           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3589           .vece = MO_64 },
3590     };
3591
3592     tcg_debug_assert(vece <= MO_64);
3593     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3594 }
3595
3596 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3597                                   TCGv_vec a, TCGv_vec b)
3598 {
3599     TCGv_vec t = tcg_temp_new_vec_matching(d);
3600     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3601
3602     tcg_gen_and_vec(vece, t, b, m);
3603     tcg_gen_rotrv_vec(vece, d, a, t);
3604     tcg_temp_free_vec(t);
3605 }
3606
3607 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3608 {
3609     TCGv_i32 t = tcg_temp_ebb_new_i32();
3610
3611     tcg_gen_andi_i32(t, b, 31);
3612     tcg_gen_rotr_i32(d, a, t);
3613     tcg_temp_free_i32(t);
3614 }
3615
3616 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3617 {
3618     TCGv_i64 t = tcg_temp_ebb_new_i64();
3619
3620     tcg_gen_andi_i64(t, b, 63);
3621     tcg_gen_rotr_i64(d, a, t);
3622     tcg_temp_free_i64(t);
3623 }
3624
3625 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3626                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3627 {
3628     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3629     static const GVecGen3 g[4] = {
3630         { .fniv = tcg_gen_rotrv_mod_vec,
3631           .fno = gen_helper_gvec_rotr8v,
3632           .opt_opc = vecop_list,
3633           .vece = MO_8 },
3634         { .fniv = tcg_gen_rotrv_mod_vec,
3635           .fno = gen_helper_gvec_rotr16v,
3636           .opt_opc = vecop_list,
3637           .vece = MO_16 },
3638         { .fni4 = tcg_gen_rotr_mod_i32,
3639           .fniv = tcg_gen_rotrv_mod_vec,
3640           .fno = gen_helper_gvec_rotr32v,
3641           .opt_opc = vecop_list,
3642           .vece = MO_32 },
3643         { .fni8 = tcg_gen_rotr_mod_i64,
3644           .fniv = tcg_gen_rotrv_mod_vec,
3645           .fno = gen_helper_gvec_rotr64v,
3646           .opt_opc = vecop_list,
3647           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3648           .vece = MO_64 },
3649     };
3650
3651     tcg_debug_assert(vece <= MO_64);
3652     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3653 }
3654
3655 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3656 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3657                            uint32_t oprsz, TCGCond cond)
3658 {
3659     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3660     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3661     uint32_t i;
3662
3663     for (i = 0; i < oprsz; i += 4) {
3664         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3665         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3666         tcg_gen_setcond_i32(cond, t0, t0, t1);
3667         tcg_gen_neg_i32(t0, t0);
3668         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3669     }
3670     tcg_temp_free_i32(t1);
3671     tcg_temp_free_i32(t0);
3672 }
3673
3674 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3675                            uint32_t oprsz, TCGCond cond)
3676 {
3677     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3678     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3679     uint32_t i;
3680
3681     for (i = 0; i < oprsz; i += 8) {
3682         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3683         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3684         tcg_gen_setcond_i64(cond, t0, t0, t1);
3685         tcg_gen_neg_i64(t0, t0);
3686         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3687     }
3688     tcg_temp_free_i64(t1);
3689     tcg_temp_free_i64(t0);
3690 }
3691
3692 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3693                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3694                            TCGType type, TCGCond cond)
3695 {
3696     TCGv_vec t0 = tcg_temp_new_vec(type);
3697     TCGv_vec t1 = tcg_temp_new_vec(type);
3698     uint32_t i;
3699
3700     for (i = 0; i < oprsz; i += tysz) {
3701         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3702         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3703         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3704         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3705     }
3706     tcg_temp_free_vec(t1);
3707     tcg_temp_free_vec(t0);
3708 }
3709
3710 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3711                       uint32_t aofs, uint32_t bofs,
3712                       uint32_t oprsz, uint32_t maxsz)
3713 {
3714     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3715     static gen_helper_gvec_3 * const eq_fn[4] = {
3716         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3717         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3718     };
3719     static gen_helper_gvec_3 * const ne_fn[4] = {
3720         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3721         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3722     };
3723     static gen_helper_gvec_3 * const lt_fn[4] = {
3724         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3725         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3726     };
3727     static gen_helper_gvec_3 * const le_fn[4] = {
3728         gen_helper_gvec_le8, gen_helper_gvec_le16,
3729         gen_helper_gvec_le32, gen_helper_gvec_le64
3730     };
3731     static gen_helper_gvec_3 * const ltu_fn[4] = {
3732         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3733         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3734     };
3735     static gen_helper_gvec_3 * const leu_fn[4] = {
3736         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3737         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3738     };
3739     static gen_helper_gvec_3 * const * const fns[16] = {
3740         [TCG_COND_EQ] = eq_fn,
3741         [TCG_COND_NE] = ne_fn,
3742         [TCG_COND_LT] = lt_fn,
3743         [TCG_COND_LE] = le_fn,
3744         [TCG_COND_LTU] = ltu_fn,
3745         [TCG_COND_LEU] = leu_fn,
3746     };
3747
3748     const TCGOpcode *hold_list;
3749     TCGType type;
3750     uint32_t some;
3751
3752     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3753     check_overlap_3(dofs, aofs, bofs, maxsz);
3754
3755     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3756         do_dup(MO_8, dofs, oprsz, maxsz,
3757                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3758         return;
3759     }
3760
3761     /*
3762      * Implement inline with a vector type, if possible.
3763      * Prefer integer when 64-bit host and 64-bit comparison.
3764      */
3765     hold_list = tcg_swap_vecop_list(cmp_list);
3766     type = choose_vector_type(cmp_list, vece, oprsz,
3767                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3768     switch (type) {
3769     case TCG_TYPE_V256:
3770         /* Recall that ARM SVE allows vector sizes that are not a
3771          * power of 2, but always a multiple of 16.  The intent is
3772          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3773          */
3774         some = QEMU_ALIGN_DOWN(oprsz, 32);
3775         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3776         if (some == oprsz) {
3777             break;
3778         }
3779         dofs += some;
3780         aofs += some;
3781         bofs += some;
3782         oprsz -= some;
3783         maxsz -= some;
3784         /* fallthru */
3785     case TCG_TYPE_V128:
3786         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3787         break;
3788     case TCG_TYPE_V64:
3789         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3790         break;
3791
3792     case 0:
3793         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3794             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3795         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3796             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3797         } else {
3798             gen_helper_gvec_3 * const *fn = fns[cond];
3799
3800             if (fn == NULL) {
3801                 uint32_t tmp;
3802                 tmp = aofs, aofs = bofs, bofs = tmp;
3803                 cond = tcg_swap_cond(cond);
3804                 fn = fns[cond];
3805                 assert(fn != NULL);
3806             }
3807             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3808             oprsz = maxsz;
3809         }
3810         break;
3811
3812     default:
3813         g_assert_not_reached();
3814     }
3815     tcg_swap_vecop_list(hold_list);
3816
3817     if (oprsz < maxsz) {
3818         expand_clr(dofs + oprsz, maxsz - oprsz);
3819     }
3820 }
3821
3822 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3823 {
3824     TCGv_i64 t = tcg_temp_ebb_new_i64();
3825
3826     tcg_gen_and_i64(t, b, a);
3827     tcg_gen_andc_i64(d, c, a);
3828     tcg_gen_or_i64(d, d, t);
3829     tcg_temp_free_i64(t);
3830 }
3831
3832 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3833                          uint32_t bofs, uint32_t cofs,
3834                          uint32_t oprsz, uint32_t maxsz)
3835 {
3836     static const GVecGen4 g = {
3837         .fni8 = tcg_gen_bitsel_i64,
3838         .fniv = tcg_gen_bitsel_vec,
3839         .fno = gen_helper_gvec_bitsel,
3840     };
3841
3842     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3843 }