tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "tcg/tcg.h"
  22 #include "tcg/tcg-op.h"
  23 #include "tcg/tcg-op-gvec.h"
  24 #include "qemu/main-loop.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26
  27 #define MAX_UNROLL  4
  28
  29 #ifdef CONFIG_DEBUG_TCG
  30 static const TCGOpcode vecop_list_empty[1] = { 0 };
  31 #else
  32 #define vecop_list_empty NULL
  33 #endif
  34
  35
  36 /* Verify vector size and alignment rules.  OFS should be the OR of all
  37    of the operand offsets so that we can check them all at once.  */
  38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39 {
  40     uint32_t max_align;
  41
  42     switch (oprsz) {
  43     case 8:
  44     case 16:
  45     case 32:
  46         tcg_debug_assert(oprsz <= maxsz);
  47         break;
  48     default:
  49         tcg_debug_assert(oprsz == maxsz);
  50         break;
  51     }
  52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54     max_align = maxsz >= 16 ? 15 : 7;
  55     tcg_debug_assert((maxsz & max_align) == 0);
  56     tcg_debug_assert((ofs & max_align) == 0);
  57 }
  58
  59 /* Verify vector overlap rules for two operands.  */
  60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61 {
  62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63 }
  64
  65 /* Verify vector overlap rules for three operands.  */
  66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67 {
  68     check_overlap_2(d, a, s);
  69     check_overlap_2(d, b, s);
  70     check_overlap_2(a, b, s);
  71 }
  72
  73 /* Verify vector overlap rules for four operands.  */
  74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                             uint32_t c, uint32_t s)
  76 {
  77     check_overlap_2(d, a, s);
  78     check_overlap_2(d, b, s);
  79     check_overlap_2(d, c, s);
  80     check_overlap_2(a, b, s);
  81     check_overlap_2(a, c, s);
  82     check_overlap_2(b, c, s);
  83 }
  84
  85 /* Create a descriptor from components.  */
  86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87 {
  88     uint32_t desc = 0;
  89
  90     check_size_align(oprsz, maxsz, 0);
  91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93     oprsz = (oprsz / 8) - 1;
  94     maxsz = (maxsz / 8) - 1;
  95
  96     /*
  97      * We have just asserted in check_size_align that either
  98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99      * case with '2', as that would otherwise map to 24.
 100      */
 101     if (oprsz == maxsz) {
 102         oprsz = 2;
 103     }
 104
 105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109     return desc;
 110 }
 111
 112 /* Generate a call to a gvec-style helper with two vector operands.  */
 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                         gen_helper_gvec_2 *fn)
 116 {
 117     TCGv_ptr a0, a1;
 118     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 119
 120     a0 = tcg_temp_new_ptr();
 121     a1 = tcg_temp_new_ptr();
 122
 123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 125
 126     fn(a0, a1, desc);
 127
 128     tcg_temp_free_ptr(a0);
 129     tcg_temp_free_ptr(a1);
 130     tcg_temp_free_i32(desc);
 131 }
 132
 133 /* Generate a call to a gvec-style helper with two vector operands
 134    and one scalar operand.  */
 135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 136                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 137                          gen_helper_gvec_2i *fn)
 138 {
 139     TCGv_ptr a0, a1;
 140     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 141
 142     a0 = tcg_temp_new_ptr();
 143     a1 = tcg_temp_new_ptr();
 144
 145     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 146     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 147
 148     fn(a0, a1, c, desc);
 149
 150     tcg_temp_free_ptr(a0);
 151     tcg_temp_free_ptr(a1);
 152     tcg_temp_free_i32(desc);
 153 }
 154
 155 /* Generate a call to a gvec-style helper with three vector operands.  */
 156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 157                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 158                         gen_helper_gvec_3 *fn)
 159 {
 160     TCGv_ptr a0, a1, a2;
 161     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 162
 163     a0 = tcg_temp_new_ptr();
 164     a1 = tcg_temp_new_ptr();
 165     a2 = tcg_temp_new_ptr();
 166
 167     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 168     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 169     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 170
 171     fn(a0, a1, a2, desc);
 172
 173     tcg_temp_free_ptr(a0);
 174     tcg_temp_free_ptr(a1);
 175     tcg_temp_free_ptr(a2);
 176     tcg_temp_free_i32(desc);
 177 }
 178
 179 /* Generate a call to a gvec-style helper with four vector operands.  */
 180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 181                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 182                         int32_t data, gen_helper_gvec_4 *fn)
 183 {
 184     TCGv_ptr a0, a1, a2, a3;
 185     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 186
 187     a0 = tcg_temp_new_ptr();
 188     a1 = tcg_temp_new_ptr();
 189     a2 = tcg_temp_new_ptr();
 190     a3 = tcg_temp_new_ptr();
 191
 192     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 193     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 194     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 195     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 196
 197     fn(a0, a1, a2, a3, desc);
 198
 199     tcg_temp_free_ptr(a0);
 200     tcg_temp_free_ptr(a1);
 201     tcg_temp_free_ptr(a2);
 202     tcg_temp_free_ptr(a3);
 203     tcg_temp_free_i32(desc);
 204 }
 205
 206 /* Generate a call to a gvec-style helper with five vector operands.  */
 207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 208                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 209                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 210 {
 211     TCGv_ptr a0, a1, a2, a3, a4;
 212     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 213
 214     a0 = tcg_temp_new_ptr();
 215     a1 = tcg_temp_new_ptr();
 216     a2 = tcg_temp_new_ptr();
 217     a3 = tcg_temp_new_ptr();
 218     a4 = tcg_temp_new_ptr();
 219
 220     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 221     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 222     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 223     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 224     tcg_gen_addi_ptr(a4, cpu_env, xofs);
 225
 226     fn(a0, a1, a2, a3, a4, desc);
 227
 228     tcg_temp_free_ptr(a0);
 229     tcg_temp_free_ptr(a1);
 230     tcg_temp_free_ptr(a2);
 231     tcg_temp_free_ptr(a3);
 232     tcg_temp_free_ptr(a4);
 233     tcg_temp_free_i32(desc);
 234 }
 235
 236 /* Generate a call to a gvec-style helper with three vector operands
 237    and an extra pointer operand.  */
 238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 239                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 240                         int32_t data, gen_helper_gvec_2_ptr *fn)
 241 {
 242     TCGv_ptr a0, a1;
 243     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 244
 245     a0 = tcg_temp_new_ptr();
 246     a1 = tcg_temp_new_ptr();
 247
 248     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 249     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 250
 251     fn(a0, a1, ptr, desc);
 252
 253     tcg_temp_free_ptr(a0);
 254     tcg_temp_free_ptr(a1);
 255     tcg_temp_free_i32(desc);
 256 }
 257
 258 /* Generate a call to a gvec-style helper with three vector operands
 259    and an extra pointer operand.  */
 260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 261                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 262                         int32_t data, gen_helper_gvec_3_ptr *fn)
 263 {
 264     TCGv_ptr a0, a1, a2;
 265     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 266
 267     a0 = tcg_temp_new_ptr();
 268     a1 = tcg_temp_new_ptr();
 269     a2 = tcg_temp_new_ptr();
 270
 271     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 272     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 273     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 274
 275     fn(a0, a1, a2, ptr, desc);
 276
 277     tcg_temp_free_ptr(a0);
 278     tcg_temp_free_ptr(a1);
 279     tcg_temp_free_ptr(a2);
 280     tcg_temp_free_i32(desc);
 281 }
 282
 283 /* Generate a call to a gvec-style helper with four vector operands
 284    and an extra pointer operand.  */
 285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 286                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 287                         uint32_t maxsz, int32_t data,
 288                         gen_helper_gvec_4_ptr *fn)
 289 {
 290     TCGv_ptr a0, a1, a2, a3;
 291     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 292
 293     a0 = tcg_temp_new_ptr();
 294     a1 = tcg_temp_new_ptr();
 295     a2 = tcg_temp_new_ptr();
 296     a3 = tcg_temp_new_ptr();
 297
 298     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 299     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 300     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 301     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 302
 303     fn(a0, a1, a2, a3, ptr, desc);
 304
 305     tcg_temp_free_ptr(a0);
 306     tcg_temp_free_ptr(a1);
 307     tcg_temp_free_ptr(a2);
 308     tcg_temp_free_ptr(a3);
 309     tcg_temp_free_i32(desc);
 310 }
 311
 312 /* Generate a call to a gvec-style helper with five vector operands
 313    and an extra pointer operand.  */
 314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 315                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 316                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 317                         gen_helper_gvec_5_ptr *fn)
 318 {
 319     TCGv_ptr a0, a1, a2, a3, a4;
 320     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 321
 322     a0 = tcg_temp_new_ptr();
 323     a1 = tcg_temp_new_ptr();
 324     a2 = tcg_temp_new_ptr();
 325     a3 = tcg_temp_new_ptr();
 326     a4 = tcg_temp_new_ptr();
 327
 328     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 329     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 330     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 331     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 332     tcg_gen_addi_ptr(a4, cpu_env, eofs);
 333
 334     fn(a0, a1, a2, a3, a4, ptr, desc);
 335
 336     tcg_temp_free_ptr(a0);
 337     tcg_temp_free_ptr(a1);
 338     tcg_temp_free_ptr(a2);
 339     tcg_temp_free_ptr(a3);
 340     tcg_temp_free_ptr(a4);
 341     tcg_temp_free_i32(desc);
 342 }
 343
 344 /* Return true if we want to implement something of OPRSZ bytes
 345    in units of LNSZ.  This limits the expansion of inline code.  */
 346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 347 {
 348     uint32_t q, r;
 349
 350     if (oprsz < lnsz) {
 351         return false;
 352     }
 353
 354     q = oprsz / lnsz;
 355     r = oprsz % lnsz;
 356     tcg_debug_assert((r & 7) == 0);
 357
 358     if (lnsz < 16) {
 359         /* For sizes below 16, accept no remainder. */
 360         if (r != 0) {
 361             return false;
 362         }
 363     } else {
 364         /*
 365          * Recall that ARM SVE allows vector sizes that are not a
 366          * power of 2, but always a multiple of 16.  The intent is
 367          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 368          * In addition, expand_clr needs to handle a multiple of 8.
 369          * Thus we can handle the tail with one more operation per
 370          * diminishing power of 2.
 371          */
 372         q += ctpop32(r);
 373     }
 374
 375     return q <= MAX_UNROLL;
 376 }
 377
 378 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 379
 380 /* Duplicate C as per VECE.  */
 381 uint64_t (dup_const)(unsigned vece, uint64_t c)
 382 {
 383     switch (vece) {
 384     case MO_8:
 385         return 0x0101010101010101ull * (uint8_t)c;
 386     case MO_16:
 387         return 0x0001000100010001ull * (uint16_t)c;
 388     case MO_32:
 389         return 0x0000000100000001ull * (uint32_t)c;
 390     case MO_64:
 391         return c;
 392     default:
 393         g_assert_not_reached();
 394     }
 395 }
 396
 397 /* Duplicate IN into OUT as per VECE.  */
 398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 399 {
 400     switch (vece) {
 401     case MO_8:
 402         tcg_gen_ext8u_i32(out, in);
 403         tcg_gen_muli_i32(out, out, 0x01010101);
 404         break;
 405     case MO_16:
 406         tcg_gen_deposit_i32(out, in, in, 16, 16);
 407         break;
 408     case MO_32:
 409         tcg_gen_mov_i32(out, in);
 410         break;
 411     default:
 412         g_assert_not_reached();
 413     }
 414 }
 415
 416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 417 {
 418     switch (vece) {
 419     case MO_8:
 420         tcg_gen_ext8u_i64(out, in);
 421         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 422         break;
 423     case MO_16:
 424         tcg_gen_ext16u_i64(out, in);
 425         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 426         break;
 427     case MO_32:
 428         tcg_gen_deposit_i64(out, in, in, 32, 32);
 429         break;
 430     case MO_64:
 431         tcg_gen_mov_i64(out, in);
 432         break;
 433     default:
 434         g_assert_not_reached();
 435     }
 436 }
 437
 438 /* Select a supported vector type for implementing an operation on SIZE
 439  * bytes.  If OP is 0, assume that the real operation to be performed is
 440  * required by all backends.  Otherwise, make sure than OP can be performed
 441  * on elements of size VECE in the selected type.  Do not select V64 if
 442  * PREFER_I64 is true.  Return 0 if no vector type is selected.
 443  */
 444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 445                                   uint32_t size, bool prefer_i64)
 446 {
 447     /*
 448      * Recall that ARM SVE allows vector sizes that are not a
 449      * power of 2, but always a multiple of 16.  The intent is
 450      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 451      * It is hard to imagine a case in which v256 is supported
 452      * but v128 is not, but check anyway.
 453      * In addition, expand_clr needs to handle a multiple of 8.
 454      */
 455     if (TCG_TARGET_HAS_v256 &&
 456         check_size_impl(size, 32) &&
 457         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 458         (!(size & 16) ||
 459          (TCG_TARGET_HAS_v128 &&
 460           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 461         (!(size & 8) ||
 462          (TCG_TARGET_HAS_v64 &&
 463           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 464         return TCG_TYPE_V256;
 465     }
 466     if (TCG_TARGET_HAS_v128 &&
 467         check_size_impl(size, 16) &&
 468         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 469         (!(size & 8) ||
 470          (TCG_TARGET_HAS_v64 &&
 471           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 472         return TCG_TYPE_V128;
 473     }
 474     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 475         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 476         return TCG_TYPE_V64;
 477     }
 478     return 0;
 479 }
 480
 481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 482                          uint32_t maxsz, TCGv_vec t_vec)
 483 {
 484     uint32_t i = 0;
 485
 486     tcg_debug_assert(oprsz >= 8);
 487
 488     /*
 489      * This may be expand_clr for the tail of an operation, e.g.
 490      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 491      * are misaligned wrt the maximum vector size, so do that first.
 492      */
 493     if (dofs & 8) {
 494         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 495         i += 8;
 496     }
 497
 498     switch (type) {
 499     case TCG_TYPE_V256:
 500         /*
 501          * Recall that ARM SVE allows vector sizes that are not a
 502          * power of 2, but always a multiple of 16.  The intent is
 503          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 504          */
 505         for (; i + 32 <= oprsz; i += 32) {
 506             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 507         }
 508         /* fallthru */
 509     case TCG_TYPE_V128:
 510         for (; i + 16 <= oprsz; i += 16) {
 511             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 512         }
 513         break;
 514     case TCG_TYPE_V64:
 515         for (; i < oprsz; i += 8) {
 516             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 517         }
 518         break;
 519     default:
 520         g_assert_not_reached();
 521     }
 522
 523     if (oprsz < maxsz) {
 524         expand_clr(dofs + oprsz, maxsz - oprsz);
 525     }
 526 }
 527
 528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 529  * Only one of IN_32 or IN_64 may be set;
 530  * IN_C is used if IN_32 and IN_64 are unset.
 531  */
 532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 533                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 534                    uint64_t in_c)
 535 {
 536     TCGType type;
 537     TCGv_i64 t_64;
 538     TCGv_i32 t_32, t_desc;
 539     TCGv_ptr t_ptr;
 540     uint32_t i;
 541
 542     assert(vece <= (in_32 ? MO_32 : MO_64));
 543     assert(in_32 == NULL || in_64 == NULL);
 544
 545     /* If we're storing 0, expand oprsz to maxsz.  */
 546     if (in_32 == NULL && in_64 == NULL) {
 547         in_c = dup_const(vece, in_c);
 548         if (in_c == 0) {
 549             oprsz = maxsz;
 550             vece = MO_8;
 551         } else if (in_c == dup_const(MO_8, in_c)) {
 552             vece = MO_8;
 553         }
 554     }
 555
 556     /* Implement inline with a vector type, if possible.
 557      * Prefer integer when 64-bit host and no variable dup.
 558      */
 559     type = choose_vector_type(NULL, vece, oprsz,
 560                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 561                                && (in_64 == NULL || vece == MO_64)));
 562     if (type != 0) {
 563         TCGv_vec t_vec = tcg_temp_new_vec(type);
 564
 565         if (in_32) {
 566             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 567         } else if (in_64) {
 568             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 569         } else {
 570             tcg_gen_dupi_vec(vece, t_vec, in_c);
 571         }
 572         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 573         tcg_temp_free_vec(t_vec);
 574         return;
 575     }
 576
 577     /* Otherwise, inline with an integer type, unless "large".  */
 578     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 579         t_64 = NULL;
 580         t_32 = NULL;
 581
 582         if (in_32) {
 583             /* We are given a 32-bit variable input.  For a 64-bit host,
 584                use a 64-bit operation unless the 32-bit operation would
 585                be simple enough.  */
 586             if (TCG_TARGET_REG_BITS == 64
 587                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 588                 t_64 = tcg_temp_new_i64();
 589                 tcg_gen_extu_i32_i64(t_64, in_32);
 590                 gen_dup_i64(vece, t_64, t_64);
 591             } else {
 592                 t_32 = tcg_temp_new_i32();
 593                 gen_dup_i32(vece, t_32, in_32);
 594             }
 595         } else if (in_64) {
 596             /* We are given a 64-bit variable input.  */
 597             t_64 = tcg_temp_new_i64();
 598             gen_dup_i64(vece, t_64, in_64);
 599         } else {
 600             /* We are given a constant input.  */
 601             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 602                or when we'd need too many 32-bit stores, or when a 64-bit
 603                constant is really required.  */
 604             if (vece == MO_64
 605                 || (TCG_TARGET_REG_BITS == 64
 606                     && (in_c == 0 || in_c == -1
 607                         || !check_size_impl(oprsz, 4)))) {
 608                 t_64 = tcg_const_i64(in_c);
 609             } else {
 610                 t_32 = tcg_const_i32(in_c);
 611             }
 612         }
 613
 614         /* Implement inline if we picked an implementation size above.  */
 615         if (t_32) {
 616             for (i = 0; i < oprsz; i += 4) {
 617                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 618             }
 619             tcg_temp_free_i32(t_32);
 620             goto done;
 621         }
 622         if (t_64) {
 623             for (i = 0; i < oprsz; i += 8) {
 624                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 625             }
 626             tcg_temp_free_i64(t_64);
 627             goto done;
 628         }
 629     }
 630
 631     /* Otherwise implement out of line.  */
 632     t_ptr = tcg_temp_new_ptr();
 633     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 634
 635     /*
 636      * This may be expand_clr for the tail of an operation, e.g.
 637      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 638      * wrt simd_desc and will assert.  Simply pass all replicated byte
 639      * stores through to memset.
 640      */
 641     if (oprsz == maxsz && vece == MO_8) {
 642         TCGv_ptr t_size = tcg_const_ptr(oprsz);
 643         TCGv_i32 t_val;
 644
 645         if (in_32) {
 646             t_val = in_32;
 647         } else if (in_64) {
 648             t_val = tcg_temp_new_i32();
 649             tcg_gen_extrl_i64_i32(t_val, in_64);
 650         } else {
 651             t_val = tcg_const_i32(in_c);
 652         }
 653         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 654
 655         if (!in_32) {
 656             tcg_temp_free_i32(t_val);
 657         }
 658         tcg_temp_free_ptr(t_size);
 659         tcg_temp_free_ptr(t_ptr);
 660         return;
 661     }
 662
 663     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 664
 665     if (vece == MO_64) {
 666         if (in_64) {
 667             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 668         } else {
 669             t_64 = tcg_const_i64(in_c);
 670             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 671             tcg_temp_free_i64(t_64);
 672         }
 673     } else {
 674         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 675         static dup_fn * const fns[3] = {
 676             gen_helper_gvec_dup8,
 677             gen_helper_gvec_dup16,
 678             gen_helper_gvec_dup32
 679         };
 680
 681         if (in_32) {
 682             fns[vece](t_ptr, t_desc, in_32);
 683         } else {
 684             t_32 = tcg_temp_new_i32();
 685             if (in_64) {
 686                 tcg_gen_extrl_i64_i32(t_32, in_64);
 687             } else if (vece == MO_8) {
 688                 tcg_gen_movi_i32(t_32, in_c & 0xff);
 689             } else if (vece == MO_16) {
 690                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
 691             } else {
 692                 tcg_gen_movi_i32(t_32, in_c);
 693             }
 694             fns[vece](t_ptr, t_desc, t_32);
 695             tcg_temp_free_i32(t_32);
 696         }
 697     }
 698
 699     tcg_temp_free_ptr(t_ptr);
 700     tcg_temp_free_i32(t_desc);
 701     return;
 702
 703  done:
 704     if (oprsz < maxsz) {
 705         expand_clr(dofs + oprsz, maxsz - oprsz);
 706     }
 707 }
 708
 709 /* Likewise, but with zero.  */
 710 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 711 {
 712     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 713 }
 714
 715 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 716 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 717                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 718 {
 719     TCGv_i32 t0 = tcg_temp_new_i32();
 720     TCGv_i32 t1 = tcg_temp_new_i32();
 721     uint32_t i;
 722
 723     for (i = 0; i < oprsz; i += 4) {
 724         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 725         if (load_dest) {
 726             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 727         }
 728         fni(t1, t0);
 729         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 730     }
 731     tcg_temp_free_i32(t0);
 732     tcg_temp_free_i32(t1);
 733 }
 734
 735 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 736                           int32_t c, bool load_dest,
 737                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 738 {
 739     TCGv_i32 t0 = tcg_temp_new_i32();
 740     TCGv_i32 t1 = tcg_temp_new_i32();
 741     uint32_t i;
 742
 743     for (i = 0; i < oprsz; i += 4) {
 744         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 745         if (load_dest) {
 746             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 747         }
 748         fni(t1, t0, c);
 749         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 750     }
 751     tcg_temp_free_i32(t0);
 752     tcg_temp_free_i32(t1);
 753 }
 754
 755 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 756                           TCGv_i32 c, bool scalar_first,
 757                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 758 {
 759     TCGv_i32 t0 = tcg_temp_new_i32();
 760     TCGv_i32 t1 = tcg_temp_new_i32();
 761     uint32_t i;
 762
 763     for (i = 0; i < oprsz; i += 4) {
 764         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 765         if (scalar_first) {
 766             fni(t1, c, t0);
 767         } else {
 768             fni(t1, t0, c);
 769         }
 770         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 771     }
 772     tcg_temp_free_i32(t0);
 773     tcg_temp_free_i32(t1);
 774 }
 775
 776 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 777 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 778                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 779                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 780 {
 781     TCGv_i32 t0 = tcg_temp_new_i32();
 782     TCGv_i32 t1 = tcg_temp_new_i32();
 783     TCGv_i32 t2 = tcg_temp_new_i32();
 784     uint32_t i;
 785
 786     for (i = 0; i < oprsz; i += 4) {
 787         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 788         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 789         if (load_dest) {
 790             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 791         }
 792         fni(t2, t0, t1);
 793         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 794     }
 795     tcg_temp_free_i32(t2);
 796     tcg_temp_free_i32(t1);
 797     tcg_temp_free_i32(t0);
 798 }
 799
 800 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 801                           uint32_t oprsz, int32_t c, bool load_dest,
 802                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 803 {
 804     TCGv_i32 t0 = tcg_temp_new_i32();
 805     TCGv_i32 t1 = tcg_temp_new_i32();
 806     TCGv_i32 t2 = tcg_temp_new_i32();
 807     uint32_t i;
 808
 809     for (i = 0; i < oprsz; i += 4) {
 810         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 811         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 812         if (load_dest) {
 813             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 814         }
 815         fni(t2, t0, t1, c);
 816         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 817     }
 818     tcg_temp_free_i32(t0);
 819     tcg_temp_free_i32(t1);
 820     tcg_temp_free_i32(t2);
 821 }
 822
 823 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 824 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 825                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 826                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 827 {
 828     TCGv_i32 t0 = tcg_temp_new_i32();
 829     TCGv_i32 t1 = tcg_temp_new_i32();
 830     TCGv_i32 t2 = tcg_temp_new_i32();
 831     TCGv_i32 t3 = tcg_temp_new_i32();
 832     uint32_t i;
 833
 834     for (i = 0; i < oprsz; i += 4) {
 835         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 836         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 837         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 838         fni(t0, t1, t2, t3);
 839         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 840         if (write_aofs) {
 841             tcg_gen_st_i32(t1, cpu_env, aofs + i);
 842         }
 843     }
 844     tcg_temp_free_i32(t3);
 845     tcg_temp_free_i32(t2);
 846     tcg_temp_free_i32(t1);
 847     tcg_temp_free_i32(t0);
 848 }
 849
 850 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 851 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 852                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 853 {
 854     TCGv_i64 t0 = tcg_temp_new_i64();
 855     TCGv_i64 t1 = tcg_temp_new_i64();
 856     uint32_t i;
 857
 858     for (i = 0; i < oprsz; i += 8) {
 859         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 860         if (load_dest) {
 861             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 862         }
 863         fni(t1, t0);
 864         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 865     }
 866     tcg_temp_free_i64(t0);
 867     tcg_temp_free_i64(t1);
 868 }
 869
 870 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 871                           int64_t c, bool load_dest,
 872                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 873 {
 874     TCGv_i64 t0 = tcg_temp_new_i64();
 875     TCGv_i64 t1 = tcg_temp_new_i64();
 876     uint32_t i;
 877
 878     for (i = 0; i < oprsz; i += 8) {
 879         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 880         if (load_dest) {
 881             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 882         }
 883         fni(t1, t0, c);
 884         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 885     }
 886     tcg_temp_free_i64(t0);
 887     tcg_temp_free_i64(t1);
 888 }
 889
 890 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 891                           TCGv_i64 c, bool scalar_first,
 892                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 893 {
 894     TCGv_i64 t0 = tcg_temp_new_i64();
 895     TCGv_i64 t1 = tcg_temp_new_i64();
 896     uint32_t i;
 897
 898     for (i = 0; i < oprsz; i += 8) {
 899         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 900         if (scalar_first) {
 901             fni(t1, c, t0);
 902         } else {
 903             fni(t1, t0, c);
 904         }
 905         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 906     }
 907     tcg_temp_free_i64(t0);
 908     tcg_temp_free_i64(t1);
 909 }
 910
 911 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 912 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 913                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 914                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 915 {
 916     TCGv_i64 t0 = tcg_temp_new_i64();
 917     TCGv_i64 t1 = tcg_temp_new_i64();
 918     TCGv_i64 t2 = tcg_temp_new_i64();
 919     uint32_t i;
 920
 921     for (i = 0; i < oprsz; i += 8) {
 922         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 923         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 924         if (load_dest) {
 925             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 926         }
 927         fni(t2, t0, t1);
 928         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 929     }
 930     tcg_temp_free_i64(t2);
 931     tcg_temp_free_i64(t1);
 932     tcg_temp_free_i64(t0);
 933 }
 934
 935 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 936                           uint32_t oprsz, int64_t c, bool load_dest,
 937                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 938 {
 939     TCGv_i64 t0 = tcg_temp_new_i64();
 940     TCGv_i64 t1 = tcg_temp_new_i64();
 941     TCGv_i64 t2 = tcg_temp_new_i64();
 942     uint32_t i;
 943
 944     for (i = 0; i < oprsz; i += 8) {
 945         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 946         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 947         if (load_dest) {
 948             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 949         }
 950         fni(t2, t0, t1, c);
 951         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 952     }
 953     tcg_temp_free_i64(t0);
 954     tcg_temp_free_i64(t1);
 955     tcg_temp_free_i64(t2);
 956 }
 957
 958 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 959 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 960                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 961                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 962 {
 963     TCGv_i64 t0 = tcg_temp_new_i64();
 964     TCGv_i64 t1 = tcg_temp_new_i64();
 965     TCGv_i64 t2 = tcg_temp_new_i64();
 966     TCGv_i64 t3 = tcg_temp_new_i64();
 967     uint32_t i;
 968
 969     for (i = 0; i < oprsz; i += 8) {
 970         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 971         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 972         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 973         fni(t0, t1, t2, t3);
 974         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 975         if (write_aofs) {
 976             tcg_gen_st_i64(t1, cpu_env, aofs + i);
 977         }
 978     }
 979     tcg_temp_free_i64(t3);
 980     tcg_temp_free_i64(t2);
 981     tcg_temp_free_i64(t1);
 982     tcg_temp_free_i64(t0);
 983 }
 984
 985 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 986 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 987                          uint32_t oprsz, uint32_t tysz, TCGType type,
 988                          bool load_dest,
 989                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 990 {
 991     TCGv_vec t0 = tcg_temp_new_vec(type);
 992     TCGv_vec t1 = tcg_temp_new_vec(type);
 993     uint32_t i;
 994
 995     for (i = 0; i < oprsz; i += tysz) {
 996         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 997         if (load_dest) {
 998             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 999         }
1000         fni(vece, t1, t0);
1001         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1002     }
1003     tcg_temp_free_vec(t0);
1004     tcg_temp_free_vec(t1);
1005 }
1006
1007 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1008    using host vectors.  */
1009 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1010                           uint32_t oprsz, uint32_t tysz, TCGType type,
1011                           int64_t c, bool load_dest,
1012                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1013 {
1014     TCGv_vec t0 = tcg_temp_new_vec(type);
1015     TCGv_vec t1 = tcg_temp_new_vec(type);
1016     uint32_t i;
1017
1018     for (i = 0; i < oprsz; i += tysz) {
1019         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1020         if (load_dest) {
1021             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1022         }
1023         fni(vece, t1, t0, c);
1024         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1025     }
1026     tcg_temp_free_vec(t0);
1027     tcg_temp_free_vec(t1);
1028 }
1029
1030 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1031                           uint32_t oprsz, uint32_t tysz, TCGType type,
1032                           TCGv_vec c, bool scalar_first,
1033                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1034 {
1035     TCGv_vec t0 = tcg_temp_new_vec(type);
1036     TCGv_vec t1 = tcg_temp_new_vec(type);
1037     uint32_t i;
1038
1039     for (i = 0; i < oprsz; i += tysz) {
1040         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1041         if (scalar_first) {
1042             fni(vece, t1, c, t0);
1043         } else {
1044             fni(vece, t1, t0, c);
1045         }
1046         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1047     }
1048     tcg_temp_free_vec(t0);
1049     tcg_temp_free_vec(t1);
1050 }
1051
1052 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1053 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1054                          uint32_t bofs, uint32_t oprsz,
1055                          uint32_t tysz, TCGType type, bool load_dest,
1056                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1057 {
1058     TCGv_vec t0 = tcg_temp_new_vec(type);
1059     TCGv_vec t1 = tcg_temp_new_vec(type);
1060     TCGv_vec t2 = tcg_temp_new_vec(type);
1061     uint32_t i;
1062
1063     for (i = 0; i < oprsz; i += tysz) {
1064         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1065         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1066         if (load_dest) {
1067             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1068         }
1069         fni(vece, t2, t0, t1);
1070         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1071     }
1072     tcg_temp_free_vec(t2);
1073     tcg_temp_free_vec(t1);
1074     tcg_temp_free_vec(t0);
1075 }
1076
1077 /*
1078  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1079  * using host vectors.
1080  */
1081 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1082                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1083                           TCGType type, int64_t c, bool load_dest,
1084                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1085                                       int64_t))
1086 {
1087     TCGv_vec t0 = tcg_temp_new_vec(type);
1088     TCGv_vec t1 = tcg_temp_new_vec(type);
1089     TCGv_vec t2 = tcg_temp_new_vec(type);
1090     uint32_t i;
1091
1092     for (i = 0; i < oprsz; i += tysz) {
1093         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1094         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1095         if (load_dest) {
1096             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1097         }
1098         fni(vece, t2, t0, t1, c);
1099         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1100     }
1101     tcg_temp_free_vec(t0);
1102     tcg_temp_free_vec(t1);
1103     tcg_temp_free_vec(t2);
1104 }
1105
1106 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1107 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1108                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1109                          uint32_t tysz, TCGType type, bool write_aofs,
1110                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1111                                      TCGv_vec, TCGv_vec))
1112 {
1113     TCGv_vec t0 = tcg_temp_new_vec(type);
1114     TCGv_vec t1 = tcg_temp_new_vec(type);
1115     TCGv_vec t2 = tcg_temp_new_vec(type);
1116     TCGv_vec t3 = tcg_temp_new_vec(type);
1117     uint32_t i;
1118
1119     for (i = 0; i < oprsz; i += tysz) {
1120         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1121         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1122         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1123         fni(vece, t0, t1, t2, t3);
1124         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1125         if (write_aofs) {
1126             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1127         }
1128     }
1129     tcg_temp_free_vec(t3);
1130     tcg_temp_free_vec(t2);
1131     tcg_temp_free_vec(t1);
1132     tcg_temp_free_vec(t0);
1133 }
1134
1135 /* Expand a vector two-operand operation.  */
1136 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1137                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1138 {
1139     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1140     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1141     TCGType type;
1142     uint32_t some;
1143
1144     check_size_align(oprsz, maxsz, dofs | aofs);
1145     check_overlap_2(dofs, aofs, maxsz);
1146
1147     type = 0;
1148     if (g->fniv) {
1149         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1150     }
1151     switch (type) {
1152     case TCG_TYPE_V256:
1153         /* Recall that ARM SVE allows vector sizes that are not a
1154          * power of 2, but always a multiple of 16.  The intent is
1155          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1156          */
1157         some = QEMU_ALIGN_DOWN(oprsz, 32);
1158         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1159                      g->load_dest, g->fniv);
1160         if (some == oprsz) {
1161             break;
1162         }
1163         dofs += some;
1164         aofs += some;
1165         oprsz -= some;
1166         maxsz -= some;
1167         /* fallthru */
1168     case TCG_TYPE_V128:
1169         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1170                      g->load_dest, g->fniv);
1171         break;
1172     case TCG_TYPE_V64:
1173         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1174                      g->load_dest, g->fniv);
1175         break;
1176
1177     case 0:
1178         if (g->fni8 && check_size_impl(oprsz, 8)) {
1179             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1180         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1181             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1182         } else {
1183             assert(g->fno != NULL);
1184             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1185             oprsz = maxsz;
1186         }
1187         break;
1188
1189     default:
1190         g_assert_not_reached();
1191     }
1192     tcg_swap_vecop_list(hold_list);
1193
1194     if (oprsz < maxsz) {
1195         expand_clr(dofs + oprsz, maxsz - oprsz);
1196     }
1197 }
1198
1199 /* Expand a vector operation with two vectors and an immediate.  */
1200 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1201                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1202 {
1203     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1204     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1205     TCGType type;
1206     uint32_t some;
1207
1208     check_size_align(oprsz, maxsz, dofs | aofs);
1209     check_overlap_2(dofs, aofs, maxsz);
1210
1211     type = 0;
1212     if (g->fniv) {
1213         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1214     }
1215     switch (type) {
1216     case TCG_TYPE_V256:
1217         /* Recall that ARM SVE allows vector sizes that are not a
1218          * power of 2, but always a multiple of 16.  The intent is
1219          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1220          */
1221         some = QEMU_ALIGN_DOWN(oprsz, 32);
1222         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1223                       c, g->load_dest, g->fniv);
1224         if (some == oprsz) {
1225             break;
1226         }
1227         dofs += some;
1228         aofs += some;
1229         oprsz -= some;
1230         maxsz -= some;
1231         /* fallthru */
1232     case TCG_TYPE_V128:
1233         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1234                       c, g->load_dest, g->fniv);
1235         break;
1236     case TCG_TYPE_V64:
1237         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1238                       c, g->load_dest, g->fniv);
1239         break;
1240
1241     case 0:
1242         if (g->fni8 && check_size_impl(oprsz, 8)) {
1243             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1244         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1245             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1246         } else {
1247             if (g->fno) {
1248                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1249             } else {
1250                 TCGv_i64 tcg_c = tcg_const_i64(c);
1251                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1252                                     maxsz, c, g->fnoi);
1253                 tcg_temp_free_i64(tcg_c);
1254             }
1255             oprsz = maxsz;
1256         }
1257         break;
1258
1259     default:
1260         g_assert_not_reached();
1261     }
1262     tcg_swap_vecop_list(hold_list);
1263
1264     if (oprsz < maxsz) {
1265         expand_clr(dofs + oprsz, maxsz - oprsz);
1266     }
1267 }
1268
1269 /* Expand a vector operation with two vectors and a scalar.  */
1270 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1271                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1272 {
1273     TCGType type;
1274
1275     check_size_align(oprsz, maxsz, dofs | aofs);
1276     check_overlap_2(dofs, aofs, maxsz);
1277
1278     type = 0;
1279     if (g->fniv) {
1280         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1281     }
1282     if (type != 0) {
1283         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1284         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1285         TCGv_vec t_vec = tcg_temp_new_vec(type);
1286         uint32_t some;
1287
1288         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1289
1290         switch (type) {
1291         case TCG_TYPE_V256:
1292             /* Recall that ARM SVE allows vector sizes that are not a
1293              * power of 2, but always a multiple of 16.  The intent is
1294              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1295              */
1296             some = QEMU_ALIGN_DOWN(oprsz, 32);
1297             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1298                           t_vec, g->scalar_first, g->fniv);
1299             if (some == oprsz) {
1300                 break;
1301             }
1302             dofs += some;
1303             aofs += some;
1304             oprsz -= some;
1305             maxsz -= some;
1306             /* fallthru */
1307
1308         case TCG_TYPE_V128:
1309             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1310                           t_vec, g->scalar_first, g->fniv);
1311             break;
1312
1313         case TCG_TYPE_V64:
1314             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1315                           t_vec, g->scalar_first, g->fniv);
1316             break;
1317
1318         default:
1319             g_assert_not_reached();
1320         }
1321         tcg_temp_free_vec(t_vec);
1322         tcg_swap_vecop_list(hold_list);
1323     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1324         TCGv_i64 t64 = tcg_temp_new_i64();
1325
1326         gen_dup_i64(g->vece, t64, c);
1327         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1328         tcg_temp_free_i64(t64);
1329     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1330         TCGv_i32 t32 = tcg_temp_new_i32();
1331
1332         tcg_gen_extrl_i64_i32(t32, c);
1333         gen_dup_i32(g->vece, t32, t32);
1334         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1335         tcg_temp_free_i32(t32);
1336     } else {
1337         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1338         return;
1339     }
1340
1341     if (oprsz < maxsz) {
1342         expand_clr(dofs + oprsz, maxsz - oprsz);
1343     }
1344 }
1345
1346 /* Expand a vector three-operand operation.  */
1347 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1348                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1349 {
1350     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1351     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1352     TCGType type;
1353     uint32_t some;
1354
1355     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1356     check_overlap_3(dofs, aofs, bofs, maxsz);
1357
1358     type = 0;
1359     if (g->fniv) {
1360         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1361     }
1362     switch (type) {
1363     case TCG_TYPE_V256:
1364         /* Recall that ARM SVE allows vector sizes that are not a
1365          * power of 2, but always a multiple of 16.  The intent is
1366          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1367          */
1368         some = QEMU_ALIGN_DOWN(oprsz, 32);
1369         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1370                      g->load_dest, g->fniv);
1371         if (some == oprsz) {
1372             break;
1373         }
1374         dofs += some;
1375         aofs += some;
1376         bofs += some;
1377         oprsz -= some;
1378         maxsz -= some;
1379         /* fallthru */
1380     case TCG_TYPE_V128:
1381         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1382                      g->load_dest, g->fniv);
1383         break;
1384     case TCG_TYPE_V64:
1385         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1386                      g->load_dest, g->fniv);
1387         break;
1388
1389     case 0:
1390         if (g->fni8 && check_size_impl(oprsz, 8)) {
1391             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1392         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1393             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1394         } else {
1395             assert(g->fno != NULL);
1396             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1397                                maxsz, g->data, g->fno);
1398             oprsz = maxsz;
1399         }
1400         break;
1401
1402     default:
1403         g_assert_not_reached();
1404     }
1405     tcg_swap_vecop_list(hold_list);
1406
1407     if (oprsz < maxsz) {
1408         expand_clr(dofs + oprsz, maxsz - oprsz);
1409     }
1410 }
1411
1412 /* Expand a vector operation with three vectors and an immediate.  */
1413 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1414                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1415                      const GVecGen3i *g)
1416 {
1417     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1418     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1419     TCGType type;
1420     uint32_t some;
1421
1422     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1423     check_overlap_3(dofs, aofs, bofs, maxsz);
1424
1425     type = 0;
1426     if (g->fniv) {
1427         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1428     }
1429     switch (type) {
1430     case TCG_TYPE_V256:
1431         /*
1432          * Recall that ARM SVE allows vector sizes that are not a
1433          * power of 2, but always a multiple of 16.  The intent is
1434          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1435          */
1436         some = QEMU_ALIGN_DOWN(oprsz, 32);
1437         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1438                       c, g->load_dest, g->fniv);
1439         if (some == oprsz) {
1440             break;
1441         }
1442         dofs += some;
1443         aofs += some;
1444         bofs += some;
1445         oprsz -= some;
1446         maxsz -= some;
1447         /* fallthru */
1448     case TCG_TYPE_V128:
1449         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1450                       c, g->load_dest, g->fniv);
1451         break;
1452     case TCG_TYPE_V64:
1453         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1454                       c, g->load_dest, g->fniv);
1455         break;
1456
1457     case 0:
1458         if (g->fni8 && check_size_impl(oprsz, 8)) {
1459             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1460         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1461             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1462         } else {
1463             assert(g->fno != NULL);
1464             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1465             oprsz = maxsz;
1466         }
1467         break;
1468
1469     default:
1470         g_assert_not_reached();
1471     }
1472     tcg_swap_vecop_list(hold_list);
1473
1474     if (oprsz < maxsz) {
1475         expand_clr(dofs + oprsz, maxsz - oprsz);
1476     }
1477 }
1478
1479 /* Expand a vector four-operand operation.  */
1480 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1481                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1482 {
1483     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1484     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1485     TCGType type;
1486     uint32_t some;
1487
1488     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1489     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1490
1491     type = 0;
1492     if (g->fniv) {
1493         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1494     }
1495     switch (type) {
1496     case TCG_TYPE_V256:
1497         /* Recall that ARM SVE allows vector sizes that are not a
1498          * power of 2, but always a multiple of 16.  The intent is
1499          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1500          */
1501         some = QEMU_ALIGN_DOWN(oprsz, 32);
1502         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1503                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1504         if (some == oprsz) {
1505             break;
1506         }
1507         dofs += some;
1508         aofs += some;
1509         bofs += some;
1510         cofs += some;
1511         oprsz -= some;
1512         maxsz -= some;
1513         /* fallthru */
1514     case TCG_TYPE_V128:
1515         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1516                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1517         break;
1518     case TCG_TYPE_V64:
1519         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1520                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1521         break;
1522
1523     case 0:
1524         if (g->fni8 && check_size_impl(oprsz, 8)) {
1525             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1526                          g->write_aofs, g->fni8);
1527         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1528             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1529                          g->write_aofs, g->fni4);
1530         } else {
1531             assert(g->fno != NULL);
1532             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1533                                oprsz, maxsz, g->data, g->fno);
1534             oprsz = maxsz;
1535         }
1536         break;
1537
1538     default:
1539         g_assert_not_reached();
1540     }
1541     tcg_swap_vecop_list(hold_list);
1542
1543     if (oprsz < maxsz) {
1544         expand_clr(dofs + oprsz, maxsz - oprsz);
1545     }
1546 }
1547
1548 /*
1549  * Expand specific vector operations.
1550  */
1551
1552 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1553 {
1554     tcg_gen_mov_vec(a, b);
1555 }
1556
1557 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1558                       uint32_t oprsz, uint32_t maxsz)
1559 {
1560     static const GVecGen2 g = {
1561         .fni8 = tcg_gen_mov_i64,
1562         .fniv = vec_mov2,
1563         .fno = gen_helper_gvec_mov,
1564         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1565     };
1566     if (dofs != aofs) {
1567         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1568     } else {
1569         check_size_align(oprsz, maxsz, dofs);
1570         if (oprsz < maxsz) {
1571             expand_clr(dofs + oprsz, maxsz - oprsz);
1572         }
1573     }
1574 }
1575
1576 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1577                           uint32_t maxsz, TCGv_i32 in)
1578 {
1579     check_size_align(oprsz, maxsz, dofs);
1580     tcg_debug_assert(vece <= MO_32);
1581     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1582 }
1583
1584 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1585                           uint32_t maxsz, TCGv_i64 in)
1586 {
1587     check_size_align(oprsz, maxsz, dofs);
1588     tcg_debug_assert(vece <= MO_64);
1589     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1590 }
1591
1592 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1593                           uint32_t oprsz, uint32_t maxsz)
1594 {
1595     check_size_align(oprsz, maxsz, dofs);
1596     if (vece <= MO_64) {
1597         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1598         if (type != 0) {
1599             TCGv_vec t_vec = tcg_temp_new_vec(type);
1600             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1601             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1602             tcg_temp_free_vec(t_vec);
1603         } else if (vece <= MO_32) {
1604             TCGv_i32 in = tcg_temp_new_i32();
1605             switch (vece) {
1606             case MO_8:
1607                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1608                 break;
1609             case MO_16:
1610                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1611                 break;
1612             default:
1613                 tcg_gen_ld_i32(in, cpu_env, aofs);
1614                 break;
1615             }
1616             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1617             tcg_temp_free_i32(in);
1618         } else {
1619             TCGv_i64 in = tcg_temp_new_i64();
1620             tcg_gen_ld_i64(in, cpu_env, aofs);
1621             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1622             tcg_temp_free_i64(in);
1623         }
1624     } else if (vece == 4) {
1625         /* 128-bit duplicate.  */
1626         int i;
1627
1628         tcg_debug_assert(oprsz >= 16);
1629         if (TCG_TARGET_HAS_v128) {
1630             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1631
1632             tcg_gen_ld_vec(in, cpu_env, aofs);
1633             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1634                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1635             }
1636             tcg_temp_free_vec(in);
1637         } else {
1638             TCGv_i64 in0 = tcg_temp_new_i64();
1639             TCGv_i64 in1 = tcg_temp_new_i64();
1640
1641             tcg_gen_ld_i64(in0, cpu_env, aofs);
1642             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1643             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1644                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1645                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1646             }
1647             tcg_temp_free_i64(in0);
1648             tcg_temp_free_i64(in1);
1649         }
1650         if (oprsz < maxsz) {
1651             expand_clr(dofs + oprsz, maxsz - oprsz);
1652         }
1653     } else if (vece == 5) {
1654         /* 256-bit duplicate.  */
1655         int i;
1656
1657         tcg_debug_assert(oprsz >= 32);
1658         tcg_debug_assert(oprsz % 32 == 0);
1659         if (TCG_TARGET_HAS_v256) {
1660             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1661
1662             tcg_gen_ld_vec(in, cpu_env, aofs);
1663             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1664                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1665             }
1666             tcg_temp_free_vec(in);
1667         } else if (TCG_TARGET_HAS_v128) {
1668             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1669             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1670
1671             tcg_gen_ld_vec(in0, cpu_env, aofs);
1672             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1673             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1674                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1675                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1676             }
1677             tcg_temp_free_vec(in0);
1678             tcg_temp_free_vec(in1);
1679         } else {
1680             TCGv_i64 in[4];
1681             int j;
1682
1683             for (j = 0; j < 4; ++j) {
1684                 in[j] = tcg_temp_new_i64();
1685                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1686             }
1687             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1688                 for (j = 0; j < 4; ++j) {
1689                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1690                 }
1691             }
1692             for (j = 0; j < 4; ++j) {
1693                 tcg_temp_free_i64(in[j]);
1694             }
1695         }
1696         if (oprsz < maxsz) {
1697             expand_clr(dofs + oprsz, maxsz - oprsz);
1698         }
1699     } else {
1700         g_assert_not_reached();
1701     }
1702 }
1703
1704 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1705                           uint32_t maxsz, uint64_t x)
1706 {
1707     check_size_align(oprsz, maxsz, dofs);
1708     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1709 }
1710
1711 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1712                       uint32_t oprsz, uint32_t maxsz)
1713 {
1714     static const GVecGen2 g = {
1715         .fni8 = tcg_gen_not_i64,
1716         .fniv = tcg_gen_not_vec,
1717         .fno = gen_helper_gvec_not,
1718         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1719     };
1720     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1721 }
1722
1723 /* Perform a vector addition using normal addition and a mask.  The mask
1724    should be the sign bit of each lane.  This 6-operation form is more
1725    efficient than separate additions when there are 4 or more lanes in
1726    the 64-bit operation.  */
1727 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1728 {
1729     TCGv_i64 t1 = tcg_temp_new_i64();
1730     TCGv_i64 t2 = tcg_temp_new_i64();
1731     TCGv_i64 t3 = tcg_temp_new_i64();
1732
1733     tcg_gen_andc_i64(t1, a, m);
1734     tcg_gen_andc_i64(t2, b, m);
1735     tcg_gen_xor_i64(t3, a, b);
1736     tcg_gen_add_i64(d, t1, t2);
1737     tcg_gen_and_i64(t3, t3, m);
1738     tcg_gen_xor_i64(d, d, t3);
1739
1740     tcg_temp_free_i64(t1);
1741     tcg_temp_free_i64(t2);
1742     tcg_temp_free_i64(t3);
1743 }
1744
1745 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1746 {
1747     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1748     gen_addv_mask(d, a, b, m);
1749     tcg_temp_free_i64(m);
1750 }
1751
1752 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1753 {
1754     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1755     gen_addv_mask(d, a, b, m);
1756     tcg_temp_free_i64(m);
1757 }
1758
1759 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1760 {
1761     TCGv_i64 t1 = tcg_temp_new_i64();
1762     TCGv_i64 t2 = tcg_temp_new_i64();
1763
1764     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1765     tcg_gen_add_i64(t2, a, b);
1766     tcg_gen_add_i64(t1, t1, b);
1767     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1768
1769     tcg_temp_free_i64(t1);
1770     tcg_temp_free_i64(t2);
1771 }
1772
1773 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1774
1775 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1776                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1777 {
1778     static const GVecGen3 g[4] = {
1779         { .fni8 = tcg_gen_vec_add8_i64,
1780           .fniv = tcg_gen_add_vec,
1781           .fno = gen_helper_gvec_add8,
1782           .opt_opc = vecop_list_add,
1783           .vece = MO_8 },
1784         { .fni8 = tcg_gen_vec_add16_i64,
1785           .fniv = tcg_gen_add_vec,
1786           .fno = gen_helper_gvec_add16,
1787           .opt_opc = vecop_list_add,
1788           .vece = MO_16 },
1789         { .fni4 = tcg_gen_add_i32,
1790           .fniv = tcg_gen_add_vec,
1791           .fno = gen_helper_gvec_add32,
1792           .opt_opc = vecop_list_add,
1793           .vece = MO_32 },
1794         { .fni8 = tcg_gen_add_i64,
1795           .fniv = tcg_gen_add_vec,
1796           .fno = gen_helper_gvec_add64,
1797           .opt_opc = vecop_list_add,
1798           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1799           .vece = MO_64 },
1800     };
1801
1802     tcg_debug_assert(vece <= MO_64);
1803     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1804 }
1805
1806 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1807                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1808 {
1809     static const GVecGen2s g[4] = {
1810         { .fni8 = tcg_gen_vec_add8_i64,
1811           .fniv = tcg_gen_add_vec,
1812           .fno = gen_helper_gvec_adds8,
1813           .opt_opc = vecop_list_add,
1814           .vece = MO_8 },
1815         { .fni8 = tcg_gen_vec_add16_i64,
1816           .fniv = tcg_gen_add_vec,
1817           .fno = gen_helper_gvec_adds16,
1818           .opt_opc = vecop_list_add,
1819           .vece = MO_16 },
1820         { .fni4 = tcg_gen_add_i32,
1821           .fniv = tcg_gen_add_vec,
1822           .fno = gen_helper_gvec_adds32,
1823           .opt_opc = vecop_list_add,
1824           .vece = MO_32 },
1825         { .fni8 = tcg_gen_add_i64,
1826           .fniv = tcg_gen_add_vec,
1827           .fno = gen_helper_gvec_adds64,
1828           .opt_opc = vecop_list_add,
1829           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1830           .vece = MO_64 },
1831     };
1832
1833     tcg_debug_assert(vece <= MO_64);
1834     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1835 }
1836
1837 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1838                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1839 {
1840     TCGv_i64 tmp = tcg_const_i64(c);
1841     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1842     tcg_temp_free_i64(tmp);
1843 }
1844
1845 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1846
1847 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1848                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1849 {
1850     static const GVecGen2s g[4] = {
1851         { .fni8 = tcg_gen_vec_sub8_i64,
1852           .fniv = tcg_gen_sub_vec,
1853           .fno = gen_helper_gvec_subs8,
1854           .opt_opc = vecop_list_sub,
1855           .vece = MO_8 },
1856         { .fni8 = tcg_gen_vec_sub16_i64,
1857           .fniv = tcg_gen_sub_vec,
1858           .fno = gen_helper_gvec_subs16,
1859           .opt_opc = vecop_list_sub,
1860           .vece = MO_16 },
1861         { .fni4 = tcg_gen_sub_i32,
1862           .fniv = tcg_gen_sub_vec,
1863           .fno = gen_helper_gvec_subs32,
1864           .opt_opc = vecop_list_sub,
1865           .vece = MO_32 },
1866         { .fni8 = tcg_gen_sub_i64,
1867           .fniv = tcg_gen_sub_vec,
1868           .fno = gen_helper_gvec_subs64,
1869           .opt_opc = vecop_list_sub,
1870           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1871           .vece = MO_64 },
1872     };
1873
1874     tcg_debug_assert(vece <= MO_64);
1875     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1876 }
1877
1878 /* Perform a vector subtraction using normal subtraction and a mask.
1879    Compare gen_addv_mask above.  */
1880 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1881 {
1882     TCGv_i64 t1 = tcg_temp_new_i64();
1883     TCGv_i64 t2 = tcg_temp_new_i64();
1884     TCGv_i64 t3 = tcg_temp_new_i64();
1885
1886     tcg_gen_or_i64(t1, a, m);
1887     tcg_gen_andc_i64(t2, b, m);
1888     tcg_gen_eqv_i64(t3, a, b);
1889     tcg_gen_sub_i64(d, t1, t2);
1890     tcg_gen_and_i64(t3, t3, m);
1891     tcg_gen_xor_i64(d, d, t3);
1892
1893     tcg_temp_free_i64(t1);
1894     tcg_temp_free_i64(t2);
1895     tcg_temp_free_i64(t3);
1896 }
1897
1898 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1899 {
1900     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1901     gen_subv_mask(d, a, b, m);
1902     tcg_temp_free_i64(m);
1903 }
1904
1905 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1906 {
1907     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1908     gen_subv_mask(d, a, b, m);
1909     tcg_temp_free_i64(m);
1910 }
1911
1912 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1913 {
1914     TCGv_i64 t1 = tcg_temp_new_i64();
1915     TCGv_i64 t2 = tcg_temp_new_i64();
1916
1917     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1918     tcg_gen_sub_i64(t2, a, b);
1919     tcg_gen_sub_i64(t1, a, t1);
1920     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1921
1922     tcg_temp_free_i64(t1);
1923     tcg_temp_free_i64(t2);
1924 }
1925
1926 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1927                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1928 {
1929     static const GVecGen3 g[4] = {
1930         { .fni8 = tcg_gen_vec_sub8_i64,
1931           .fniv = tcg_gen_sub_vec,
1932           .fno = gen_helper_gvec_sub8,
1933           .opt_opc = vecop_list_sub,
1934           .vece = MO_8 },
1935         { .fni8 = tcg_gen_vec_sub16_i64,
1936           .fniv = tcg_gen_sub_vec,
1937           .fno = gen_helper_gvec_sub16,
1938           .opt_opc = vecop_list_sub,
1939           .vece = MO_16 },
1940         { .fni4 = tcg_gen_sub_i32,
1941           .fniv = tcg_gen_sub_vec,
1942           .fno = gen_helper_gvec_sub32,
1943           .opt_opc = vecop_list_sub,
1944           .vece = MO_32 },
1945         { .fni8 = tcg_gen_sub_i64,
1946           .fniv = tcg_gen_sub_vec,
1947           .fno = gen_helper_gvec_sub64,
1948           .opt_opc = vecop_list_sub,
1949           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1950           .vece = MO_64 },
1951     };
1952
1953     tcg_debug_assert(vece <= MO_64);
1954     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1955 }
1956
1957 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1958
1959 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1960                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1961 {
1962     static const GVecGen3 g[4] = {
1963         { .fniv = tcg_gen_mul_vec,
1964           .fno = gen_helper_gvec_mul8,
1965           .opt_opc = vecop_list_mul,
1966           .vece = MO_8 },
1967         { .fniv = tcg_gen_mul_vec,
1968           .fno = gen_helper_gvec_mul16,
1969           .opt_opc = vecop_list_mul,
1970           .vece = MO_16 },
1971         { .fni4 = tcg_gen_mul_i32,
1972           .fniv = tcg_gen_mul_vec,
1973           .fno = gen_helper_gvec_mul32,
1974           .opt_opc = vecop_list_mul,
1975           .vece = MO_32 },
1976         { .fni8 = tcg_gen_mul_i64,
1977           .fniv = tcg_gen_mul_vec,
1978           .fno = gen_helper_gvec_mul64,
1979           .opt_opc = vecop_list_mul,
1980           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1981           .vece = MO_64 },
1982     };
1983
1984     tcg_debug_assert(vece <= MO_64);
1985     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1986 }
1987
1988 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1989                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1990 {
1991     static const GVecGen2s g[4] = {
1992         { .fniv = tcg_gen_mul_vec,
1993           .fno = gen_helper_gvec_muls8,
1994           .opt_opc = vecop_list_mul,
1995           .vece = MO_8 },
1996         { .fniv = tcg_gen_mul_vec,
1997           .fno = gen_helper_gvec_muls16,
1998           .opt_opc = vecop_list_mul,
1999           .vece = MO_16 },
2000         { .fni4 = tcg_gen_mul_i32,
2001           .fniv = tcg_gen_mul_vec,
2002           .fno = gen_helper_gvec_muls32,
2003           .opt_opc = vecop_list_mul,
2004           .vece = MO_32 },
2005         { .fni8 = tcg_gen_mul_i64,
2006           .fniv = tcg_gen_mul_vec,
2007           .fno = gen_helper_gvec_muls64,
2008           .opt_opc = vecop_list_mul,
2009           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2010           .vece = MO_64 },
2011     };
2012
2013     tcg_debug_assert(vece <= MO_64);
2014     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2015 }
2016
2017 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2018                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2019 {
2020     TCGv_i64 tmp = tcg_const_i64(c);
2021     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2022     tcg_temp_free_i64(tmp);
2023 }
2024
2025 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2026                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2027 {
2028     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2029     static const GVecGen3 g[4] = {
2030         { .fniv = tcg_gen_ssadd_vec,
2031           .fno = gen_helper_gvec_ssadd8,
2032           .opt_opc = vecop_list,
2033           .vece = MO_8 },
2034         { .fniv = tcg_gen_ssadd_vec,
2035           .fno = gen_helper_gvec_ssadd16,
2036           .opt_opc = vecop_list,
2037           .vece = MO_16 },
2038         { .fniv = tcg_gen_ssadd_vec,
2039           .fno = gen_helper_gvec_ssadd32,
2040           .opt_opc = vecop_list,
2041           .vece = MO_32 },
2042         { .fniv = tcg_gen_ssadd_vec,
2043           .fno = gen_helper_gvec_ssadd64,
2044           .opt_opc = vecop_list,
2045           .vece = MO_64 },
2046     };
2047     tcg_debug_assert(vece <= MO_64);
2048     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2049 }
2050
2051 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2052                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2053 {
2054     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2055     static const GVecGen3 g[4] = {
2056         { .fniv = tcg_gen_sssub_vec,
2057           .fno = gen_helper_gvec_sssub8,
2058           .opt_opc = vecop_list,
2059           .vece = MO_8 },
2060         { .fniv = tcg_gen_sssub_vec,
2061           .fno = gen_helper_gvec_sssub16,
2062           .opt_opc = vecop_list,
2063           .vece = MO_16 },
2064         { .fniv = tcg_gen_sssub_vec,
2065           .fno = gen_helper_gvec_sssub32,
2066           .opt_opc = vecop_list,
2067           .vece = MO_32 },
2068         { .fniv = tcg_gen_sssub_vec,
2069           .fno = gen_helper_gvec_sssub64,
2070           .opt_opc = vecop_list,
2071           .vece = MO_64 },
2072     };
2073     tcg_debug_assert(vece <= MO_64);
2074     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2075 }
2076
2077 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2078 {
2079     TCGv_i32 max = tcg_const_i32(-1);
2080     tcg_gen_add_i32(d, a, b);
2081     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2082     tcg_temp_free_i32(max);
2083 }
2084
2085 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2086 {
2087     TCGv_i64 max = tcg_const_i64(-1);
2088     tcg_gen_add_i64(d, a, b);
2089     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2090     tcg_temp_free_i64(max);
2091 }
2092
2093 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2094                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2095 {
2096     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2097     static const GVecGen3 g[4] = {
2098         { .fniv = tcg_gen_usadd_vec,
2099           .fno = gen_helper_gvec_usadd8,
2100           .opt_opc = vecop_list,
2101           .vece = MO_8 },
2102         { .fniv = tcg_gen_usadd_vec,
2103           .fno = gen_helper_gvec_usadd16,
2104           .opt_opc = vecop_list,
2105           .vece = MO_16 },
2106         { .fni4 = tcg_gen_usadd_i32,
2107           .fniv = tcg_gen_usadd_vec,
2108           .fno = gen_helper_gvec_usadd32,
2109           .opt_opc = vecop_list,
2110           .vece = MO_32 },
2111         { .fni8 = tcg_gen_usadd_i64,
2112           .fniv = tcg_gen_usadd_vec,
2113           .fno = gen_helper_gvec_usadd64,
2114           .opt_opc = vecop_list,
2115           .vece = MO_64 }
2116     };
2117     tcg_debug_assert(vece <= MO_64);
2118     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2119 }
2120
2121 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2122 {
2123     TCGv_i32 min = tcg_const_i32(0);
2124     tcg_gen_sub_i32(d, a, b);
2125     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2126     tcg_temp_free_i32(min);
2127 }
2128
2129 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2130 {
2131     TCGv_i64 min = tcg_const_i64(0);
2132     tcg_gen_sub_i64(d, a, b);
2133     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2134     tcg_temp_free_i64(min);
2135 }
2136
2137 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2138                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2139 {
2140     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2141     static const GVecGen3 g[4] = {
2142         { .fniv = tcg_gen_ussub_vec,
2143           .fno = gen_helper_gvec_ussub8,
2144           .opt_opc = vecop_list,
2145           .vece = MO_8 },
2146         { .fniv = tcg_gen_ussub_vec,
2147           .fno = gen_helper_gvec_ussub16,
2148           .opt_opc = vecop_list,
2149           .vece = MO_16 },
2150         { .fni4 = tcg_gen_ussub_i32,
2151           .fniv = tcg_gen_ussub_vec,
2152           .fno = gen_helper_gvec_ussub32,
2153           .opt_opc = vecop_list,
2154           .vece = MO_32 },
2155         { .fni8 = tcg_gen_ussub_i64,
2156           .fniv = tcg_gen_ussub_vec,
2157           .fno = gen_helper_gvec_ussub64,
2158           .opt_opc = vecop_list,
2159           .vece = MO_64 }
2160     };
2161     tcg_debug_assert(vece <= MO_64);
2162     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2163 }
2164
2165 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2166                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2167 {
2168     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2169     static const GVecGen3 g[4] = {
2170         { .fniv = tcg_gen_smin_vec,
2171           .fno = gen_helper_gvec_smin8,
2172           .opt_opc = vecop_list,
2173           .vece = MO_8 },
2174         { .fniv = tcg_gen_smin_vec,
2175           .fno = gen_helper_gvec_smin16,
2176           .opt_opc = vecop_list,
2177           .vece = MO_16 },
2178         { .fni4 = tcg_gen_smin_i32,
2179           .fniv = tcg_gen_smin_vec,
2180           .fno = gen_helper_gvec_smin32,
2181           .opt_opc = vecop_list,
2182           .vece = MO_32 },
2183         { .fni8 = tcg_gen_smin_i64,
2184           .fniv = tcg_gen_smin_vec,
2185           .fno = gen_helper_gvec_smin64,
2186           .opt_opc = vecop_list,
2187           .vece = MO_64 }
2188     };
2189     tcg_debug_assert(vece <= MO_64);
2190     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2191 }
2192
2193 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2194                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2195 {
2196     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2197     static const GVecGen3 g[4] = {
2198         { .fniv = tcg_gen_umin_vec,
2199           .fno = gen_helper_gvec_umin8,
2200           .opt_opc = vecop_list,
2201           .vece = MO_8 },
2202         { .fniv = tcg_gen_umin_vec,
2203           .fno = gen_helper_gvec_umin16,
2204           .opt_opc = vecop_list,
2205           .vece = MO_16 },
2206         { .fni4 = tcg_gen_umin_i32,
2207           .fniv = tcg_gen_umin_vec,
2208           .fno = gen_helper_gvec_umin32,
2209           .opt_opc = vecop_list,
2210           .vece = MO_32 },
2211         { .fni8 = tcg_gen_umin_i64,
2212           .fniv = tcg_gen_umin_vec,
2213           .fno = gen_helper_gvec_umin64,
2214           .opt_opc = vecop_list,
2215           .vece = MO_64 }
2216     };
2217     tcg_debug_assert(vece <= MO_64);
2218     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2219 }
2220
2221 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2222                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2223 {
2224     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2225     static const GVecGen3 g[4] = {
2226         { .fniv = tcg_gen_smax_vec,
2227           .fno = gen_helper_gvec_smax8,
2228           .opt_opc = vecop_list,
2229           .vece = MO_8 },
2230         { .fniv = tcg_gen_smax_vec,
2231           .fno = gen_helper_gvec_smax16,
2232           .opt_opc = vecop_list,
2233           .vece = MO_16 },
2234         { .fni4 = tcg_gen_smax_i32,
2235           .fniv = tcg_gen_smax_vec,
2236           .fno = gen_helper_gvec_smax32,
2237           .opt_opc = vecop_list,
2238           .vece = MO_32 },
2239         { .fni8 = tcg_gen_smax_i64,
2240           .fniv = tcg_gen_smax_vec,
2241           .fno = gen_helper_gvec_smax64,
2242           .opt_opc = vecop_list,
2243           .vece = MO_64 }
2244     };
2245     tcg_debug_assert(vece <= MO_64);
2246     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2247 }
2248
2249 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2250                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2251 {
2252     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2253     static const GVecGen3 g[4] = {
2254         { .fniv = tcg_gen_umax_vec,
2255           .fno = gen_helper_gvec_umax8,
2256           .opt_opc = vecop_list,
2257           .vece = MO_8 },
2258         { .fniv = tcg_gen_umax_vec,
2259           .fno = gen_helper_gvec_umax16,
2260           .opt_opc = vecop_list,
2261           .vece = MO_16 },
2262         { .fni4 = tcg_gen_umax_i32,
2263           .fniv = tcg_gen_umax_vec,
2264           .fno = gen_helper_gvec_umax32,
2265           .opt_opc = vecop_list,
2266           .vece = MO_32 },
2267         { .fni8 = tcg_gen_umax_i64,
2268           .fniv = tcg_gen_umax_vec,
2269           .fno = gen_helper_gvec_umax64,
2270           .opt_opc = vecop_list,
2271           .vece = MO_64 }
2272     };
2273     tcg_debug_assert(vece <= MO_64);
2274     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2275 }
2276
2277 /* Perform a vector negation using normal negation and a mask.
2278    Compare gen_subv_mask above.  */
2279 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2280 {
2281     TCGv_i64 t2 = tcg_temp_new_i64();
2282     TCGv_i64 t3 = tcg_temp_new_i64();
2283
2284     tcg_gen_andc_i64(t3, m, b);
2285     tcg_gen_andc_i64(t2, b, m);
2286     tcg_gen_sub_i64(d, m, t2);
2287     tcg_gen_xor_i64(d, d, t3);
2288
2289     tcg_temp_free_i64(t2);
2290     tcg_temp_free_i64(t3);
2291 }
2292
2293 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2294 {
2295     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2296     gen_negv_mask(d, b, m);
2297     tcg_temp_free_i64(m);
2298 }
2299
2300 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2301 {
2302     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2303     gen_negv_mask(d, b, m);
2304     tcg_temp_free_i64(m);
2305 }
2306
2307 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2308 {
2309     TCGv_i64 t1 = tcg_temp_new_i64();
2310     TCGv_i64 t2 = tcg_temp_new_i64();
2311
2312     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2313     tcg_gen_neg_i64(t2, b);
2314     tcg_gen_neg_i64(t1, t1);
2315     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2316
2317     tcg_temp_free_i64(t1);
2318     tcg_temp_free_i64(t2);
2319 }
2320
2321 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2322                       uint32_t oprsz, uint32_t maxsz)
2323 {
2324     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2325     static const GVecGen2 g[4] = {
2326         { .fni8 = tcg_gen_vec_neg8_i64,
2327           .fniv = tcg_gen_neg_vec,
2328           .fno = gen_helper_gvec_neg8,
2329           .opt_opc = vecop_list,
2330           .vece = MO_8 },
2331         { .fni8 = tcg_gen_vec_neg16_i64,
2332           .fniv = tcg_gen_neg_vec,
2333           .fno = gen_helper_gvec_neg16,
2334           .opt_opc = vecop_list,
2335           .vece = MO_16 },
2336         { .fni4 = tcg_gen_neg_i32,
2337           .fniv = tcg_gen_neg_vec,
2338           .fno = gen_helper_gvec_neg32,
2339           .opt_opc = vecop_list,
2340           .vece = MO_32 },
2341         { .fni8 = tcg_gen_neg_i64,
2342           .fniv = tcg_gen_neg_vec,
2343           .fno = gen_helper_gvec_neg64,
2344           .opt_opc = vecop_list,
2345           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2346           .vece = MO_64 },
2347     };
2348
2349     tcg_debug_assert(vece <= MO_64);
2350     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2351 }
2352
2353 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2354 {
2355     TCGv_i64 t = tcg_temp_new_i64();
2356     int nbit = 8 << vece;
2357
2358     /* Create -1 for each negative element.  */
2359     tcg_gen_shri_i64(t, b, nbit - 1);
2360     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2361     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2362
2363     /*
2364      * Invert (via xor -1) and add one.
2365      * Because of the ordering the msb is cleared,
2366      * so we never have carry into the next element.
2367      */
2368     tcg_gen_xor_i64(d, b, t);
2369     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2370     tcg_gen_add_i64(d, d, t);
2371
2372     tcg_temp_free_i64(t);
2373 }
2374
2375 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2376 {
2377     gen_absv_mask(d, b, MO_8);
2378 }
2379
2380 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2381 {
2382     gen_absv_mask(d, b, MO_16);
2383 }
2384
2385 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2386                       uint32_t oprsz, uint32_t maxsz)
2387 {
2388     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2389     static const GVecGen2 g[4] = {
2390         { .fni8 = tcg_gen_vec_abs8_i64,
2391           .fniv = tcg_gen_abs_vec,
2392           .fno = gen_helper_gvec_abs8,
2393           .opt_opc = vecop_list,
2394           .vece = MO_8 },
2395         { .fni8 = tcg_gen_vec_abs16_i64,
2396           .fniv = tcg_gen_abs_vec,
2397           .fno = gen_helper_gvec_abs16,
2398           .opt_opc = vecop_list,
2399           .vece = MO_16 },
2400         { .fni4 = tcg_gen_abs_i32,
2401           .fniv = tcg_gen_abs_vec,
2402           .fno = gen_helper_gvec_abs32,
2403           .opt_opc = vecop_list,
2404           .vece = MO_32 },
2405         { .fni8 = tcg_gen_abs_i64,
2406           .fniv = tcg_gen_abs_vec,
2407           .fno = gen_helper_gvec_abs64,
2408           .opt_opc = vecop_list,
2409           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410           .vece = MO_64 },
2411     };
2412
2413     tcg_debug_assert(vece <= MO_64);
2414     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2415 }
2416
2417 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2418                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2419 {
2420     static const GVecGen3 g = {
2421         .fni8 = tcg_gen_and_i64,
2422         .fniv = tcg_gen_and_vec,
2423         .fno = gen_helper_gvec_and,
2424         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2425     };
2426
2427     if (aofs == bofs) {
2428         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2429     } else {
2430         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2431     }
2432 }
2433
2434 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2435                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2436 {
2437     static const GVecGen3 g = {
2438         .fni8 = tcg_gen_or_i64,
2439         .fniv = tcg_gen_or_vec,
2440         .fno = gen_helper_gvec_or,
2441         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2442     };
2443
2444     if (aofs == bofs) {
2445         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2446     } else {
2447         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2448     }
2449 }
2450
2451 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2452                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2453 {
2454     static const GVecGen3 g = {
2455         .fni8 = tcg_gen_xor_i64,
2456         .fniv = tcg_gen_xor_vec,
2457         .fno = gen_helper_gvec_xor,
2458         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2459     };
2460
2461     if (aofs == bofs) {
2462         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2463     } else {
2464         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2465     }
2466 }
2467
2468 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2469                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2470 {
2471     static const GVecGen3 g = {
2472         .fni8 = tcg_gen_andc_i64,
2473         .fniv = tcg_gen_andc_vec,
2474         .fno = gen_helper_gvec_andc,
2475         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2476     };
2477
2478     if (aofs == bofs) {
2479         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2480     } else {
2481         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2482     }
2483 }
2484
2485 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2486                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2487 {
2488     static const GVecGen3 g = {
2489         .fni8 = tcg_gen_orc_i64,
2490         .fniv = tcg_gen_orc_vec,
2491         .fno = gen_helper_gvec_orc,
2492         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2493     };
2494
2495     if (aofs == bofs) {
2496         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2497     } else {
2498         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2499     }
2500 }
2501
2502 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2503                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2504 {
2505     static const GVecGen3 g = {
2506         .fni8 = tcg_gen_nand_i64,
2507         .fniv = tcg_gen_nand_vec,
2508         .fno = gen_helper_gvec_nand,
2509         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2510     };
2511
2512     if (aofs == bofs) {
2513         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2514     } else {
2515         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2516     }
2517 }
2518
2519 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2520                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2521 {
2522     static const GVecGen3 g = {
2523         .fni8 = tcg_gen_nor_i64,
2524         .fniv = tcg_gen_nor_vec,
2525         .fno = gen_helper_gvec_nor,
2526         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2527     };
2528
2529     if (aofs == bofs) {
2530         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2531     } else {
2532         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2533     }
2534 }
2535
2536 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2537                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2538 {
2539     static const GVecGen3 g = {
2540         .fni8 = tcg_gen_eqv_i64,
2541         .fniv = tcg_gen_eqv_vec,
2542         .fno = gen_helper_gvec_eqv,
2543         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2544     };
2545
2546     if (aofs == bofs) {
2547         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2548     } else {
2549         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2550     }
2551 }
2552
2553 static const GVecGen2s gop_ands = {
2554     .fni8 = tcg_gen_and_i64,
2555     .fniv = tcg_gen_and_vec,
2556     .fno = gen_helper_gvec_ands,
2557     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2558     .vece = MO_64
2559 };
2560
2561 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2563 {
2564     TCGv_i64 tmp = tcg_temp_new_i64();
2565     gen_dup_i64(vece, tmp, c);
2566     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2567     tcg_temp_free_i64(tmp);
2568 }
2569
2570 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2571                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2572 {
2573     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2574     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2575     tcg_temp_free_i64(tmp);
2576 }
2577
2578 static const GVecGen2s gop_xors = {
2579     .fni8 = tcg_gen_xor_i64,
2580     .fniv = tcg_gen_xor_vec,
2581     .fno = gen_helper_gvec_xors,
2582     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2583     .vece = MO_64
2584 };
2585
2586 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2587                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2588 {
2589     TCGv_i64 tmp = tcg_temp_new_i64();
2590     gen_dup_i64(vece, tmp, c);
2591     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2592     tcg_temp_free_i64(tmp);
2593 }
2594
2595 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2596                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2597 {
2598     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2599     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2600     tcg_temp_free_i64(tmp);
2601 }
2602
2603 static const GVecGen2s gop_ors = {
2604     .fni8 = tcg_gen_or_i64,
2605     .fniv = tcg_gen_or_vec,
2606     .fno = gen_helper_gvec_ors,
2607     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2608     .vece = MO_64
2609 };
2610
2611 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2612                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2613 {
2614     TCGv_i64 tmp = tcg_temp_new_i64();
2615     gen_dup_i64(vece, tmp, c);
2616     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2617     tcg_temp_free_i64(tmp);
2618 }
2619
2620 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2621                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2622 {
2623     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2624     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2625     tcg_temp_free_i64(tmp);
2626 }
2627
2628 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2629 {
2630     uint64_t mask = dup_const(MO_8, 0xff << c);
2631     tcg_gen_shli_i64(d, a, c);
2632     tcg_gen_andi_i64(d, d, mask);
2633 }
2634
2635 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2636 {
2637     uint64_t mask = dup_const(MO_16, 0xffff << c);
2638     tcg_gen_shli_i64(d, a, c);
2639     tcg_gen_andi_i64(d, d, mask);
2640 }
2641
2642 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2643                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2644 {
2645     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2646     static const GVecGen2i g[4] = {
2647         { .fni8 = tcg_gen_vec_shl8i_i64,
2648           .fniv = tcg_gen_shli_vec,
2649           .fno = gen_helper_gvec_shl8i,
2650           .opt_opc = vecop_list,
2651           .vece = MO_8 },
2652         { .fni8 = tcg_gen_vec_shl16i_i64,
2653           .fniv = tcg_gen_shli_vec,
2654           .fno = gen_helper_gvec_shl16i,
2655           .opt_opc = vecop_list,
2656           .vece = MO_16 },
2657         { .fni4 = tcg_gen_shli_i32,
2658           .fniv = tcg_gen_shli_vec,
2659           .fno = gen_helper_gvec_shl32i,
2660           .opt_opc = vecop_list,
2661           .vece = MO_32 },
2662         { .fni8 = tcg_gen_shli_i64,
2663           .fniv = tcg_gen_shli_vec,
2664           .fno = gen_helper_gvec_shl64i,
2665           .opt_opc = vecop_list,
2666           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2667           .vece = MO_64 },
2668     };
2669
2670     tcg_debug_assert(vece <= MO_64);
2671     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2672     if (shift == 0) {
2673         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2674     } else {
2675         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2676     }
2677 }
2678
2679 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2680 {
2681     uint64_t mask = dup_const(MO_8, 0xff >> c);
2682     tcg_gen_shri_i64(d, a, c);
2683     tcg_gen_andi_i64(d, d, mask);
2684 }
2685
2686 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2687 {
2688     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2689     tcg_gen_shri_i64(d, a, c);
2690     tcg_gen_andi_i64(d, d, mask);
2691 }
2692
2693 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2694                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2695 {
2696     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2697     static const GVecGen2i g[4] = {
2698         { .fni8 = tcg_gen_vec_shr8i_i64,
2699           .fniv = tcg_gen_shri_vec,
2700           .fno = gen_helper_gvec_shr8i,
2701           .opt_opc = vecop_list,
2702           .vece = MO_8 },
2703         { .fni8 = tcg_gen_vec_shr16i_i64,
2704           .fniv = tcg_gen_shri_vec,
2705           .fno = gen_helper_gvec_shr16i,
2706           .opt_opc = vecop_list,
2707           .vece = MO_16 },
2708         { .fni4 = tcg_gen_shri_i32,
2709           .fniv = tcg_gen_shri_vec,
2710           .fno = gen_helper_gvec_shr32i,
2711           .opt_opc = vecop_list,
2712           .vece = MO_32 },
2713         { .fni8 = tcg_gen_shri_i64,
2714           .fniv = tcg_gen_shri_vec,
2715           .fno = gen_helper_gvec_shr64i,
2716           .opt_opc = vecop_list,
2717           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2718           .vece = MO_64 },
2719     };
2720
2721     tcg_debug_assert(vece <= MO_64);
2722     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2723     if (shift == 0) {
2724         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2725     } else {
2726         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2727     }
2728 }
2729
2730 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2731 {
2732     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2733     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2734     TCGv_i64 s = tcg_temp_new_i64();
2735
2736     tcg_gen_shri_i64(d, a, c);
2737     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2738     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2739     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2740     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2741     tcg_temp_free_i64(s);
2742 }
2743
2744 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2745 {
2746     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2747     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2748     TCGv_i64 s = tcg_temp_new_i64();
2749
2750     tcg_gen_shri_i64(d, a, c);
2751     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2752     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2753     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2754     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2755     tcg_temp_free_i64(s);
2756 }
2757
2758 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2759                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2760 {
2761     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2762     static const GVecGen2i g[4] = {
2763         { .fni8 = tcg_gen_vec_sar8i_i64,
2764           .fniv = tcg_gen_sari_vec,
2765           .fno = gen_helper_gvec_sar8i,
2766           .opt_opc = vecop_list,
2767           .vece = MO_8 },
2768         { .fni8 = tcg_gen_vec_sar16i_i64,
2769           .fniv = tcg_gen_sari_vec,
2770           .fno = gen_helper_gvec_sar16i,
2771           .opt_opc = vecop_list,
2772           .vece = MO_16 },
2773         { .fni4 = tcg_gen_sari_i32,
2774           .fniv = tcg_gen_sari_vec,
2775           .fno = gen_helper_gvec_sar32i,
2776           .opt_opc = vecop_list,
2777           .vece = MO_32 },
2778         { .fni8 = tcg_gen_sari_i64,
2779           .fniv = tcg_gen_sari_vec,
2780           .fno = gen_helper_gvec_sar64i,
2781           .opt_opc = vecop_list,
2782           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2783           .vece = MO_64 },
2784     };
2785
2786     tcg_debug_assert(vece <= MO_64);
2787     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2788     if (shift == 0) {
2789         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2790     } else {
2791         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2792     }
2793 }
2794
2795 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2796 {
2797     uint64_t mask = dup_const(MO_8, 0xff << c);
2798
2799     tcg_gen_shli_i64(d, a, c);
2800     tcg_gen_shri_i64(a, a, 8 - c);
2801     tcg_gen_andi_i64(d, d, mask);
2802     tcg_gen_andi_i64(a, a, ~mask);
2803     tcg_gen_or_i64(d, d, a);
2804 }
2805
2806 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2807 {
2808     uint64_t mask = dup_const(MO_16, 0xffff << c);
2809
2810     tcg_gen_shli_i64(d, a, c);
2811     tcg_gen_shri_i64(a, a, 16 - c);
2812     tcg_gen_andi_i64(d, d, mask);
2813     tcg_gen_andi_i64(a, a, ~mask);
2814     tcg_gen_or_i64(d, d, a);
2815 }
2816
2817 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2818                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2819 {
2820     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2821     static const GVecGen2i g[4] = {
2822         { .fni8 = tcg_gen_vec_rotl8i_i64,
2823           .fniv = tcg_gen_rotli_vec,
2824           .fno = gen_helper_gvec_rotl8i,
2825           .opt_opc = vecop_list,
2826           .vece = MO_8 },
2827         { .fni8 = tcg_gen_vec_rotl16i_i64,
2828           .fniv = tcg_gen_rotli_vec,
2829           .fno = gen_helper_gvec_rotl16i,
2830           .opt_opc = vecop_list,
2831           .vece = MO_16 },
2832         { .fni4 = tcg_gen_rotli_i32,
2833           .fniv = tcg_gen_rotli_vec,
2834           .fno = gen_helper_gvec_rotl32i,
2835           .opt_opc = vecop_list,
2836           .vece = MO_32 },
2837         { .fni8 = tcg_gen_rotli_i64,
2838           .fniv = tcg_gen_rotli_vec,
2839           .fno = gen_helper_gvec_rotl64i,
2840           .opt_opc = vecop_list,
2841           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2842           .vece = MO_64 },
2843     };
2844
2845     tcg_debug_assert(vece <= MO_64);
2846     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2847     if (shift == 0) {
2848         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2849     } else {
2850         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2851     }
2852 }
2853
2854 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2855                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2856 {
2857     tcg_debug_assert(vece <= MO_64);
2858     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2859     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2860                        oprsz, maxsz);
2861 }
2862
2863 /*
2864  * Specialized generation vector shifts by a non-constant scalar.
2865  */
2866
2867 typedef struct {
2868     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2869     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2870     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2871     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2872     gen_helper_gvec_2 *fno[4];
2873     TCGOpcode s_list[2];
2874     TCGOpcode v_list[2];
2875 } GVecGen2sh;
2876
2877 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2878                            uint32_t oprsz, uint32_t tysz, TCGType type,
2879                            TCGv_i32 shift,
2880                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2881 {
2882     TCGv_vec t0 = tcg_temp_new_vec(type);
2883     uint32_t i;
2884
2885     for (i = 0; i < oprsz; i += tysz) {
2886         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2887         fni(vece, t0, t0, shift);
2888         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2889     }
2890     tcg_temp_free_vec(t0);
2891 }
2892
2893 static void
2894 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2895                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2896 {
2897     TCGType type;
2898     uint32_t some;
2899
2900     check_size_align(oprsz, maxsz, dofs | aofs);
2901     check_overlap_2(dofs, aofs, maxsz);
2902
2903     /* If the backend has a scalar expansion, great.  */
2904     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2905     if (type) {
2906         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2907         switch (type) {
2908         case TCG_TYPE_V256:
2909             some = QEMU_ALIGN_DOWN(oprsz, 32);
2910             expand_2sh_vec(vece, dofs, aofs, some, 32,
2911                            TCG_TYPE_V256, shift, g->fniv_s);
2912             if (some == oprsz) {
2913                 break;
2914             }
2915             dofs += some;
2916             aofs += some;
2917             oprsz -= some;
2918             maxsz -= some;
2919             /* fallthru */
2920         case TCG_TYPE_V128:
2921             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2922                            TCG_TYPE_V128, shift, g->fniv_s);
2923             break;
2924         case TCG_TYPE_V64:
2925             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2926                            TCG_TYPE_V64, shift, g->fniv_s);
2927             break;
2928         default:
2929             g_assert_not_reached();
2930         }
2931         tcg_swap_vecop_list(hold_list);
2932         goto clear_tail;
2933     }
2934
2935     /* If the backend supports variable vector shifts, also cool.  */
2936     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2937     if (type) {
2938         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2939         TCGv_vec v_shift = tcg_temp_new_vec(type);
2940
2941         if (vece == MO_64) {
2942             TCGv_i64 sh64 = tcg_temp_new_i64();
2943             tcg_gen_extu_i32_i64(sh64, shift);
2944             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2945             tcg_temp_free_i64(sh64);
2946         } else {
2947             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2948         }
2949
2950         switch (type) {
2951         case TCG_TYPE_V256:
2952             some = QEMU_ALIGN_DOWN(oprsz, 32);
2953             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2954                           v_shift, false, g->fniv_v);
2955             if (some == oprsz) {
2956                 break;
2957             }
2958             dofs += some;
2959             aofs += some;
2960             oprsz -= some;
2961             maxsz -= some;
2962             /* fallthru */
2963         case TCG_TYPE_V128:
2964             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2965                           v_shift, false, g->fniv_v);
2966             break;
2967         case TCG_TYPE_V64:
2968             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2969                           v_shift, false, g->fniv_v);
2970             break;
2971         default:
2972             g_assert_not_reached();
2973         }
2974         tcg_temp_free_vec(v_shift);
2975         tcg_swap_vecop_list(hold_list);
2976         goto clear_tail;
2977     }
2978
2979     /* Otherwise fall back to integral... */
2980     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2981         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2982     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2983         TCGv_i64 sh64 = tcg_temp_new_i64();
2984         tcg_gen_extu_i32_i64(sh64, shift);
2985         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2986         tcg_temp_free_i64(sh64);
2987     } else {
2988         TCGv_ptr a0 = tcg_temp_new_ptr();
2989         TCGv_ptr a1 = tcg_temp_new_ptr();
2990         TCGv_i32 desc = tcg_temp_new_i32();
2991
2992         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2993         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2994         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2995         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2996
2997         g->fno[vece](a0, a1, desc);
2998
2999         tcg_temp_free_ptr(a0);
3000         tcg_temp_free_ptr(a1);
3001         tcg_temp_free_i32(desc);
3002         return;
3003     }
3004
3005  clear_tail:
3006     if (oprsz < maxsz) {
3007         expand_clr(dofs + oprsz, maxsz - oprsz);
3008     }
3009 }
3010
3011 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3012                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3013 {
3014     static const GVecGen2sh g = {
3015         .fni4 = tcg_gen_shl_i32,
3016         .fni8 = tcg_gen_shl_i64,
3017         .fniv_s = tcg_gen_shls_vec,
3018         .fniv_v = tcg_gen_shlv_vec,
3019         .fno = {
3020             gen_helper_gvec_shl8i,
3021             gen_helper_gvec_shl16i,
3022             gen_helper_gvec_shl32i,
3023             gen_helper_gvec_shl64i,
3024         },
3025         .s_list = { INDEX_op_shls_vec, 0 },
3026         .v_list = { INDEX_op_shlv_vec, 0 },
3027     };
3028
3029     tcg_debug_assert(vece <= MO_64);
3030     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3031 }
3032
3033 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3034                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3035 {
3036     static const GVecGen2sh g = {
3037         .fni4 = tcg_gen_shr_i32,
3038         .fni8 = tcg_gen_shr_i64,
3039         .fniv_s = tcg_gen_shrs_vec,
3040         .fniv_v = tcg_gen_shrv_vec,
3041         .fno = {
3042             gen_helper_gvec_shr8i,
3043             gen_helper_gvec_shr16i,
3044             gen_helper_gvec_shr32i,
3045             gen_helper_gvec_shr64i,
3046         },
3047         .s_list = { INDEX_op_shrs_vec, 0 },
3048         .v_list = { INDEX_op_shrv_vec, 0 },
3049     };
3050
3051     tcg_debug_assert(vece <= MO_64);
3052     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3053 }
3054
3055 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3056                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3057 {
3058     static const GVecGen2sh g = {
3059         .fni4 = tcg_gen_sar_i32,
3060         .fni8 = tcg_gen_sar_i64,
3061         .fniv_s = tcg_gen_sars_vec,
3062         .fniv_v = tcg_gen_sarv_vec,
3063         .fno = {
3064             gen_helper_gvec_sar8i,
3065             gen_helper_gvec_sar16i,
3066             gen_helper_gvec_sar32i,
3067             gen_helper_gvec_sar64i,
3068         },
3069         .s_list = { INDEX_op_sars_vec, 0 },
3070         .v_list = { INDEX_op_sarv_vec, 0 },
3071     };
3072
3073     tcg_debug_assert(vece <= MO_64);
3074     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3075 }
3076
3077 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3078                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3079 {
3080     static const GVecGen2sh g = {
3081         .fni4 = tcg_gen_rotl_i32,
3082         .fni8 = tcg_gen_rotl_i64,
3083         .fniv_s = tcg_gen_rotls_vec,
3084         .fniv_v = tcg_gen_rotlv_vec,
3085         .fno = {
3086             gen_helper_gvec_rotl8i,
3087             gen_helper_gvec_rotl16i,
3088             gen_helper_gvec_rotl32i,
3089             gen_helper_gvec_rotl64i,
3090         },
3091         .s_list = { INDEX_op_rotls_vec, 0 },
3092         .v_list = { INDEX_op_rotlv_vec, 0 },
3093     };
3094
3095     tcg_debug_assert(vece <= MO_64);
3096     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3097 }
3098
3099 /*
3100  * Expand D = A << (B % element bits)
3101  *
3102  * Unlike scalar shifts, where it is easy for the target front end
3103  * to include the modulo as part of the expansion.  If the target
3104  * naturally includes the modulo as part of the operation, great!
3105  * If the target has some other behaviour from out-of-range shifts,
3106  * then it could not use this function anyway, and would need to
3107  * do it's own expansion with custom functions.
3108  */
3109 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3110                                  TCGv_vec a, TCGv_vec b)
3111 {
3112     TCGv_vec t = tcg_temp_new_vec_matching(d);
3113
3114     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3115     tcg_gen_and_vec(vece, t, t, b);
3116     tcg_gen_shlv_vec(vece, d, a, t);
3117     tcg_temp_free_vec(t);
3118 }
3119
3120 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3121 {
3122     TCGv_i32 t = tcg_temp_new_i32();
3123
3124     tcg_gen_andi_i32(t, b, 31);
3125     tcg_gen_shl_i32(d, a, t);
3126     tcg_temp_free_i32(t);
3127 }
3128
3129 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3130 {
3131     TCGv_i64 t = tcg_temp_new_i64();
3132
3133     tcg_gen_andi_i64(t, b, 63);
3134     tcg_gen_shl_i64(d, a, t);
3135     tcg_temp_free_i64(t);
3136 }
3137
3138 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3139                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3140 {
3141     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3142     static const GVecGen3 g[4] = {
3143         { .fniv = tcg_gen_shlv_mod_vec,
3144           .fno = gen_helper_gvec_shl8v,
3145           .opt_opc = vecop_list,
3146           .vece = MO_8 },
3147         { .fniv = tcg_gen_shlv_mod_vec,
3148           .fno = gen_helper_gvec_shl16v,
3149           .opt_opc = vecop_list,
3150           .vece = MO_16 },
3151         { .fni4 = tcg_gen_shl_mod_i32,
3152           .fniv = tcg_gen_shlv_mod_vec,
3153           .fno = gen_helper_gvec_shl32v,
3154           .opt_opc = vecop_list,
3155           .vece = MO_32 },
3156         { .fni8 = tcg_gen_shl_mod_i64,
3157           .fniv = tcg_gen_shlv_mod_vec,
3158           .fno = gen_helper_gvec_shl64v,
3159           .opt_opc = vecop_list,
3160           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3161           .vece = MO_64 },
3162     };
3163
3164     tcg_debug_assert(vece <= MO_64);
3165     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3166 }
3167
3168 /*
3169  * Similarly for logical right shifts.
3170  */
3171
3172 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3173                                  TCGv_vec a, TCGv_vec b)
3174 {
3175     TCGv_vec t = tcg_temp_new_vec_matching(d);
3176
3177     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3178     tcg_gen_and_vec(vece, t, t, b);
3179     tcg_gen_shrv_vec(vece, d, a, t);
3180     tcg_temp_free_vec(t);
3181 }
3182
3183 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3184 {
3185     TCGv_i32 t = tcg_temp_new_i32();
3186
3187     tcg_gen_andi_i32(t, b, 31);
3188     tcg_gen_shr_i32(d, a, t);
3189     tcg_temp_free_i32(t);
3190 }
3191
3192 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3193 {
3194     TCGv_i64 t = tcg_temp_new_i64();
3195
3196     tcg_gen_andi_i64(t, b, 63);
3197     tcg_gen_shr_i64(d, a, t);
3198     tcg_temp_free_i64(t);
3199 }
3200
3201 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3202                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3203 {
3204     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3205     static const GVecGen3 g[4] = {
3206         { .fniv = tcg_gen_shrv_mod_vec,
3207           .fno = gen_helper_gvec_shr8v,
3208           .opt_opc = vecop_list,
3209           .vece = MO_8 },
3210         { .fniv = tcg_gen_shrv_mod_vec,
3211           .fno = gen_helper_gvec_shr16v,
3212           .opt_opc = vecop_list,
3213           .vece = MO_16 },
3214         { .fni4 = tcg_gen_shr_mod_i32,
3215           .fniv = tcg_gen_shrv_mod_vec,
3216           .fno = gen_helper_gvec_shr32v,
3217           .opt_opc = vecop_list,
3218           .vece = MO_32 },
3219         { .fni8 = tcg_gen_shr_mod_i64,
3220           .fniv = tcg_gen_shrv_mod_vec,
3221           .fno = gen_helper_gvec_shr64v,
3222           .opt_opc = vecop_list,
3223           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3224           .vece = MO_64 },
3225     };
3226
3227     tcg_debug_assert(vece <= MO_64);
3228     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3229 }
3230
3231 /*
3232  * Similarly for arithmetic right shifts.
3233  */
3234
3235 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3236                                  TCGv_vec a, TCGv_vec b)
3237 {
3238     TCGv_vec t = tcg_temp_new_vec_matching(d);
3239
3240     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3241     tcg_gen_and_vec(vece, t, t, b);
3242     tcg_gen_sarv_vec(vece, d, a, t);
3243     tcg_temp_free_vec(t);
3244 }
3245
3246 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3247 {
3248     TCGv_i32 t = tcg_temp_new_i32();
3249
3250     tcg_gen_andi_i32(t, b, 31);
3251     tcg_gen_sar_i32(d, a, t);
3252     tcg_temp_free_i32(t);
3253 }
3254
3255 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3256 {
3257     TCGv_i64 t = tcg_temp_new_i64();
3258
3259     tcg_gen_andi_i64(t, b, 63);
3260     tcg_gen_sar_i64(d, a, t);
3261     tcg_temp_free_i64(t);
3262 }
3263
3264 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3265                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3266 {
3267     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3268     static const GVecGen3 g[4] = {
3269         { .fniv = tcg_gen_sarv_mod_vec,
3270           .fno = gen_helper_gvec_sar8v,
3271           .opt_opc = vecop_list,
3272           .vece = MO_8 },
3273         { .fniv = tcg_gen_sarv_mod_vec,
3274           .fno = gen_helper_gvec_sar16v,
3275           .opt_opc = vecop_list,
3276           .vece = MO_16 },
3277         { .fni4 = tcg_gen_sar_mod_i32,
3278           .fniv = tcg_gen_sarv_mod_vec,
3279           .fno = gen_helper_gvec_sar32v,
3280           .opt_opc = vecop_list,
3281           .vece = MO_32 },
3282         { .fni8 = tcg_gen_sar_mod_i64,
3283           .fniv = tcg_gen_sarv_mod_vec,
3284           .fno = gen_helper_gvec_sar64v,
3285           .opt_opc = vecop_list,
3286           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3287           .vece = MO_64 },
3288     };
3289
3290     tcg_debug_assert(vece <= MO_64);
3291     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3292 }
3293
3294 /*
3295  * Similarly for rotates.
3296  */
3297
3298 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3299                                   TCGv_vec a, TCGv_vec b)
3300 {
3301     TCGv_vec t = tcg_temp_new_vec_matching(d);
3302
3303     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3304     tcg_gen_and_vec(vece, t, t, b);
3305     tcg_gen_rotlv_vec(vece, d, a, t);
3306     tcg_temp_free_vec(t);
3307 }
3308
3309 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3310 {
3311     TCGv_i32 t = tcg_temp_new_i32();
3312
3313     tcg_gen_andi_i32(t, b, 31);
3314     tcg_gen_rotl_i32(d, a, t);
3315     tcg_temp_free_i32(t);
3316 }
3317
3318 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3319 {
3320     TCGv_i64 t = tcg_temp_new_i64();
3321
3322     tcg_gen_andi_i64(t, b, 63);
3323     tcg_gen_rotl_i64(d, a, t);
3324     tcg_temp_free_i64(t);
3325 }
3326
3327 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3328                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3329 {
3330     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3331     static const GVecGen3 g[4] = {
3332         { .fniv = tcg_gen_rotlv_mod_vec,
3333           .fno = gen_helper_gvec_rotl8v,
3334           .opt_opc = vecop_list,
3335           .vece = MO_8 },
3336         { .fniv = tcg_gen_rotlv_mod_vec,
3337           .fno = gen_helper_gvec_rotl16v,
3338           .opt_opc = vecop_list,
3339           .vece = MO_16 },
3340         { .fni4 = tcg_gen_rotl_mod_i32,
3341           .fniv = tcg_gen_rotlv_mod_vec,
3342           .fno = gen_helper_gvec_rotl32v,
3343           .opt_opc = vecop_list,
3344           .vece = MO_32 },
3345         { .fni8 = tcg_gen_rotl_mod_i64,
3346           .fniv = tcg_gen_rotlv_mod_vec,
3347           .fno = gen_helper_gvec_rotl64v,
3348           .opt_opc = vecop_list,
3349           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3350           .vece = MO_64 },
3351     };
3352
3353     tcg_debug_assert(vece <= MO_64);
3354     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3355 }
3356
3357 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3358                                   TCGv_vec a, TCGv_vec b)
3359 {
3360     TCGv_vec t = tcg_temp_new_vec_matching(d);
3361
3362     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3363     tcg_gen_and_vec(vece, t, t, b);
3364     tcg_gen_rotrv_vec(vece, d, a, t);
3365     tcg_temp_free_vec(t);
3366 }
3367
3368 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3369 {
3370     TCGv_i32 t = tcg_temp_new_i32();
3371
3372     tcg_gen_andi_i32(t, b, 31);
3373     tcg_gen_rotr_i32(d, a, t);
3374     tcg_temp_free_i32(t);
3375 }
3376
3377 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3378 {
3379     TCGv_i64 t = tcg_temp_new_i64();
3380
3381     tcg_gen_andi_i64(t, b, 63);
3382     tcg_gen_rotr_i64(d, a, t);
3383     tcg_temp_free_i64(t);
3384 }
3385
3386 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3387                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3388 {
3389     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3390     static const GVecGen3 g[4] = {
3391         { .fniv = tcg_gen_rotrv_mod_vec,
3392           .fno = gen_helper_gvec_rotr8v,
3393           .opt_opc = vecop_list,
3394           .vece = MO_8 },
3395         { .fniv = tcg_gen_rotrv_mod_vec,
3396           .fno = gen_helper_gvec_rotr16v,
3397           .opt_opc = vecop_list,
3398           .vece = MO_16 },
3399         { .fni4 = tcg_gen_rotr_mod_i32,
3400           .fniv = tcg_gen_rotrv_mod_vec,
3401           .fno = gen_helper_gvec_rotr32v,
3402           .opt_opc = vecop_list,
3403           .vece = MO_32 },
3404         { .fni8 = tcg_gen_rotr_mod_i64,
3405           .fniv = tcg_gen_rotrv_mod_vec,
3406           .fno = gen_helper_gvec_rotr64v,
3407           .opt_opc = vecop_list,
3408           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3409           .vece = MO_64 },
3410     };
3411
3412     tcg_debug_assert(vece <= MO_64);
3413     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3414 }
3415
3416 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3417 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3418                            uint32_t oprsz, TCGCond cond)
3419 {
3420     TCGv_i32 t0 = tcg_temp_new_i32();
3421     TCGv_i32 t1 = tcg_temp_new_i32();
3422     uint32_t i;
3423
3424     for (i = 0; i < oprsz; i += 4) {
3425         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3426         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3427         tcg_gen_setcond_i32(cond, t0, t0, t1);
3428         tcg_gen_neg_i32(t0, t0);
3429         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3430     }
3431     tcg_temp_free_i32(t1);
3432     tcg_temp_free_i32(t0);
3433 }
3434
3435 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3436                            uint32_t oprsz, TCGCond cond)
3437 {
3438     TCGv_i64 t0 = tcg_temp_new_i64();
3439     TCGv_i64 t1 = tcg_temp_new_i64();
3440     uint32_t i;
3441
3442     for (i = 0; i < oprsz; i += 8) {
3443         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3444         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3445         tcg_gen_setcond_i64(cond, t0, t0, t1);
3446         tcg_gen_neg_i64(t0, t0);
3447         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3448     }
3449     tcg_temp_free_i64(t1);
3450     tcg_temp_free_i64(t0);
3451 }
3452
3453 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3454                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3455                            TCGType type, TCGCond cond)
3456 {
3457     TCGv_vec t0 = tcg_temp_new_vec(type);
3458     TCGv_vec t1 = tcg_temp_new_vec(type);
3459     uint32_t i;
3460
3461     for (i = 0; i < oprsz; i += tysz) {
3462         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3463         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3464         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3465         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3466     }
3467     tcg_temp_free_vec(t1);
3468     tcg_temp_free_vec(t0);
3469 }
3470
3471 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3472                       uint32_t aofs, uint32_t bofs,
3473                       uint32_t oprsz, uint32_t maxsz)
3474 {
3475     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3476     static gen_helper_gvec_3 * const eq_fn[4] = {
3477         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3478         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3479     };
3480     static gen_helper_gvec_3 * const ne_fn[4] = {
3481         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3482         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3483     };
3484     static gen_helper_gvec_3 * const lt_fn[4] = {
3485         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3486         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3487     };
3488     static gen_helper_gvec_3 * const le_fn[4] = {
3489         gen_helper_gvec_le8, gen_helper_gvec_le16,
3490         gen_helper_gvec_le32, gen_helper_gvec_le64
3491     };
3492     static gen_helper_gvec_3 * const ltu_fn[4] = {
3493         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3494         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3495     };
3496     static gen_helper_gvec_3 * const leu_fn[4] = {
3497         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3498         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3499     };
3500     static gen_helper_gvec_3 * const * const fns[16] = {
3501         [TCG_COND_EQ] = eq_fn,
3502         [TCG_COND_NE] = ne_fn,
3503         [TCG_COND_LT] = lt_fn,
3504         [TCG_COND_LE] = le_fn,
3505         [TCG_COND_LTU] = ltu_fn,
3506         [TCG_COND_LEU] = leu_fn,
3507     };
3508
3509     const TCGOpcode *hold_list;
3510     TCGType type;
3511     uint32_t some;
3512
3513     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3514     check_overlap_3(dofs, aofs, bofs, maxsz);
3515
3516     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3517         do_dup(MO_8, dofs, oprsz, maxsz,
3518                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3519         return;
3520     }
3521
3522     /*
3523      * Implement inline with a vector type, if possible.
3524      * Prefer integer when 64-bit host and 64-bit comparison.
3525      */
3526     hold_list = tcg_swap_vecop_list(cmp_list);
3527     type = choose_vector_type(cmp_list, vece, oprsz,
3528                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3529     switch (type) {
3530     case TCG_TYPE_V256:
3531         /* Recall that ARM SVE allows vector sizes that are not a
3532          * power of 2, but always a multiple of 16.  The intent is
3533          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3534          */
3535         some = QEMU_ALIGN_DOWN(oprsz, 32);
3536         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3537         if (some == oprsz) {
3538             break;
3539         }
3540         dofs += some;
3541         aofs += some;
3542         bofs += some;
3543         oprsz -= some;
3544         maxsz -= some;
3545         /* fallthru */
3546     case TCG_TYPE_V128:
3547         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3548         break;
3549     case TCG_TYPE_V64:
3550         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3551         break;
3552
3553     case 0:
3554         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3555             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3556         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3557             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3558         } else {
3559             gen_helper_gvec_3 * const *fn = fns[cond];
3560
3561             if (fn == NULL) {
3562                 uint32_t tmp;
3563                 tmp = aofs, aofs = bofs, bofs = tmp;
3564                 cond = tcg_swap_cond(cond);
3565                 fn = fns[cond];
3566                 assert(fn != NULL);
3567             }
3568             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3569             oprsz = maxsz;
3570         }
3571         break;
3572
3573     default:
3574         g_assert_not_reached();
3575     }
3576     tcg_swap_vecop_list(hold_list);
3577
3578     if (oprsz < maxsz) {
3579         expand_clr(dofs + oprsz, maxsz - oprsz);
3580     }
3581 }
3582
3583 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3584 {
3585     TCGv_i64 t = tcg_temp_new_i64();
3586
3587     tcg_gen_and_i64(t, b, a);
3588     tcg_gen_andc_i64(d, c, a);
3589     tcg_gen_or_i64(d, d, t);
3590     tcg_temp_free_i64(t);
3591 }
3592
3593 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3594                          uint32_t bofs, uint32_t cofs,
3595                          uint32_t oprsz, uint32_t maxsz)
3596 {
3597     static const GVecGen4 g = {
3598         .fni8 = tcg_gen_bitsel_i64,
3599         .fniv = tcg_gen_bitsel_vec,
3600         .fno = gen_helper_gvec_bitsel,
3601     };
3602
3603     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3604 }