tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "tcg/tcg.h"
  22 #include "tcg/tcg-op.h"
  23 #include "tcg/tcg-op-gvec.h"
  24 #include "qemu/main-loop.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26
  27 #define MAX_UNROLL  4
  28
  29 #ifdef CONFIG_DEBUG_TCG
  30 static const TCGOpcode vecop_list_empty[1] = { 0 };
  31 #else
  32 #define vecop_list_empty NULL
  33 #endif
  34
  35
  36 /* Verify vector size and alignment rules.  OFS should be the OR of all
  37    of the operand offsets so that we can check them all at once.  */
  38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39 {
  40     uint32_t max_align;
  41
  42     switch (oprsz) {
  43     case 8:
  44     case 16:
  45     case 32:
  46         tcg_debug_assert(oprsz <= maxsz);
  47         break;
  48     default:
  49         tcg_debug_assert(oprsz == maxsz);
  50         break;
  51     }
  52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54     max_align = maxsz >= 16 ? 15 : 7;
  55     tcg_debug_assert((maxsz & max_align) == 0);
  56     tcg_debug_assert((ofs & max_align) == 0);
  57 }
  58
  59 /* Verify vector overlap rules for two operands.  */
  60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61 {
  62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63 }
  64
  65 /* Verify vector overlap rules for three operands.  */
  66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67 {
  68     check_overlap_2(d, a, s);
  69     check_overlap_2(d, b, s);
  70     check_overlap_2(a, b, s);
  71 }
  72
  73 /* Verify vector overlap rules for four operands.  */
  74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                             uint32_t c, uint32_t s)
  76 {
  77     check_overlap_2(d, a, s);
  78     check_overlap_2(d, b, s);
  79     check_overlap_2(d, c, s);
  80     check_overlap_2(a, b, s);
  81     check_overlap_2(a, c, s);
  82     check_overlap_2(b, c, s);
  83 }
  84
  85 /* Create a descriptor from components.  */
  86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87 {
  88     uint32_t desc = 0;
  89
  90     check_size_align(oprsz, maxsz, 0);
  91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93     oprsz = (oprsz / 8) - 1;
  94     maxsz = (maxsz / 8) - 1;
  95
  96     /*
  97      * We have just asserted in check_size_align that either
  98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99      * case with '2', as that would otherwise map to 24.
 100      */
 101     if (oprsz == maxsz) {
 102         oprsz = 2;
 103     }
 104
 105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109     return desc;
 110 }
 111
 112 /* Generate a call to a gvec-style helper with two vector operands.  */
 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                         gen_helper_gvec_2 *fn)
 116 {
 117     TCGv_ptr a0, a1;
 118     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 119
 120     a0 = tcg_temp_new_ptr();
 121     a1 = tcg_temp_new_ptr();
 122
 123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 125
 126     fn(a0, a1, desc);
 127
 128     tcg_temp_free_ptr(a0);
 129     tcg_temp_free_ptr(a1);
 130     tcg_temp_free_i32(desc);
 131 }
 132
 133 /* Generate a call to a gvec-style helper with two vector operands
 134    and one scalar operand.  */
 135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 136                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 137                          gen_helper_gvec_2i *fn)
 138 {
 139     TCGv_ptr a0, a1;
 140     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 141
 142     a0 = tcg_temp_new_ptr();
 143     a1 = tcg_temp_new_ptr();
 144
 145     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 146     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 147
 148     fn(a0, a1, c, desc);
 149
 150     tcg_temp_free_ptr(a0);
 151     tcg_temp_free_ptr(a1);
 152     tcg_temp_free_i32(desc);
 153 }
 154
 155 /* Generate a call to a gvec-style helper with three vector operands.  */
 156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 157                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 158                         gen_helper_gvec_3 *fn)
 159 {
 160     TCGv_ptr a0, a1, a2;
 161     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 162
 163     a0 = tcg_temp_new_ptr();
 164     a1 = tcg_temp_new_ptr();
 165     a2 = tcg_temp_new_ptr();
 166
 167     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 168     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 169     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 170
 171     fn(a0, a1, a2, desc);
 172
 173     tcg_temp_free_ptr(a0);
 174     tcg_temp_free_ptr(a1);
 175     tcg_temp_free_ptr(a2);
 176     tcg_temp_free_i32(desc);
 177 }
 178
 179 /* Generate a call to a gvec-style helper with four vector operands.  */
 180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 181                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 182                         int32_t data, gen_helper_gvec_4 *fn)
 183 {
 184     TCGv_ptr a0, a1, a2, a3;
 185     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 186
 187     a0 = tcg_temp_new_ptr();
 188     a1 = tcg_temp_new_ptr();
 189     a2 = tcg_temp_new_ptr();
 190     a3 = tcg_temp_new_ptr();
 191
 192     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 193     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 194     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 195     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 196
 197     fn(a0, a1, a2, a3, desc);
 198
 199     tcg_temp_free_ptr(a0);
 200     tcg_temp_free_ptr(a1);
 201     tcg_temp_free_ptr(a2);
 202     tcg_temp_free_ptr(a3);
 203     tcg_temp_free_i32(desc);
 204 }
 205
 206 /* Generate a call to a gvec-style helper with five vector operands.  */
 207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 208                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 209                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 210 {
 211     TCGv_ptr a0, a1, a2, a3, a4;
 212     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 213
 214     a0 = tcg_temp_new_ptr();
 215     a1 = tcg_temp_new_ptr();
 216     a2 = tcg_temp_new_ptr();
 217     a3 = tcg_temp_new_ptr();
 218     a4 = tcg_temp_new_ptr();
 219
 220     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 221     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 222     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 223     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 224     tcg_gen_addi_ptr(a4, cpu_env, xofs);
 225
 226     fn(a0, a1, a2, a3, a4, desc);
 227
 228     tcg_temp_free_ptr(a0);
 229     tcg_temp_free_ptr(a1);
 230     tcg_temp_free_ptr(a2);
 231     tcg_temp_free_ptr(a3);
 232     tcg_temp_free_ptr(a4);
 233     tcg_temp_free_i32(desc);
 234 }
 235
 236 /* Generate a call to a gvec-style helper with three vector operands
 237    and an extra pointer operand.  */
 238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 239                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 240                         int32_t data, gen_helper_gvec_2_ptr *fn)
 241 {
 242     TCGv_ptr a0, a1;
 243     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 244
 245     a0 = tcg_temp_new_ptr();
 246     a1 = tcg_temp_new_ptr();
 247
 248     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 249     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 250
 251     fn(a0, a1, ptr, desc);
 252
 253     tcg_temp_free_ptr(a0);
 254     tcg_temp_free_ptr(a1);
 255     tcg_temp_free_i32(desc);
 256 }
 257
 258 /* Generate a call to a gvec-style helper with three vector operands
 259    and an extra pointer operand.  */
 260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 261                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 262                         int32_t data, gen_helper_gvec_3_ptr *fn)
 263 {
 264     TCGv_ptr a0, a1, a2;
 265     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 266
 267     a0 = tcg_temp_new_ptr();
 268     a1 = tcg_temp_new_ptr();
 269     a2 = tcg_temp_new_ptr();
 270
 271     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 272     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 273     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 274
 275     fn(a0, a1, a2, ptr, desc);
 276
 277     tcg_temp_free_ptr(a0);
 278     tcg_temp_free_ptr(a1);
 279     tcg_temp_free_ptr(a2);
 280     tcg_temp_free_i32(desc);
 281 }
 282
 283 /* Generate a call to a gvec-style helper with four vector operands
 284    and an extra pointer operand.  */
 285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 286                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 287                         uint32_t maxsz, int32_t data,
 288                         gen_helper_gvec_4_ptr *fn)
 289 {
 290     TCGv_ptr a0, a1, a2, a3;
 291     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 292
 293     a0 = tcg_temp_new_ptr();
 294     a1 = tcg_temp_new_ptr();
 295     a2 = tcg_temp_new_ptr();
 296     a3 = tcg_temp_new_ptr();
 297
 298     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 299     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 300     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 301     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 302
 303     fn(a0, a1, a2, a3, ptr, desc);
 304
 305     tcg_temp_free_ptr(a0);
 306     tcg_temp_free_ptr(a1);
 307     tcg_temp_free_ptr(a2);
 308     tcg_temp_free_ptr(a3);
 309     tcg_temp_free_i32(desc);
 310 }
 311
 312 /* Generate a call to a gvec-style helper with five vector operands
 313    and an extra pointer operand.  */
 314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 315                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 316                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 317                         gen_helper_gvec_5_ptr *fn)
 318 {
 319     TCGv_ptr a0, a1, a2, a3, a4;
 320     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 321
 322     a0 = tcg_temp_new_ptr();
 323     a1 = tcg_temp_new_ptr();
 324     a2 = tcg_temp_new_ptr();
 325     a3 = tcg_temp_new_ptr();
 326     a4 = tcg_temp_new_ptr();
 327
 328     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 329     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 330     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 331     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 332     tcg_gen_addi_ptr(a4, cpu_env, eofs);
 333
 334     fn(a0, a1, a2, a3, a4, ptr, desc);
 335
 336     tcg_temp_free_ptr(a0);
 337     tcg_temp_free_ptr(a1);
 338     tcg_temp_free_ptr(a2);
 339     tcg_temp_free_ptr(a3);
 340     tcg_temp_free_ptr(a4);
 341     tcg_temp_free_i32(desc);
 342 }
 343
 344 /* Return true if we want to implement something of OPRSZ bytes
 345    in units of LNSZ.  This limits the expansion of inline code.  */
 346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 347 {
 348     uint32_t q, r;
 349
 350     if (oprsz < lnsz) {
 351         return false;
 352     }
 353
 354     q = oprsz / lnsz;
 355     r = oprsz % lnsz;
 356     tcg_debug_assert((r & 7) == 0);
 357
 358     if (lnsz < 16) {
 359         /* For sizes below 16, accept no remainder. */
 360         if (r != 0) {
 361             return false;
 362         }
 363     } else {
 364         /*
 365          * Recall that ARM SVE allows vector sizes that are not a
 366          * power of 2, but always a multiple of 16.  The intent is
 367          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 368          * In addition, expand_clr needs to handle a multiple of 8.
 369          * Thus we can handle the tail with one more operation per
 370          * diminishing power of 2.
 371          */
 372         q += ctpop32(r);
 373     }
 374
 375     return q <= MAX_UNROLL;
 376 }
 377
 378 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 379
 380 /* Duplicate C as per VECE.  */
 381 uint64_t (dup_const)(unsigned vece, uint64_t c)
 382 {
 383     switch (vece) {
 384     case MO_8:
 385         return 0x0101010101010101ull * (uint8_t)c;
 386     case MO_16:
 387         return 0x0001000100010001ull * (uint16_t)c;
 388     case MO_32:
 389         return 0x0000000100000001ull * (uint32_t)c;
 390     case MO_64:
 391         return c;
 392     default:
 393         g_assert_not_reached();
 394     }
 395 }
 396
 397 /* Duplicate IN into OUT as per VECE.  */
 398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 399 {
 400     switch (vece) {
 401     case MO_8:
 402         tcg_gen_ext8u_i32(out, in);
 403         tcg_gen_muli_i32(out, out, 0x01010101);
 404         break;
 405     case MO_16:
 406         tcg_gen_deposit_i32(out, in, in, 16, 16);
 407         break;
 408     case MO_32:
 409         tcg_gen_mov_i32(out, in);
 410         break;
 411     default:
 412         g_assert_not_reached();
 413     }
 414 }
 415
 416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 417 {
 418     switch (vece) {
 419     case MO_8:
 420         tcg_gen_ext8u_i64(out, in);
 421         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 422         break;
 423     case MO_16:
 424         tcg_gen_ext16u_i64(out, in);
 425         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 426         break;
 427     case MO_32:
 428         tcg_gen_deposit_i64(out, in, in, 32, 32);
 429         break;
 430     case MO_64:
 431         tcg_gen_mov_i64(out, in);
 432         break;
 433     default:
 434         g_assert_not_reached();
 435     }
 436 }
 437
 438 /* Select a supported vector type for implementing an operation on SIZE
 439  * bytes.  If OP is 0, assume that the real operation to be performed is
 440  * required by all backends.  Otherwise, make sure than OP can be performed
 441  * on elements of size VECE in the selected type.  Do not select V64 if
 442  * PREFER_I64 is true.  Return 0 if no vector type is selected.
 443  */
 444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 445                                   uint32_t size, bool prefer_i64)
 446 {
 447     /*
 448      * Recall that ARM SVE allows vector sizes that are not a
 449      * power of 2, but always a multiple of 16.  The intent is
 450      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 451      * It is hard to imagine a case in which v256 is supported
 452      * but v128 is not, but check anyway.
 453      * In addition, expand_clr needs to handle a multiple of 8.
 454      */
 455     if (TCG_TARGET_HAS_v256 &&
 456         check_size_impl(size, 32) &&
 457         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 458         (!(size & 16) ||
 459          (TCG_TARGET_HAS_v128 &&
 460           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 461         (!(size & 8) ||
 462          (TCG_TARGET_HAS_v64 &&
 463           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 464         return TCG_TYPE_V256;
 465     }
 466     if (TCG_TARGET_HAS_v128 &&
 467         check_size_impl(size, 16) &&
 468         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 469         (!(size & 8) ||
 470          (TCG_TARGET_HAS_v64 &&
 471           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 472         return TCG_TYPE_V128;
 473     }
 474     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 475         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 476         return TCG_TYPE_V64;
 477     }
 478     return 0;
 479 }
 480
 481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 482                          uint32_t maxsz, TCGv_vec t_vec)
 483 {
 484     uint32_t i = 0;
 485
 486     tcg_debug_assert(oprsz >= 8);
 487
 488     /*
 489      * This may be expand_clr for the tail of an operation, e.g.
 490      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 491      * are misaligned wrt the maximum vector size, so do that first.
 492      */
 493     if (dofs & 8) {
 494         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 495         i += 8;
 496     }
 497
 498     switch (type) {
 499     case TCG_TYPE_V256:
 500         /*
 501          * Recall that ARM SVE allows vector sizes that are not a
 502          * power of 2, but always a multiple of 16.  The intent is
 503          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 504          */
 505         for (; i + 32 <= oprsz; i += 32) {
 506             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 507         }
 508         /* fallthru */
 509     case TCG_TYPE_V128:
 510         for (; i + 16 <= oprsz; i += 16) {
 511             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 512         }
 513         break;
 514     case TCG_TYPE_V64:
 515         for (; i < oprsz; i += 8) {
 516             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 517         }
 518         break;
 519     default:
 520         g_assert_not_reached();
 521     }
 522
 523     if (oprsz < maxsz) {
 524         expand_clr(dofs + oprsz, maxsz - oprsz);
 525     }
 526 }
 527
 528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 529  * Only one of IN_32 or IN_64 may be set;
 530  * IN_C is used if IN_32 and IN_64 are unset.
 531  */
 532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 533                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 534                    uint64_t in_c)
 535 {
 536     TCGType type;
 537     TCGv_i64 t_64;
 538     TCGv_i32 t_32, t_desc;
 539     TCGv_ptr t_ptr;
 540     uint32_t i;
 541
 542     assert(vece <= (in_32 ? MO_32 : MO_64));
 543     assert(in_32 == NULL || in_64 == NULL);
 544
 545     /* If we're storing 0, expand oprsz to maxsz.  */
 546     if (in_32 == NULL && in_64 == NULL) {
 547         in_c = dup_const(vece, in_c);
 548         if (in_c == 0) {
 549             oprsz = maxsz;
 550         }
 551     }
 552
 553     /* Implement inline with a vector type, if possible.
 554      * Prefer integer when 64-bit host and no variable dup.
 555      */
 556     type = choose_vector_type(NULL, vece, oprsz,
 557                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 558                                && (in_64 == NULL || vece == MO_64)));
 559     if (type != 0) {
 560         TCGv_vec t_vec = tcg_temp_new_vec(type);
 561
 562         if (in_32) {
 563             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 564         } else if (in_64) {
 565             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 566         } else {
 567             tcg_gen_dupi_vec(vece, t_vec, in_c);
 568         }
 569         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 570         tcg_temp_free_vec(t_vec);
 571         return;
 572     }
 573
 574     /* Otherwise, inline with an integer type, unless "large".  */
 575     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 576         t_64 = NULL;
 577         t_32 = NULL;
 578
 579         if (in_32) {
 580             /* We are given a 32-bit variable input.  For a 64-bit host,
 581                use a 64-bit operation unless the 32-bit operation would
 582                be simple enough.  */
 583             if (TCG_TARGET_REG_BITS == 64
 584                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 585                 t_64 = tcg_temp_new_i64();
 586                 tcg_gen_extu_i32_i64(t_64, in_32);
 587                 gen_dup_i64(vece, t_64, t_64);
 588             } else {
 589                 t_32 = tcg_temp_new_i32();
 590                 gen_dup_i32(vece, t_32, in_32);
 591             }
 592         } else if (in_64) {
 593             /* We are given a 64-bit variable input.  */
 594             t_64 = tcg_temp_new_i64();
 595             gen_dup_i64(vece, t_64, in_64);
 596         } else {
 597             /* We are given a constant input.  */
 598             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 599                or when we'd need too many 32-bit stores, or when a 64-bit
 600                constant is really required.  */
 601             if (vece == MO_64
 602                 || (TCG_TARGET_REG_BITS == 64
 603                     && (in_c == 0 || in_c == -1
 604                         || !check_size_impl(oprsz, 4)))) {
 605                 t_64 = tcg_const_i64(in_c);
 606             } else {
 607                 t_32 = tcg_const_i32(in_c);
 608             }
 609         }
 610
 611         /* Implement inline if we picked an implementation size above.  */
 612         if (t_32) {
 613             for (i = 0; i < oprsz; i += 4) {
 614                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 615             }
 616             tcg_temp_free_i32(t_32);
 617             goto done;
 618         }
 619         if (t_64) {
 620             for (i = 0; i < oprsz; i += 8) {
 621                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 622             }
 623             tcg_temp_free_i64(t_64);
 624             goto done;
 625         }
 626     }
 627
 628     /* Otherwise implement out of line.  */
 629     t_ptr = tcg_temp_new_ptr();
 630     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 631     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 632
 633     if (vece == MO_64) {
 634         if (in_64) {
 635             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 636         } else {
 637             t_64 = tcg_const_i64(in_c);
 638             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 639             tcg_temp_free_i64(t_64);
 640         }
 641     } else {
 642         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 643         static dup_fn * const fns[3] = {
 644             gen_helper_gvec_dup8,
 645             gen_helper_gvec_dup16,
 646             gen_helper_gvec_dup32
 647         };
 648
 649         if (in_32) {
 650             fns[vece](t_ptr, t_desc, in_32);
 651         } else {
 652             t_32 = tcg_temp_new_i32();
 653             if (in_64) {
 654                 tcg_gen_extrl_i64_i32(t_32, in_64);
 655             } else if (vece == MO_8) {
 656                 tcg_gen_movi_i32(t_32, in_c & 0xff);
 657             } else if (vece == MO_16) {
 658                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
 659             } else {
 660                 tcg_gen_movi_i32(t_32, in_c);
 661             }
 662             fns[vece](t_ptr, t_desc, t_32);
 663             tcg_temp_free_i32(t_32);
 664         }
 665     }
 666
 667     tcg_temp_free_ptr(t_ptr);
 668     tcg_temp_free_i32(t_desc);
 669     return;
 670
 671  done:
 672     if (oprsz < maxsz) {
 673         expand_clr(dofs + oprsz, maxsz - oprsz);
 674     }
 675 }
 676
 677 /* Likewise, but with zero.  */
 678 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 679 {
 680     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 681 }
 682
 683 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 684 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 685                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 686 {
 687     TCGv_i32 t0 = tcg_temp_new_i32();
 688     TCGv_i32 t1 = tcg_temp_new_i32();
 689     uint32_t i;
 690
 691     for (i = 0; i < oprsz; i += 4) {
 692         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 693         if (load_dest) {
 694             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 695         }
 696         fni(t1, t0);
 697         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 698     }
 699     tcg_temp_free_i32(t0);
 700     tcg_temp_free_i32(t1);
 701 }
 702
 703 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 704                           int32_t c, bool load_dest,
 705                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 706 {
 707     TCGv_i32 t0 = tcg_temp_new_i32();
 708     TCGv_i32 t1 = tcg_temp_new_i32();
 709     uint32_t i;
 710
 711     for (i = 0; i < oprsz; i += 4) {
 712         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 713         if (load_dest) {
 714             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 715         }
 716         fni(t1, t0, c);
 717         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 718     }
 719     tcg_temp_free_i32(t0);
 720     tcg_temp_free_i32(t1);
 721 }
 722
 723 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 724                           TCGv_i32 c, bool scalar_first,
 725                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 726 {
 727     TCGv_i32 t0 = tcg_temp_new_i32();
 728     TCGv_i32 t1 = tcg_temp_new_i32();
 729     uint32_t i;
 730
 731     for (i = 0; i < oprsz; i += 4) {
 732         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 733         if (scalar_first) {
 734             fni(t1, c, t0);
 735         } else {
 736             fni(t1, t0, c);
 737         }
 738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 739     }
 740     tcg_temp_free_i32(t0);
 741     tcg_temp_free_i32(t1);
 742 }
 743
 744 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 745 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 746                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 747                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 748 {
 749     TCGv_i32 t0 = tcg_temp_new_i32();
 750     TCGv_i32 t1 = tcg_temp_new_i32();
 751     TCGv_i32 t2 = tcg_temp_new_i32();
 752     uint32_t i;
 753
 754     for (i = 0; i < oprsz; i += 4) {
 755         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 756         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 757         if (load_dest) {
 758             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 759         }
 760         fni(t2, t0, t1);
 761         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 762     }
 763     tcg_temp_free_i32(t2);
 764     tcg_temp_free_i32(t1);
 765     tcg_temp_free_i32(t0);
 766 }
 767
 768 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 769                           uint32_t oprsz, int32_t c, bool load_dest,
 770                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 771 {
 772     TCGv_i32 t0 = tcg_temp_new_i32();
 773     TCGv_i32 t1 = tcg_temp_new_i32();
 774     TCGv_i32 t2 = tcg_temp_new_i32();
 775     uint32_t i;
 776
 777     for (i = 0; i < oprsz; i += 4) {
 778         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 779         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 780         if (load_dest) {
 781             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 782         }
 783         fni(t2, t0, t1, c);
 784         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 785     }
 786     tcg_temp_free_i32(t0);
 787     tcg_temp_free_i32(t1);
 788     tcg_temp_free_i32(t2);
 789 }
 790
 791 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 792 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 793                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 794                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 795 {
 796     TCGv_i32 t0 = tcg_temp_new_i32();
 797     TCGv_i32 t1 = tcg_temp_new_i32();
 798     TCGv_i32 t2 = tcg_temp_new_i32();
 799     TCGv_i32 t3 = tcg_temp_new_i32();
 800     uint32_t i;
 801
 802     for (i = 0; i < oprsz; i += 4) {
 803         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 804         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 805         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 806         fni(t0, t1, t2, t3);
 807         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 808         if (write_aofs) {
 809             tcg_gen_st_i32(t1, cpu_env, aofs + i);
 810         }
 811     }
 812     tcg_temp_free_i32(t3);
 813     tcg_temp_free_i32(t2);
 814     tcg_temp_free_i32(t1);
 815     tcg_temp_free_i32(t0);
 816 }
 817
 818 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 819 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 820                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 821 {
 822     TCGv_i64 t0 = tcg_temp_new_i64();
 823     TCGv_i64 t1 = tcg_temp_new_i64();
 824     uint32_t i;
 825
 826     for (i = 0; i < oprsz; i += 8) {
 827         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 828         if (load_dest) {
 829             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 830         }
 831         fni(t1, t0);
 832         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 833     }
 834     tcg_temp_free_i64(t0);
 835     tcg_temp_free_i64(t1);
 836 }
 837
 838 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 839                           int64_t c, bool load_dest,
 840                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 841 {
 842     TCGv_i64 t0 = tcg_temp_new_i64();
 843     TCGv_i64 t1 = tcg_temp_new_i64();
 844     uint32_t i;
 845
 846     for (i = 0; i < oprsz; i += 8) {
 847         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 848         if (load_dest) {
 849             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 850         }
 851         fni(t1, t0, c);
 852         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 853     }
 854     tcg_temp_free_i64(t0);
 855     tcg_temp_free_i64(t1);
 856 }
 857
 858 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 859                           TCGv_i64 c, bool scalar_first,
 860                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 861 {
 862     TCGv_i64 t0 = tcg_temp_new_i64();
 863     TCGv_i64 t1 = tcg_temp_new_i64();
 864     uint32_t i;
 865
 866     for (i = 0; i < oprsz; i += 8) {
 867         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 868         if (scalar_first) {
 869             fni(t1, c, t0);
 870         } else {
 871             fni(t1, t0, c);
 872         }
 873         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 874     }
 875     tcg_temp_free_i64(t0);
 876     tcg_temp_free_i64(t1);
 877 }
 878
 879 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 880 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 881                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 882                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 883 {
 884     TCGv_i64 t0 = tcg_temp_new_i64();
 885     TCGv_i64 t1 = tcg_temp_new_i64();
 886     TCGv_i64 t2 = tcg_temp_new_i64();
 887     uint32_t i;
 888
 889     for (i = 0; i < oprsz; i += 8) {
 890         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 891         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 892         if (load_dest) {
 893             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 894         }
 895         fni(t2, t0, t1);
 896         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 897     }
 898     tcg_temp_free_i64(t2);
 899     tcg_temp_free_i64(t1);
 900     tcg_temp_free_i64(t0);
 901 }
 902
 903 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 904                           uint32_t oprsz, int64_t c, bool load_dest,
 905                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 906 {
 907     TCGv_i64 t0 = tcg_temp_new_i64();
 908     TCGv_i64 t1 = tcg_temp_new_i64();
 909     TCGv_i64 t2 = tcg_temp_new_i64();
 910     uint32_t i;
 911
 912     for (i = 0; i < oprsz; i += 8) {
 913         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 914         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 915         if (load_dest) {
 916             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 917         }
 918         fni(t2, t0, t1, c);
 919         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 920     }
 921     tcg_temp_free_i64(t0);
 922     tcg_temp_free_i64(t1);
 923     tcg_temp_free_i64(t2);
 924 }
 925
 926 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 927 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 928                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 929                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 930 {
 931     TCGv_i64 t0 = tcg_temp_new_i64();
 932     TCGv_i64 t1 = tcg_temp_new_i64();
 933     TCGv_i64 t2 = tcg_temp_new_i64();
 934     TCGv_i64 t3 = tcg_temp_new_i64();
 935     uint32_t i;
 936
 937     for (i = 0; i < oprsz; i += 8) {
 938         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 939         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 940         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 941         fni(t0, t1, t2, t3);
 942         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 943         if (write_aofs) {
 944             tcg_gen_st_i64(t1, cpu_env, aofs + i);
 945         }
 946     }
 947     tcg_temp_free_i64(t3);
 948     tcg_temp_free_i64(t2);
 949     tcg_temp_free_i64(t1);
 950     tcg_temp_free_i64(t0);
 951 }
 952
 953 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 954 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 955                          uint32_t oprsz, uint32_t tysz, TCGType type,
 956                          bool load_dest,
 957                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 958 {
 959     TCGv_vec t0 = tcg_temp_new_vec(type);
 960     TCGv_vec t1 = tcg_temp_new_vec(type);
 961     uint32_t i;
 962
 963     for (i = 0; i < oprsz; i += tysz) {
 964         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 965         if (load_dest) {
 966             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 967         }
 968         fni(vece, t1, t0);
 969         tcg_gen_st_vec(t1, cpu_env, dofs + i);
 970     }
 971     tcg_temp_free_vec(t0);
 972     tcg_temp_free_vec(t1);
 973 }
 974
 975 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 976    using host vectors.  */
 977 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 978                           uint32_t oprsz, uint32_t tysz, TCGType type,
 979                           int64_t c, bool load_dest,
 980                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 981 {
 982     TCGv_vec t0 = tcg_temp_new_vec(type);
 983     TCGv_vec t1 = tcg_temp_new_vec(type);
 984     uint32_t i;
 985
 986     for (i = 0; i < oprsz; i += tysz) {
 987         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 988         if (load_dest) {
 989             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 990         }
 991         fni(vece, t1, t0, c);
 992         tcg_gen_st_vec(t1, cpu_env, dofs + i);
 993     }
 994     tcg_temp_free_vec(t0);
 995     tcg_temp_free_vec(t1);
 996 }
 997
 998 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 999                           uint32_t oprsz, uint32_t tysz, TCGType type,
1000                           TCGv_vec c, bool scalar_first,
1001                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1002 {
1003     TCGv_vec t0 = tcg_temp_new_vec(type);
1004     TCGv_vec t1 = tcg_temp_new_vec(type);
1005     uint32_t i;
1006
1007     for (i = 0; i < oprsz; i += tysz) {
1008         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009         if (scalar_first) {
1010             fni(vece, t1, c, t0);
1011         } else {
1012             fni(vece, t1, t0, c);
1013         }
1014         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1015     }
1016     tcg_temp_free_vec(t0);
1017     tcg_temp_free_vec(t1);
1018 }
1019
1020 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1021 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1022                          uint32_t bofs, uint32_t oprsz,
1023                          uint32_t tysz, TCGType type, bool load_dest,
1024                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1025 {
1026     TCGv_vec t0 = tcg_temp_new_vec(type);
1027     TCGv_vec t1 = tcg_temp_new_vec(type);
1028     TCGv_vec t2 = tcg_temp_new_vec(type);
1029     uint32_t i;
1030
1031     for (i = 0; i < oprsz; i += tysz) {
1032         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1033         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1034         if (load_dest) {
1035             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1036         }
1037         fni(vece, t2, t0, t1);
1038         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1039     }
1040     tcg_temp_free_vec(t2);
1041     tcg_temp_free_vec(t1);
1042     tcg_temp_free_vec(t0);
1043 }
1044
1045 /*
1046  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1047  * using host vectors.
1048  */
1049 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1050                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1051                           TCGType type, int64_t c, bool load_dest,
1052                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1053                                       int64_t))
1054 {
1055     TCGv_vec t0 = tcg_temp_new_vec(type);
1056     TCGv_vec t1 = tcg_temp_new_vec(type);
1057     TCGv_vec t2 = tcg_temp_new_vec(type);
1058     uint32_t i;
1059
1060     for (i = 0; i < oprsz; i += tysz) {
1061         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1062         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1063         if (load_dest) {
1064             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1065         }
1066         fni(vece, t2, t0, t1, c);
1067         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1068     }
1069     tcg_temp_free_vec(t0);
1070     tcg_temp_free_vec(t1);
1071     tcg_temp_free_vec(t2);
1072 }
1073
1074 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1075 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1076                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1077                          uint32_t tysz, TCGType type, bool write_aofs,
1078                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1079                                      TCGv_vec, TCGv_vec))
1080 {
1081     TCGv_vec t0 = tcg_temp_new_vec(type);
1082     TCGv_vec t1 = tcg_temp_new_vec(type);
1083     TCGv_vec t2 = tcg_temp_new_vec(type);
1084     TCGv_vec t3 = tcg_temp_new_vec(type);
1085     uint32_t i;
1086
1087     for (i = 0; i < oprsz; i += tysz) {
1088         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1089         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1090         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1091         fni(vece, t0, t1, t2, t3);
1092         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1093         if (write_aofs) {
1094             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1095         }
1096     }
1097     tcg_temp_free_vec(t3);
1098     tcg_temp_free_vec(t2);
1099     tcg_temp_free_vec(t1);
1100     tcg_temp_free_vec(t0);
1101 }
1102
1103 /* Expand a vector two-operand operation.  */
1104 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1105                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1106 {
1107     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1108     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1109     TCGType type;
1110     uint32_t some;
1111
1112     check_size_align(oprsz, maxsz, dofs | aofs);
1113     check_overlap_2(dofs, aofs, maxsz);
1114
1115     type = 0;
1116     if (g->fniv) {
1117         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1118     }
1119     switch (type) {
1120     case TCG_TYPE_V256:
1121         /* Recall that ARM SVE allows vector sizes that are not a
1122          * power of 2, but always a multiple of 16.  The intent is
1123          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1124          */
1125         some = QEMU_ALIGN_DOWN(oprsz, 32);
1126         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1127                      g->load_dest, g->fniv);
1128         if (some == oprsz) {
1129             break;
1130         }
1131         dofs += some;
1132         aofs += some;
1133         oprsz -= some;
1134         maxsz -= some;
1135         /* fallthru */
1136     case TCG_TYPE_V128:
1137         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1138                      g->load_dest, g->fniv);
1139         break;
1140     case TCG_TYPE_V64:
1141         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1142                      g->load_dest, g->fniv);
1143         break;
1144
1145     case 0:
1146         if (g->fni8 && check_size_impl(oprsz, 8)) {
1147             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1148         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1149             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1150         } else {
1151             assert(g->fno != NULL);
1152             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1153             oprsz = maxsz;
1154         }
1155         break;
1156
1157     default:
1158         g_assert_not_reached();
1159     }
1160     tcg_swap_vecop_list(hold_list);
1161
1162     if (oprsz < maxsz) {
1163         expand_clr(dofs + oprsz, maxsz - oprsz);
1164     }
1165 }
1166
1167 /* Expand a vector operation with two vectors and an immediate.  */
1168 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1169                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1170 {
1171     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1172     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1173     TCGType type;
1174     uint32_t some;
1175
1176     check_size_align(oprsz, maxsz, dofs | aofs);
1177     check_overlap_2(dofs, aofs, maxsz);
1178
1179     type = 0;
1180     if (g->fniv) {
1181         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1182     }
1183     switch (type) {
1184     case TCG_TYPE_V256:
1185         /* Recall that ARM SVE allows vector sizes that are not a
1186          * power of 2, but always a multiple of 16.  The intent is
1187          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1188          */
1189         some = QEMU_ALIGN_DOWN(oprsz, 32);
1190         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1191                       c, g->load_dest, g->fniv);
1192         if (some == oprsz) {
1193             break;
1194         }
1195         dofs += some;
1196         aofs += some;
1197         oprsz -= some;
1198         maxsz -= some;
1199         /* fallthru */
1200     case TCG_TYPE_V128:
1201         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1202                       c, g->load_dest, g->fniv);
1203         break;
1204     case TCG_TYPE_V64:
1205         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1206                       c, g->load_dest, g->fniv);
1207         break;
1208
1209     case 0:
1210         if (g->fni8 && check_size_impl(oprsz, 8)) {
1211             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1212         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1213             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1214         } else {
1215             if (g->fno) {
1216                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1217             } else {
1218                 TCGv_i64 tcg_c = tcg_const_i64(c);
1219                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1220                                     maxsz, c, g->fnoi);
1221                 tcg_temp_free_i64(tcg_c);
1222             }
1223             oprsz = maxsz;
1224         }
1225         break;
1226
1227     default:
1228         g_assert_not_reached();
1229     }
1230     tcg_swap_vecop_list(hold_list);
1231
1232     if (oprsz < maxsz) {
1233         expand_clr(dofs + oprsz, maxsz - oprsz);
1234     }
1235 }
1236
1237 /* Expand a vector operation with two vectors and a scalar.  */
1238 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1239                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1240 {
1241     TCGType type;
1242
1243     check_size_align(oprsz, maxsz, dofs | aofs);
1244     check_overlap_2(dofs, aofs, maxsz);
1245
1246     type = 0;
1247     if (g->fniv) {
1248         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1249     }
1250     if (type != 0) {
1251         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1252         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1253         TCGv_vec t_vec = tcg_temp_new_vec(type);
1254         uint32_t some;
1255
1256         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1257
1258         switch (type) {
1259         case TCG_TYPE_V256:
1260             /* Recall that ARM SVE allows vector sizes that are not a
1261              * power of 2, but always a multiple of 16.  The intent is
1262              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1263              */
1264             some = QEMU_ALIGN_DOWN(oprsz, 32);
1265             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1266                           t_vec, g->scalar_first, g->fniv);
1267             if (some == oprsz) {
1268                 break;
1269             }
1270             dofs += some;
1271             aofs += some;
1272             oprsz -= some;
1273             maxsz -= some;
1274             /* fallthru */
1275
1276         case TCG_TYPE_V128:
1277             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1278                           t_vec, g->scalar_first, g->fniv);
1279             break;
1280
1281         case TCG_TYPE_V64:
1282             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1283                           t_vec, g->scalar_first, g->fniv);
1284             break;
1285
1286         default:
1287             g_assert_not_reached();
1288         }
1289         tcg_temp_free_vec(t_vec);
1290         tcg_swap_vecop_list(hold_list);
1291     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1292         TCGv_i64 t64 = tcg_temp_new_i64();
1293
1294         gen_dup_i64(g->vece, t64, c);
1295         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1296         tcg_temp_free_i64(t64);
1297     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1298         TCGv_i32 t32 = tcg_temp_new_i32();
1299
1300         tcg_gen_extrl_i64_i32(t32, c);
1301         gen_dup_i32(g->vece, t32, t32);
1302         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1303         tcg_temp_free_i32(t32);
1304     } else {
1305         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1306         return;
1307     }
1308
1309     if (oprsz < maxsz) {
1310         expand_clr(dofs + oprsz, maxsz - oprsz);
1311     }
1312 }
1313
1314 /* Expand a vector three-operand operation.  */
1315 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1316                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1317 {
1318     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1319     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1320     TCGType type;
1321     uint32_t some;
1322
1323     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1324     check_overlap_3(dofs, aofs, bofs, maxsz);
1325
1326     type = 0;
1327     if (g->fniv) {
1328         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1329     }
1330     switch (type) {
1331     case TCG_TYPE_V256:
1332         /* Recall that ARM SVE allows vector sizes that are not a
1333          * power of 2, but always a multiple of 16.  The intent is
1334          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1335          */
1336         some = QEMU_ALIGN_DOWN(oprsz, 32);
1337         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1338                      g->load_dest, g->fniv);
1339         if (some == oprsz) {
1340             break;
1341         }
1342         dofs += some;
1343         aofs += some;
1344         bofs += some;
1345         oprsz -= some;
1346         maxsz -= some;
1347         /* fallthru */
1348     case TCG_TYPE_V128:
1349         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1350                      g->load_dest, g->fniv);
1351         break;
1352     case TCG_TYPE_V64:
1353         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1354                      g->load_dest, g->fniv);
1355         break;
1356
1357     case 0:
1358         if (g->fni8 && check_size_impl(oprsz, 8)) {
1359             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1360         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1361             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1362         } else {
1363             assert(g->fno != NULL);
1364             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1365                                maxsz, g->data, g->fno);
1366             oprsz = maxsz;
1367         }
1368         break;
1369
1370     default:
1371         g_assert_not_reached();
1372     }
1373     tcg_swap_vecop_list(hold_list);
1374
1375     if (oprsz < maxsz) {
1376         expand_clr(dofs + oprsz, maxsz - oprsz);
1377     }
1378 }
1379
1380 /* Expand a vector operation with three vectors and an immediate.  */
1381 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1382                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1383                      const GVecGen3i *g)
1384 {
1385     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1386     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1387     TCGType type;
1388     uint32_t some;
1389
1390     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1391     check_overlap_3(dofs, aofs, bofs, maxsz);
1392
1393     type = 0;
1394     if (g->fniv) {
1395         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1396     }
1397     switch (type) {
1398     case TCG_TYPE_V256:
1399         /*
1400          * Recall that ARM SVE allows vector sizes that are not a
1401          * power of 2, but always a multiple of 16.  The intent is
1402          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1403          */
1404         some = QEMU_ALIGN_DOWN(oprsz, 32);
1405         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1406                       c, g->load_dest, g->fniv);
1407         if (some == oprsz) {
1408             break;
1409         }
1410         dofs += some;
1411         aofs += some;
1412         bofs += some;
1413         oprsz -= some;
1414         maxsz -= some;
1415         /* fallthru */
1416     case TCG_TYPE_V128:
1417         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1418                       c, g->load_dest, g->fniv);
1419         break;
1420     case TCG_TYPE_V64:
1421         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1422                       c, g->load_dest, g->fniv);
1423         break;
1424
1425     case 0:
1426         if (g->fni8 && check_size_impl(oprsz, 8)) {
1427             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1428         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1429             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1430         } else {
1431             assert(g->fno != NULL);
1432             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1433             oprsz = maxsz;
1434         }
1435         break;
1436
1437     default:
1438         g_assert_not_reached();
1439     }
1440     tcg_swap_vecop_list(hold_list);
1441
1442     if (oprsz < maxsz) {
1443         expand_clr(dofs + oprsz, maxsz - oprsz);
1444     }
1445 }
1446
1447 /* Expand a vector four-operand operation.  */
1448 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1449                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1450 {
1451     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1452     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1453     TCGType type;
1454     uint32_t some;
1455
1456     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1457     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1458
1459     type = 0;
1460     if (g->fniv) {
1461         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1462     }
1463     switch (type) {
1464     case TCG_TYPE_V256:
1465         /* Recall that ARM SVE allows vector sizes that are not a
1466          * power of 2, but always a multiple of 16.  The intent is
1467          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1468          */
1469         some = QEMU_ALIGN_DOWN(oprsz, 32);
1470         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1471                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1472         if (some == oprsz) {
1473             break;
1474         }
1475         dofs += some;
1476         aofs += some;
1477         bofs += some;
1478         cofs += some;
1479         oprsz -= some;
1480         maxsz -= some;
1481         /* fallthru */
1482     case TCG_TYPE_V128:
1483         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1484                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1485         break;
1486     case TCG_TYPE_V64:
1487         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1488                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1489         break;
1490
1491     case 0:
1492         if (g->fni8 && check_size_impl(oprsz, 8)) {
1493             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1494                          g->write_aofs, g->fni8);
1495         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1496             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1497                          g->write_aofs, g->fni4);
1498         } else {
1499             assert(g->fno != NULL);
1500             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1501                                oprsz, maxsz, g->data, g->fno);
1502             oprsz = maxsz;
1503         }
1504         break;
1505
1506     default:
1507         g_assert_not_reached();
1508     }
1509     tcg_swap_vecop_list(hold_list);
1510
1511     if (oprsz < maxsz) {
1512         expand_clr(dofs + oprsz, maxsz - oprsz);
1513     }
1514 }
1515
1516 /*
1517  * Expand specific vector operations.
1518  */
1519
1520 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1521 {
1522     tcg_gen_mov_vec(a, b);
1523 }
1524
1525 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1526                       uint32_t oprsz, uint32_t maxsz)
1527 {
1528     static const GVecGen2 g = {
1529         .fni8 = tcg_gen_mov_i64,
1530         .fniv = vec_mov2,
1531         .fno = gen_helper_gvec_mov,
1532         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1533     };
1534     if (dofs != aofs) {
1535         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1536     } else {
1537         check_size_align(oprsz, maxsz, dofs);
1538         if (oprsz < maxsz) {
1539             expand_clr(dofs + oprsz, maxsz - oprsz);
1540         }
1541     }
1542 }
1543
1544 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1545                           uint32_t maxsz, TCGv_i32 in)
1546 {
1547     check_size_align(oprsz, maxsz, dofs);
1548     tcg_debug_assert(vece <= MO_32);
1549     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1550 }
1551
1552 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1553                           uint32_t maxsz, TCGv_i64 in)
1554 {
1555     check_size_align(oprsz, maxsz, dofs);
1556     tcg_debug_assert(vece <= MO_64);
1557     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1558 }
1559
1560 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1561                           uint32_t oprsz, uint32_t maxsz)
1562 {
1563     check_size_align(oprsz, maxsz, dofs);
1564     if (vece <= MO_64) {
1565         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1566         if (type != 0) {
1567             TCGv_vec t_vec = tcg_temp_new_vec(type);
1568             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1569             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1570             tcg_temp_free_vec(t_vec);
1571         } else if (vece <= MO_32) {
1572             TCGv_i32 in = tcg_temp_new_i32();
1573             switch (vece) {
1574             case MO_8:
1575                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1576                 break;
1577             case MO_16:
1578                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1579                 break;
1580             default:
1581                 tcg_gen_ld_i32(in, cpu_env, aofs);
1582                 break;
1583             }
1584             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1585             tcg_temp_free_i32(in);
1586         } else {
1587             TCGv_i64 in = tcg_temp_new_i64();
1588             tcg_gen_ld_i64(in, cpu_env, aofs);
1589             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1590             tcg_temp_free_i64(in);
1591         }
1592     } else if (vece == 4) {
1593         /* 128-bit duplicate.  */
1594         int i;
1595
1596         tcg_debug_assert(oprsz >= 16);
1597         if (TCG_TARGET_HAS_v128) {
1598             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1599
1600             tcg_gen_ld_vec(in, cpu_env, aofs);
1601             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1602                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1603             }
1604             tcg_temp_free_vec(in);
1605         } else {
1606             TCGv_i64 in0 = tcg_temp_new_i64();
1607             TCGv_i64 in1 = tcg_temp_new_i64();
1608
1609             tcg_gen_ld_i64(in0, cpu_env, aofs);
1610             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1611             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1612                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1613                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1614             }
1615             tcg_temp_free_i64(in0);
1616             tcg_temp_free_i64(in1);
1617         }
1618         if (oprsz < maxsz) {
1619             expand_clr(dofs + oprsz, maxsz - oprsz);
1620         }
1621     } else if (vece == 5) {
1622         /* 256-bit duplicate.  */
1623         int i;
1624
1625         tcg_debug_assert(oprsz >= 32);
1626         tcg_debug_assert(oprsz % 32 == 0);
1627         if (TCG_TARGET_HAS_v256) {
1628             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1629
1630             tcg_gen_ld_vec(in, cpu_env, aofs);
1631             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1632                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1633             }
1634             tcg_temp_free_vec(in);
1635         } else if (TCG_TARGET_HAS_v128) {
1636             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1637             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1638
1639             tcg_gen_ld_vec(in0, cpu_env, aofs);
1640             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1641             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1642                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1643                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1644             }
1645             tcg_temp_free_vec(in0);
1646             tcg_temp_free_vec(in1);
1647         } else {
1648             TCGv_i64 in[4];
1649             int j;
1650
1651             for (j = 0; j < 4; ++j) {
1652                 in[j] = tcg_temp_new_i64();
1653                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1654             }
1655             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1656                 for (j = 0; j < 4; ++j) {
1657                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1658                 }
1659             }
1660             for (j = 0; j < 4; ++j) {
1661                 tcg_temp_free_i64(in[j]);
1662             }
1663         }
1664         if (oprsz < maxsz) {
1665             expand_clr(dofs + oprsz, maxsz - oprsz);
1666         }
1667     } else {
1668         g_assert_not_reached();
1669     }
1670 }
1671
1672 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1673                           uint32_t maxsz, uint64_t x)
1674 {
1675     check_size_align(oprsz, maxsz, dofs);
1676     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1677 }
1678
1679 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1680                       uint32_t oprsz, uint32_t maxsz)
1681 {
1682     static const GVecGen2 g = {
1683         .fni8 = tcg_gen_not_i64,
1684         .fniv = tcg_gen_not_vec,
1685         .fno = gen_helper_gvec_not,
1686         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1687     };
1688     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1689 }
1690
1691 /* Perform a vector addition using normal addition and a mask.  The mask
1692    should be the sign bit of each lane.  This 6-operation form is more
1693    efficient than separate additions when there are 4 or more lanes in
1694    the 64-bit operation.  */
1695 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1696 {
1697     TCGv_i64 t1 = tcg_temp_new_i64();
1698     TCGv_i64 t2 = tcg_temp_new_i64();
1699     TCGv_i64 t3 = tcg_temp_new_i64();
1700
1701     tcg_gen_andc_i64(t1, a, m);
1702     tcg_gen_andc_i64(t2, b, m);
1703     tcg_gen_xor_i64(t3, a, b);
1704     tcg_gen_add_i64(d, t1, t2);
1705     tcg_gen_and_i64(t3, t3, m);
1706     tcg_gen_xor_i64(d, d, t3);
1707
1708     tcg_temp_free_i64(t1);
1709     tcg_temp_free_i64(t2);
1710     tcg_temp_free_i64(t3);
1711 }
1712
1713 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1714 {
1715     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1716     gen_addv_mask(d, a, b, m);
1717     tcg_temp_free_i64(m);
1718 }
1719
1720 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1721 {
1722     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1723     gen_addv_mask(d, a, b, m);
1724     tcg_temp_free_i64(m);
1725 }
1726
1727 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1728 {
1729     TCGv_i64 t1 = tcg_temp_new_i64();
1730     TCGv_i64 t2 = tcg_temp_new_i64();
1731
1732     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1733     tcg_gen_add_i64(t2, a, b);
1734     tcg_gen_add_i64(t1, t1, b);
1735     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1736
1737     tcg_temp_free_i64(t1);
1738     tcg_temp_free_i64(t2);
1739 }
1740
1741 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1742
1743 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1744                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1745 {
1746     static const GVecGen3 g[4] = {
1747         { .fni8 = tcg_gen_vec_add8_i64,
1748           .fniv = tcg_gen_add_vec,
1749           .fno = gen_helper_gvec_add8,
1750           .opt_opc = vecop_list_add,
1751           .vece = MO_8 },
1752         { .fni8 = tcg_gen_vec_add16_i64,
1753           .fniv = tcg_gen_add_vec,
1754           .fno = gen_helper_gvec_add16,
1755           .opt_opc = vecop_list_add,
1756           .vece = MO_16 },
1757         { .fni4 = tcg_gen_add_i32,
1758           .fniv = tcg_gen_add_vec,
1759           .fno = gen_helper_gvec_add32,
1760           .opt_opc = vecop_list_add,
1761           .vece = MO_32 },
1762         { .fni8 = tcg_gen_add_i64,
1763           .fniv = tcg_gen_add_vec,
1764           .fno = gen_helper_gvec_add64,
1765           .opt_opc = vecop_list_add,
1766           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1767           .vece = MO_64 },
1768     };
1769
1770     tcg_debug_assert(vece <= MO_64);
1771     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1772 }
1773
1774 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1775                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1776 {
1777     static const GVecGen2s g[4] = {
1778         { .fni8 = tcg_gen_vec_add8_i64,
1779           .fniv = tcg_gen_add_vec,
1780           .fno = gen_helper_gvec_adds8,
1781           .opt_opc = vecop_list_add,
1782           .vece = MO_8 },
1783         { .fni8 = tcg_gen_vec_add16_i64,
1784           .fniv = tcg_gen_add_vec,
1785           .fno = gen_helper_gvec_adds16,
1786           .opt_opc = vecop_list_add,
1787           .vece = MO_16 },
1788         { .fni4 = tcg_gen_add_i32,
1789           .fniv = tcg_gen_add_vec,
1790           .fno = gen_helper_gvec_adds32,
1791           .opt_opc = vecop_list_add,
1792           .vece = MO_32 },
1793         { .fni8 = tcg_gen_add_i64,
1794           .fniv = tcg_gen_add_vec,
1795           .fno = gen_helper_gvec_adds64,
1796           .opt_opc = vecop_list_add,
1797           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1798           .vece = MO_64 },
1799     };
1800
1801     tcg_debug_assert(vece <= MO_64);
1802     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1803 }
1804
1805 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1806                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1807 {
1808     TCGv_i64 tmp = tcg_const_i64(c);
1809     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1810     tcg_temp_free_i64(tmp);
1811 }
1812
1813 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1814
1815 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1816                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1817 {
1818     static const GVecGen2s g[4] = {
1819         { .fni8 = tcg_gen_vec_sub8_i64,
1820           .fniv = tcg_gen_sub_vec,
1821           .fno = gen_helper_gvec_subs8,
1822           .opt_opc = vecop_list_sub,
1823           .vece = MO_8 },
1824         { .fni8 = tcg_gen_vec_sub16_i64,
1825           .fniv = tcg_gen_sub_vec,
1826           .fno = gen_helper_gvec_subs16,
1827           .opt_opc = vecop_list_sub,
1828           .vece = MO_16 },
1829         { .fni4 = tcg_gen_sub_i32,
1830           .fniv = tcg_gen_sub_vec,
1831           .fno = gen_helper_gvec_subs32,
1832           .opt_opc = vecop_list_sub,
1833           .vece = MO_32 },
1834         { .fni8 = tcg_gen_sub_i64,
1835           .fniv = tcg_gen_sub_vec,
1836           .fno = gen_helper_gvec_subs64,
1837           .opt_opc = vecop_list_sub,
1838           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1839           .vece = MO_64 },
1840     };
1841
1842     tcg_debug_assert(vece <= MO_64);
1843     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1844 }
1845
1846 /* Perform a vector subtraction using normal subtraction and a mask.
1847    Compare gen_addv_mask above.  */
1848 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1849 {
1850     TCGv_i64 t1 = tcg_temp_new_i64();
1851     TCGv_i64 t2 = tcg_temp_new_i64();
1852     TCGv_i64 t3 = tcg_temp_new_i64();
1853
1854     tcg_gen_or_i64(t1, a, m);
1855     tcg_gen_andc_i64(t2, b, m);
1856     tcg_gen_eqv_i64(t3, a, b);
1857     tcg_gen_sub_i64(d, t1, t2);
1858     tcg_gen_and_i64(t3, t3, m);
1859     tcg_gen_xor_i64(d, d, t3);
1860
1861     tcg_temp_free_i64(t1);
1862     tcg_temp_free_i64(t2);
1863     tcg_temp_free_i64(t3);
1864 }
1865
1866 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1867 {
1868     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1869     gen_subv_mask(d, a, b, m);
1870     tcg_temp_free_i64(m);
1871 }
1872
1873 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1874 {
1875     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1876     gen_subv_mask(d, a, b, m);
1877     tcg_temp_free_i64(m);
1878 }
1879
1880 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1881 {
1882     TCGv_i64 t1 = tcg_temp_new_i64();
1883     TCGv_i64 t2 = tcg_temp_new_i64();
1884
1885     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1886     tcg_gen_sub_i64(t2, a, b);
1887     tcg_gen_sub_i64(t1, a, t1);
1888     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1889
1890     tcg_temp_free_i64(t1);
1891     tcg_temp_free_i64(t2);
1892 }
1893
1894 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1895                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1896 {
1897     static const GVecGen3 g[4] = {
1898         { .fni8 = tcg_gen_vec_sub8_i64,
1899           .fniv = tcg_gen_sub_vec,
1900           .fno = gen_helper_gvec_sub8,
1901           .opt_opc = vecop_list_sub,
1902           .vece = MO_8 },
1903         { .fni8 = tcg_gen_vec_sub16_i64,
1904           .fniv = tcg_gen_sub_vec,
1905           .fno = gen_helper_gvec_sub16,
1906           .opt_opc = vecop_list_sub,
1907           .vece = MO_16 },
1908         { .fni4 = tcg_gen_sub_i32,
1909           .fniv = tcg_gen_sub_vec,
1910           .fno = gen_helper_gvec_sub32,
1911           .opt_opc = vecop_list_sub,
1912           .vece = MO_32 },
1913         { .fni8 = tcg_gen_sub_i64,
1914           .fniv = tcg_gen_sub_vec,
1915           .fno = gen_helper_gvec_sub64,
1916           .opt_opc = vecop_list_sub,
1917           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1918           .vece = MO_64 },
1919     };
1920
1921     tcg_debug_assert(vece <= MO_64);
1922     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1923 }
1924
1925 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1926
1927 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1928                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1929 {
1930     static const GVecGen3 g[4] = {
1931         { .fniv = tcg_gen_mul_vec,
1932           .fno = gen_helper_gvec_mul8,
1933           .opt_opc = vecop_list_mul,
1934           .vece = MO_8 },
1935         { .fniv = tcg_gen_mul_vec,
1936           .fno = gen_helper_gvec_mul16,
1937           .opt_opc = vecop_list_mul,
1938           .vece = MO_16 },
1939         { .fni4 = tcg_gen_mul_i32,
1940           .fniv = tcg_gen_mul_vec,
1941           .fno = gen_helper_gvec_mul32,
1942           .opt_opc = vecop_list_mul,
1943           .vece = MO_32 },
1944         { .fni8 = tcg_gen_mul_i64,
1945           .fniv = tcg_gen_mul_vec,
1946           .fno = gen_helper_gvec_mul64,
1947           .opt_opc = vecop_list_mul,
1948           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1949           .vece = MO_64 },
1950     };
1951
1952     tcg_debug_assert(vece <= MO_64);
1953     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1954 }
1955
1956 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1957                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1958 {
1959     static const GVecGen2s g[4] = {
1960         { .fniv = tcg_gen_mul_vec,
1961           .fno = gen_helper_gvec_muls8,
1962           .opt_opc = vecop_list_mul,
1963           .vece = MO_8 },
1964         { .fniv = tcg_gen_mul_vec,
1965           .fno = gen_helper_gvec_muls16,
1966           .opt_opc = vecop_list_mul,
1967           .vece = MO_16 },
1968         { .fni4 = tcg_gen_mul_i32,
1969           .fniv = tcg_gen_mul_vec,
1970           .fno = gen_helper_gvec_muls32,
1971           .opt_opc = vecop_list_mul,
1972           .vece = MO_32 },
1973         { .fni8 = tcg_gen_mul_i64,
1974           .fniv = tcg_gen_mul_vec,
1975           .fno = gen_helper_gvec_muls64,
1976           .opt_opc = vecop_list_mul,
1977           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1978           .vece = MO_64 },
1979     };
1980
1981     tcg_debug_assert(vece <= MO_64);
1982     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1983 }
1984
1985 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1986                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1987 {
1988     TCGv_i64 tmp = tcg_const_i64(c);
1989     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1990     tcg_temp_free_i64(tmp);
1991 }
1992
1993 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1994                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1995 {
1996     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1997     static const GVecGen3 g[4] = {
1998         { .fniv = tcg_gen_ssadd_vec,
1999           .fno = gen_helper_gvec_ssadd8,
2000           .opt_opc = vecop_list,
2001           .vece = MO_8 },
2002         { .fniv = tcg_gen_ssadd_vec,
2003           .fno = gen_helper_gvec_ssadd16,
2004           .opt_opc = vecop_list,
2005           .vece = MO_16 },
2006         { .fniv = tcg_gen_ssadd_vec,
2007           .fno = gen_helper_gvec_ssadd32,
2008           .opt_opc = vecop_list,
2009           .vece = MO_32 },
2010         { .fniv = tcg_gen_ssadd_vec,
2011           .fno = gen_helper_gvec_ssadd64,
2012           .opt_opc = vecop_list,
2013           .vece = MO_64 },
2014     };
2015     tcg_debug_assert(vece <= MO_64);
2016     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2017 }
2018
2019 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2020                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2021 {
2022     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2023     static const GVecGen3 g[4] = {
2024         { .fniv = tcg_gen_sssub_vec,
2025           .fno = gen_helper_gvec_sssub8,
2026           .opt_opc = vecop_list,
2027           .vece = MO_8 },
2028         { .fniv = tcg_gen_sssub_vec,
2029           .fno = gen_helper_gvec_sssub16,
2030           .opt_opc = vecop_list,
2031           .vece = MO_16 },
2032         { .fniv = tcg_gen_sssub_vec,
2033           .fno = gen_helper_gvec_sssub32,
2034           .opt_opc = vecop_list,
2035           .vece = MO_32 },
2036         { .fniv = tcg_gen_sssub_vec,
2037           .fno = gen_helper_gvec_sssub64,
2038           .opt_opc = vecop_list,
2039           .vece = MO_64 },
2040     };
2041     tcg_debug_assert(vece <= MO_64);
2042     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2043 }
2044
2045 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2046 {
2047     TCGv_i32 max = tcg_const_i32(-1);
2048     tcg_gen_add_i32(d, a, b);
2049     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2050     tcg_temp_free_i32(max);
2051 }
2052
2053 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2054 {
2055     TCGv_i64 max = tcg_const_i64(-1);
2056     tcg_gen_add_i64(d, a, b);
2057     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2058     tcg_temp_free_i64(max);
2059 }
2060
2061 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2062                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2063 {
2064     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2065     static const GVecGen3 g[4] = {
2066         { .fniv = tcg_gen_usadd_vec,
2067           .fno = gen_helper_gvec_usadd8,
2068           .opt_opc = vecop_list,
2069           .vece = MO_8 },
2070         { .fniv = tcg_gen_usadd_vec,
2071           .fno = gen_helper_gvec_usadd16,
2072           .opt_opc = vecop_list,
2073           .vece = MO_16 },
2074         { .fni4 = tcg_gen_usadd_i32,
2075           .fniv = tcg_gen_usadd_vec,
2076           .fno = gen_helper_gvec_usadd32,
2077           .opt_opc = vecop_list,
2078           .vece = MO_32 },
2079         { .fni8 = tcg_gen_usadd_i64,
2080           .fniv = tcg_gen_usadd_vec,
2081           .fno = gen_helper_gvec_usadd64,
2082           .opt_opc = vecop_list,
2083           .vece = MO_64 }
2084     };
2085     tcg_debug_assert(vece <= MO_64);
2086     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2087 }
2088
2089 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2090 {
2091     TCGv_i32 min = tcg_const_i32(0);
2092     tcg_gen_sub_i32(d, a, b);
2093     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2094     tcg_temp_free_i32(min);
2095 }
2096
2097 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2098 {
2099     TCGv_i64 min = tcg_const_i64(0);
2100     tcg_gen_sub_i64(d, a, b);
2101     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2102     tcg_temp_free_i64(min);
2103 }
2104
2105 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2106                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2107 {
2108     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2109     static const GVecGen3 g[4] = {
2110         { .fniv = tcg_gen_ussub_vec,
2111           .fno = gen_helper_gvec_ussub8,
2112           .opt_opc = vecop_list,
2113           .vece = MO_8 },
2114         { .fniv = tcg_gen_ussub_vec,
2115           .fno = gen_helper_gvec_ussub16,
2116           .opt_opc = vecop_list,
2117           .vece = MO_16 },
2118         { .fni4 = tcg_gen_ussub_i32,
2119           .fniv = tcg_gen_ussub_vec,
2120           .fno = gen_helper_gvec_ussub32,
2121           .opt_opc = vecop_list,
2122           .vece = MO_32 },
2123         { .fni8 = tcg_gen_ussub_i64,
2124           .fniv = tcg_gen_ussub_vec,
2125           .fno = gen_helper_gvec_ussub64,
2126           .opt_opc = vecop_list,
2127           .vece = MO_64 }
2128     };
2129     tcg_debug_assert(vece <= MO_64);
2130     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2131 }
2132
2133 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2134                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2135 {
2136     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2137     static const GVecGen3 g[4] = {
2138         { .fniv = tcg_gen_smin_vec,
2139           .fno = gen_helper_gvec_smin8,
2140           .opt_opc = vecop_list,
2141           .vece = MO_8 },
2142         { .fniv = tcg_gen_smin_vec,
2143           .fno = gen_helper_gvec_smin16,
2144           .opt_opc = vecop_list,
2145           .vece = MO_16 },
2146         { .fni4 = tcg_gen_smin_i32,
2147           .fniv = tcg_gen_smin_vec,
2148           .fno = gen_helper_gvec_smin32,
2149           .opt_opc = vecop_list,
2150           .vece = MO_32 },
2151         { .fni8 = tcg_gen_smin_i64,
2152           .fniv = tcg_gen_smin_vec,
2153           .fno = gen_helper_gvec_smin64,
2154           .opt_opc = vecop_list,
2155           .vece = MO_64 }
2156     };
2157     tcg_debug_assert(vece <= MO_64);
2158     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2159 }
2160
2161 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2162                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2163 {
2164     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2165     static const GVecGen3 g[4] = {
2166         { .fniv = tcg_gen_umin_vec,
2167           .fno = gen_helper_gvec_umin8,
2168           .opt_opc = vecop_list,
2169           .vece = MO_8 },
2170         { .fniv = tcg_gen_umin_vec,
2171           .fno = gen_helper_gvec_umin16,
2172           .opt_opc = vecop_list,
2173           .vece = MO_16 },
2174         { .fni4 = tcg_gen_umin_i32,
2175           .fniv = tcg_gen_umin_vec,
2176           .fno = gen_helper_gvec_umin32,
2177           .opt_opc = vecop_list,
2178           .vece = MO_32 },
2179         { .fni8 = tcg_gen_umin_i64,
2180           .fniv = tcg_gen_umin_vec,
2181           .fno = gen_helper_gvec_umin64,
2182           .opt_opc = vecop_list,
2183           .vece = MO_64 }
2184     };
2185     tcg_debug_assert(vece <= MO_64);
2186     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2187 }
2188
2189 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2190                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2191 {
2192     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2193     static const GVecGen3 g[4] = {
2194         { .fniv = tcg_gen_smax_vec,
2195           .fno = gen_helper_gvec_smax8,
2196           .opt_opc = vecop_list,
2197           .vece = MO_8 },
2198         { .fniv = tcg_gen_smax_vec,
2199           .fno = gen_helper_gvec_smax16,
2200           .opt_opc = vecop_list,
2201           .vece = MO_16 },
2202         { .fni4 = tcg_gen_smax_i32,
2203           .fniv = tcg_gen_smax_vec,
2204           .fno = gen_helper_gvec_smax32,
2205           .opt_opc = vecop_list,
2206           .vece = MO_32 },
2207         { .fni8 = tcg_gen_smax_i64,
2208           .fniv = tcg_gen_smax_vec,
2209           .fno = gen_helper_gvec_smax64,
2210           .opt_opc = vecop_list,
2211           .vece = MO_64 }
2212     };
2213     tcg_debug_assert(vece <= MO_64);
2214     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2215 }
2216
2217 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2218                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2219 {
2220     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2221     static const GVecGen3 g[4] = {
2222         { .fniv = tcg_gen_umax_vec,
2223           .fno = gen_helper_gvec_umax8,
2224           .opt_opc = vecop_list,
2225           .vece = MO_8 },
2226         { .fniv = tcg_gen_umax_vec,
2227           .fno = gen_helper_gvec_umax16,
2228           .opt_opc = vecop_list,
2229           .vece = MO_16 },
2230         { .fni4 = tcg_gen_umax_i32,
2231           .fniv = tcg_gen_umax_vec,
2232           .fno = gen_helper_gvec_umax32,
2233           .opt_opc = vecop_list,
2234           .vece = MO_32 },
2235         { .fni8 = tcg_gen_umax_i64,
2236           .fniv = tcg_gen_umax_vec,
2237           .fno = gen_helper_gvec_umax64,
2238           .opt_opc = vecop_list,
2239           .vece = MO_64 }
2240     };
2241     tcg_debug_assert(vece <= MO_64);
2242     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2243 }
2244
2245 /* Perform a vector negation using normal negation and a mask.
2246    Compare gen_subv_mask above.  */
2247 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2248 {
2249     TCGv_i64 t2 = tcg_temp_new_i64();
2250     TCGv_i64 t3 = tcg_temp_new_i64();
2251
2252     tcg_gen_andc_i64(t3, m, b);
2253     tcg_gen_andc_i64(t2, b, m);
2254     tcg_gen_sub_i64(d, m, t2);
2255     tcg_gen_xor_i64(d, d, t3);
2256
2257     tcg_temp_free_i64(t2);
2258     tcg_temp_free_i64(t3);
2259 }
2260
2261 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2262 {
2263     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2264     gen_negv_mask(d, b, m);
2265     tcg_temp_free_i64(m);
2266 }
2267
2268 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2269 {
2270     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2271     gen_negv_mask(d, b, m);
2272     tcg_temp_free_i64(m);
2273 }
2274
2275 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2276 {
2277     TCGv_i64 t1 = tcg_temp_new_i64();
2278     TCGv_i64 t2 = tcg_temp_new_i64();
2279
2280     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2281     tcg_gen_neg_i64(t2, b);
2282     tcg_gen_neg_i64(t1, t1);
2283     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2284
2285     tcg_temp_free_i64(t1);
2286     tcg_temp_free_i64(t2);
2287 }
2288
2289 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2290                       uint32_t oprsz, uint32_t maxsz)
2291 {
2292     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2293     static const GVecGen2 g[4] = {
2294         { .fni8 = tcg_gen_vec_neg8_i64,
2295           .fniv = tcg_gen_neg_vec,
2296           .fno = gen_helper_gvec_neg8,
2297           .opt_opc = vecop_list,
2298           .vece = MO_8 },
2299         { .fni8 = tcg_gen_vec_neg16_i64,
2300           .fniv = tcg_gen_neg_vec,
2301           .fno = gen_helper_gvec_neg16,
2302           .opt_opc = vecop_list,
2303           .vece = MO_16 },
2304         { .fni4 = tcg_gen_neg_i32,
2305           .fniv = tcg_gen_neg_vec,
2306           .fno = gen_helper_gvec_neg32,
2307           .opt_opc = vecop_list,
2308           .vece = MO_32 },
2309         { .fni8 = tcg_gen_neg_i64,
2310           .fniv = tcg_gen_neg_vec,
2311           .fno = gen_helper_gvec_neg64,
2312           .opt_opc = vecop_list,
2313           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2314           .vece = MO_64 },
2315     };
2316
2317     tcg_debug_assert(vece <= MO_64);
2318     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2319 }
2320
2321 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2322 {
2323     TCGv_i64 t = tcg_temp_new_i64();
2324     int nbit = 8 << vece;
2325
2326     /* Create -1 for each negative element.  */
2327     tcg_gen_shri_i64(t, b, nbit - 1);
2328     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2329     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2330
2331     /*
2332      * Invert (via xor -1) and add one.
2333      * Because of the ordering the msb is cleared,
2334      * so we never have carry into the next element.
2335      */
2336     tcg_gen_xor_i64(d, b, t);
2337     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2338     tcg_gen_add_i64(d, d, t);
2339
2340     tcg_temp_free_i64(t);
2341 }
2342
2343 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2344 {
2345     gen_absv_mask(d, b, MO_8);
2346 }
2347
2348 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2349 {
2350     gen_absv_mask(d, b, MO_16);
2351 }
2352
2353 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2354                       uint32_t oprsz, uint32_t maxsz)
2355 {
2356     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2357     static const GVecGen2 g[4] = {
2358         { .fni8 = tcg_gen_vec_abs8_i64,
2359           .fniv = tcg_gen_abs_vec,
2360           .fno = gen_helper_gvec_abs8,
2361           .opt_opc = vecop_list,
2362           .vece = MO_8 },
2363         { .fni8 = tcg_gen_vec_abs16_i64,
2364           .fniv = tcg_gen_abs_vec,
2365           .fno = gen_helper_gvec_abs16,
2366           .opt_opc = vecop_list,
2367           .vece = MO_16 },
2368         { .fni4 = tcg_gen_abs_i32,
2369           .fniv = tcg_gen_abs_vec,
2370           .fno = gen_helper_gvec_abs32,
2371           .opt_opc = vecop_list,
2372           .vece = MO_32 },
2373         { .fni8 = tcg_gen_abs_i64,
2374           .fniv = tcg_gen_abs_vec,
2375           .fno = gen_helper_gvec_abs64,
2376           .opt_opc = vecop_list,
2377           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2378           .vece = MO_64 },
2379     };
2380
2381     tcg_debug_assert(vece <= MO_64);
2382     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2383 }
2384
2385 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2386                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2387 {
2388     static const GVecGen3 g = {
2389         .fni8 = tcg_gen_and_i64,
2390         .fniv = tcg_gen_and_vec,
2391         .fno = gen_helper_gvec_and,
2392         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2393     };
2394
2395     if (aofs == bofs) {
2396         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2397     } else {
2398         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2399     }
2400 }
2401
2402 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2403                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2404 {
2405     static const GVecGen3 g = {
2406         .fni8 = tcg_gen_or_i64,
2407         .fniv = tcg_gen_or_vec,
2408         .fno = gen_helper_gvec_or,
2409         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410     };
2411
2412     if (aofs == bofs) {
2413         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2414     } else {
2415         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2416     }
2417 }
2418
2419 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2420                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2421 {
2422     static const GVecGen3 g = {
2423         .fni8 = tcg_gen_xor_i64,
2424         .fniv = tcg_gen_xor_vec,
2425         .fno = gen_helper_gvec_xor,
2426         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2427     };
2428
2429     if (aofs == bofs) {
2430         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2431     } else {
2432         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2433     }
2434 }
2435
2436 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2437                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2438 {
2439     static const GVecGen3 g = {
2440         .fni8 = tcg_gen_andc_i64,
2441         .fniv = tcg_gen_andc_vec,
2442         .fno = gen_helper_gvec_andc,
2443         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2444     };
2445
2446     if (aofs == bofs) {
2447         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2448     } else {
2449         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2450     }
2451 }
2452
2453 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2454                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2455 {
2456     static const GVecGen3 g = {
2457         .fni8 = tcg_gen_orc_i64,
2458         .fniv = tcg_gen_orc_vec,
2459         .fno = gen_helper_gvec_orc,
2460         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2461     };
2462
2463     if (aofs == bofs) {
2464         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2465     } else {
2466         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2467     }
2468 }
2469
2470 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2471                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2472 {
2473     static const GVecGen3 g = {
2474         .fni8 = tcg_gen_nand_i64,
2475         .fniv = tcg_gen_nand_vec,
2476         .fno = gen_helper_gvec_nand,
2477         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2478     };
2479
2480     if (aofs == bofs) {
2481         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2482     } else {
2483         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2484     }
2485 }
2486
2487 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2488                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2489 {
2490     static const GVecGen3 g = {
2491         .fni8 = tcg_gen_nor_i64,
2492         .fniv = tcg_gen_nor_vec,
2493         .fno = gen_helper_gvec_nor,
2494         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2495     };
2496
2497     if (aofs == bofs) {
2498         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2499     } else {
2500         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2501     }
2502 }
2503
2504 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2505                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2506 {
2507     static const GVecGen3 g = {
2508         .fni8 = tcg_gen_eqv_i64,
2509         .fniv = tcg_gen_eqv_vec,
2510         .fno = gen_helper_gvec_eqv,
2511         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2512     };
2513
2514     if (aofs == bofs) {
2515         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2516     } else {
2517         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2518     }
2519 }
2520
2521 static const GVecGen2s gop_ands = {
2522     .fni8 = tcg_gen_and_i64,
2523     .fniv = tcg_gen_and_vec,
2524     .fno = gen_helper_gvec_ands,
2525     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2526     .vece = MO_64
2527 };
2528
2529 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2530                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2531 {
2532     TCGv_i64 tmp = tcg_temp_new_i64();
2533     gen_dup_i64(vece, tmp, c);
2534     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2535     tcg_temp_free_i64(tmp);
2536 }
2537
2538 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2539                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2540 {
2541     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2542     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2543     tcg_temp_free_i64(tmp);
2544 }
2545
2546 static const GVecGen2s gop_xors = {
2547     .fni8 = tcg_gen_xor_i64,
2548     .fniv = tcg_gen_xor_vec,
2549     .fno = gen_helper_gvec_xors,
2550     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2551     .vece = MO_64
2552 };
2553
2554 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2555                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2556 {
2557     TCGv_i64 tmp = tcg_temp_new_i64();
2558     gen_dup_i64(vece, tmp, c);
2559     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2560     tcg_temp_free_i64(tmp);
2561 }
2562
2563 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2564                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2565 {
2566     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2567     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2568     tcg_temp_free_i64(tmp);
2569 }
2570
2571 static const GVecGen2s gop_ors = {
2572     .fni8 = tcg_gen_or_i64,
2573     .fniv = tcg_gen_or_vec,
2574     .fno = gen_helper_gvec_ors,
2575     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2576     .vece = MO_64
2577 };
2578
2579 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2580                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2581 {
2582     TCGv_i64 tmp = tcg_temp_new_i64();
2583     gen_dup_i64(vece, tmp, c);
2584     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2585     tcg_temp_free_i64(tmp);
2586 }
2587
2588 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2589                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2590 {
2591     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2592     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2593     tcg_temp_free_i64(tmp);
2594 }
2595
2596 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2597 {
2598     uint64_t mask = dup_const(MO_8, 0xff << c);
2599     tcg_gen_shli_i64(d, a, c);
2600     tcg_gen_andi_i64(d, d, mask);
2601 }
2602
2603 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2604 {
2605     uint64_t mask = dup_const(MO_16, 0xffff << c);
2606     tcg_gen_shli_i64(d, a, c);
2607     tcg_gen_andi_i64(d, d, mask);
2608 }
2609
2610 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2611                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2612 {
2613     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2614     static const GVecGen2i g[4] = {
2615         { .fni8 = tcg_gen_vec_shl8i_i64,
2616           .fniv = tcg_gen_shli_vec,
2617           .fno = gen_helper_gvec_shl8i,
2618           .opt_opc = vecop_list,
2619           .vece = MO_8 },
2620         { .fni8 = tcg_gen_vec_shl16i_i64,
2621           .fniv = tcg_gen_shli_vec,
2622           .fno = gen_helper_gvec_shl16i,
2623           .opt_opc = vecop_list,
2624           .vece = MO_16 },
2625         { .fni4 = tcg_gen_shli_i32,
2626           .fniv = tcg_gen_shli_vec,
2627           .fno = gen_helper_gvec_shl32i,
2628           .opt_opc = vecop_list,
2629           .vece = MO_32 },
2630         { .fni8 = tcg_gen_shli_i64,
2631           .fniv = tcg_gen_shli_vec,
2632           .fno = gen_helper_gvec_shl64i,
2633           .opt_opc = vecop_list,
2634           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2635           .vece = MO_64 },
2636     };
2637
2638     tcg_debug_assert(vece <= MO_64);
2639     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2640     if (shift == 0) {
2641         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2642     } else {
2643         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2644     }
2645 }
2646
2647 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2648 {
2649     uint64_t mask = dup_const(MO_8, 0xff >> c);
2650     tcg_gen_shri_i64(d, a, c);
2651     tcg_gen_andi_i64(d, d, mask);
2652 }
2653
2654 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2655 {
2656     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2657     tcg_gen_shri_i64(d, a, c);
2658     tcg_gen_andi_i64(d, d, mask);
2659 }
2660
2661 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2662                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2663 {
2664     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2665     static const GVecGen2i g[4] = {
2666         { .fni8 = tcg_gen_vec_shr8i_i64,
2667           .fniv = tcg_gen_shri_vec,
2668           .fno = gen_helper_gvec_shr8i,
2669           .opt_opc = vecop_list,
2670           .vece = MO_8 },
2671         { .fni8 = tcg_gen_vec_shr16i_i64,
2672           .fniv = tcg_gen_shri_vec,
2673           .fno = gen_helper_gvec_shr16i,
2674           .opt_opc = vecop_list,
2675           .vece = MO_16 },
2676         { .fni4 = tcg_gen_shri_i32,
2677           .fniv = tcg_gen_shri_vec,
2678           .fno = gen_helper_gvec_shr32i,
2679           .opt_opc = vecop_list,
2680           .vece = MO_32 },
2681         { .fni8 = tcg_gen_shri_i64,
2682           .fniv = tcg_gen_shri_vec,
2683           .fno = gen_helper_gvec_shr64i,
2684           .opt_opc = vecop_list,
2685           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2686           .vece = MO_64 },
2687     };
2688
2689     tcg_debug_assert(vece <= MO_64);
2690     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2691     if (shift == 0) {
2692         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2693     } else {
2694         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2695     }
2696 }
2697
2698 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2699 {
2700     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2701     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2702     TCGv_i64 s = tcg_temp_new_i64();
2703
2704     tcg_gen_shri_i64(d, a, c);
2705     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2706     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2707     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2708     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2709     tcg_temp_free_i64(s);
2710 }
2711
2712 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2713 {
2714     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2715     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2716     TCGv_i64 s = tcg_temp_new_i64();
2717
2718     tcg_gen_shri_i64(d, a, c);
2719     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2720     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2721     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2722     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2723     tcg_temp_free_i64(s);
2724 }
2725
2726 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2727                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2728 {
2729     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2730     static const GVecGen2i g[4] = {
2731         { .fni8 = tcg_gen_vec_sar8i_i64,
2732           .fniv = tcg_gen_sari_vec,
2733           .fno = gen_helper_gvec_sar8i,
2734           .opt_opc = vecop_list,
2735           .vece = MO_8 },
2736         { .fni8 = tcg_gen_vec_sar16i_i64,
2737           .fniv = tcg_gen_sari_vec,
2738           .fno = gen_helper_gvec_sar16i,
2739           .opt_opc = vecop_list,
2740           .vece = MO_16 },
2741         { .fni4 = tcg_gen_sari_i32,
2742           .fniv = tcg_gen_sari_vec,
2743           .fno = gen_helper_gvec_sar32i,
2744           .opt_opc = vecop_list,
2745           .vece = MO_32 },
2746         { .fni8 = tcg_gen_sari_i64,
2747           .fniv = tcg_gen_sari_vec,
2748           .fno = gen_helper_gvec_sar64i,
2749           .opt_opc = vecop_list,
2750           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2751           .vece = MO_64 },
2752     };
2753
2754     tcg_debug_assert(vece <= MO_64);
2755     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2756     if (shift == 0) {
2757         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2758     } else {
2759         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2760     }
2761 }
2762
2763 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2764 {
2765     uint64_t mask = dup_const(MO_8, 0xff << c);
2766
2767     tcg_gen_shli_i64(d, a, c);
2768     tcg_gen_shri_i64(a, a, 8 - c);
2769     tcg_gen_andi_i64(d, d, mask);
2770     tcg_gen_andi_i64(a, a, ~mask);
2771     tcg_gen_or_i64(d, d, a);
2772 }
2773
2774 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2775 {
2776     uint64_t mask = dup_const(MO_16, 0xffff << c);
2777
2778     tcg_gen_shli_i64(d, a, c);
2779     tcg_gen_shri_i64(a, a, 16 - c);
2780     tcg_gen_andi_i64(d, d, mask);
2781     tcg_gen_andi_i64(a, a, ~mask);
2782     tcg_gen_or_i64(d, d, a);
2783 }
2784
2785 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2786                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2787 {
2788     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2789     static const GVecGen2i g[4] = {
2790         { .fni8 = tcg_gen_vec_rotl8i_i64,
2791           .fniv = tcg_gen_rotli_vec,
2792           .fno = gen_helper_gvec_rotl8i,
2793           .opt_opc = vecop_list,
2794           .vece = MO_8 },
2795         { .fni8 = tcg_gen_vec_rotl16i_i64,
2796           .fniv = tcg_gen_rotli_vec,
2797           .fno = gen_helper_gvec_rotl16i,
2798           .opt_opc = vecop_list,
2799           .vece = MO_16 },
2800         { .fni4 = tcg_gen_rotli_i32,
2801           .fniv = tcg_gen_rotli_vec,
2802           .fno = gen_helper_gvec_rotl32i,
2803           .opt_opc = vecop_list,
2804           .vece = MO_32 },
2805         { .fni8 = tcg_gen_rotli_i64,
2806           .fniv = tcg_gen_rotli_vec,
2807           .fno = gen_helper_gvec_rotl64i,
2808           .opt_opc = vecop_list,
2809           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2810           .vece = MO_64 },
2811     };
2812
2813     tcg_debug_assert(vece <= MO_64);
2814     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2815     if (shift == 0) {
2816         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2817     } else {
2818         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2819     }
2820 }
2821
2822 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2823                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2824 {
2825     tcg_debug_assert(vece <= MO_64);
2826     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2827     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2828                        oprsz, maxsz);
2829 }
2830
2831 /*
2832  * Specialized generation vector shifts by a non-constant scalar.
2833  */
2834
2835 typedef struct {
2836     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2837     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2838     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2839     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2840     gen_helper_gvec_2 *fno[4];
2841     TCGOpcode s_list[2];
2842     TCGOpcode v_list[2];
2843 } GVecGen2sh;
2844
2845 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2846                            uint32_t oprsz, uint32_t tysz, TCGType type,
2847                            TCGv_i32 shift,
2848                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2849 {
2850     TCGv_vec t0 = tcg_temp_new_vec(type);
2851     uint32_t i;
2852
2853     for (i = 0; i < oprsz; i += tysz) {
2854         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2855         fni(vece, t0, t0, shift);
2856         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2857     }
2858     tcg_temp_free_vec(t0);
2859 }
2860
2861 static void
2862 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2863                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2864 {
2865     TCGType type;
2866     uint32_t some;
2867
2868     check_size_align(oprsz, maxsz, dofs | aofs);
2869     check_overlap_2(dofs, aofs, maxsz);
2870
2871     /* If the backend has a scalar expansion, great.  */
2872     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2873     if (type) {
2874         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2875         switch (type) {
2876         case TCG_TYPE_V256:
2877             some = QEMU_ALIGN_DOWN(oprsz, 32);
2878             expand_2sh_vec(vece, dofs, aofs, some, 32,
2879                            TCG_TYPE_V256, shift, g->fniv_s);
2880             if (some == oprsz) {
2881                 break;
2882             }
2883             dofs += some;
2884             aofs += some;
2885             oprsz -= some;
2886             maxsz -= some;
2887             /* fallthru */
2888         case TCG_TYPE_V128:
2889             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2890                            TCG_TYPE_V128, shift, g->fniv_s);
2891             break;
2892         case TCG_TYPE_V64:
2893             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2894                            TCG_TYPE_V64, shift, g->fniv_s);
2895             break;
2896         default:
2897             g_assert_not_reached();
2898         }
2899         tcg_swap_vecop_list(hold_list);
2900         goto clear_tail;
2901     }
2902
2903     /* If the backend supports variable vector shifts, also cool.  */
2904     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2905     if (type) {
2906         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2907         TCGv_vec v_shift = tcg_temp_new_vec(type);
2908
2909         if (vece == MO_64) {
2910             TCGv_i64 sh64 = tcg_temp_new_i64();
2911             tcg_gen_extu_i32_i64(sh64, shift);
2912             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2913             tcg_temp_free_i64(sh64);
2914         } else {
2915             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2916         }
2917
2918         switch (type) {
2919         case TCG_TYPE_V256:
2920             some = QEMU_ALIGN_DOWN(oprsz, 32);
2921             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2922                           v_shift, false, g->fniv_v);
2923             if (some == oprsz) {
2924                 break;
2925             }
2926             dofs += some;
2927             aofs += some;
2928             oprsz -= some;
2929             maxsz -= some;
2930             /* fallthru */
2931         case TCG_TYPE_V128:
2932             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2933                           v_shift, false, g->fniv_v);
2934             break;
2935         case TCG_TYPE_V64:
2936             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2937                           v_shift, false, g->fniv_v);
2938             break;
2939         default:
2940             g_assert_not_reached();
2941         }
2942         tcg_temp_free_vec(v_shift);
2943         tcg_swap_vecop_list(hold_list);
2944         goto clear_tail;
2945     }
2946
2947     /* Otherwise fall back to integral... */
2948     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2949         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2950     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2951         TCGv_i64 sh64 = tcg_temp_new_i64();
2952         tcg_gen_extu_i32_i64(sh64, shift);
2953         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2954         tcg_temp_free_i64(sh64);
2955     } else {
2956         TCGv_ptr a0 = tcg_temp_new_ptr();
2957         TCGv_ptr a1 = tcg_temp_new_ptr();
2958         TCGv_i32 desc = tcg_temp_new_i32();
2959
2960         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2961         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2962         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2963         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2964
2965         g->fno[vece](a0, a1, desc);
2966
2967         tcg_temp_free_ptr(a0);
2968         tcg_temp_free_ptr(a1);
2969         tcg_temp_free_i32(desc);
2970         return;
2971     }
2972
2973  clear_tail:
2974     if (oprsz < maxsz) {
2975         expand_clr(dofs + oprsz, maxsz - oprsz);
2976     }
2977 }
2978
2979 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2980                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2981 {
2982     static const GVecGen2sh g = {
2983         .fni4 = tcg_gen_shl_i32,
2984         .fni8 = tcg_gen_shl_i64,
2985         .fniv_s = tcg_gen_shls_vec,
2986         .fniv_v = tcg_gen_shlv_vec,
2987         .fno = {
2988             gen_helper_gvec_shl8i,
2989             gen_helper_gvec_shl16i,
2990             gen_helper_gvec_shl32i,
2991             gen_helper_gvec_shl64i,
2992         },
2993         .s_list = { INDEX_op_shls_vec, 0 },
2994         .v_list = { INDEX_op_shlv_vec, 0 },
2995     };
2996
2997     tcg_debug_assert(vece <= MO_64);
2998     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2999 }
3000
3001 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3002                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3003 {
3004     static const GVecGen2sh g = {
3005         .fni4 = tcg_gen_shr_i32,
3006         .fni8 = tcg_gen_shr_i64,
3007         .fniv_s = tcg_gen_shrs_vec,
3008         .fniv_v = tcg_gen_shrv_vec,
3009         .fno = {
3010             gen_helper_gvec_shr8i,
3011             gen_helper_gvec_shr16i,
3012             gen_helper_gvec_shr32i,
3013             gen_helper_gvec_shr64i,
3014         },
3015         .s_list = { INDEX_op_shrs_vec, 0 },
3016         .v_list = { INDEX_op_shrv_vec, 0 },
3017     };
3018
3019     tcg_debug_assert(vece <= MO_64);
3020     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3021 }
3022
3023 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3024                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3025 {
3026     static const GVecGen2sh g = {
3027         .fni4 = tcg_gen_sar_i32,
3028         .fni8 = tcg_gen_sar_i64,
3029         .fniv_s = tcg_gen_sars_vec,
3030         .fniv_v = tcg_gen_sarv_vec,
3031         .fno = {
3032             gen_helper_gvec_sar8i,
3033             gen_helper_gvec_sar16i,
3034             gen_helper_gvec_sar32i,
3035             gen_helper_gvec_sar64i,
3036         },
3037         .s_list = { INDEX_op_sars_vec, 0 },
3038         .v_list = { INDEX_op_sarv_vec, 0 },
3039     };
3040
3041     tcg_debug_assert(vece <= MO_64);
3042     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3043 }
3044
3045 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3046                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3047 {
3048     static const GVecGen2sh g = {
3049         .fni4 = tcg_gen_rotl_i32,
3050         .fni8 = tcg_gen_rotl_i64,
3051         .fniv_s = tcg_gen_rotls_vec,
3052         .fniv_v = tcg_gen_rotlv_vec,
3053         .fno = {
3054             gen_helper_gvec_rotl8i,
3055             gen_helper_gvec_rotl16i,
3056             gen_helper_gvec_rotl32i,
3057             gen_helper_gvec_rotl64i,
3058         },
3059         .s_list = { INDEX_op_rotls_vec, 0 },
3060         .v_list = { INDEX_op_rotlv_vec, 0 },
3061     };
3062
3063     tcg_debug_assert(vece <= MO_64);
3064     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3065 }
3066
3067 /*
3068  * Expand D = A << (B % element bits)
3069  *
3070  * Unlike scalar shifts, where it is easy for the target front end
3071  * to include the modulo as part of the expansion.  If the target
3072  * naturally includes the modulo as part of the operation, great!
3073  * If the target has some other behaviour from out-of-range shifts,
3074  * then it could not use this function anyway, and would need to
3075  * do it's own expansion with custom functions.
3076  */
3077 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3078                                  TCGv_vec a, TCGv_vec b)
3079 {
3080     TCGv_vec t = tcg_temp_new_vec_matching(d);
3081
3082     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3083     tcg_gen_and_vec(vece, t, t, b);
3084     tcg_gen_shlv_vec(vece, d, a, t);
3085     tcg_temp_free_vec(t);
3086 }
3087
3088 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3089 {
3090     TCGv_i32 t = tcg_temp_new_i32();
3091
3092     tcg_gen_andi_i32(t, b, 31);
3093     tcg_gen_shl_i32(d, a, t);
3094     tcg_temp_free_i32(t);
3095 }
3096
3097 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3098 {
3099     TCGv_i64 t = tcg_temp_new_i64();
3100
3101     tcg_gen_andi_i64(t, b, 63);
3102     tcg_gen_shl_i64(d, a, t);
3103     tcg_temp_free_i64(t);
3104 }
3105
3106 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3107                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3108 {
3109     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3110     static const GVecGen3 g[4] = {
3111         { .fniv = tcg_gen_shlv_mod_vec,
3112           .fno = gen_helper_gvec_shl8v,
3113           .opt_opc = vecop_list,
3114           .vece = MO_8 },
3115         { .fniv = tcg_gen_shlv_mod_vec,
3116           .fno = gen_helper_gvec_shl16v,
3117           .opt_opc = vecop_list,
3118           .vece = MO_16 },
3119         { .fni4 = tcg_gen_shl_mod_i32,
3120           .fniv = tcg_gen_shlv_mod_vec,
3121           .fno = gen_helper_gvec_shl32v,
3122           .opt_opc = vecop_list,
3123           .vece = MO_32 },
3124         { .fni8 = tcg_gen_shl_mod_i64,
3125           .fniv = tcg_gen_shlv_mod_vec,
3126           .fno = gen_helper_gvec_shl64v,
3127           .opt_opc = vecop_list,
3128           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3129           .vece = MO_64 },
3130     };
3131
3132     tcg_debug_assert(vece <= MO_64);
3133     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3134 }
3135
3136 /*
3137  * Similarly for logical right shifts.
3138  */
3139
3140 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3141                                  TCGv_vec a, TCGv_vec b)
3142 {
3143     TCGv_vec t = tcg_temp_new_vec_matching(d);
3144
3145     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3146     tcg_gen_and_vec(vece, t, t, b);
3147     tcg_gen_shrv_vec(vece, d, a, t);
3148     tcg_temp_free_vec(t);
3149 }
3150
3151 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3152 {
3153     TCGv_i32 t = tcg_temp_new_i32();
3154
3155     tcg_gen_andi_i32(t, b, 31);
3156     tcg_gen_shr_i32(d, a, t);
3157     tcg_temp_free_i32(t);
3158 }
3159
3160 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3161 {
3162     TCGv_i64 t = tcg_temp_new_i64();
3163
3164     tcg_gen_andi_i64(t, b, 63);
3165     tcg_gen_shr_i64(d, a, t);
3166     tcg_temp_free_i64(t);
3167 }
3168
3169 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3170                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3171 {
3172     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3173     static const GVecGen3 g[4] = {
3174         { .fniv = tcg_gen_shrv_mod_vec,
3175           .fno = gen_helper_gvec_shr8v,
3176           .opt_opc = vecop_list,
3177           .vece = MO_8 },
3178         { .fniv = tcg_gen_shrv_mod_vec,
3179           .fno = gen_helper_gvec_shr16v,
3180           .opt_opc = vecop_list,
3181           .vece = MO_16 },
3182         { .fni4 = tcg_gen_shr_mod_i32,
3183           .fniv = tcg_gen_shrv_mod_vec,
3184           .fno = gen_helper_gvec_shr32v,
3185           .opt_opc = vecop_list,
3186           .vece = MO_32 },
3187         { .fni8 = tcg_gen_shr_mod_i64,
3188           .fniv = tcg_gen_shrv_mod_vec,
3189           .fno = gen_helper_gvec_shr64v,
3190           .opt_opc = vecop_list,
3191           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3192           .vece = MO_64 },
3193     };
3194
3195     tcg_debug_assert(vece <= MO_64);
3196     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3197 }
3198
3199 /*
3200  * Similarly for arithmetic right shifts.
3201  */
3202
3203 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3204                                  TCGv_vec a, TCGv_vec b)
3205 {
3206     TCGv_vec t = tcg_temp_new_vec_matching(d);
3207
3208     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3209     tcg_gen_and_vec(vece, t, t, b);
3210     tcg_gen_sarv_vec(vece, d, a, t);
3211     tcg_temp_free_vec(t);
3212 }
3213
3214 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3215 {
3216     TCGv_i32 t = tcg_temp_new_i32();
3217
3218     tcg_gen_andi_i32(t, b, 31);
3219     tcg_gen_sar_i32(d, a, t);
3220     tcg_temp_free_i32(t);
3221 }
3222
3223 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3224 {
3225     TCGv_i64 t = tcg_temp_new_i64();
3226
3227     tcg_gen_andi_i64(t, b, 63);
3228     tcg_gen_sar_i64(d, a, t);
3229     tcg_temp_free_i64(t);
3230 }
3231
3232 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3233                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3234 {
3235     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3236     static const GVecGen3 g[4] = {
3237         { .fniv = tcg_gen_sarv_mod_vec,
3238           .fno = gen_helper_gvec_sar8v,
3239           .opt_opc = vecop_list,
3240           .vece = MO_8 },
3241         { .fniv = tcg_gen_sarv_mod_vec,
3242           .fno = gen_helper_gvec_sar16v,
3243           .opt_opc = vecop_list,
3244           .vece = MO_16 },
3245         { .fni4 = tcg_gen_sar_mod_i32,
3246           .fniv = tcg_gen_sarv_mod_vec,
3247           .fno = gen_helper_gvec_sar32v,
3248           .opt_opc = vecop_list,
3249           .vece = MO_32 },
3250         { .fni8 = tcg_gen_sar_mod_i64,
3251           .fniv = tcg_gen_sarv_mod_vec,
3252           .fno = gen_helper_gvec_sar64v,
3253           .opt_opc = vecop_list,
3254           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3255           .vece = MO_64 },
3256     };
3257
3258     tcg_debug_assert(vece <= MO_64);
3259     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3260 }
3261
3262 /*
3263  * Similarly for rotates.
3264  */
3265
3266 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3267                                   TCGv_vec a, TCGv_vec b)
3268 {
3269     TCGv_vec t = tcg_temp_new_vec_matching(d);
3270
3271     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3272     tcg_gen_and_vec(vece, t, t, b);
3273     tcg_gen_rotlv_vec(vece, d, a, t);
3274     tcg_temp_free_vec(t);
3275 }
3276
3277 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3278 {
3279     TCGv_i32 t = tcg_temp_new_i32();
3280
3281     tcg_gen_andi_i32(t, b, 31);
3282     tcg_gen_rotl_i32(d, a, t);
3283     tcg_temp_free_i32(t);
3284 }
3285
3286 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3287 {
3288     TCGv_i64 t = tcg_temp_new_i64();
3289
3290     tcg_gen_andi_i64(t, b, 63);
3291     tcg_gen_rotl_i64(d, a, t);
3292     tcg_temp_free_i64(t);
3293 }
3294
3295 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3296                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3297 {
3298     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3299     static const GVecGen3 g[4] = {
3300         { .fniv = tcg_gen_rotlv_mod_vec,
3301           .fno = gen_helper_gvec_rotl8v,
3302           .opt_opc = vecop_list,
3303           .vece = MO_8 },
3304         { .fniv = tcg_gen_rotlv_mod_vec,
3305           .fno = gen_helper_gvec_rotl16v,
3306           .opt_opc = vecop_list,
3307           .vece = MO_16 },
3308         { .fni4 = tcg_gen_rotl_mod_i32,
3309           .fniv = tcg_gen_rotlv_mod_vec,
3310           .fno = gen_helper_gvec_rotl32v,
3311           .opt_opc = vecop_list,
3312           .vece = MO_32 },
3313         { .fni8 = tcg_gen_rotl_mod_i64,
3314           .fniv = tcg_gen_rotlv_mod_vec,
3315           .fno = gen_helper_gvec_rotl64v,
3316           .opt_opc = vecop_list,
3317           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3318           .vece = MO_64 },
3319     };
3320
3321     tcg_debug_assert(vece <= MO_64);
3322     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3323 }
3324
3325 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3326                                   TCGv_vec a, TCGv_vec b)
3327 {
3328     TCGv_vec t = tcg_temp_new_vec_matching(d);
3329
3330     tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3331     tcg_gen_and_vec(vece, t, t, b);
3332     tcg_gen_rotrv_vec(vece, d, a, t);
3333     tcg_temp_free_vec(t);
3334 }
3335
3336 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3337 {
3338     TCGv_i32 t = tcg_temp_new_i32();
3339
3340     tcg_gen_andi_i32(t, b, 31);
3341     tcg_gen_rotr_i32(d, a, t);
3342     tcg_temp_free_i32(t);
3343 }
3344
3345 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3346 {
3347     TCGv_i64 t = tcg_temp_new_i64();
3348
3349     tcg_gen_andi_i64(t, b, 63);
3350     tcg_gen_rotr_i64(d, a, t);
3351     tcg_temp_free_i64(t);
3352 }
3353
3354 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3355                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3356 {
3357     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3358     static const GVecGen3 g[4] = {
3359         { .fniv = tcg_gen_rotrv_mod_vec,
3360           .fno = gen_helper_gvec_rotr8v,
3361           .opt_opc = vecop_list,
3362           .vece = MO_8 },
3363         { .fniv = tcg_gen_rotrv_mod_vec,
3364           .fno = gen_helper_gvec_rotr16v,
3365           .opt_opc = vecop_list,
3366           .vece = MO_16 },
3367         { .fni4 = tcg_gen_rotr_mod_i32,
3368           .fniv = tcg_gen_rotrv_mod_vec,
3369           .fno = gen_helper_gvec_rotr32v,
3370           .opt_opc = vecop_list,
3371           .vece = MO_32 },
3372         { .fni8 = tcg_gen_rotr_mod_i64,
3373           .fniv = tcg_gen_rotrv_mod_vec,
3374           .fno = gen_helper_gvec_rotr64v,
3375           .opt_opc = vecop_list,
3376           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3377           .vece = MO_64 },
3378     };
3379
3380     tcg_debug_assert(vece <= MO_64);
3381     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3382 }
3383
3384 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3385 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3386                            uint32_t oprsz, TCGCond cond)
3387 {
3388     TCGv_i32 t0 = tcg_temp_new_i32();
3389     TCGv_i32 t1 = tcg_temp_new_i32();
3390     uint32_t i;
3391
3392     for (i = 0; i < oprsz; i += 4) {
3393         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3394         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3395         tcg_gen_setcond_i32(cond, t0, t0, t1);
3396         tcg_gen_neg_i32(t0, t0);
3397         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3398     }
3399     tcg_temp_free_i32(t1);
3400     tcg_temp_free_i32(t0);
3401 }
3402
3403 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3404                            uint32_t oprsz, TCGCond cond)
3405 {
3406     TCGv_i64 t0 = tcg_temp_new_i64();
3407     TCGv_i64 t1 = tcg_temp_new_i64();
3408     uint32_t i;
3409
3410     for (i = 0; i < oprsz; i += 8) {
3411         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3412         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3413         tcg_gen_setcond_i64(cond, t0, t0, t1);
3414         tcg_gen_neg_i64(t0, t0);
3415         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3416     }
3417     tcg_temp_free_i64(t1);
3418     tcg_temp_free_i64(t0);
3419 }
3420
3421 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3422                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3423                            TCGType type, TCGCond cond)
3424 {
3425     TCGv_vec t0 = tcg_temp_new_vec(type);
3426     TCGv_vec t1 = tcg_temp_new_vec(type);
3427     uint32_t i;
3428
3429     for (i = 0; i < oprsz; i += tysz) {
3430         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3431         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3432         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3433         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3434     }
3435     tcg_temp_free_vec(t1);
3436     tcg_temp_free_vec(t0);
3437 }
3438
3439 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3440                       uint32_t aofs, uint32_t bofs,
3441                       uint32_t oprsz, uint32_t maxsz)
3442 {
3443     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3444     static gen_helper_gvec_3 * const eq_fn[4] = {
3445         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3446         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3447     };
3448     static gen_helper_gvec_3 * const ne_fn[4] = {
3449         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3450         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3451     };
3452     static gen_helper_gvec_3 * const lt_fn[4] = {
3453         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3454         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3455     };
3456     static gen_helper_gvec_3 * const le_fn[4] = {
3457         gen_helper_gvec_le8, gen_helper_gvec_le16,
3458         gen_helper_gvec_le32, gen_helper_gvec_le64
3459     };
3460     static gen_helper_gvec_3 * const ltu_fn[4] = {
3461         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3462         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3463     };
3464     static gen_helper_gvec_3 * const leu_fn[4] = {
3465         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3466         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3467     };
3468     static gen_helper_gvec_3 * const * const fns[16] = {
3469         [TCG_COND_EQ] = eq_fn,
3470         [TCG_COND_NE] = ne_fn,
3471         [TCG_COND_LT] = lt_fn,
3472         [TCG_COND_LE] = le_fn,
3473         [TCG_COND_LTU] = ltu_fn,
3474         [TCG_COND_LEU] = leu_fn,
3475     };
3476
3477     const TCGOpcode *hold_list;
3478     TCGType type;
3479     uint32_t some;
3480
3481     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3482     check_overlap_3(dofs, aofs, bofs, maxsz);
3483
3484     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3485         do_dup(MO_8, dofs, oprsz, maxsz,
3486                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3487         return;
3488     }
3489
3490     /*
3491      * Implement inline with a vector type, if possible.
3492      * Prefer integer when 64-bit host and 64-bit comparison.
3493      */
3494     hold_list = tcg_swap_vecop_list(cmp_list);
3495     type = choose_vector_type(cmp_list, vece, oprsz,
3496                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3497     switch (type) {
3498     case TCG_TYPE_V256:
3499         /* Recall that ARM SVE allows vector sizes that are not a
3500          * power of 2, but always a multiple of 16.  The intent is
3501          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3502          */
3503         some = QEMU_ALIGN_DOWN(oprsz, 32);
3504         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3505         if (some == oprsz) {
3506             break;
3507         }
3508         dofs += some;
3509         aofs += some;
3510         bofs += some;
3511         oprsz -= some;
3512         maxsz -= some;
3513         /* fallthru */
3514     case TCG_TYPE_V128:
3515         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3516         break;
3517     case TCG_TYPE_V64:
3518         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3519         break;
3520
3521     case 0:
3522         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3523             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3524         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3525             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3526         } else {
3527             gen_helper_gvec_3 * const *fn = fns[cond];
3528
3529             if (fn == NULL) {
3530                 uint32_t tmp;
3531                 tmp = aofs, aofs = bofs, bofs = tmp;
3532                 cond = tcg_swap_cond(cond);
3533                 fn = fns[cond];
3534                 assert(fn != NULL);
3535             }
3536             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3537             oprsz = maxsz;
3538         }
3539         break;
3540
3541     default:
3542         g_assert_not_reached();
3543     }
3544     tcg_swap_vecop_list(hold_list);
3545
3546     if (oprsz < maxsz) {
3547         expand_clr(dofs + oprsz, maxsz - oprsz);
3548     }
3549 }
3550
3551 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3552 {
3553     TCGv_i64 t = tcg_temp_new_i64();
3554
3555     tcg_gen_and_i64(t, b, a);
3556     tcg_gen_andc_i64(d, c, a);
3557     tcg_gen_or_i64(d, d, t);
3558     tcg_temp_free_i64(t);
3559 }
3560
3561 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3562                          uint32_t bofs, uint32_t cofs,
3563                          uint32_t oprsz, uint32_t maxsz)
3564 {
3565     static const GVecGen4 g = {
3566         .fni8 = tcg_gen_bitsel_i64,
3567         .fniv = tcg_gen_bitsel_vec,
3568         .fno = gen_helper_gvec_bitsel,
3569     };
3570
3571     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3572 }