tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "qemu-common.h"
  22 #include "tcg.h"
  23 #include "tcg-op.h"
  24 #include "tcg-op-gvec.h"
  25 #include "tcg-gvec-desc.h"
  26
  27 #define MAX_UNROLL  4
  28
  29 /* Verify vector size and alignment rules.  OFS should be the OR of all
  30    of the operand offsets so that we can check them all at once.  */
  31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  32 {
  33     uint32_t opr_align = oprsz >= 16 ? 15 : 7;
  34     uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
  35     tcg_debug_assert(oprsz > 0);
  36     tcg_debug_assert(oprsz <= maxsz);
  37     tcg_debug_assert((oprsz & opr_align) == 0);
  38     tcg_debug_assert((maxsz & max_align) == 0);
  39     tcg_debug_assert((ofs & max_align) == 0);
  40 }
  41
  42 /* Verify vector overlap rules for two operands.  */
  43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  44 {
  45     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  46 }
  47
  48 /* Verify vector overlap rules for three operands.  */
  49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  50 {
  51     check_overlap_2(d, a, s);
  52     check_overlap_2(d, b, s);
  53     check_overlap_2(a, b, s);
  54 }
  55
  56 /* Verify vector overlap rules for four operands.  */
  57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  58                             uint32_t c, uint32_t s)
  59 {
  60     check_overlap_2(d, a, s);
  61     check_overlap_2(d, b, s);
  62     check_overlap_2(d, c, s);
  63     check_overlap_2(a, b, s);
  64     check_overlap_2(a, c, s);
  65     check_overlap_2(b, c, s);
  66 }
  67
  68 /* Create a descriptor from components.  */
  69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  70 {
  71     uint32_t desc = 0;
  72
  73     assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
  74     assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
  75     assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  76
  77     oprsz = (oprsz / 8) - 1;
  78     maxsz = (maxsz / 8) - 1;
  79     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
  80     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
  81     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
  82
  83     return desc;
  84 }
  85
  86 /* Generate a call to a gvec-style helper with two vector operands.  */
  87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
  88                         uint32_t oprsz, uint32_t maxsz, int32_t data,
  89                         gen_helper_gvec_2 *fn)
  90 {
  91     TCGv_ptr a0, a1;
  92     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
  93
  94     a0 = tcg_temp_new_ptr();
  95     a1 = tcg_temp_new_ptr();
  96
  97     tcg_gen_addi_ptr(a0, cpu_env, dofs);
  98     tcg_gen_addi_ptr(a1, cpu_env, aofs);
  99
 100     fn(a0, a1, desc);
 101
 102     tcg_temp_free_ptr(a0);
 103     tcg_temp_free_ptr(a1);
 104     tcg_temp_free_i32(desc);
 105 }
 106
 107 /* Generate a call to a gvec-style helper with two vector operands
 108    and one scalar operand.  */
 109 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 110                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 111                          gen_helper_gvec_2i *fn)
 112 {
 113     TCGv_ptr a0, a1;
 114     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 115
 116     a0 = tcg_temp_new_ptr();
 117     a1 = tcg_temp_new_ptr();
 118
 119     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 120     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 121
 122     fn(a0, a1, c, desc);
 123
 124     tcg_temp_free_ptr(a0);
 125     tcg_temp_free_ptr(a1);
 126     tcg_temp_free_i32(desc);
 127 }
 128
 129 /* Generate a call to a gvec-style helper with three vector operands.  */
 130 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 131                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 132                         gen_helper_gvec_3 *fn)
 133 {
 134     TCGv_ptr a0, a1, a2;
 135     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 136
 137     a0 = tcg_temp_new_ptr();
 138     a1 = tcg_temp_new_ptr();
 139     a2 = tcg_temp_new_ptr();
 140
 141     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 142     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 143     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 144
 145     fn(a0, a1, a2, desc);
 146
 147     tcg_temp_free_ptr(a0);
 148     tcg_temp_free_ptr(a1);
 149     tcg_temp_free_ptr(a2);
 150     tcg_temp_free_i32(desc);
 151 }
 152
 153 /* Generate a call to a gvec-style helper with four vector operands.  */
 154 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 156                         int32_t data, gen_helper_gvec_4 *fn)
 157 {
 158     TCGv_ptr a0, a1, a2, a3;
 159     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 160
 161     a0 = tcg_temp_new_ptr();
 162     a1 = tcg_temp_new_ptr();
 163     a2 = tcg_temp_new_ptr();
 164     a3 = tcg_temp_new_ptr();
 165
 166     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 167     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 168     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 169     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 170
 171     fn(a0, a1, a2, a3, desc);
 172
 173     tcg_temp_free_ptr(a0);
 174     tcg_temp_free_ptr(a1);
 175     tcg_temp_free_ptr(a2);
 176     tcg_temp_free_ptr(a3);
 177     tcg_temp_free_i32(desc);
 178 }
 179
 180 /* Generate a call to a gvec-style helper with five vector operands.  */
 181 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 182                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 183                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 184 {
 185     TCGv_ptr a0, a1, a2, a3, a4;
 186     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 187
 188     a0 = tcg_temp_new_ptr();
 189     a1 = tcg_temp_new_ptr();
 190     a2 = tcg_temp_new_ptr();
 191     a3 = tcg_temp_new_ptr();
 192     a4 = tcg_temp_new_ptr();
 193
 194     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 195     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 196     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 197     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 198     tcg_gen_addi_ptr(a4, cpu_env, xofs);
 199
 200     fn(a0, a1, a2, a3, a4, desc);
 201
 202     tcg_temp_free_ptr(a0);
 203     tcg_temp_free_ptr(a1);
 204     tcg_temp_free_ptr(a2);
 205     tcg_temp_free_ptr(a3);
 206     tcg_temp_free_ptr(a4);
 207     tcg_temp_free_i32(desc);
 208 }
 209
 210 /* Generate a call to a gvec-style helper with three vector operands
 211    and an extra pointer operand.  */
 212 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 213                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 214                         int32_t data, gen_helper_gvec_2_ptr *fn)
 215 {
 216     TCGv_ptr a0, a1;
 217     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 218
 219     a0 = tcg_temp_new_ptr();
 220     a1 = tcg_temp_new_ptr();
 221
 222     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 223     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 224
 225     fn(a0, a1, ptr, desc);
 226
 227     tcg_temp_free_ptr(a0);
 228     tcg_temp_free_ptr(a1);
 229     tcg_temp_free_i32(desc);
 230 }
 231
 232 /* Generate a call to a gvec-style helper with three vector operands
 233    and an extra pointer operand.  */
 234 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 235                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 236                         int32_t data, gen_helper_gvec_3_ptr *fn)
 237 {
 238     TCGv_ptr a0, a1, a2;
 239     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 240
 241     a0 = tcg_temp_new_ptr();
 242     a1 = tcg_temp_new_ptr();
 243     a2 = tcg_temp_new_ptr();
 244
 245     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 246     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 247     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 248
 249     fn(a0, a1, a2, ptr, desc);
 250
 251     tcg_temp_free_ptr(a0);
 252     tcg_temp_free_ptr(a1);
 253     tcg_temp_free_ptr(a2);
 254     tcg_temp_free_i32(desc);
 255 }
 256
 257 /* Generate a call to a gvec-style helper with four vector operands
 258    and an extra pointer operand.  */
 259 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 260                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 261                         uint32_t maxsz, int32_t data,
 262                         gen_helper_gvec_4_ptr *fn)
 263 {
 264     TCGv_ptr a0, a1, a2, a3;
 265     TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 266
 267     a0 = tcg_temp_new_ptr();
 268     a1 = tcg_temp_new_ptr();
 269     a2 = tcg_temp_new_ptr();
 270     a3 = tcg_temp_new_ptr();
 271
 272     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 273     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 274     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 275     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 276
 277     fn(a0, a1, a2, a3, ptr, desc);
 278
 279     tcg_temp_free_ptr(a0);
 280     tcg_temp_free_ptr(a1);
 281     tcg_temp_free_ptr(a2);
 282     tcg_temp_free_ptr(a3);
 283     tcg_temp_free_i32(desc);
 284 }
 285
 286 /* Return true if we want to implement something of OPRSZ bytes
 287    in units of LNSZ.  This limits the expansion of inline code.  */
 288 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 289 {
 290     uint32_t lnct = oprsz / lnsz;
 291     return lnct >= 1 && lnct <= MAX_UNROLL;
 292 }
 293
 294 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 295
 296 /* Duplicate C as per VECE.  */
 297 uint64_t (dup_const)(unsigned vece, uint64_t c)
 298 {
 299     switch (vece) {
 300     case MO_8:
 301         return 0x0101010101010101ull * (uint8_t)c;
 302     case MO_16:
 303         return 0x0001000100010001ull * (uint16_t)c;
 304     case MO_32:
 305         return 0x0000000100000001ull * (uint32_t)c;
 306     case MO_64:
 307         return c;
 308     default:
 309         g_assert_not_reached();
 310     }
 311 }
 312
 313 /* Duplicate IN into OUT as per VECE.  */
 314 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 315 {
 316     switch (vece) {
 317     case MO_8:
 318         tcg_gen_ext8u_i32(out, in);
 319         tcg_gen_muli_i32(out, out, 0x01010101);
 320         break;
 321     case MO_16:
 322         tcg_gen_deposit_i32(out, in, in, 16, 16);
 323         break;
 324     case MO_32:
 325         tcg_gen_mov_i32(out, in);
 326         break;
 327     default:
 328         g_assert_not_reached();
 329     }
 330 }
 331
 332 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 333 {
 334     switch (vece) {
 335     case MO_8:
 336         tcg_gen_ext8u_i64(out, in);
 337         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 338         break;
 339     case MO_16:
 340         tcg_gen_ext16u_i64(out, in);
 341         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 342         break;
 343     case MO_32:
 344         tcg_gen_deposit_i64(out, in, in, 32, 32);
 345         break;
 346     case MO_64:
 347         tcg_gen_mov_i64(out, in);
 348         break;
 349     default:
 350         g_assert_not_reached();
 351     }
 352 }
 353
 354 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 355  * Only one of IN_32 or IN_64 may be set;
 356  * IN_C is used if IN_32 and IN_64 are unset.
 357  */
 358 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 359                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 360                    uint64_t in_c)
 361 {
 362     TCGType type;
 363     TCGv_i64 t_64;
 364     TCGv_i32 t_32, t_desc;
 365     TCGv_ptr t_ptr;
 366     uint32_t i;
 367
 368     assert(vece <= (in_32 ? MO_32 : MO_64));
 369     assert(in_32 == NULL || in_64 == NULL);
 370
 371     /* If we're storing 0, expand oprsz to maxsz.  */
 372     if (in_32 == NULL && in_64 == NULL) {
 373         in_c = dup_const(vece, in_c);
 374         if (in_c == 0) {
 375             oprsz = maxsz;
 376         }
 377     }
 378
 379     type = 0;
 380     if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
 381         type = TCG_TYPE_V256;
 382     } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
 383         type = TCG_TYPE_V128;
 384     } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)
 385                /* Prefer integer when 64-bit host and no variable dup.  */
 386                && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 387                     && (in_64 == NULL || vece == MO_64))) {
 388         type = TCG_TYPE_V64;
 389     }
 390
 391     /* Implement inline with a vector type, if possible.  */
 392     if (type != 0) {
 393         TCGv_vec t_vec = tcg_temp_new_vec(type);
 394
 395         if (in_32) {
 396             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 397         } else if (in_64) {
 398             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 399         } else {
 400             switch (vece) {
 401             case MO_8:
 402                 tcg_gen_dup8i_vec(t_vec, in_c);
 403                 break;
 404             case MO_16:
 405                 tcg_gen_dup16i_vec(t_vec, in_c);
 406                 break;
 407             case MO_32:
 408                 tcg_gen_dup32i_vec(t_vec, in_c);
 409                 break;
 410             default:
 411                 tcg_gen_dup64i_vec(t_vec, in_c);
 412                 break;
 413             }
 414         }
 415
 416         i = 0;
 417         if (TCG_TARGET_HAS_v256) {
 418             for (; i + 32 <= oprsz; i += 32) {
 419                 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 420             }
 421         }
 422         if (TCG_TARGET_HAS_v128) {
 423             for (; i + 16 <= oprsz; i += 16) {
 424                 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 425             }
 426         }
 427         if (TCG_TARGET_HAS_v64) {
 428             for (; i < oprsz; i += 8) {
 429                 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 430             }
 431         }
 432         tcg_temp_free_vec(t_vec);
 433         goto done;
 434     }
 435
 436     /* Otherwise, inline with an integer type, unless "large".  */
 437     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 438         t_64 = NULL;
 439         t_32 = NULL;
 440
 441         if (in_32) {
 442             /* We are given a 32-bit variable input.  For a 64-bit host,
 443                use a 64-bit operation unless the 32-bit operation would
 444                be simple enough.  */
 445             if (TCG_TARGET_REG_BITS == 64
 446                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 447                 t_64 = tcg_temp_new_i64();
 448                 tcg_gen_extu_i32_i64(t_64, in_32);
 449                 gen_dup_i64(vece, t_64, t_64);
 450             } else {
 451                 t_32 = tcg_temp_new_i32();
 452                 gen_dup_i32(vece, t_32, in_32);
 453             }
 454         } else if (in_64) {
 455             /* We are given a 64-bit variable input.  */
 456             t_64 = tcg_temp_new_i64();
 457             gen_dup_i64(vece, t_64, in_64);
 458         } else {
 459             /* We are given a constant input.  */
 460             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 461                or when we'd need too many 32-bit stores, or when a 64-bit
 462                constant is really required.  */
 463             if (vece == MO_64
 464                 || (TCG_TARGET_REG_BITS == 64
 465                     && (in_c == 0 || in_c == -1
 466                         || !check_size_impl(oprsz, 4)))) {
 467                 t_64 = tcg_const_i64(in_c);
 468             } else {
 469                 t_32 = tcg_const_i32(in_c);
 470             }
 471         }
 472
 473         /* Implement inline if we picked an implementation size above.  */
 474         if (t_32) {
 475             for (i = 0; i < oprsz; i += 4) {
 476                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 477             }
 478             tcg_temp_free_i32(t_32);
 479             goto done;
 480         }
 481         if (t_64) {
 482             for (i = 0; i < oprsz; i += 8) {
 483                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 484             }
 485             tcg_temp_free_i64(t_64);
 486             goto done;
 487         }
 488     }
 489
 490     /* Otherwise implement out of line.  */
 491     t_ptr = tcg_temp_new_ptr();
 492     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 493     t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 494
 495     if (vece == MO_64) {
 496         if (in_64) {
 497             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 498         } else {
 499             t_64 = tcg_const_i64(in_c);
 500             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 501             tcg_temp_free_i64(t_64);
 502         }
 503     } else {
 504         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 505         static dup_fn * const fns[3] = {
 506             gen_helper_gvec_dup8,
 507             gen_helper_gvec_dup16,
 508             gen_helper_gvec_dup32
 509         };
 510
 511         if (in_32) {
 512             fns[vece](t_ptr, t_desc, in_32);
 513         } else {
 514             t_32 = tcg_temp_new_i32();
 515             if (in_64) {
 516                 tcg_gen_extrl_i64_i32(t_32, in_64);
 517             } else if (vece == MO_8) {
 518                 tcg_gen_movi_i32(t_32, in_c & 0xff);
 519             } else if (vece == MO_16) {
 520                 tcg_gen_movi_i32(t_32, in_c & 0xffff);
 521             } else {
 522                 tcg_gen_movi_i32(t_32, in_c);
 523             }
 524             fns[vece](t_ptr, t_desc, t_32);
 525             tcg_temp_free_i32(t_32);
 526         }
 527     }
 528
 529     tcg_temp_free_ptr(t_ptr);
 530     tcg_temp_free_i32(t_desc);
 531     return;
 532
 533  done:
 534     if (oprsz < maxsz) {
 535         expand_clr(dofs + oprsz, maxsz - oprsz);
 536     }
 537 }
 538
 539 /* Likewise, but with zero.  */
 540 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 541 {
 542     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 543 }
 544
 545 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 546 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 547                          void (*fni)(TCGv_i32, TCGv_i32))
 548 {
 549     TCGv_i32 t0 = tcg_temp_new_i32();
 550     uint32_t i;
 551
 552     for (i = 0; i < oprsz; i += 4) {
 553         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 554         fni(t0, t0);
 555         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 556     }
 557     tcg_temp_free_i32(t0);
 558 }
 559
 560 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 561                           int32_t c, bool load_dest,
 562                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 563 {
 564     TCGv_i32 t0 = tcg_temp_new_i32();
 565     TCGv_i32 t1 = tcg_temp_new_i32();
 566     uint32_t i;
 567
 568     for (i = 0; i < oprsz; i += 4) {
 569         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 570         if (load_dest) {
 571             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 572         }
 573         fni(t1, t0, c);
 574         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 575     }
 576     tcg_temp_free_i32(t0);
 577     tcg_temp_free_i32(t1);
 578 }
 579
 580 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 581                           TCGv_i32 c, bool scalar_first,
 582                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 583 {
 584     TCGv_i32 t0 = tcg_temp_new_i32();
 585     TCGv_i32 t1 = tcg_temp_new_i32();
 586     uint32_t i;
 587
 588     for (i = 0; i < oprsz; i += 4) {
 589         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 590         if (scalar_first) {
 591             fni(t1, c, t0);
 592         } else {
 593             fni(t1, t0, c);
 594         }
 595         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 596     }
 597     tcg_temp_free_i32(t0);
 598     tcg_temp_free_i32(t1);
 599 }
 600
 601 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 602 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 603                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 604                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 605 {
 606     TCGv_i32 t0 = tcg_temp_new_i32();
 607     TCGv_i32 t1 = tcg_temp_new_i32();
 608     TCGv_i32 t2 = tcg_temp_new_i32();
 609     uint32_t i;
 610
 611     for (i = 0; i < oprsz; i += 4) {
 612         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 613         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 614         if (load_dest) {
 615             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 616         }
 617         fni(t2, t0, t1);
 618         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 619     }
 620     tcg_temp_free_i32(t2);
 621     tcg_temp_free_i32(t1);
 622     tcg_temp_free_i32(t0);
 623 }
 624
 625 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 626 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 627                          uint32_t cofs, uint32_t oprsz,
 628                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 629 {
 630     TCGv_i32 t0 = tcg_temp_new_i32();
 631     TCGv_i32 t1 = tcg_temp_new_i32();
 632     TCGv_i32 t2 = tcg_temp_new_i32();
 633     TCGv_i32 t3 = tcg_temp_new_i32();
 634     uint32_t i;
 635
 636     for (i = 0; i < oprsz; i += 4) {
 637         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 638         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 639         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 640         fni(t0, t1, t2, t3);
 641         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 642     }
 643     tcg_temp_free_i32(t3);
 644     tcg_temp_free_i32(t2);
 645     tcg_temp_free_i32(t1);
 646     tcg_temp_free_i32(t0);
 647 }
 648
 649 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 650 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 651                          void (*fni)(TCGv_i64, TCGv_i64))
 652 {
 653     TCGv_i64 t0 = tcg_temp_new_i64();
 654     uint32_t i;
 655
 656     for (i = 0; i < oprsz; i += 8) {
 657         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 658         fni(t0, t0);
 659         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 660     }
 661     tcg_temp_free_i64(t0);
 662 }
 663
 664 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 665                           int64_t c, bool load_dest,
 666                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 667 {
 668     TCGv_i64 t0 = tcg_temp_new_i64();
 669     TCGv_i64 t1 = tcg_temp_new_i64();
 670     uint32_t i;
 671
 672     for (i = 0; i < oprsz; i += 8) {
 673         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 674         if (load_dest) {
 675             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 676         }
 677         fni(t1, t0, c);
 678         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 679     }
 680     tcg_temp_free_i64(t0);
 681     tcg_temp_free_i64(t1);
 682 }
 683
 684 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 685                           TCGv_i64 c, bool scalar_first,
 686                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 687 {
 688     TCGv_i64 t0 = tcg_temp_new_i64();
 689     TCGv_i64 t1 = tcg_temp_new_i64();
 690     uint32_t i;
 691
 692     for (i = 0; i < oprsz; i += 8) {
 693         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 694         if (scalar_first) {
 695             fni(t1, c, t0);
 696         } else {
 697             fni(t1, t0, c);
 698         }
 699         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 700     }
 701     tcg_temp_free_i64(t0);
 702     tcg_temp_free_i64(t1);
 703 }
 704
 705 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 706 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 707                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 708                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 709 {
 710     TCGv_i64 t0 = tcg_temp_new_i64();
 711     TCGv_i64 t1 = tcg_temp_new_i64();
 712     TCGv_i64 t2 = tcg_temp_new_i64();
 713     uint32_t i;
 714
 715     for (i = 0; i < oprsz; i += 8) {
 716         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 717         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 718         if (load_dest) {
 719             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 720         }
 721         fni(t2, t0, t1);
 722         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 723     }
 724     tcg_temp_free_i64(t2);
 725     tcg_temp_free_i64(t1);
 726     tcg_temp_free_i64(t0);
 727 }
 728
 729 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 730 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 731                          uint32_t cofs, uint32_t oprsz,
 732                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 733 {
 734     TCGv_i64 t0 = tcg_temp_new_i64();
 735     TCGv_i64 t1 = tcg_temp_new_i64();
 736     TCGv_i64 t2 = tcg_temp_new_i64();
 737     TCGv_i64 t3 = tcg_temp_new_i64();
 738     uint32_t i;
 739
 740     for (i = 0; i < oprsz; i += 8) {
 741         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 742         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 743         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 744         fni(t0, t1, t2, t3);
 745         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 746     }
 747     tcg_temp_free_i64(t3);
 748     tcg_temp_free_i64(t2);
 749     tcg_temp_free_i64(t1);
 750     tcg_temp_free_i64(t0);
 751 }
 752
 753 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 754 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 755                          uint32_t oprsz, uint32_t tysz, TCGType type,
 756                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 757 {
 758     TCGv_vec t0 = tcg_temp_new_vec(type);
 759     uint32_t i;
 760
 761     for (i = 0; i < oprsz; i += tysz) {
 762         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 763         fni(vece, t0, t0);
 764         tcg_gen_st_vec(t0, cpu_env, dofs + i);
 765     }
 766     tcg_temp_free_vec(t0);
 767 }
 768
 769 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 770    using host vectors.  */
 771 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 772                           uint32_t oprsz, uint32_t tysz, TCGType type,
 773                           int64_t c, bool load_dest,
 774                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 775 {
 776     TCGv_vec t0 = tcg_temp_new_vec(type);
 777     TCGv_vec t1 = tcg_temp_new_vec(type);
 778     uint32_t i;
 779
 780     for (i = 0; i < oprsz; i += tysz) {
 781         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 782         if (load_dest) {
 783             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 784         }
 785         fni(vece, t1, t0, c);
 786         tcg_gen_st_vec(t1, cpu_env, dofs + i);
 787     }
 788     tcg_temp_free_vec(t0);
 789     tcg_temp_free_vec(t1);
 790 }
 791
 792 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 793                           uint32_t oprsz, uint32_t tysz, TCGType type,
 794                           TCGv_vec c, bool scalar_first,
 795                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 796 {
 797     TCGv_vec t0 = tcg_temp_new_vec(type);
 798     TCGv_vec t1 = tcg_temp_new_vec(type);
 799     uint32_t i;
 800
 801     for (i = 0; i < oprsz; i += tysz) {
 802         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 803         if (scalar_first) {
 804             fni(vece, t1, c, t0);
 805         } else {
 806             fni(vece, t1, t0, c);
 807         }
 808         tcg_gen_st_vec(t1, cpu_env, dofs + i);
 809     }
 810     tcg_temp_free_vec(t0);
 811     tcg_temp_free_vec(t1);
 812 }
 813
 814 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
 815 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 816                          uint32_t bofs, uint32_t oprsz,
 817                          uint32_t tysz, TCGType type, bool load_dest,
 818                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 819 {
 820     TCGv_vec t0 = tcg_temp_new_vec(type);
 821     TCGv_vec t1 = tcg_temp_new_vec(type);
 822     TCGv_vec t2 = tcg_temp_new_vec(type);
 823     uint32_t i;
 824
 825     for (i = 0; i < oprsz; i += tysz) {
 826         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 827         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 828         if (load_dest) {
 829             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 830         }
 831         fni(vece, t2, t0, t1);
 832         tcg_gen_st_vec(t2, cpu_env, dofs + i);
 833     }
 834     tcg_temp_free_vec(t2);
 835     tcg_temp_free_vec(t1);
 836     tcg_temp_free_vec(t0);
 837 }
 838
 839 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
 840 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 841                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
 842                          uint32_t tysz, TCGType type,
 843                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
 844                                      TCGv_vec, TCGv_vec))
 845 {
 846     TCGv_vec t0 = tcg_temp_new_vec(type);
 847     TCGv_vec t1 = tcg_temp_new_vec(type);
 848     TCGv_vec t2 = tcg_temp_new_vec(type);
 849     TCGv_vec t3 = tcg_temp_new_vec(type);
 850     uint32_t i;
 851
 852     for (i = 0; i < oprsz; i += tysz) {
 853         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
 854         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
 855         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
 856         fni(vece, t0, t1, t2, t3);
 857         tcg_gen_st_vec(t0, cpu_env, dofs + i);
 858     }
 859     tcg_temp_free_vec(t3);
 860     tcg_temp_free_vec(t2);
 861     tcg_temp_free_vec(t1);
 862     tcg_temp_free_vec(t0);
 863 }
 864
 865 /* Expand a vector two-operand operation.  */
 866 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 867                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
 868 {
 869     check_size_align(oprsz, maxsz, dofs | aofs);
 870     check_overlap_2(dofs, aofs, maxsz);
 871
 872     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
 873        Expand with successively smaller host vector sizes.  The intent is
 874        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
 875     /* ??? For maxsz > oprsz, the host may be able to use an opr-sized
 876        operation, zeroing the balance of the register.  We can then
 877        use a max-sized store to implement the clearing without an extra
 878        store operation.  This is true for aarch64 and x86_64 hosts.  */
 879
 880     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
 881         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
 882         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
 883         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
 884         if (some == oprsz) {
 885             goto done;
 886         }
 887         dofs += some;
 888         aofs += some;
 889         oprsz -= some;
 890         maxsz -= some;
 891     }
 892
 893     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
 894         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
 895         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
 896     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
 897                && g->fniv && check_size_impl(oprsz, 8)
 898                && (!g->opc
 899                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
 900         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
 901     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
 902         expand_2_i64(dofs, aofs, oprsz, g->fni8);
 903     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
 904         expand_2_i32(dofs, aofs, oprsz, g->fni4);
 905     } else {
 906         assert(g->fno != NULL);
 907         tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
 908         return;
 909     }
 910
 911  done:
 912     if (oprsz < maxsz) {
 913         expand_clr(dofs + oprsz, maxsz - oprsz);
 914     }
 915 }
 916
 917 /* Expand a vector operation with two vectors and an immediate.  */
 918 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 919                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
 920 {
 921     check_size_align(oprsz, maxsz, dofs | aofs);
 922     check_overlap_2(dofs, aofs, maxsz);
 923
 924     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
 925        Expand with successively smaller host vector sizes.  The intent is
 926        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
 927
 928     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
 929         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
 930         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
 931         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
 932                       c, g->load_dest, g->fniv);
 933         if (some == oprsz) {
 934             goto done;
 935         }
 936         dofs += some;
 937         aofs += some;
 938         oprsz -= some;
 939         maxsz -= some;
 940     }
 941
 942     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
 943         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
 944         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
 945                       c, g->load_dest, g->fniv);
 946     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
 947                && g->fniv && check_size_impl(oprsz, 8)
 948                && (!g->opc
 949                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
 950         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
 951                       c, g->load_dest, g->fniv);
 952     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
 953         expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
 954     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
 955         expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
 956     } else {
 957         if (g->fno) {
 958             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
 959         } else {
 960             TCGv_i64 tcg_c = tcg_const_i64(c);
 961             tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, maxsz, c, g->fnoi);
 962             tcg_temp_free_i64(tcg_c);
 963         }
 964         return;
 965     }
 966
 967  done:
 968     if (oprsz < maxsz) {
 969         expand_clr(dofs + oprsz, maxsz - oprsz);
 970     }
 971 }
 972
 973 /* Expand a vector operation with two vectors and a scalar.  */
 974 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 975                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
 976 {
 977     TCGType type;
 978
 979     check_size_align(oprsz, maxsz, dofs | aofs);
 980     check_overlap_2(dofs, aofs, maxsz);
 981
 982     type = 0;
 983     if (g->fniv) {
 984         if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
 985             type = TCG_TYPE_V256;
 986         } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
 987             type = TCG_TYPE_V128;
 988         } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
 989                && check_size_impl(oprsz, 8)) {
 990             type = TCG_TYPE_V64;
 991         }
 992     }
 993     if (type != 0) {
 994         TCGv_vec t_vec = tcg_temp_new_vec(type);
 995
 996         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
 997
 998         /* Recall that ARM SVE allows vector sizes that are not a power of 2.
 999            Expand with successively smaller host vector sizes.  The intent is
1000            that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
1001         switch (type) {
1002         case TCG_TYPE_V256:
1003             {
1004                 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1005                 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1006                               t_vec, g->scalar_first, g->fniv);
1007                 if (some == oprsz) {
1008                     break;
1009                 }
1010                 dofs += some;
1011                 aofs += some;
1012                 oprsz -= some;
1013                 maxsz -= some;
1014             }
1015             /* fallthru */
1016
1017         case TCG_TYPE_V128:
1018             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1019                           t_vec, g->scalar_first, g->fniv);
1020             break;
1021
1022         case TCG_TYPE_V64:
1023             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1024                           t_vec, g->scalar_first, g->fniv);
1025             break;
1026
1027         default:
1028             g_assert_not_reached();
1029         }
1030         tcg_temp_free_vec(t_vec);
1031     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1032         TCGv_i64 t64 = tcg_temp_new_i64();
1033
1034         gen_dup_i64(g->vece, t64, c);
1035         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1036         tcg_temp_free_i64(t64);
1037     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1038         TCGv_i32 t32 = tcg_temp_new_i32();
1039
1040         tcg_gen_extrl_i64_i32(t32, c);
1041         gen_dup_i32(g->vece, t32, t32);
1042         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1043         tcg_temp_free_i32(t32);
1044     } else {
1045         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1046         return;
1047     }
1048
1049     if (oprsz < maxsz) {
1050         expand_clr(dofs + oprsz, maxsz - oprsz);
1051     }
1052 }
1053
1054 /* Expand a vector three-operand operation.  */
1055 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1056                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1057 {
1058     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1059     check_overlap_3(dofs, aofs, bofs, maxsz);
1060
1061     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
1062        Expand with successively smaller host vector sizes.  The intent is
1063        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
1064
1065     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
1066         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
1067         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1068         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1069                      g->load_dest, g->fniv);
1070         if (some == oprsz) {
1071             goto done;
1072         }
1073         dofs += some;
1074         aofs += some;
1075         bofs += some;
1076         oprsz -= some;
1077         maxsz -= some;
1078     }
1079
1080     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
1081         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
1082         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1083                      g->load_dest, g->fniv);
1084     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
1085                && g->fniv && check_size_impl(oprsz, 8)
1086                && (!g->opc
1087                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
1088         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1089                      g->load_dest, g->fniv);
1090     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1091         expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1092     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1093         expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1094     } else {
1095         assert(g->fno != NULL);
1096         tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno);
1097     }
1098
1099  done:
1100     if (oprsz < maxsz) {
1101         expand_clr(dofs + oprsz, maxsz - oprsz);
1102     }
1103 }
1104
1105 /* Expand a vector four-operand operation.  */
1106 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1107                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1108 {
1109     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1110     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1111
1112     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
1113        Expand with successively smaller host vector sizes.  The intent is
1114        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
1115
1116     if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
1117         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
1118         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1119         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1120                      32, TCG_TYPE_V256, g->fniv);
1121         if (some == oprsz) {
1122             goto done;
1123         }
1124         dofs += some;
1125         aofs += some;
1126         bofs += some;
1127         cofs += some;
1128         oprsz -= some;
1129         maxsz -= some;
1130     }
1131
1132     if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
1133         && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
1134         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1135                      16, TCG_TYPE_V128, g->fniv);
1136     } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
1137                && g->fniv && check_size_impl(oprsz, 8)
1138                 && (!g->opc
1139                     || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
1140         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1141                      8, TCG_TYPE_V64, g->fniv);
1142     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1143         expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
1144     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1145         expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
1146     } else {
1147         assert(g->fno != NULL);
1148         tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1149                            oprsz, maxsz, g->data, g->fno);
1150         return;
1151     }
1152
1153  done:
1154     if (oprsz < maxsz) {
1155         expand_clr(dofs + oprsz, maxsz - oprsz);
1156     }
1157 }
1158
1159 /*
1160  * Expand specific vector operations.
1161  */
1162
1163 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1164 {
1165     tcg_gen_mov_vec(a, b);
1166 }
1167
1168 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1169                       uint32_t oprsz, uint32_t maxsz)
1170 {
1171     static const GVecGen2 g = {
1172         .fni8 = tcg_gen_mov_i64,
1173         .fniv = vec_mov2,
1174         .fno = gen_helper_gvec_mov,
1175         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1176     };
1177     if (dofs != aofs) {
1178         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1179     } else {
1180         check_size_align(oprsz, maxsz, dofs);
1181         if (oprsz < maxsz) {
1182             expand_clr(dofs + oprsz, maxsz - oprsz);
1183         }
1184     }
1185 }
1186
1187 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1188                           uint32_t maxsz, TCGv_i32 in)
1189 {
1190     check_size_align(oprsz, maxsz, dofs);
1191     tcg_debug_assert(vece <= MO_32);
1192     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1193 }
1194
1195 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1196                           uint32_t maxsz, TCGv_i64 in)
1197 {
1198     check_size_align(oprsz, maxsz, dofs);
1199     tcg_debug_assert(vece <= MO_64);
1200     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1201 }
1202
1203 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1204                           uint32_t oprsz, uint32_t maxsz)
1205 {
1206     if (vece <= MO_32) {
1207         TCGv_i32 in = tcg_temp_new_i32();
1208         switch (vece) {
1209         case MO_8:
1210             tcg_gen_ld8u_i32(in, cpu_env, aofs);
1211             break;
1212         case MO_16:
1213             tcg_gen_ld16u_i32(in, cpu_env, aofs);
1214             break;
1215         case MO_32:
1216             tcg_gen_ld_i32(in, cpu_env, aofs);
1217             break;
1218         }
1219         tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1220         tcg_temp_free_i32(in);
1221     } else if (vece == MO_64) {
1222         TCGv_i64 in = tcg_temp_new_i64();
1223         tcg_gen_ld_i64(in, cpu_env, aofs);
1224         tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1225         tcg_temp_free_i64(in);
1226     } else {
1227         /* 128-bit duplicate.  */
1228         /* ??? Dup to 256-bit vector.  */
1229         int i;
1230
1231         tcg_debug_assert(vece == 4);
1232         tcg_debug_assert(oprsz >= 16);
1233         if (TCG_TARGET_HAS_v128) {
1234             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1235
1236             tcg_gen_ld_vec(in, cpu_env, aofs);
1237             for (i = 0; i < oprsz; i += 16) {
1238                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1239             }
1240             tcg_temp_free_vec(in);
1241         } else {
1242             TCGv_i64 in0 = tcg_temp_new_i64();
1243             TCGv_i64 in1 = tcg_temp_new_i64();
1244
1245             tcg_gen_ld_i64(in0, cpu_env, aofs);
1246             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1247             for (i = 0; i < oprsz; i += 16) {
1248                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1249                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1250             }
1251             tcg_temp_free_i64(in0);
1252             tcg_temp_free_i64(in1);
1253         }
1254     }
1255 }
1256
1257 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1258                          uint32_t maxsz, uint64_t x)
1259 {
1260     check_size_align(oprsz, maxsz, dofs);
1261     do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1262 }
1263
1264 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1265                          uint32_t maxsz, uint32_t x)
1266 {
1267     check_size_align(oprsz, maxsz, dofs);
1268     do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1269 }
1270
1271 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1272                          uint32_t maxsz, uint16_t x)
1273 {
1274     check_size_align(oprsz, maxsz, dofs);
1275     do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1276 }
1277
1278 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1279                          uint32_t maxsz, uint8_t x)
1280 {
1281     check_size_align(oprsz, maxsz, dofs);
1282     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1283 }
1284
1285 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1286                       uint32_t oprsz, uint32_t maxsz)
1287 {
1288     static const GVecGen2 g = {
1289         .fni8 = tcg_gen_not_i64,
1290         .fniv = tcg_gen_not_vec,
1291         .fno = gen_helper_gvec_not,
1292         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1293     };
1294     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1295 }
1296
1297 /* Perform a vector addition using normal addition and a mask.  The mask
1298    should be the sign bit of each lane.  This 6-operation form is more
1299    efficient than separate additions when there are 4 or more lanes in
1300    the 64-bit operation.  */
1301 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1302 {
1303     TCGv_i64 t1 = tcg_temp_new_i64();
1304     TCGv_i64 t2 = tcg_temp_new_i64();
1305     TCGv_i64 t3 = tcg_temp_new_i64();
1306
1307     tcg_gen_andc_i64(t1, a, m);
1308     tcg_gen_andc_i64(t2, b, m);
1309     tcg_gen_xor_i64(t3, a, b);
1310     tcg_gen_add_i64(d, t1, t2);
1311     tcg_gen_and_i64(t3, t3, m);
1312     tcg_gen_xor_i64(d, d, t3);
1313
1314     tcg_temp_free_i64(t1);
1315     tcg_temp_free_i64(t2);
1316     tcg_temp_free_i64(t3);
1317 }
1318
1319 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1320 {
1321     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1322     gen_addv_mask(d, a, b, m);
1323     tcg_temp_free_i64(m);
1324 }
1325
1326 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1327 {
1328     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1329     gen_addv_mask(d, a, b, m);
1330     tcg_temp_free_i64(m);
1331 }
1332
1333 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1334 {
1335     TCGv_i64 t1 = tcg_temp_new_i64();
1336     TCGv_i64 t2 = tcg_temp_new_i64();
1337
1338     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1339     tcg_gen_add_i64(t2, a, b);
1340     tcg_gen_add_i64(t1, t1, b);
1341     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1342
1343     tcg_temp_free_i64(t1);
1344     tcg_temp_free_i64(t2);
1345 }
1346
1347 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1348                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1349 {
1350     static const GVecGen3 g[4] = {
1351         { .fni8 = tcg_gen_vec_add8_i64,
1352           .fniv = tcg_gen_add_vec,
1353           .fno = gen_helper_gvec_add8,
1354           .opc = INDEX_op_add_vec,
1355           .vece = MO_8 },
1356         { .fni8 = tcg_gen_vec_add16_i64,
1357           .fniv = tcg_gen_add_vec,
1358           .fno = gen_helper_gvec_add16,
1359           .opc = INDEX_op_add_vec,
1360           .vece = MO_16 },
1361         { .fni4 = tcg_gen_add_i32,
1362           .fniv = tcg_gen_add_vec,
1363           .fno = gen_helper_gvec_add32,
1364           .opc = INDEX_op_add_vec,
1365           .vece = MO_32 },
1366         { .fni8 = tcg_gen_add_i64,
1367           .fniv = tcg_gen_add_vec,
1368           .fno = gen_helper_gvec_add64,
1369           .opc = INDEX_op_add_vec,
1370           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1371           .vece = MO_64 },
1372     };
1373
1374     tcg_debug_assert(vece <= MO_64);
1375     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1376 }
1377
1378 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1379                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1380 {
1381     static const GVecGen2s g[4] = {
1382         { .fni8 = tcg_gen_vec_add8_i64,
1383           .fniv = tcg_gen_add_vec,
1384           .fno = gen_helper_gvec_adds8,
1385           .opc = INDEX_op_add_vec,
1386           .vece = MO_8 },
1387         { .fni8 = tcg_gen_vec_add16_i64,
1388           .fniv = tcg_gen_add_vec,
1389           .fno = gen_helper_gvec_adds16,
1390           .opc = INDEX_op_add_vec,
1391           .vece = MO_16 },
1392         { .fni4 = tcg_gen_add_i32,
1393           .fniv = tcg_gen_add_vec,
1394           .fno = gen_helper_gvec_adds32,
1395           .opc = INDEX_op_add_vec,
1396           .vece = MO_32 },
1397         { .fni8 = tcg_gen_add_i64,
1398           .fniv = tcg_gen_add_vec,
1399           .fno = gen_helper_gvec_adds64,
1400           .opc = INDEX_op_add_vec,
1401           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1402           .vece = MO_64 },
1403     };
1404
1405     tcg_debug_assert(vece <= MO_64);
1406     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1407 }
1408
1409 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1410                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1411 {
1412     TCGv_i64 tmp = tcg_const_i64(c);
1413     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1414     tcg_temp_free_i64(tmp);
1415 }
1416
1417 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1418                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1419 {
1420     static const GVecGen2s g[4] = {
1421         { .fni8 = tcg_gen_vec_sub8_i64,
1422           .fniv = tcg_gen_sub_vec,
1423           .fno = gen_helper_gvec_subs8,
1424           .opc = INDEX_op_sub_vec,
1425           .vece = MO_8 },
1426         { .fni8 = tcg_gen_vec_sub16_i64,
1427           .fniv = tcg_gen_sub_vec,
1428           .fno = gen_helper_gvec_subs16,
1429           .opc = INDEX_op_sub_vec,
1430           .vece = MO_16 },
1431         { .fni4 = tcg_gen_sub_i32,
1432           .fniv = tcg_gen_sub_vec,
1433           .fno = gen_helper_gvec_subs32,
1434           .opc = INDEX_op_sub_vec,
1435           .vece = MO_32 },
1436         { .fni8 = tcg_gen_sub_i64,
1437           .fniv = tcg_gen_sub_vec,
1438           .fno = gen_helper_gvec_subs64,
1439           .opc = INDEX_op_sub_vec,
1440           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1441           .vece = MO_64 },
1442     };
1443
1444     tcg_debug_assert(vece <= MO_64);
1445     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1446 }
1447
1448 /* Perform a vector subtraction using normal subtraction and a mask.
1449    Compare gen_addv_mask above.  */
1450 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1451 {
1452     TCGv_i64 t1 = tcg_temp_new_i64();
1453     TCGv_i64 t2 = tcg_temp_new_i64();
1454     TCGv_i64 t3 = tcg_temp_new_i64();
1455
1456     tcg_gen_or_i64(t1, a, m);
1457     tcg_gen_andc_i64(t2, b, m);
1458     tcg_gen_eqv_i64(t3, a, b);
1459     tcg_gen_sub_i64(d, t1, t2);
1460     tcg_gen_and_i64(t3, t3, m);
1461     tcg_gen_xor_i64(d, d, t3);
1462
1463     tcg_temp_free_i64(t1);
1464     tcg_temp_free_i64(t2);
1465     tcg_temp_free_i64(t3);
1466 }
1467
1468 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1469 {
1470     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1471     gen_subv_mask(d, a, b, m);
1472     tcg_temp_free_i64(m);
1473 }
1474
1475 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1476 {
1477     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1478     gen_subv_mask(d, a, b, m);
1479     tcg_temp_free_i64(m);
1480 }
1481
1482 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1483 {
1484     TCGv_i64 t1 = tcg_temp_new_i64();
1485     TCGv_i64 t2 = tcg_temp_new_i64();
1486
1487     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1488     tcg_gen_sub_i64(t2, a, b);
1489     tcg_gen_sub_i64(t1, a, t1);
1490     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1491
1492     tcg_temp_free_i64(t1);
1493     tcg_temp_free_i64(t2);
1494 }
1495
1496 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1497                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1498 {
1499     static const GVecGen3 g[4] = {
1500         { .fni8 = tcg_gen_vec_sub8_i64,
1501           .fniv = tcg_gen_sub_vec,
1502           .fno = gen_helper_gvec_sub8,
1503           .opc = INDEX_op_sub_vec,
1504           .vece = MO_8 },
1505         { .fni8 = tcg_gen_vec_sub16_i64,
1506           .fniv = tcg_gen_sub_vec,
1507           .fno = gen_helper_gvec_sub16,
1508           .opc = INDEX_op_sub_vec,
1509           .vece = MO_16 },
1510         { .fni4 = tcg_gen_sub_i32,
1511           .fniv = tcg_gen_sub_vec,
1512           .fno = gen_helper_gvec_sub32,
1513           .opc = INDEX_op_sub_vec,
1514           .vece = MO_32 },
1515         { .fni8 = tcg_gen_sub_i64,
1516           .fniv = tcg_gen_sub_vec,
1517           .fno = gen_helper_gvec_sub64,
1518           .opc = INDEX_op_sub_vec,
1519           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1520           .vece = MO_64 },
1521     };
1522
1523     tcg_debug_assert(vece <= MO_64);
1524     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1525 }
1526
1527 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1528                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1529 {
1530     static const GVecGen3 g[4] = {
1531         { .fniv = tcg_gen_mul_vec,
1532           .fno = gen_helper_gvec_mul8,
1533           .opc = INDEX_op_mul_vec,
1534           .vece = MO_8 },
1535         { .fniv = tcg_gen_mul_vec,
1536           .fno = gen_helper_gvec_mul16,
1537           .opc = INDEX_op_mul_vec,
1538           .vece = MO_16 },
1539         { .fni4 = tcg_gen_mul_i32,
1540           .fniv = tcg_gen_mul_vec,
1541           .fno = gen_helper_gvec_mul32,
1542           .opc = INDEX_op_mul_vec,
1543           .vece = MO_32 },
1544         { .fni8 = tcg_gen_mul_i64,
1545           .fniv = tcg_gen_mul_vec,
1546           .fno = gen_helper_gvec_mul64,
1547           .opc = INDEX_op_mul_vec,
1548           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1549           .vece = MO_64 },
1550     };
1551
1552     tcg_debug_assert(vece <= MO_64);
1553     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1554 }
1555
1556 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1557                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1558 {
1559     static const GVecGen2s g[4] = {
1560         { .fniv = tcg_gen_mul_vec,
1561           .fno = gen_helper_gvec_muls8,
1562           .opc = INDEX_op_mul_vec,
1563           .vece = MO_8 },
1564         { .fniv = tcg_gen_mul_vec,
1565           .fno = gen_helper_gvec_muls16,
1566           .opc = INDEX_op_mul_vec,
1567           .vece = MO_16 },
1568         { .fni4 = tcg_gen_mul_i32,
1569           .fniv = tcg_gen_mul_vec,
1570           .fno = gen_helper_gvec_muls32,
1571           .opc = INDEX_op_mul_vec,
1572           .vece = MO_32 },
1573         { .fni8 = tcg_gen_mul_i64,
1574           .fniv = tcg_gen_mul_vec,
1575           .fno = gen_helper_gvec_muls64,
1576           .opc = INDEX_op_mul_vec,
1577           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1578           .vece = MO_64 },
1579     };
1580
1581     tcg_debug_assert(vece <= MO_64);
1582     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1583 }
1584
1585 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1586                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1587 {
1588     TCGv_i64 tmp = tcg_const_i64(c);
1589     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1590     tcg_temp_free_i64(tmp);
1591 }
1592
1593 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1594                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1595 {
1596     static const GVecGen3 g[4] = {
1597         { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
1598         { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
1599         { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
1600         { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
1601     };
1602     tcg_debug_assert(vece <= MO_64);
1603     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1604 }
1605
1606 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1607                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1608 {
1609     static const GVecGen3 g[4] = {
1610         { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
1611         { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
1612         { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
1613         { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
1614     };
1615     tcg_debug_assert(vece <= MO_64);
1616     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1617 }
1618
1619 static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1620 {
1621     TCGv_i32 max = tcg_const_i32(-1);
1622     tcg_gen_add_i32(d, a, b);
1623     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1624     tcg_temp_free_i32(max);
1625 }
1626
1627 static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1628 {
1629     TCGv_i64 max = tcg_const_i64(-1);
1630     tcg_gen_add_i64(d, a, b);
1631     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1632     tcg_temp_free_i64(max);
1633 }
1634
1635 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1636                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1637 {
1638     static const GVecGen3 g[4] = {
1639         { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
1640         { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
1641         { .fni4 = tcg_gen_vec_usadd32_i32,
1642           .fno = gen_helper_gvec_usadd32,
1643           .vece = MO_32 },
1644         { .fni8 = tcg_gen_vec_usadd32_i64,
1645           .fno = gen_helper_gvec_usadd64,
1646           .vece = MO_64 }
1647     };
1648     tcg_debug_assert(vece <= MO_64);
1649     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1650 }
1651
1652 static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1653 {
1654     TCGv_i32 min = tcg_const_i32(0);
1655     tcg_gen_sub_i32(d, a, b);
1656     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1657     tcg_temp_free_i32(min);
1658 }
1659
1660 static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1661 {
1662     TCGv_i64 min = tcg_const_i64(0);
1663     tcg_gen_sub_i64(d, a, b);
1664     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1665     tcg_temp_free_i64(min);
1666 }
1667
1668 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1669                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1670 {
1671     static const GVecGen3 g[4] = {
1672         { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
1673         { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
1674         { .fni4 = tcg_gen_vec_ussub32_i32,
1675           .fno = gen_helper_gvec_ussub32,
1676           .vece = MO_32 },
1677         { .fni8 = tcg_gen_vec_ussub32_i64,
1678           .fno = gen_helper_gvec_ussub64,
1679           .vece = MO_64 }
1680     };
1681     tcg_debug_assert(vece <= MO_64);
1682     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1683 }
1684
1685 /* Perform a vector negation using normal negation and a mask.
1686    Compare gen_subv_mask above.  */
1687 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1688 {
1689     TCGv_i64 t2 = tcg_temp_new_i64();
1690     TCGv_i64 t3 = tcg_temp_new_i64();
1691
1692     tcg_gen_andc_i64(t3, m, b);
1693     tcg_gen_andc_i64(t2, b, m);
1694     tcg_gen_sub_i64(d, m, t2);
1695     tcg_gen_xor_i64(d, d, t3);
1696
1697     tcg_temp_free_i64(t2);
1698     tcg_temp_free_i64(t3);
1699 }
1700
1701 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1702 {
1703     TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1704     gen_negv_mask(d, b, m);
1705     tcg_temp_free_i64(m);
1706 }
1707
1708 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1709 {
1710     TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1711     gen_negv_mask(d, b, m);
1712     tcg_temp_free_i64(m);
1713 }
1714
1715 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1716 {
1717     TCGv_i64 t1 = tcg_temp_new_i64();
1718     TCGv_i64 t2 = tcg_temp_new_i64();
1719
1720     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1721     tcg_gen_neg_i64(t2, b);
1722     tcg_gen_neg_i64(t1, t1);
1723     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1724
1725     tcg_temp_free_i64(t1);
1726     tcg_temp_free_i64(t2);
1727 }
1728
1729 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1730                       uint32_t oprsz, uint32_t maxsz)
1731 {
1732     static const GVecGen2 g[4] = {
1733         { .fni8 = tcg_gen_vec_neg8_i64,
1734           .fniv = tcg_gen_neg_vec,
1735           .fno = gen_helper_gvec_neg8,
1736           .opc = INDEX_op_neg_vec,
1737           .vece = MO_8 },
1738         { .fni8 = tcg_gen_vec_neg16_i64,
1739           .fniv = tcg_gen_neg_vec,
1740           .fno = gen_helper_gvec_neg16,
1741           .opc = INDEX_op_neg_vec,
1742           .vece = MO_16 },
1743         { .fni4 = tcg_gen_neg_i32,
1744           .fniv = tcg_gen_neg_vec,
1745           .fno = gen_helper_gvec_neg32,
1746           .opc = INDEX_op_neg_vec,
1747           .vece = MO_32 },
1748         { .fni8 = tcg_gen_neg_i64,
1749           .fniv = tcg_gen_neg_vec,
1750           .fno = gen_helper_gvec_neg64,
1751           .opc = INDEX_op_neg_vec,
1752           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1753           .vece = MO_64 },
1754     };
1755
1756     tcg_debug_assert(vece <= MO_64);
1757     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1758 }
1759
1760 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1761                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1762 {
1763     static const GVecGen3 g = {
1764         .fni8 = tcg_gen_and_i64,
1765         .fniv = tcg_gen_and_vec,
1766         .fno = gen_helper_gvec_and,
1767         .opc = INDEX_op_and_vec,
1768         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1769     };
1770     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1771 }
1772
1773 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
1774                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1775 {
1776     static const GVecGen3 g = {
1777         .fni8 = tcg_gen_or_i64,
1778         .fniv = tcg_gen_or_vec,
1779         .fno = gen_helper_gvec_or,
1780         .opc = INDEX_op_or_vec,
1781         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1782     };
1783     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1784 }
1785
1786 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1788 {
1789     static const GVecGen3 g = {
1790         .fni8 = tcg_gen_xor_i64,
1791         .fniv = tcg_gen_xor_vec,
1792         .fno = gen_helper_gvec_xor,
1793         .opc = INDEX_op_xor_vec,
1794         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1795     };
1796     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1797 }
1798
1799 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
1800                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1801 {
1802     static const GVecGen3 g = {
1803         .fni8 = tcg_gen_andc_i64,
1804         .fniv = tcg_gen_andc_vec,
1805         .fno = gen_helper_gvec_andc,
1806         .opc = INDEX_op_andc_vec,
1807         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1808     };
1809     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1810 }
1811
1812 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
1813                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1814 {
1815     static const GVecGen3 g = {
1816         .fni8 = tcg_gen_orc_i64,
1817         .fniv = tcg_gen_orc_vec,
1818         .fno = gen_helper_gvec_orc,
1819         .opc = INDEX_op_orc_vec,
1820         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1821     };
1822     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1823 }
1824
1825 static const GVecGen2s gop_ands = {
1826     .fni8 = tcg_gen_and_i64,
1827     .fniv = tcg_gen_and_vec,
1828     .fno = gen_helper_gvec_ands,
1829     .opc = INDEX_op_and_vec,
1830     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1831     .vece = MO_64
1832 };
1833
1834 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
1835                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1836 {
1837     TCGv_i64 tmp = tcg_temp_new_i64();
1838     gen_dup_i64(vece, tmp, c);
1839     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1840     tcg_temp_free_i64(tmp);
1841 }
1842
1843 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
1844                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1845 {
1846     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1847     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1848     tcg_temp_free_i64(tmp);
1849 }
1850
1851 static const GVecGen2s gop_xors = {
1852     .fni8 = tcg_gen_xor_i64,
1853     .fniv = tcg_gen_xor_vec,
1854     .fno = gen_helper_gvec_xors,
1855     .opc = INDEX_op_xor_vec,
1856     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1857     .vece = MO_64
1858 };
1859
1860 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
1861                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1862 {
1863     TCGv_i64 tmp = tcg_temp_new_i64();
1864     gen_dup_i64(vece, tmp, c);
1865     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1866     tcg_temp_free_i64(tmp);
1867 }
1868
1869 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
1870                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1871 {
1872     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1873     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1874     tcg_temp_free_i64(tmp);
1875 }
1876
1877 static const GVecGen2s gop_ors = {
1878     .fni8 = tcg_gen_or_i64,
1879     .fniv = tcg_gen_or_vec,
1880     .fno = gen_helper_gvec_ors,
1881     .opc = INDEX_op_or_vec,
1882     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1883     .vece = MO_64
1884 };
1885
1886 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
1887                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1888 {
1889     TCGv_i64 tmp = tcg_temp_new_i64();
1890     gen_dup_i64(vece, tmp, c);
1891     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1892     tcg_temp_free_i64(tmp);
1893 }
1894
1895 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
1896                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1897 {
1898     TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1899     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1900     tcg_temp_free_i64(tmp);
1901 }
1902
1903 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1904 {
1905     uint64_t mask = dup_const(MO_8, 0xff << c);
1906     tcg_gen_shli_i64(d, a, c);
1907     tcg_gen_andi_i64(d, d, mask);
1908 }
1909
1910 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1911 {
1912     uint64_t mask = dup_const(MO_16, 0xffff << c);
1913     tcg_gen_shli_i64(d, a, c);
1914     tcg_gen_andi_i64(d, d, mask);
1915 }
1916
1917 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
1918                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
1919 {
1920     static const GVecGen2i g[4] = {
1921         { .fni8 = tcg_gen_vec_shl8i_i64,
1922           .fniv = tcg_gen_shli_vec,
1923           .fno = gen_helper_gvec_shl8i,
1924           .opc = INDEX_op_shli_vec,
1925           .vece = MO_8 },
1926         { .fni8 = tcg_gen_vec_shl16i_i64,
1927           .fniv = tcg_gen_shli_vec,
1928           .fno = gen_helper_gvec_shl16i,
1929           .opc = INDEX_op_shli_vec,
1930           .vece = MO_16 },
1931         { .fni4 = tcg_gen_shli_i32,
1932           .fniv = tcg_gen_shli_vec,
1933           .fno = gen_helper_gvec_shl32i,
1934           .opc = INDEX_op_shli_vec,
1935           .vece = MO_32 },
1936         { .fni8 = tcg_gen_shli_i64,
1937           .fniv = tcg_gen_shli_vec,
1938           .fno = gen_helper_gvec_shl64i,
1939           .opc = INDEX_op_shli_vec,
1940           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1941           .vece = MO_64 },
1942     };
1943
1944     tcg_debug_assert(vece <= MO_64);
1945     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1946     if (shift == 0) {
1947         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1948     } else {
1949         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
1950     }
1951 }
1952
1953 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1954 {
1955     uint64_t mask = dup_const(MO_8, 0xff >> c);
1956     tcg_gen_shri_i64(d, a, c);
1957     tcg_gen_andi_i64(d, d, mask);
1958 }
1959
1960 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1961 {
1962     uint64_t mask = dup_const(MO_16, 0xffff >> c);
1963     tcg_gen_shri_i64(d, a, c);
1964     tcg_gen_andi_i64(d, d, mask);
1965 }
1966
1967 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
1968                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
1969 {
1970     static const GVecGen2i g[4] = {
1971         { .fni8 = tcg_gen_vec_shr8i_i64,
1972           .fniv = tcg_gen_shri_vec,
1973           .fno = gen_helper_gvec_shr8i,
1974           .opc = INDEX_op_shri_vec,
1975           .vece = MO_8 },
1976         { .fni8 = tcg_gen_vec_shr16i_i64,
1977           .fniv = tcg_gen_shri_vec,
1978           .fno = gen_helper_gvec_shr16i,
1979           .opc = INDEX_op_shri_vec,
1980           .vece = MO_16 },
1981         { .fni4 = tcg_gen_shri_i32,
1982           .fniv = tcg_gen_shri_vec,
1983           .fno = gen_helper_gvec_shr32i,
1984           .opc = INDEX_op_shri_vec,
1985           .vece = MO_32 },
1986         { .fni8 = tcg_gen_shri_i64,
1987           .fniv = tcg_gen_shri_vec,
1988           .fno = gen_helper_gvec_shr64i,
1989           .opc = INDEX_op_shri_vec,
1990           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1991           .vece = MO_64 },
1992     };
1993
1994     tcg_debug_assert(vece <= MO_64);
1995     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1996     if (shift == 0) {
1997         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1998     } else {
1999         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2000     }
2001 }
2002
2003 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2004 {
2005     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2006     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2007     TCGv_i64 s = tcg_temp_new_i64();
2008
2009     tcg_gen_shri_i64(d, a, c);
2010     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2011     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2012     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2013     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2014     tcg_temp_free_i64(s);
2015 }
2016
2017 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2018 {
2019     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2020     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2021     TCGv_i64 s = tcg_temp_new_i64();
2022
2023     tcg_gen_shri_i64(d, a, c);
2024     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2025     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2026     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2027     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2028     tcg_temp_free_i64(s);
2029 }
2030
2031 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2032                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2033 {
2034     static const GVecGen2i g[4] = {
2035         { .fni8 = tcg_gen_vec_sar8i_i64,
2036           .fniv = tcg_gen_sari_vec,
2037           .fno = gen_helper_gvec_sar8i,
2038           .opc = INDEX_op_sari_vec,
2039           .vece = MO_8 },
2040         { .fni8 = tcg_gen_vec_sar16i_i64,
2041           .fniv = tcg_gen_sari_vec,
2042           .fno = gen_helper_gvec_sar16i,
2043           .opc = INDEX_op_sari_vec,
2044           .vece = MO_16 },
2045         { .fni4 = tcg_gen_sari_i32,
2046           .fniv = tcg_gen_sari_vec,
2047           .fno = gen_helper_gvec_sar32i,
2048           .opc = INDEX_op_sari_vec,
2049           .vece = MO_32 },
2050         { .fni8 = tcg_gen_sari_i64,
2051           .fniv = tcg_gen_sari_vec,
2052           .fno = gen_helper_gvec_sar64i,
2053           .opc = INDEX_op_sari_vec,
2054           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2055           .vece = MO_64 },
2056     };
2057
2058     tcg_debug_assert(vece <= MO_64);
2059     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2060     if (shift == 0) {
2061         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2062     } else {
2063         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2064     }
2065 }
2066
2067 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
2068 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2069                            uint32_t oprsz, TCGCond cond)
2070 {
2071     TCGv_i32 t0 = tcg_temp_new_i32();
2072     TCGv_i32 t1 = tcg_temp_new_i32();
2073     uint32_t i;
2074
2075     for (i = 0; i < oprsz; i += 4) {
2076         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2077         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2078         tcg_gen_setcond_i32(cond, t0, t0, t1);
2079         tcg_gen_neg_i32(t0, t0);
2080         tcg_gen_st_i32(t0, cpu_env, dofs + i);
2081     }
2082     tcg_temp_free_i32(t1);
2083     tcg_temp_free_i32(t0);
2084 }
2085
2086 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2087                            uint32_t oprsz, TCGCond cond)
2088 {
2089     TCGv_i64 t0 = tcg_temp_new_i64();
2090     TCGv_i64 t1 = tcg_temp_new_i64();
2091     uint32_t i;
2092
2093     for (i = 0; i < oprsz; i += 8) {
2094         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2095         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2096         tcg_gen_setcond_i64(cond, t0, t0, t1);
2097         tcg_gen_neg_i64(t0, t0);
2098         tcg_gen_st_i64(t0, cpu_env, dofs + i);
2099     }
2100     tcg_temp_free_i64(t1);
2101     tcg_temp_free_i64(t0);
2102 }
2103
2104 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2105                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2106                            TCGType type, TCGCond cond)
2107 {
2108     TCGv_vec t0 = tcg_temp_new_vec(type);
2109     TCGv_vec t1 = tcg_temp_new_vec(type);
2110     uint32_t i;
2111
2112     for (i = 0; i < oprsz; i += tysz) {
2113         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2114         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2115         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2116         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2117     }
2118     tcg_temp_free_vec(t1);
2119     tcg_temp_free_vec(t0);
2120 }
2121
2122 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2123                       uint32_t aofs, uint32_t bofs,
2124                       uint32_t oprsz, uint32_t maxsz)
2125 {
2126     static gen_helper_gvec_3 * const eq_fn[4] = {
2127         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2128         gen_helper_gvec_eq32, gen_helper_gvec_eq64
2129     };
2130     static gen_helper_gvec_3 * const ne_fn[4] = {
2131         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2132         gen_helper_gvec_ne32, gen_helper_gvec_ne64
2133     };
2134     static gen_helper_gvec_3 * const lt_fn[4] = {
2135         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2136         gen_helper_gvec_lt32, gen_helper_gvec_lt64
2137     };
2138     static gen_helper_gvec_3 * const le_fn[4] = {
2139         gen_helper_gvec_le8, gen_helper_gvec_le16,
2140         gen_helper_gvec_le32, gen_helper_gvec_le64
2141     };
2142     static gen_helper_gvec_3 * const ltu_fn[4] = {
2143         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2144         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2145     };
2146     static gen_helper_gvec_3 * const leu_fn[4] = {
2147         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2148         gen_helper_gvec_leu32, gen_helper_gvec_leu64
2149     };
2150     static gen_helper_gvec_3 * const * const fns[16] = {
2151         [TCG_COND_EQ] = eq_fn,
2152         [TCG_COND_NE] = ne_fn,
2153         [TCG_COND_LT] = lt_fn,
2154         [TCG_COND_LE] = le_fn,
2155         [TCG_COND_LTU] = ltu_fn,
2156         [TCG_COND_LEU] = leu_fn,
2157     };
2158
2159     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2160     check_overlap_3(dofs, aofs, bofs, maxsz);
2161
2162     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2163         do_dup(MO_8, dofs, oprsz, maxsz,
2164                NULL, NULL, -(cond == TCG_COND_ALWAYS));
2165         return;
2166     }
2167
2168     /* Recall that ARM SVE allows vector sizes that are not a power of 2.
2169        Expand with successively smaller host vector sizes.  The intent is
2170        that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
2171
2172     if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)
2173         && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) {
2174         uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
2175         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2176         if (some == oprsz) {
2177             goto done;
2178         }
2179         dofs += some;
2180         aofs += some;
2181         bofs += some;
2182         oprsz -= some;
2183         maxsz -= some;
2184     }
2185
2186     if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)
2187         && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) {
2188         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
2189     } else if (TCG_TARGET_HAS_v64
2190                && check_size_impl(oprsz, 8)
2191                && (TCG_TARGET_REG_BITS == 32 || vece != MO_64)
2192                && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) {
2193         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
2194     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2195         expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2196     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2197         expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2198     } else {
2199         gen_helper_gvec_3 * const *fn = fns[cond];
2200
2201         if (fn == NULL) {
2202             uint32_t tmp;
2203             tmp = aofs, aofs = bofs, bofs = tmp;
2204             cond = tcg_swap_cond(cond);
2205             fn = fns[cond];
2206             assert(fn != NULL);
2207         }
2208         tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2209         return;
2210     }
2211
2212  done:
2213     if (oprsz < maxsz) {
2214         expand_clr(dofs + oprsz, maxsz - oprsz);
2215     }
2216 }