target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 /* Include the generated Neon decoder */
  32 #include "decode-neon-dp.c.inc"
  33 #include "decode-neon-ls.c.inc"
  34 #include "decode-neon-shared.c.inc"
  35
  36 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  37 {
  38     TCGv_ptr ret = tcg_temp_new_ptr();
  39     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  40     return ret;
  41 }
  42
  43 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  44 {
  45     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  46
  47     switch (mop) {
  48     case MO_UB:
  49         tcg_gen_ld8u_i32(var, cpu_env, offset);
  50         break;
  51     case MO_UW:
  52         tcg_gen_ld16u_i32(var, cpu_env, offset);
  53         break;
  54     case MO_UL:
  55         tcg_gen_ld_i32(var, cpu_env, offset);
  56         break;
  57     default:
  58         g_assert_not_reached();
  59     }
  60 }
  61
  62 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  63 {
  64     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  65
  66     switch (mop) {
  67     case MO_UB:
  68         tcg_gen_ld8u_i64(var, cpu_env, offset);
  69         break;
  70     case MO_UW:
  71         tcg_gen_ld16u_i64(var, cpu_env, offset);
  72         break;
  73     case MO_UL:
  74         tcg_gen_ld32u_i64(var, cpu_env, offset);
  75         break;
  76     case MO_UQ:
  77         tcg_gen_ld_i64(var, cpu_env, offset);
  78         break;
  79     default:
  80         g_assert_not_reached();
  81     }
  82 }
  83
  84 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  85 {
  86     long offset = neon_element_offset(reg, ele, size);
  87
  88     switch (size) {
  89     case MO_8:
  90         tcg_gen_st8_i32(var, cpu_env, offset);
  91         break;
  92     case MO_16:
  93         tcg_gen_st16_i32(var, cpu_env, offset);
  94         break;
  95     case MO_32:
  96         tcg_gen_st_i32(var, cpu_env, offset);
  97         break;
  98     default:
  99         g_assert_not_reached();
 100     }
 101 }
 102
 103 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 104 {
 105     long offset = neon_element_offset(reg, ele, size);
 106
 107     switch (size) {
 108     case MO_8:
 109         tcg_gen_st8_i64(var, cpu_env, offset);
 110         break;
 111     case MO_16:
 112         tcg_gen_st16_i64(var, cpu_env, offset);
 113         break;
 114     case MO_32:
 115         tcg_gen_st32_i64(var, cpu_env, offset);
 116         break;
 117     case MO_64:
 118         tcg_gen_st_i64(var, cpu_env, offset);
 119         break;
 120     default:
 121         g_assert_not_reached();
 122     }
 123 }
 124
 125 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 126                          int data, gen_helper_gvec_4 *fn_gvec)
 127 {
 128     /* UNDEF accesses to D16-D31 if they don't exist. */
 129     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 130         return false;
 131     }
 132
 133     /*
 134      * UNDEF accesses to odd registers for each bit of Q.
 135      * Q will be 0b111 for all Q-reg instructions, otherwise
 136      * when we have mixed Q- and D-reg inputs.
 137      */
 138     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 139         return false;
 140     }
 141
 142     if (!vfp_access_check(s)) {
 143         return true;
 144     }
 145
 146     int opr_sz = q ? 16 : 8;
 147     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 148                        vfp_reg_offset(1, vn),
 149                        vfp_reg_offset(1, vm),
 150                        vfp_reg_offset(1, vd),
 151                        opr_sz, opr_sz, data, fn_gvec);
 152     return true;
 153 }
 154
 155 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 156                               int data, ARMFPStatusFlavour fp_flavour,
 157                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 158 {
 159     /* UNDEF accesses to D16-D31 if they don't exist. */
 160     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 161         return false;
 162     }
 163
 164     /*
 165      * UNDEF accesses to odd registers for each bit of Q.
 166      * Q will be 0b111 for all Q-reg instructions, otherwise
 167      * when we have mixed Q- and D-reg inputs.
 168      */
 169     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 170         return false;
 171     }
 172
 173     if (!vfp_access_check(s)) {
 174         return true;
 175     }
 176
 177     int opr_sz = q ? 16 : 8;
 178     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 179
 180     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 181                        vfp_reg_offset(1, vn),
 182                        vfp_reg_offset(1, vm),
 183                        vfp_reg_offset(1, vd),
 184                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 185     tcg_temp_free_ptr(fpst);
 186     return true;
 187 }
 188
 189 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 190 {
 191     if (!dc_isar_feature(aa32_vcma, s)) {
 192         return false;
 193     }
 194     if (a->size == MO_16) {
 195         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 196             return false;
 197         }
 198         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 199                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 200     }
 201     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 202                              FPST_STD, gen_helper_gvec_fcmlas);
 203 }
 204
 205 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 206 {
 207     int opr_sz;
 208     TCGv_ptr fpst;
 209     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 210
 211     if (!dc_isar_feature(aa32_vcma, s)
 212         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 213         return false;
 214     }
 215
 216     /* UNDEF accesses to D16-D31 if they don't exist. */
 217     if (!dc_isar_feature(aa32_simd_r32, s) &&
 218         ((a->vd | a->vn | a->vm) & 0x10)) {
 219         return false;
 220     }
 221
 222     if ((a->vn | a->vm | a->vd) & a->q) {
 223         return false;
 224     }
 225
 226     if (!vfp_access_check(s)) {
 227         return true;
 228     }
 229
 230     opr_sz = (1 + a->q) * 8;
 231     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 232     fn_gvec_ptr = (a->size == MO_16) ?
 233         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 234     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 235                        vfp_reg_offset(1, a->vn),
 236                        vfp_reg_offset(1, a->vm),
 237                        fpst, opr_sz, opr_sz, a->rot,
 238                        fn_gvec_ptr);
 239     tcg_temp_free_ptr(fpst);
 240     return true;
 241 }
 242
 243 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 244 {
 245     if (!dc_isar_feature(aa32_dp, s)) {
 246         return false;
 247     }
 248     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 249                         gen_helper_gvec_sdot_b);
 250 }
 251
 252 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 253 {
 254     if (!dc_isar_feature(aa32_dp, s)) {
 255         return false;
 256     }
 257     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 258                         gen_helper_gvec_udot_b);
 259 }
 260
 261 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 262 {
 263     if (!dc_isar_feature(aa32_i8mm, s)) {
 264         return false;
 265     }
 266     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 267                         gen_helper_gvec_usdot_b);
 268 }
 269
 270 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 271 {
 272     if (!dc_isar_feature(aa32_bf16, s)) {
 273         return false;
 274     }
 275     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 276                         gen_helper_gvec_bfdot);
 277 }
 278
 279 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 280 {
 281     int opr_sz;
 282
 283     if (!dc_isar_feature(aa32_fhm, s)) {
 284         return false;
 285     }
 286
 287     /* UNDEF accesses to D16-D31 if they don't exist. */
 288     if (!dc_isar_feature(aa32_simd_r32, s) &&
 289         (a->vd & 0x10)) {
 290         return false;
 291     }
 292
 293     if (a->vd & a->q) {
 294         return false;
 295     }
 296
 297     if (!vfp_access_check(s)) {
 298         return true;
 299     }
 300
 301     opr_sz = (1 + a->q) * 8;
 302     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 303                        vfp_reg_offset(a->q, a->vn),
 304                        vfp_reg_offset(a->q, a->vm),
 305                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 306                        gen_helper_gvec_fmlal_a32);
 307     return true;
 308 }
 309
 310 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 311 {
 312     int data = (a->index << 2) | a->rot;
 313
 314     if (!dc_isar_feature(aa32_vcma, s)) {
 315         return false;
 316     }
 317     if (a->size == MO_16) {
 318         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 319             return false;
 320         }
 321         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 322                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 323     }
 324     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 325                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 326 }
 327
 328 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 329 {
 330     if (!dc_isar_feature(aa32_dp, s)) {
 331         return false;
 332     }
 333     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 334                         gen_helper_gvec_sdot_idx_b);
 335 }
 336
 337 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 338 {
 339     if (!dc_isar_feature(aa32_dp, s)) {
 340         return false;
 341     }
 342     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 343                         gen_helper_gvec_udot_idx_b);
 344 }
 345
 346 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 347 {
 348     if (!dc_isar_feature(aa32_i8mm, s)) {
 349         return false;
 350     }
 351     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 352                         gen_helper_gvec_usdot_idx_b);
 353 }
 354
 355 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 356 {
 357     if (!dc_isar_feature(aa32_i8mm, s)) {
 358         return false;
 359     }
 360     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 361                         gen_helper_gvec_sudot_idx_b);
 362 }
 363
 364 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 365 {
 366     if (!dc_isar_feature(aa32_bf16, s)) {
 367         return false;
 368     }
 369     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 370                         gen_helper_gvec_bfdot_idx);
 371 }
 372
 373 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 374 {
 375     int opr_sz;
 376
 377     if (!dc_isar_feature(aa32_fhm, s)) {
 378         return false;
 379     }
 380
 381     /* UNDEF accesses to D16-D31 if they don't exist. */
 382     if (!dc_isar_feature(aa32_simd_r32, s) &&
 383         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 384         return false;
 385     }
 386
 387     if (a->vd & a->q) {
 388         return false;
 389     }
 390
 391     if (!vfp_access_check(s)) {
 392         return true;
 393     }
 394
 395     opr_sz = (1 + a->q) * 8;
 396     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 397                        vfp_reg_offset(a->q, a->vn),
 398                        vfp_reg_offset(a->q, a->rm),
 399                        cpu_env, opr_sz, opr_sz,
 400                        (a->index << 2) | a->s, /* is_2 == 0 */
 401                        gen_helper_gvec_fmlal_idx_a32);
 402     return true;
 403 }
 404
 405 static struct {
 406     int nregs;
 407     int interleave;
 408     int spacing;
 409 } const neon_ls_element_type[11] = {
 410     {1, 4, 1},
 411     {1, 4, 2},
 412     {4, 1, 1},
 413     {2, 2, 2},
 414     {1, 3, 1},
 415     {1, 3, 2},
 416     {3, 1, 1},
 417     {1, 1, 1},
 418     {1, 2, 1},
 419     {1, 2, 2},
 420     {2, 1, 1}
 421 };
 422
 423 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 424                                       int stride)
 425 {
 426     if (rm != 15) {
 427         TCGv_i32 base;
 428
 429         base = load_reg(s, rn);
 430         if (rm == 13) {
 431             tcg_gen_addi_i32(base, base, stride);
 432         } else {
 433             TCGv_i32 index;
 434             index = load_reg(s, rm);
 435             tcg_gen_add_i32(base, base, index);
 436             tcg_temp_free_i32(index);
 437         }
 438         store_reg(s, rn, base);
 439     }
 440 }
 441
 442 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 443 {
 444     /* Neon load/store multiple structures */
 445     int nregs, interleave, spacing, reg, n;
 446     MemOp mop, align, endian;
 447     int mmu_idx = get_mem_index(s);
 448     int size = a->size;
 449     TCGv_i64 tmp64;
 450     TCGv_i32 addr;
 451
 452     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 453         return false;
 454     }
 455
 456     /* UNDEF accesses to D16-D31 if they don't exist */
 457     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 458         return false;
 459     }
 460     if (a->itype > 10) {
 461         return false;
 462     }
 463     /* Catch UNDEF cases for bad values of align field */
 464     switch (a->itype & 0xc) {
 465     case 4:
 466         if (a->align >= 2) {
 467             return false;
 468         }
 469         break;
 470     case 8:
 471         if (a->align == 3) {
 472             return false;
 473         }
 474         break;
 475     default:
 476         break;
 477     }
 478     nregs = neon_ls_element_type[a->itype].nregs;
 479     interleave = neon_ls_element_type[a->itype].interleave;
 480     spacing = neon_ls_element_type[a->itype].spacing;
 481     if (size == 3 && (interleave | spacing) != 1) {
 482         return false;
 483     }
 484
 485     if (!vfp_access_check(s)) {
 486         return true;
 487     }
 488
 489     /* For our purposes, bytes are always little-endian.  */
 490     endian = s->be_data;
 491     if (size == 0) {
 492         endian = MO_LE;
 493     }
 494
 495     /* Enforce alignment requested by the instruction */
 496     if (a->align) {
 497         align = pow2_align(a->align + 2); /* 4 ** a->align */
 498     } else {
 499         align = s->align_mem ? MO_ALIGN : 0;
 500     }
 501
 502     /*
 503      * Consecutive little-endian elements from a single register
 504      * can be promoted to a larger little-endian operation.
 505      */
 506     if (interleave == 1 && endian == MO_LE) {
 507         /* Retain any natural alignment. */
 508         if (align == MO_ALIGN) {
 509             align = pow2_align(size);
 510         }
 511         size = 3;
 512     }
 513
 514     tmp64 = tcg_temp_new_i64();
 515     addr = tcg_temp_new_i32();
 516     load_reg_var(s, addr, a->rn);
 517
 518     mop = endian | size | align;
 519     for (reg = 0; reg < nregs; reg++) {
 520         for (n = 0; n < 8 >> size; n++) {
 521             int xs;
 522             for (xs = 0; xs < interleave; xs++) {
 523                 int tt = a->vd + reg + spacing * xs;
 524
 525                 if (a->l) {
 526                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 527                     neon_store_element64(tt, n, size, tmp64);
 528                 } else {
 529                     neon_load_element64(tmp64, tt, n, size);
 530                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 531                 }
 532                 tcg_gen_addi_i32(addr, addr, 1 << size);
 533
 534                 /* Subsequent memory operations inherit alignment */
 535                 mop &= ~MO_AMASK;
 536             }
 537         }
 538     }
 539     tcg_temp_free_i32(addr);
 540     tcg_temp_free_i64(tmp64);
 541
 542     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 543     return true;
 544 }
 545
 546 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 547 {
 548     /* Neon load single structure to all lanes */
 549     int reg, stride, vec_size;
 550     int vd = a->vd;
 551     int size = a->size;
 552     int nregs = a->n + 1;
 553     TCGv_i32 addr, tmp;
 554     MemOp mop, align;
 555
 556     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 557         return false;
 558     }
 559
 560     /* UNDEF accesses to D16-D31 if they don't exist */
 561     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 562         return false;
 563     }
 564
 565     align = 0;
 566     if (size == 3) {
 567         if (nregs != 4 || a->a == 0) {
 568             return false;
 569         }
 570         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 571         size = MO_32;
 572         align = MO_ALIGN_16;
 573     } else if (a->a) {
 574         switch (nregs) {
 575         case 1:
 576             if (size == 0) {
 577                 return false;
 578             }
 579             align = MO_ALIGN;
 580             break;
 581         case 2:
 582             align = pow2_align(size + 1);
 583             break;
 584         case 3:
 585             return false;
 586         case 4:
 587             align = pow2_align(size + 2);
 588             break;
 589         default:
 590             g_assert_not_reached();
 591         }
 592     }
 593
 594     if (!vfp_access_check(s)) {
 595         return true;
 596     }
 597
 598     /*
 599      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 600      * VLD2/3/4 to all lanes: T bit indicates register stride.
 601      */
 602     stride = a->t ? 2 : 1;
 603     vec_size = nregs == 1 ? stride * 8 : 8;
 604     mop = size | align;
 605     tmp = tcg_temp_new_i32();
 606     addr = tcg_temp_new_i32();
 607     load_reg_var(s, addr, a->rn);
 608     for (reg = 0; reg < nregs; reg++) {
 609         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 610         if ((vd & 1) && vec_size == 16) {
 611             /*
 612              * We cannot write 16 bytes at once because the
 613              * destination is unaligned.
 614              */
 615             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 616                                  8, 8, tmp);
 617             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 618                              neon_full_reg_offset(vd), 8, 8);
 619         } else {
 620             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 621                                  vec_size, vec_size, tmp);
 622         }
 623         tcg_gen_addi_i32(addr, addr, 1 << size);
 624         vd += stride;
 625
 626         /* Subsequent memory operations inherit alignment */
 627         mop &= ~MO_AMASK;
 628     }
 629     tcg_temp_free_i32(tmp);
 630     tcg_temp_free_i32(addr);
 631
 632     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 633
 634     return true;
 635 }
 636
 637 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 638 {
 639     /* Neon load/store single structure to one lane */
 640     int reg;
 641     int nregs = a->n + 1;
 642     int vd = a->vd;
 643     TCGv_i32 addr, tmp;
 644     MemOp mop;
 645
 646     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 647         return false;
 648     }
 649
 650     /* UNDEF accesses to D16-D31 if they don't exist */
 651     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 652         return false;
 653     }
 654
 655     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 656     switch (nregs) {
 657     case 1:
 658         if (a->stride != 1) {
 659             return false;
 660         }
 661         if (((a->align & (1 << a->size)) != 0) ||
 662             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 663             return false;
 664         }
 665         break;
 666     case 2:
 667         if (a->size == 2 && (a->align & 2) != 0) {
 668             return false;
 669         }
 670         break;
 671     case 3:
 672         if (a->align != 0) {
 673             return false;
 674         }
 675         break;
 676     case 4:
 677         if (a->size == 2 && a->align == 3) {
 678             return false;
 679         }
 680         break;
 681     default:
 682         g_assert_not_reached();
 683     }
 684     if ((vd + a->stride * (nregs - 1)) > 31) {
 685         /*
 686          * Attempts to write off the end of the register file are
 687          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 688          * access off the end of the array that holds the register data.
 689          */
 690         return false;
 691     }
 692
 693     if (!vfp_access_check(s)) {
 694         return true;
 695     }
 696
 697     /* Pick up SCTLR settings */
 698     mop = finalize_memop(s, a->size);
 699
 700     if (a->align) {
 701         MemOp align_op;
 702
 703         switch (nregs) {
 704         case 1:
 705             /* For VLD1, use natural alignment. */
 706             align_op = MO_ALIGN;
 707             break;
 708         case 2:
 709             /* For VLD2, use double alignment. */
 710             align_op = pow2_align(a->size + 1);
 711             break;
 712         case 4:
 713             if (a->size == MO_32) {
 714                 /*
 715                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 716                  * quad alignment; align = 3 is rejected above.
 717                  */
 718                 align_op = pow2_align(a->size + a->align);
 719             } else {
 720                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 721                 align_op = pow2_align(a->size + 2);
 722             }
 723             break;
 724         default:
 725             /* For VLD3, the alignment field is zero and rejected above. */
 726             g_assert_not_reached();
 727         }
 728
 729         mop = (mop & ~MO_AMASK) | align_op;
 730     }
 731
 732     tmp = tcg_temp_new_i32();
 733     addr = tcg_temp_new_i32();
 734     load_reg_var(s, addr, a->rn);
 735
 736     for (reg = 0; reg < nregs; reg++) {
 737         if (a->l) {
 738             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 739             neon_store_element(vd, a->reg_idx, a->size, tmp);
 740         } else { /* Store */
 741             neon_load_element(tmp, vd, a->reg_idx, a->size);
 742             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 743         }
 744         vd += a->stride;
 745         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 746
 747         /* Subsequent memory operations inherit alignment */
 748         mop &= ~MO_AMASK;
 749     }
 750     tcg_temp_free_i32(addr);
 751     tcg_temp_free_i32(tmp);
 752
 753     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 754
 755     return true;
 756 }
 757
 758 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 759 {
 760     int vec_size = a->q ? 16 : 8;
 761     int rd_ofs = neon_full_reg_offset(a->vd);
 762     int rn_ofs = neon_full_reg_offset(a->vn);
 763     int rm_ofs = neon_full_reg_offset(a->vm);
 764
 765     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 766         return false;
 767     }
 768
 769     /* UNDEF accesses to D16-D31 if they don't exist. */
 770     if (!dc_isar_feature(aa32_simd_r32, s) &&
 771         ((a->vd | a->vn | a->vm) & 0x10)) {
 772         return false;
 773     }
 774
 775     if ((a->vn | a->vm | a->vd) & a->q) {
 776         return false;
 777     }
 778
 779     if (!vfp_access_check(s)) {
 780         return true;
 781     }
 782
 783     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 784     return true;
 785 }
 786
 787 #define DO_3SAME(INSN, FUNC)                                            \
 788     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 789     {                                                                   \
 790         return do_3same(s, a, FUNC);                                    \
 791     }
 792
 793 DO_3SAME(VADD, tcg_gen_gvec_add)
 794 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 795 DO_3SAME(VAND, tcg_gen_gvec_and)
 796 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 797 DO_3SAME(VORR, tcg_gen_gvec_or)
 798 DO_3SAME(VORN, tcg_gen_gvec_orc)
 799 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 800 DO_3SAME(VSHL_S, gen_gvec_sshl)
 801 DO_3SAME(VSHL_U, gen_gvec_ushl)
 802 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 803 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 804 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 805 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 806
 807 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 808 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 809     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 810                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 811                                 uint32_t oprsz, uint32_t maxsz)         \
 812     {                                                                   \
 813         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 814     }                                                                   \
 815     DO_3SAME(INSN, gen_##INSN##_3s)
 816
 817 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 818 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 819 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 820
 821 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 822     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 823     {                                                                   \
 824         if (a->size == 3) {                                             \
 825             return false;                                               \
 826         }                                                               \
 827         return do_3same(s, a, FUNC);                                    \
 828     }
 829
 830 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 831 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 832 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 833 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 834 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 835 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 836 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 837 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 838 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 839 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 840 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 841 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 842
 843 #define DO_3SAME_CMP(INSN, COND)                                        \
 844     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 845                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 846                                 uint32_t oprsz, uint32_t maxsz)         \
 847     {                                                                   \
 848         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 849     }                                                                   \
 850     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 851
 852 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 853 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 854 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 855 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 856 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 857
 858 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 859     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 860                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 861     {                                                                      \
 862         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 863     }
 864
 865 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 866
 867 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 868 {
 869     if (a->size != 0) {
 870         return false;
 871     }
 872     return do_3same(s, a, gen_VMUL_p_3s);
 873 }
 874
 875 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 876     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 877     {                                                                   \
 878         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 879             return false;                                               \
 880         }                                                               \
 881         if (a->size != 1 && a->size != 2) {                             \
 882             return false;                                               \
 883         }                                                               \
 884         return do_3same(s, a, FUNC);                                    \
 885     }
 886
 887 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 888 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 889
 890 #define DO_SHA1(NAME, FUNC)                                             \
 891     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 892     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 893     {                                                                   \
 894         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 895             return false;                                               \
 896         }                                                               \
 897         return do_3same(s, a, gen_##NAME##_3s);                         \
 898     }
 899
 900 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 901 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 902 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 903 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 904
 905 #define DO_SHA2(NAME, FUNC)                                             \
 906     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 907     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 908     {                                                                   \
 909         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 910             return false;                                               \
 911         }                                                               \
 912         return do_3same(s, a, gen_##NAME##_3s);                         \
 913     }
 914
 915 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 916 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 917 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 918
 919 #define DO_3SAME_64(INSN, FUNC)                                         \
 920     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 921                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 922                                 uint32_t oprsz, uint32_t maxsz)         \
 923     {                                                                   \
 924         static const GVecGen3 op = { .fni8 = FUNC };                    \
 925         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 926     }                                                                   \
 927     DO_3SAME(INSN, gen_##INSN##_3s)
 928
 929 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 930     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 931     {                                                                   \
 932         FUNC(d, cpu_env, n, m);                                         \
 933     }                                                                   \
 934     DO_3SAME_64(INSN, gen_##INSN##_elt)
 935
 936 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 937 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 938 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 939 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 940 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 941 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 942
 943 #define DO_3SAME_32(INSN, FUNC)                                         \
 944     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 945                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 946                                 uint32_t oprsz, uint32_t maxsz)         \
 947     {                                                                   \
 948         static const GVecGen3 ops[4] = {                                \
 949             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 950             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 951             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 952             { 0 },                                                      \
 953         };                                                              \
 954         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 955     }                                                                   \
 956     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 957     {                                                                   \
 958         if (a->size > 2) {                                              \
 959             return false;                                               \
 960         }                                                               \
 961         return do_3same(s, a, gen_##INSN##_3s);                         \
 962     }
 963
 964 /*
 965  * Some helper functions need to be passed the cpu_env. In order
 966  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 967  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 968  * and which call a NeonGenTwoOpEnvFn().
 969  */
 970 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 971     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 972     {                                                                   \
 973         FUNC(d, cpu_env, n, m);                                         \
 974     }
 975
 976 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 977     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 978     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 979     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 980     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 981                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 982                                 uint32_t oprsz, uint32_t maxsz)         \
 983     {                                                                   \
 984         static const GVecGen3 ops[4] = {                                \
 985             { .fni4 = gen_##INSN##_tramp8 },                            \
 986             { .fni4 = gen_##INSN##_tramp16 },                           \
 987             { .fni4 = gen_##INSN##_tramp32 },                           \
 988             { 0 },                                                      \
 989         };                                                              \
 990         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 991     }                                                                   \
 992     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 993     {                                                                   \
 994         if (a->size > 2) {                                              \
 995             return false;                                               \
 996         }                                                               \
 997         return do_3same(s, a, gen_##INSN##_3s);                         \
 998     }
 999
1000 DO_3SAME_32(VHADD_S, hadd_s)
1001 DO_3SAME_32(VHADD_U, hadd_u)
1002 DO_3SAME_32(VHSUB_S, hsub_s)
1003 DO_3SAME_32(VHSUB_U, hsub_u)
1004 DO_3SAME_32(VRHADD_S, rhadd_s)
1005 DO_3SAME_32(VRHADD_U, rhadd_u)
1006 DO_3SAME_32(VRSHL_S, rshl_s)
1007 DO_3SAME_32(VRSHL_U, rshl_u)
1008
1009 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1010 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1011 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1012 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1013
1014 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1015 {
1016     /* Operations handled pairwise 32 bits at a time */
1017     TCGv_i32 tmp, tmp2, tmp3;
1018
1019     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1020         return false;
1021     }
1022
1023     /* UNDEF accesses to D16-D31 if they don't exist. */
1024     if (!dc_isar_feature(aa32_simd_r32, s) &&
1025         ((a->vd | a->vn | a->vm) & 0x10)) {
1026         return false;
1027     }
1028
1029     if (a->size == 3) {
1030         return false;
1031     }
1032
1033     if (!vfp_access_check(s)) {
1034         return true;
1035     }
1036
1037     assert(a->q == 0); /* enforced by decode patterns */
1038
1039     /*
1040      * Note that we have to be careful not to clobber the source operands
1041      * in the "vm == vd" case by storing the result of the first pass too
1042      * early. Since Q is 0 there are always just two passes, so instead
1043      * of a complicated loop over each pass we just unroll.
1044      */
1045     tmp = tcg_temp_new_i32();
1046     tmp2 = tcg_temp_new_i32();
1047     tmp3 = tcg_temp_new_i32();
1048
1049     read_neon_element32(tmp, a->vn, 0, MO_32);
1050     read_neon_element32(tmp2, a->vn, 1, MO_32);
1051     fn(tmp, tmp, tmp2);
1052
1053     read_neon_element32(tmp3, a->vm, 0, MO_32);
1054     read_neon_element32(tmp2, a->vm, 1, MO_32);
1055     fn(tmp3, tmp3, tmp2);
1056
1057     write_neon_element32(tmp, a->vd, 0, MO_32);
1058     write_neon_element32(tmp3, a->vd, 1, MO_32);
1059
1060     tcg_temp_free_i32(tmp);
1061     tcg_temp_free_i32(tmp2);
1062     tcg_temp_free_i32(tmp3);
1063     return true;
1064 }
1065
1066 #define DO_3SAME_PAIR(INSN, func)                                       \
1067     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1068     {                                                                   \
1069         static NeonGenTwoOpFn * const fns[] = {                         \
1070             gen_helper_neon_##func##8,                                  \
1071             gen_helper_neon_##func##16,                                 \
1072             gen_helper_neon_##func##32,                                 \
1073         };                                                              \
1074         if (a->size > 2) {                                              \
1075             return false;                                               \
1076         }                                                               \
1077         return do_3same_pair(s, a, fns[a->size]);                       \
1078     }
1079
1080 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1081 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1082 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1083 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1084 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1085 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1086
1087 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1088 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1089 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1090 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1091 DO_3SAME_PAIR(VPADD, padd_u)
1092
1093 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1094     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1095     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1096     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1097                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1098                                 uint32_t oprsz, uint32_t maxsz)         \
1099     {                                                                   \
1100         static const GVecGen3 ops[2] = {                                \
1101             { .fni4 = gen_##INSN##_tramp16 },                           \
1102             { .fni4 = gen_##INSN##_tramp32 },                           \
1103         };                                                              \
1104         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1105     }                                                                   \
1106     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1107     {                                                                   \
1108         if (a->size != 1 && a->size != 2) {                             \
1109             return false;                                               \
1110         }                                                               \
1111         return do_3same(s, a, gen_##INSN##_3s);                         \
1112     }
1113
1114 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1115 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1116
1117 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1118     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1119                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1120                          uint32_t oprsz, uint32_t maxsz)                \
1121     {                                                                   \
1122         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1123         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1124                            oprsz, maxsz, 0, FUNC);                      \
1125         tcg_temp_free_ptr(fpst);                                        \
1126     }
1127
1128 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1129     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1130     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1131     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1132     {                                                                   \
1133         if (a->size == MO_16) {                                         \
1134             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1135                 return false;                                           \
1136             }                                                           \
1137             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1138         }                                                               \
1139         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1140     }
1141
1142
1143 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1144 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1145 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1146 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1147 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1148 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1149 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1150 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1151 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1152 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1153 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1154 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1155 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1156 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1157 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1158 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1159 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1160
1161 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1162 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1163 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1164 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1165
1166 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1167 {
1168     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1169         return false;
1170     }
1171
1172     if (a->size == MO_16) {
1173         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1174             return false;
1175         }
1176         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1177     }
1178     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1179 }
1180
1181 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1182 {
1183     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1184         return false;
1185     }
1186
1187     if (a->size == MO_16) {
1188         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1189             return false;
1190         }
1191         return do_3same(s, a, gen_VMINNM_fp16_3s);
1192     }
1193     return do_3same(s, a, gen_VMINNM_fp32_3s);
1194 }
1195
1196 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1197                              gen_helper_gvec_3_ptr *fn)
1198 {
1199     /* FP pairwise operations */
1200     TCGv_ptr fpstatus;
1201
1202     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1203         return false;
1204     }
1205
1206     /* UNDEF accesses to D16-D31 if they don't exist. */
1207     if (!dc_isar_feature(aa32_simd_r32, s) &&
1208         ((a->vd | a->vn | a->vm) & 0x10)) {
1209         return false;
1210     }
1211
1212     if (!vfp_access_check(s)) {
1213         return true;
1214     }
1215
1216     assert(a->q == 0); /* enforced by decode patterns */
1217
1218
1219     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1220     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1221                        vfp_reg_offset(1, a->vn),
1222                        vfp_reg_offset(1, a->vm),
1223                        fpstatus, 8, 8, 0, fn);
1224     tcg_temp_free_ptr(fpstatus);
1225
1226     return true;
1227 }
1228
1229 /*
1230  * For all the functions using this macro, size == 1 means fp16,
1231  * which is an architecture extension we don't implement yet.
1232  */
1233 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1234     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1235     {                                                               \
1236         if (a->size == MO_16) {                                     \
1237             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1238                 return false;                                       \
1239             }                                                       \
1240             return do_3same_fp_pair(s, a, FUNC##h);                 \
1241         }                                                           \
1242         return do_3same_fp_pair(s, a, FUNC##s);                     \
1243     }
1244
1245 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1246 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1247 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1248
1249 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1250 {
1251     /* Handle a 2-reg-shift insn which can be vectorized. */
1252     int vec_size = a->q ? 16 : 8;
1253     int rd_ofs = neon_full_reg_offset(a->vd);
1254     int rm_ofs = neon_full_reg_offset(a->vm);
1255
1256     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1257         return false;
1258     }
1259
1260     /* UNDEF accesses to D16-D31 if they don't exist. */
1261     if (!dc_isar_feature(aa32_simd_r32, s) &&
1262         ((a->vd | a->vm) & 0x10)) {
1263         return false;
1264     }
1265
1266     if ((a->vm | a->vd) & a->q) {
1267         return false;
1268     }
1269
1270     if (!vfp_access_check(s)) {
1271         return true;
1272     }
1273
1274     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1275     return true;
1276 }
1277
1278 #define DO_2SH(INSN, FUNC)                                              \
1279     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1280     {                                                                   \
1281         return do_vector_2sh(s, a, FUNC);                               \
1282     }                                                                   \
1283
1284 DO_2SH(VSHL, tcg_gen_gvec_shli)
1285 DO_2SH(VSLI, gen_gvec_sli)
1286 DO_2SH(VSRI, gen_gvec_sri)
1287 DO_2SH(VSRA_S, gen_gvec_ssra)
1288 DO_2SH(VSRA_U, gen_gvec_usra)
1289 DO_2SH(VRSHR_S, gen_gvec_srshr)
1290 DO_2SH(VRSHR_U, gen_gvec_urshr)
1291 DO_2SH(VRSRA_S, gen_gvec_srsra)
1292 DO_2SH(VRSRA_U, gen_gvec_ursra)
1293
1294 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1295 {
1296     /* Signed shift out of range results in all-sign-bits */
1297     a->shift = MIN(a->shift, (8 << a->size) - 1);
1298     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1299 }
1300
1301 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1302                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1303 {
1304     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1305 }
1306
1307 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1308 {
1309     /* Shift out of range is architecturally valid and results in zero. */
1310     if (a->shift >= (8 << a->size)) {
1311         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1312     } else {
1313         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1314     }
1315 }
1316
1317 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1318                              NeonGenTwo64OpEnvFn *fn)
1319 {
1320     /*
1321      * 2-reg-and-shift operations, size == 3 case, where the
1322      * function needs to be passed cpu_env.
1323      */
1324     TCGv_i64 constimm;
1325     int pass;
1326
1327     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1328         return false;
1329     }
1330
1331     /* UNDEF accesses to D16-D31 if they don't exist. */
1332     if (!dc_isar_feature(aa32_simd_r32, s) &&
1333         ((a->vd | a->vm) & 0x10)) {
1334         return false;
1335     }
1336
1337     if ((a->vm | a->vd) & a->q) {
1338         return false;
1339     }
1340
1341     if (!vfp_access_check(s)) {
1342         return true;
1343     }
1344
1345     /*
1346      * To avoid excessive duplication of ops we implement shift
1347      * by immediate using the variable shift operations.
1348      */
1349     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1350
1351     for (pass = 0; pass < a->q + 1; pass++) {
1352         TCGv_i64 tmp = tcg_temp_new_i64();
1353
1354         read_neon_element64(tmp, a->vm, pass, MO_64);
1355         fn(tmp, cpu_env, tmp, constimm);
1356         write_neon_element64(tmp, a->vd, pass, MO_64);
1357         tcg_temp_free_i64(tmp);
1358     }
1359     return true;
1360 }
1361
1362 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1363                              NeonGenTwoOpEnvFn *fn)
1364 {
1365     /*
1366      * 2-reg-and-shift operations, size < 3 case, where the
1367      * helper needs to be passed cpu_env.
1368      */
1369     TCGv_i32 constimm, tmp;
1370     int pass;
1371
1372     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1373         return false;
1374     }
1375
1376     /* UNDEF accesses to D16-D31 if they don't exist. */
1377     if (!dc_isar_feature(aa32_simd_r32, s) &&
1378         ((a->vd | a->vm) & 0x10)) {
1379         return false;
1380     }
1381
1382     if ((a->vm | a->vd) & a->q) {
1383         return false;
1384     }
1385
1386     if (!vfp_access_check(s)) {
1387         return true;
1388     }
1389
1390     /*
1391      * To avoid excessive duplication of ops we implement shift
1392      * by immediate using the variable shift operations.
1393      */
1394     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1395     tmp = tcg_temp_new_i32();
1396
1397     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1398         read_neon_element32(tmp, a->vm, pass, MO_32);
1399         fn(tmp, cpu_env, tmp, constimm);
1400         write_neon_element32(tmp, a->vd, pass, MO_32);
1401     }
1402     tcg_temp_free_i32(tmp);
1403     return true;
1404 }
1405
1406 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1407     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1408     {                                                                   \
1409         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1410     }                                                                   \
1411     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1412     {                                                                   \
1413         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1414             gen_helper_neon_##FUNC##8,                                  \
1415             gen_helper_neon_##FUNC##16,                                 \
1416             gen_helper_neon_##FUNC##32,                                 \
1417         };                                                              \
1418         assert(a->size < ARRAY_SIZE(fns));                              \
1419         return do_2shift_env_32(s, a, fns[a->size]);                    \
1420     }
1421
1422 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1423 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1424 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1425
1426 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1427                                 NeonGenTwo64OpFn *shiftfn,
1428                                 NeonGenNarrowEnvFn *narrowfn)
1429 {
1430     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1431     TCGv_i64 constimm, rm1, rm2;
1432     TCGv_i32 rd;
1433
1434     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1435         return false;
1436     }
1437
1438     /* UNDEF accesses to D16-D31 if they don't exist. */
1439     if (!dc_isar_feature(aa32_simd_r32, s) &&
1440         ((a->vd | a->vm) & 0x10)) {
1441         return false;
1442     }
1443
1444     if (a->vm & 1) {
1445         return false;
1446     }
1447
1448     if (!vfp_access_check(s)) {
1449         return true;
1450     }
1451
1452     /*
1453      * This is always a right shift, and the shiftfn is always a
1454      * left-shift helper, which thus needs the negated shift count.
1455      */
1456     constimm = tcg_constant_i64(-a->shift);
1457     rm1 = tcg_temp_new_i64();
1458     rm2 = tcg_temp_new_i64();
1459     rd = tcg_temp_new_i32();
1460
1461     /* Load both inputs first to avoid potential overwrite if rm == rd */
1462     read_neon_element64(rm1, a->vm, 0, MO_64);
1463     read_neon_element64(rm2, a->vm, 1, MO_64);
1464
1465     shiftfn(rm1, rm1, constimm);
1466     narrowfn(rd, cpu_env, rm1);
1467     write_neon_element32(rd, a->vd, 0, MO_32);
1468
1469     shiftfn(rm2, rm2, constimm);
1470     narrowfn(rd, cpu_env, rm2);
1471     write_neon_element32(rd, a->vd, 1, MO_32);
1472
1473     tcg_temp_free_i32(rd);
1474     tcg_temp_free_i64(rm1);
1475     tcg_temp_free_i64(rm2);
1476
1477     return true;
1478 }
1479
1480 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1481                                 NeonGenTwoOpFn *shiftfn,
1482                                 NeonGenNarrowEnvFn *narrowfn)
1483 {
1484     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1485     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1486     TCGv_i64 rtmp;
1487     uint32_t imm;
1488
1489     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1490         return false;
1491     }
1492
1493     /* UNDEF accesses to D16-D31 if they don't exist. */
1494     if (!dc_isar_feature(aa32_simd_r32, s) &&
1495         ((a->vd | a->vm) & 0x10)) {
1496         return false;
1497     }
1498
1499     if (a->vm & 1) {
1500         return false;
1501     }
1502
1503     if (!vfp_access_check(s)) {
1504         return true;
1505     }
1506
1507     /*
1508      * This is always a right shift, and the shiftfn is always a
1509      * left-shift helper, which thus needs the negated shift count
1510      * duplicated into each lane of the immediate value.
1511      */
1512     if (a->size == 1) {
1513         imm = (uint16_t)(-a->shift);
1514         imm |= imm << 16;
1515     } else {
1516         /* size == 2 */
1517         imm = -a->shift;
1518     }
1519     constimm = tcg_constant_i32(imm);
1520
1521     /* Load all inputs first to avoid potential overwrite */
1522     rm1 = tcg_temp_new_i32();
1523     rm2 = tcg_temp_new_i32();
1524     rm3 = tcg_temp_new_i32();
1525     rm4 = tcg_temp_new_i32();
1526     read_neon_element32(rm1, a->vm, 0, MO_32);
1527     read_neon_element32(rm2, a->vm, 1, MO_32);
1528     read_neon_element32(rm3, a->vm, 2, MO_32);
1529     read_neon_element32(rm4, a->vm, 3, MO_32);
1530     rtmp = tcg_temp_new_i64();
1531
1532     shiftfn(rm1, rm1, constimm);
1533     shiftfn(rm2, rm2, constimm);
1534
1535     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1536     tcg_temp_free_i32(rm2);
1537
1538     narrowfn(rm1, cpu_env, rtmp);
1539     write_neon_element32(rm1, a->vd, 0, MO_32);
1540     tcg_temp_free_i32(rm1);
1541
1542     shiftfn(rm3, rm3, constimm);
1543     shiftfn(rm4, rm4, constimm);
1544
1545     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1546     tcg_temp_free_i32(rm4);
1547
1548     narrowfn(rm3, cpu_env, rtmp);
1549     tcg_temp_free_i64(rtmp);
1550     write_neon_element32(rm3, a->vd, 1, MO_32);
1551     tcg_temp_free_i32(rm3);
1552     return true;
1553 }
1554
1555 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1556     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1557     {                                                                   \
1558         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1559     }
1560 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1561     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1562     {                                                                   \
1563         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1564     }
1565
1566 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1567 {
1568     tcg_gen_extrl_i64_i32(dest, src);
1569 }
1570
1571 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1572 {
1573     gen_helper_neon_narrow_u16(dest, src);
1574 }
1575
1576 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1577 {
1578     gen_helper_neon_narrow_u8(dest, src);
1579 }
1580
1581 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1582 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1583 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1584
1585 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1586 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1587 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1588
1589 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1590 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1591 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1592
1593 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1594 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1595 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1596 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1597 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1598 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1599
1600 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1601 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1602 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1603
1604 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1605 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1606 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1607
1608 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1609 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1610 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1611
1612 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1613                          NeonGenWidenFn *widenfn, bool u)
1614 {
1615     TCGv_i64 tmp;
1616     TCGv_i32 rm0, rm1;
1617     uint64_t widen_mask = 0;
1618
1619     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1620         return false;
1621     }
1622
1623     /* UNDEF accesses to D16-D31 if they don't exist. */
1624     if (!dc_isar_feature(aa32_simd_r32, s) &&
1625         ((a->vd | a->vm) & 0x10)) {
1626         return false;
1627     }
1628
1629     if (a->vd & 1) {
1630         return false;
1631     }
1632
1633     if (!vfp_access_check(s)) {
1634         return true;
1635     }
1636
1637     /*
1638      * This is a widen-and-shift operation. The shift is always less
1639      * than the width of the source type, so after widening the input
1640      * vector we can simply shift the whole 64-bit widened register,
1641      * and then clear the potential overflow bits resulting from left
1642      * bits of the narrow input appearing as right bits of the left
1643      * neighbour narrow input. Calculate a mask of bits to clear.
1644      */
1645     if ((a->shift != 0) && (a->size < 2 || u)) {
1646         int esize = 8 << a->size;
1647         widen_mask = MAKE_64BIT_MASK(0, esize);
1648         widen_mask >>= esize - a->shift;
1649         widen_mask = dup_const(a->size + 1, widen_mask);
1650     }
1651
1652     rm0 = tcg_temp_new_i32();
1653     rm1 = tcg_temp_new_i32();
1654     read_neon_element32(rm0, a->vm, 0, MO_32);
1655     read_neon_element32(rm1, a->vm, 1, MO_32);
1656     tmp = tcg_temp_new_i64();
1657
1658     widenfn(tmp, rm0);
1659     tcg_temp_free_i32(rm0);
1660     if (a->shift != 0) {
1661         tcg_gen_shli_i64(tmp, tmp, a->shift);
1662         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1663     }
1664     write_neon_element64(tmp, a->vd, 0, MO_64);
1665
1666     widenfn(tmp, rm1);
1667     tcg_temp_free_i32(rm1);
1668     if (a->shift != 0) {
1669         tcg_gen_shli_i64(tmp, tmp, a->shift);
1670         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1671     }
1672     write_neon_element64(tmp, a->vd, 1, MO_64);
1673     tcg_temp_free_i64(tmp);
1674     return true;
1675 }
1676
1677 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1678 {
1679     static NeonGenWidenFn * const widenfn[] = {
1680         gen_helper_neon_widen_s8,
1681         gen_helper_neon_widen_s16,
1682         tcg_gen_ext_i32_i64,
1683     };
1684     return do_vshll_2sh(s, a, widenfn[a->size], false);
1685 }
1686
1687 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1688 {
1689     static NeonGenWidenFn * const widenfn[] = {
1690         gen_helper_neon_widen_u8,
1691         gen_helper_neon_widen_u16,
1692         tcg_gen_extu_i32_i64,
1693     };
1694     return do_vshll_2sh(s, a, widenfn[a->size], true);
1695 }
1696
1697 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1698                       gen_helper_gvec_2_ptr *fn)
1699 {
1700     /* FP operations in 2-reg-and-shift group */
1701     int vec_size = a->q ? 16 : 8;
1702     int rd_ofs = neon_full_reg_offset(a->vd);
1703     int rm_ofs = neon_full_reg_offset(a->vm);
1704     TCGv_ptr fpst;
1705
1706     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1707         return false;
1708     }
1709
1710     if (a->size == MO_16) {
1711         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1712             return false;
1713         }
1714     }
1715
1716     /* UNDEF accesses to D16-D31 if they don't exist. */
1717     if (!dc_isar_feature(aa32_simd_r32, s) &&
1718         ((a->vd | a->vm) & 0x10)) {
1719         return false;
1720     }
1721
1722     if ((a->vm | a->vd) & a->q) {
1723         return false;
1724     }
1725
1726     if (!vfp_access_check(s)) {
1727         return true;
1728     }
1729
1730     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1731     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1732     tcg_temp_free_ptr(fpst);
1733     return true;
1734 }
1735
1736 #define DO_FP_2SH(INSN, FUNC)                                           \
1737     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1738     {                                                                   \
1739         return do_fp_2sh(s, a, FUNC);                                   \
1740     }
1741
1742 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1743 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1744 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1745 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1746
1747 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1748 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1749 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1750 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1751
1752 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1753                         GVecGen2iFn *fn)
1754 {
1755     uint64_t imm;
1756     int reg_ofs, vec_size;
1757
1758     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1759         return false;
1760     }
1761
1762     /* UNDEF accesses to D16-D31 if they don't exist. */
1763     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1764         return false;
1765     }
1766
1767     if (a->vd & a->q) {
1768         return false;
1769     }
1770
1771     if (!vfp_access_check(s)) {
1772         return true;
1773     }
1774
1775     reg_ofs = neon_full_reg_offset(a->vd);
1776     vec_size = a->q ? 16 : 8;
1777     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1778
1779     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1780     return true;
1781 }
1782
1783 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1784                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1785 {
1786     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1787 }
1788
1789 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1790 {
1791     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1792     GVecGen2iFn *fn;
1793
1794     if ((a->cmode & 1) && a->cmode < 12) {
1795         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1796         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1797     } else {
1798         /* There is one unallocated cmode/op combination in this space */
1799         if (a->cmode == 15 && a->op == 1) {
1800             return false;
1801         }
1802         fn = gen_VMOV_1r;
1803     }
1804     return do_1reg_imm(s, a, fn);
1805 }
1806
1807 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1808                            NeonGenWidenFn *widenfn,
1809                            NeonGenTwo64OpFn *opfn,
1810                            int src1_mop, int src2_mop)
1811 {
1812     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1813     TCGv_i64 rn0_64, rn1_64, rm_64;
1814
1815     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1816         return false;
1817     }
1818
1819     /* UNDEF accesses to D16-D31 if they don't exist. */
1820     if (!dc_isar_feature(aa32_simd_r32, s) &&
1821         ((a->vd | a->vn | a->vm) & 0x10)) {
1822         return false;
1823     }
1824
1825     if (!opfn) {
1826         /* size == 3 case, which is an entirely different insn group */
1827         return false;
1828     }
1829
1830     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1831         return false;
1832     }
1833
1834     if (!vfp_access_check(s)) {
1835         return true;
1836     }
1837
1838     rn0_64 = tcg_temp_new_i64();
1839     rn1_64 = tcg_temp_new_i64();
1840     rm_64 = tcg_temp_new_i64();
1841
1842     if (src1_mop >= 0) {
1843         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1844     } else {
1845         TCGv_i32 tmp = tcg_temp_new_i32();
1846         read_neon_element32(tmp, a->vn, 0, MO_32);
1847         widenfn(rn0_64, tmp);
1848         tcg_temp_free_i32(tmp);
1849     }
1850     if (src2_mop >= 0) {
1851         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1852     } else {
1853         TCGv_i32 tmp = tcg_temp_new_i32();
1854         read_neon_element32(tmp, a->vm, 0, MO_32);
1855         widenfn(rm_64, tmp);
1856         tcg_temp_free_i32(tmp);
1857     }
1858
1859     opfn(rn0_64, rn0_64, rm_64);
1860
1861     /*
1862      * Load second pass inputs before storing the first pass result, to
1863      * avoid incorrect results if a narrow input overlaps with the result.
1864      */
1865     if (src1_mop >= 0) {
1866         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1867     } else {
1868         TCGv_i32 tmp = tcg_temp_new_i32();
1869         read_neon_element32(tmp, a->vn, 1, MO_32);
1870         widenfn(rn1_64, tmp);
1871         tcg_temp_free_i32(tmp);
1872     }
1873     if (src2_mop >= 0) {
1874         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1875     } else {
1876         TCGv_i32 tmp = tcg_temp_new_i32();
1877         read_neon_element32(tmp, a->vm, 1, MO_32);
1878         widenfn(rm_64, tmp);
1879         tcg_temp_free_i32(tmp);
1880     }
1881
1882     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1883
1884     opfn(rn1_64, rn1_64, rm_64);
1885     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1886
1887     tcg_temp_free_i64(rn0_64);
1888     tcg_temp_free_i64(rn1_64);
1889     tcg_temp_free_i64(rm_64);
1890
1891     return true;
1892 }
1893
1894 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1895     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1896     {                                                                   \
1897         static NeonGenWidenFn * const widenfn[] = {                     \
1898             gen_helper_neon_widen_##S##8,                               \
1899             gen_helper_neon_widen_##S##16,                              \
1900             NULL, NULL,                                                 \
1901         };                                                              \
1902         static NeonGenTwo64OpFn * const addfn[] = {                     \
1903             gen_helper_neon_##OP##l_u16,                                \
1904             gen_helper_neon_##OP##l_u32,                                \
1905             tcg_gen_##OP##_i64,                                         \
1906             NULL,                                                       \
1907         };                                                              \
1908         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1909         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1910                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1911                               narrow_mop);                              \
1912     }
1913
1914 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1915 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1916 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1917 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1918 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1919 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1920 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1921 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1922
1923 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1924                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1925 {
1926     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1927     TCGv_i64 rn_64, rm_64;
1928     TCGv_i32 rd0, rd1;
1929
1930     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1931         return false;
1932     }
1933
1934     /* UNDEF accesses to D16-D31 if they don't exist. */
1935     if (!dc_isar_feature(aa32_simd_r32, s) &&
1936         ((a->vd | a->vn | a->vm) & 0x10)) {
1937         return false;
1938     }
1939
1940     if (!opfn || !narrowfn) {
1941         /* size == 3 case, which is an entirely different insn group */
1942         return false;
1943     }
1944
1945     if ((a->vn | a->vm) & 1) {
1946         return false;
1947     }
1948
1949     if (!vfp_access_check(s)) {
1950         return true;
1951     }
1952
1953     rn_64 = tcg_temp_new_i64();
1954     rm_64 = tcg_temp_new_i64();
1955     rd0 = tcg_temp_new_i32();
1956     rd1 = tcg_temp_new_i32();
1957
1958     read_neon_element64(rn_64, a->vn, 0, MO_64);
1959     read_neon_element64(rm_64, a->vm, 0, MO_64);
1960
1961     opfn(rn_64, rn_64, rm_64);
1962
1963     narrowfn(rd0, rn_64);
1964
1965     read_neon_element64(rn_64, a->vn, 1, MO_64);
1966     read_neon_element64(rm_64, a->vm, 1, MO_64);
1967
1968     opfn(rn_64, rn_64, rm_64);
1969
1970     narrowfn(rd1, rn_64);
1971
1972     write_neon_element32(rd0, a->vd, 0, MO_32);
1973     write_neon_element32(rd1, a->vd, 1, MO_32);
1974
1975     tcg_temp_free_i32(rd0);
1976     tcg_temp_free_i32(rd1);
1977     tcg_temp_free_i64(rn_64);
1978     tcg_temp_free_i64(rm_64);
1979
1980     return true;
1981 }
1982
1983 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1984     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1985     {                                                                   \
1986         static NeonGenTwo64OpFn * const addfn[] = {                     \
1987             gen_helper_neon_##OP##l_u16,                                \
1988             gen_helper_neon_##OP##l_u32,                                \
1989             tcg_gen_##OP##_i64,                                         \
1990             NULL,                                                       \
1991         };                                                              \
1992         static NeonGenNarrowFn * const narrowfn[] = {                   \
1993             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1994             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1995             EXTOP,                                                      \
1996             NULL,                                                       \
1997         };                                                              \
1998         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1999     }
2000
2001 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2002 {
2003     tcg_gen_addi_i64(rn, rn, 1u << 31);
2004     tcg_gen_extrh_i64_i32(rd, rn);
2005 }
2006
2007 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2008 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2009 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2010 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2011
2012 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2013                        NeonGenTwoOpWidenFn *opfn,
2014                        NeonGenTwo64OpFn *accfn)
2015 {
2016     /*
2017      * 3-regs different lengths, long operations.
2018      * These perform an operation on two inputs that returns a double-width
2019      * result, and then possibly perform an accumulation operation of
2020      * that result into the double-width destination.
2021      */
2022     TCGv_i64 rd0, rd1, tmp;
2023     TCGv_i32 rn, rm;
2024
2025     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2026         return false;
2027     }
2028
2029     /* UNDEF accesses to D16-D31 if they don't exist. */
2030     if (!dc_isar_feature(aa32_simd_r32, s) &&
2031         ((a->vd | a->vn | a->vm) & 0x10)) {
2032         return false;
2033     }
2034
2035     if (!opfn) {
2036         /* size == 3 case, which is an entirely different insn group */
2037         return false;
2038     }
2039
2040     if (a->vd & 1) {
2041         return false;
2042     }
2043
2044     if (!vfp_access_check(s)) {
2045         return true;
2046     }
2047
2048     rd0 = tcg_temp_new_i64();
2049     rd1 = tcg_temp_new_i64();
2050
2051     rn = tcg_temp_new_i32();
2052     rm = tcg_temp_new_i32();
2053     read_neon_element32(rn, a->vn, 0, MO_32);
2054     read_neon_element32(rm, a->vm, 0, MO_32);
2055     opfn(rd0, rn, rm);
2056
2057     read_neon_element32(rn, a->vn, 1, MO_32);
2058     read_neon_element32(rm, a->vm, 1, MO_32);
2059     opfn(rd1, rn, rm);
2060     tcg_temp_free_i32(rn);
2061     tcg_temp_free_i32(rm);
2062
2063     /* Don't store results until after all loads: they might overlap */
2064     if (accfn) {
2065         tmp = tcg_temp_new_i64();
2066         read_neon_element64(tmp, a->vd, 0, MO_64);
2067         accfn(rd0, tmp, rd0);
2068         read_neon_element64(tmp, a->vd, 1, MO_64);
2069         accfn(rd1, tmp, rd1);
2070         tcg_temp_free_i64(tmp);
2071     }
2072
2073     write_neon_element64(rd0, a->vd, 0, MO_64);
2074     write_neon_element64(rd1, a->vd, 1, MO_64);
2075     tcg_temp_free_i64(rd0);
2076     tcg_temp_free_i64(rd1);
2077
2078     return true;
2079 }
2080
2081 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2082 {
2083     static NeonGenTwoOpWidenFn * const opfn[] = {
2084         gen_helper_neon_abdl_s16,
2085         gen_helper_neon_abdl_s32,
2086         gen_helper_neon_abdl_s64,
2087         NULL,
2088     };
2089
2090     return do_long_3d(s, a, opfn[a->size], NULL);
2091 }
2092
2093 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2094 {
2095     static NeonGenTwoOpWidenFn * const opfn[] = {
2096         gen_helper_neon_abdl_u16,
2097         gen_helper_neon_abdl_u32,
2098         gen_helper_neon_abdl_u64,
2099         NULL,
2100     };
2101
2102     return do_long_3d(s, a, opfn[a->size], NULL);
2103 }
2104
2105 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2106 {
2107     static NeonGenTwoOpWidenFn * const opfn[] = {
2108         gen_helper_neon_abdl_s16,
2109         gen_helper_neon_abdl_s32,
2110         gen_helper_neon_abdl_s64,
2111         NULL,
2112     };
2113     static NeonGenTwo64OpFn * const addfn[] = {
2114         gen_helper_neon_addl_u16,
2115         gen_helper_neon_addl_u32,
2116         tcg_gen_add_i64,
2117         NULL,
2118     };
2119
2120     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2121 }
2122
2123 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2124 {
2125     static NeonGenTwoOpWidenFn * const opfn[] = {
2126         gen_helper_neon_abdl_u16,
2127         gen_helper_neon_abdl_u32,
2128         gen_helper_neon_abdl_u64,
2129         NULL,
2130     };
2131     static NeonGenTwo64OpFn * const addfn[] = {
2132         gen_helper_neon_addl_u16,
2133         gen_helper_neon_addl_u32,
2134         tcg_gen_add_i64,
2135         NULL,
2136     };
2137
2138     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2139 }
2140
2141 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2142 {
2143     TCGv_i32 lo = tcg_temp_new_i32();
2144     TCGv_i32 hi = tcg_temp_new_i32();
2145
2146     tcg_gen_muls2_i32(lo, hi, rn, rm);
2147     tcg_gen_concat_i32_i64(rd, lo, hi);
2148
2149     tcg_temp_free_i32(lo);
2150     tcg_temp_free_i32(hi);
2151 }
2152
2153 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2154 {
2155     TCGv_i32 lo = tcg_temp_new_i32();
2156     TCGv_i32 hi = tcg_temp_new_i32();
2157
2158     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2159     tcg_gen_concat_i32_i64(rd, lo, hi);
2160
2161     tcg_temp_free_i32(lo);
2162     tcg_temp_free_i32(hi);
2163 }
2164
2165 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2166 {
2167     static NeonGenTwoOpWidenFn * const opfn[] = {
2168         gen_helper_neon_mull_s8,
2169         gen_helper_neon_mull_s16,
2170         gen_mull_s32,
2171         NULL,
2172     };
2173
2174     return do_long_3d(s, a, opfn[a->size], NULL);
2175 }
2176
2177 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2178 {
2179     static NeonGenTwoOpWidenFn * const opfn[] = {
2180         gen_helper_neon_mull_u8,
2181         gen_helper_neon_mull_u16,
2182         gen_mull_u32,
2183         NULL,
2184     };
2185
2186     return do_long_3d(s, a, opfn[a->size], NULL);
2187 }
2188
2189 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2190     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2191     {                                                                   \
2192         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2193             gen_helper_neon_##MULL##8,                                  \
2194             gen_helper_neon_##MULL##16,                                 \
2195             gen_##MULL##32,                                             \
2196             NULL,                                                       \
2197         };                                                              \
2198         static NeonGenTwo64OpFn * const accfn[] = {                     \
2199             gen_helper_neon_##ACC##l_u16,                               \
2200             gen_helper_neon_##ACC##l_u32,                               \
2201             tcg_gen_##ACC##_i64,                                        \
2202             NULL,                                                       \
2203         };                                                              \
2204         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2205     }
2206
2207 DO_VMLAL(VMLAL_S,mull_s,add)
2208 DO_VMLAL(VMLAL_U,mull_u,add)
2209 DO_VMLAL(VMLSL_S,mull_s,sub)
2210 DO_VMLAL(VMLSL_U,mull_u,sub)
2211
2212 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2213 {
2214     gen_helper_neon_mull_s16(rd, rn, rm);
2215     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2216 }
2217
2218 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2219 {
2220     gen_mull_s32(rd, rn, rm);
2221     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2222 }
2223
2224 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2225 {
2226     static NeonGenTwoOpWidenFn * const opfn[] = {
2227         NULL,
2228         gen_VQDMULL_16,
2229         gen_VQDMULL_32,
2230         NULL,
2231     };
2232
2233     return do_long_3d(s, a, opfn[a->size], NULL);
2234 }
2235
2236 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2237 {
2238     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2239 }
2240
2241 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2242 {
2243     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2244 }
2245
2246 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2247 {
2248     static NeonGenTwoOpWidenFn * const opfn[] = {
2249         NULL,
2250         gen_VQDMULL_16,
2251         gen_VQDMULL_32,
2252         NULL,
2253     };
2254     static NeonGenTwo64OpFn * const accfn[] = {
2255         NULL,
2256         gen_VQDMLAL_acc_16,
2257         gen_VQDMLAL_acc_32,
2258         NULL,
2259     };
2260
2261     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2262 }
2263
2264 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2265 {
2266     gen_helper_neon_negl_u32(rm, rm);
2267     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2268 }
2269
2270 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2271 {
2272     tcg_gen_neg_i64(rm, rm);
2273     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2274 }
2275
2276 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2277 {
2278     static NeonGenTwoOpWidenFn * const opfn[] = {
2279         NULL,
2280         gen_VQDMULL_16,
2281         gen_VQDMULL_32,
2282         NULL,
2283     };
2284     static NeonGenTwo64OpFn * const accfn[] = {
2285         NULL,
2286         gen_VQDMLSL_acc_16,
2287         gen_VQDMLSL_acc_32,
2288         NULL,
2289     };
2290
2291     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2292 }
2293
2294 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2295 {
2296     gen_helper_gvec_3 *fn_gvec;
2297
2298     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2299         return false;
2300     }
2301
2302     /* UNDEF accesses to D16-D31 if they don't exist. */
2303     if (!dc_isar_feature(aa32_simd_r32, s) &&
2304         ((a->vd | a->vn | a->vm) & 0x10)) {
2305         return false;
2306     }
2307
2308     if (a->vd & 1) {
2309         return false;
2310     }
2311
2312     switch (a->size) {
2313     case 0:
2314         fn_gvec = gen_helper_neon_pmull_h;
2315         break;
2316     case 2:
2317         if (!dc_isar_feature(aa32_pmull, s)) {
2318             return false;
2319         }
2320         fn_gvec = gen_helper_gvec_pmull_q;
2321         break;
2322     default:
2323         return false;
2324     }
2325
2326     if (!vfp_access_check(s)) {
2327         return true;
2328     }
2329
2330     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2331                        neon_full_reg_offset(a->vn),
2332                        neon_full_reg_offset(a->vm),
2333                        16, 16, 0, fn_gvec);
2334     return true;
2335 }
2336
2337 static void gen_neon_dup_low16(TCGv_i32 var)
2338 {
2339     TCGv_i32 tmp = tcg_temp_new_i32();
2340     tcg_gen_ext16u_i32(var, var);
2341     tcg_gen_shli_i32(tmp, var, 16);
2342     tcg_gen_or_i32(var, var, tmp);
2343     tcg_temp_free_i32(tmp);
2344 }
2345
2346 static void gen_neon_dup_high16(TCGv_i32 var)
2347 {
2348     TCGv_i32 tmp = tcg_temp_new_i32();
2349     tcg_gen_andi_i32(var, var, 0xffff0000);
2350     tcg_gen_shri_i32(tmp, var, 16);
2351     tcg_gen_or_i32(var, var, tmp);
2352     tcg_temp_free_i32(tmp);
2353 }
2354
2355 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2356 {
2357     TCGv_i32 tmp = tcg_temp_new_i32();
2358     if (size == MO_16) {
2359         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2360         if (reg & 8) {
2361             gen_neon_dup_high16(tmp);
2362         } else {
2363             gen_neon_dup_low16(tmp);
2364         }
2365     } else {
2366         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2367     }
2368     return tmp;
2369 }
2370
2371 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2372                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2373 {
2374     /*
2375      * Two registers and a scalar: perform an operation between
2376      * the input elements and the scalar, and then possibly
2377      * perform an accumulation operation of that result into the
2378      * destination.
2379      */
2380     TCGv_i32 scalar, tmp;
2381     int pass;
2382
2383     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2384         return false;
2385     }
2386
2387     /* UNDEF accesses to D16-D31 if they don't exist. */
2388     if (!dc_isar_feature(aa32_simd_r32, s) &&
2389         ((a->vd | a->vn | a->vm) & 0x10)) {
2390         return false;
2391     }
2392
2393     if (!opfn) {
2394         /* Bad size (including size == 3, which is a different insn group) */
2395         return false;
2396     }
2397
2398     if (a->q && ((a->vd | a->vn) & 1)) {
2399         return false;
2400     }
2401
2402     if (!vfp_access_check(s)) {
2403         return true;
2404     }
2405
2406     scalar = neon_get_scalar(a->size, a->vm);
2407     tmp = tcg_temp_new_i32();
2408
2409     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2410         read_neon_element32(tmp, a->vn, pass, MO_32);
2411         opfn(tmp, tmp, scalar);
2412         if (accfn) {
2413             TCGv_i32 rd = tcg_temp_new_i32();
2414             read_neon_element32(rd, a->vd, pass, MO_32);
2415             accfn(tmp, rd, tmp);
2416             tcg_temp_free_i32(rd);
2417         }
2418         write_neon_element32(tmp, a->vd, pass, MO_32);
2419     }
2420     tcg_temp_free_i32(tmp);
2421     tcg_temp_free_i32(scalar);
2422     return true;
2423 }
2424
2425 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2426 {
2427     static NeonGenTwoOpFn * const opfn[] = {
2428         NULL,
2429         gen_helper_neon_mul_u16,
2430         tcg_gen_mul_i32,
2431         NULL,
2432     };
2433
2434     return do_2scalar(s, a, opfn[a->size], NULL);
2435 }
2436
2437 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2438 {
2439     static NeonGenTwoOpFn * const opfn[] = {
2440         NULL,
2441         gen_helper_neon_mul_u16,
2442         tcg_gen_mul_i32,
2443         NULL,
2444     };
2445     static NeonGenTwoOpFn * const accfn[] = {
2446         NULL,
2447         gen_helper_neon_add_u16,
2448         tcg_gen_add_i32,
2449         NULL,
2450     };
2451
2452     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2453 }
2454
2455 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2456 {
2457     static NeonGenTwoOpFn * const opfn[] = {
2458         NULL,
2459         gen_helper_neon_mul_u16,
2460         tcg_gen_mul_i32,
2461         NULL,
2462     };
2463     static NeonGenTwoOpFn * const accfn[] = {
2464         NULL,
2465         gen_helper_neon_sub_u16,
2466         tcg_gen_sub_i32,
2467         NULL,
2468     };
2469
2470     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2471 }
2472
2473 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2474                               gen_helper_gvec_3_ptr *fn)
2475 {
2476     /* Two registers and a scalar, using gvec */
2477     int vec_size = a->q ? 16 : 8;
2478     int rd_ofs = neon_full_reg_offset(a->vd);
2479     int rn_ofs = neon_full_reg_offset(a->vn);
2480     int rm_ofs;
2481     int idx;
2482     TCGv_ptr fpstatus;
2483
2484     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2485         return false;
2486     }
2487
2488     /* UNDEF accesses to D16-D31 if they don't exist. */
2489     if (!dc_isar_feature(aa32_simd_r32, s) &&
2490         ((a->vd | a->vn | a->vm) & 0x10)) {
2491         return false;
2492     }
2493
2494     if (!fn) {
2495         /* Bad size (including size == 3, which is a different insn group) */
2496         return false;
2497     }
2498
2499     if (a->q && ((a->vd | a->vn) & 1)) {
2500         return false;
2501     }
2502
2503     if (!vfp_access_check(s)) {
2504         return true;
2505     }
2506
2507     /* a->vm is M:Vm, which encodes both register and index */
2508     idx = extract32(a->vm, a->size + 2, 2);
2509     a->vm = extract32(a->vm, 0, a->size + 2);
2510     rm_ofs = neon_full_reg_offset(a->vm);
2511
2512     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2513     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2514                        vec_size, vec_size, idx, fn);
2515     tcg_temp_free_ptr(fpstatus);
2516     return true;
2517 }
2518
2519 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2520     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2521     {                                                                   \
2522         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2523             NULL,                                                       \
2524             gen_helper_##FUNC##_h,                                      \
2525             gen_helper_##FUNC##_s,                                      \
2526             NULL,                                                       \
2527         };                                                              \
2528         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2529             return false;                                               \
2530         }                                                               \
2531         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2532     }
2533
2534 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2535 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2536 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2537
2538 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2539 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2540 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2541 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2542
2543 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2544 {
2545     static NeonGenTwoOpFn * const opfn[] = {
2546         NULL,
2547         gen_VQDMULH_16,
2548         gen_VQDMULH_32,
2549         NULL,
2550     };
2551
2552     return do_2scalar(s, a, opfn[a->size], NULL);
2553 }
2554
2555 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2556 {
2557     static NeonGenTwoOpFn * const opfn[] = {
2558         NULL,
2559         gen_VQRDMULH_16,
2560         gen_VQRDMULH_32,
2561         NULL,
2562     };
2563
2564     return do_2scalar(s, a, opfn[a->size], NULL);
2565 }
2566
2567 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2568                             NeonGenThreeOpEnvFn *opfn)
2569 {
2570     /*
2571      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2572      * performs a kind of fused op-then-accumulate using a helper
2573      * function that takes all of rd, rn and the scalar at once.
2574      */
2575     TCGv_i32 scalar, rn, rd;
2576     int pass;
2577
2578     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2579         return false;
2580     }
2581
2582     if (!dc_isar_feature(aa32_rdm, s)) {
2583         return false;
2584     }
2585
2586     /* UNDEF accesses to D16-D31 if they don't exist. */
2587     if (!dc_isar_feature(aa32_simd_r32, s) &&
2588         ((a->vd | a->vn | a->vm) & 0x10)) {
2589         return false;
2590     }
2591
2592     if (!opfn) {
2593         /* Bad size (including size == 3, which is a different insn group) */
2594         return false;
2595     }
2596
2597     if (a->q && ((a->vd | a->vn) & 1)) {
2598         return false;
2599     }
2600
2601     if (!vfp_access_check(s)) {
2602         return true;
2603     }
2604
2605     scalar = neon_get_scalar(a->size, a->vm);
2606     rn = tcg_temp_new_i32();
2607     rd = tcg_temp_new_i32();
2608
2609     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2610         read_neon_element32(rn, a->vn, pass, MO_32);
2611         read_neon_element32(rd, a->vd, pass, MO_32);
2612         opfn(rd, cpu_env, rn, scalar, rd);
2613         write_neon_element32(rd, a->vd, pass, MO_32);
2614     }
2615     tcg_temp_free_i32(rn);
2616     tcg_temp_free_i32(rd);
2617     tcg_temp_free_i32(scalar);
2618
2619     return true;
2620 }
2621
2622 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2623 {
2624     static NeonGenThreeOpEnvFn *opfn[] = {
2625         NULL,
2626         gen_helper_neon_qrdmlah_s16,
2627         gen_helper_neon_qrdmlah_s32,
2628         NULL,
2629     };
2630     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2631 }
2632
2633 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2634 {
2635     static NeonGenThreeOpEnvFn *opfn[] = {
2636         NULL,
2637         gen_helper_neon_qrdmlsh_s16,
2638         gen_helper_neon_qrdmlsh_s32,
2639         NULL,
2640     };
2641     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2642 }
2643
2644 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2645                             NeonGenTwoOpWidenFn *opfn,
2646                             NeonGenTwo64OpFn *accfn)
2647 {
2648     /*
2649      * Two registers and a scalar, long operations: perform an
2650      * operation on the input elements and the scalar which produces
2651      * a double-width result, and then possibly perform an accumulation
2652      * operation of that result into the destination.
2653      */
2654     TCGv_i32 scalar, rn;
2655     TCGv_i64 rn0_64, rn1_64;
2656
2657     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2658         return false;
2659     }
2660
2661     /* UNDEF accesses to D16-D31 if they don't exist. */
2662     if (!dc_isar_feature(aa32_simd_r32, s) &&
2663         ((a->vd | a->vn | a->vm) & 0x10)) {
2664         return false;
2665     }
2666
2667     if (!opfn) {
2668         /* Bad size (including size == 3, which is a different insn group) */
2669         return false;
2670     }
2671
2672     if (a->vd & 1) {
2673         return false;
2674     }
2675
2676     if (!vfp_access_check(s)) {
2677         return true;
2678     }
2679
2680     scalar = neon_get_scalar(a->size, a->vm);
2681
2682     /* Load all inputs before writing any outputs, in case of overlap */
2683     rn = tcg_temp_new_i32();
2684     read_neon_element32(rn, a->vn, 0, MO_32);
2685     rn0_64 = tcg_temp_new_i64();
2686     opfn(rn0_64, rn, scalar);
2687
2688     read_neon_element32(rn, a->vn, 1, MO_32);
2689     rn1_64 = tcg_temp_new_i64();
2690     opfn(rn1_64, rn, scalar);
2691     tcg_temp_free_i32(rn);
2692     tcg_temp_free_i32(scalar);
2693
2694     if (accfn) {
2695         TCGv_i64 t64 = tcg_temp_new_i64();
2696         read_neon_element64(t64, a->vd, 0, MO_64);
2697         accfn(rn0_64, t64, rn0_64);
2698         read_neon_element64(t64, a->vd, 1, MO_64);
2699         accfn(rn1_64, t64, rn1_64);
2700         tcg_temp_free_i64(t64);
2701     }
2702
2703     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2704     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2705     tcg_temp_free_i64(rn0_64);
2706     tcg_temp_free_i64(rn1_64);
2707     return true;
2708 }
2709
2710 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2711 {
2712     static NeonGenTwoOpWidenFn * const opfn[] = {
2713         NULL,
2714         gen_helper_neon_mull_s16,
2715         gen_mull_s32,
2716         NULL,
2717     };
2718
2719     return do_2scalar_long(s, a, opfn[a->size], NULL);
2720 }
2721
2722 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2723 {
2724     static NeonGenTwoOpWidenFn * const opfn[] = {
2725         NULL,
2726         gen_helper_neon_mull_u16,
2727         gen_mull_u32,
2728         NULL,
2729     };
2730
2731     return do_2scalar_long(s, a, opfn[a->size], NULL);
2732 }
2733
2734 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2735     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2736     {                                                                   \
2737         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2738             NULL,                                                       \
2739             gen_helper_neon_##MULL##16,                                 \
2740             gen_##MULL##32,                                             \
2741             NULL,                                                       \
2742         };                                                              \
2743         static NeonGenTwo64OpFn * const accfn[] = {                     \
2744             NULL,                                                       \
2745             gen_helper_neon_##ACC##l_u32,                               \
2746             tcg_gen_##ACC##_i64,                                        \
2747             NULL,                                                       \
2748         };                                                              \
2749         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2750     }
2751
2752 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2753 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2754 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2755 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2756
2757 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2758 {
2759     static NeonGenTwoOpWidenFn * const opfn[] = {
2760         NULL,
2761         gen_VQDMULL_16,
2762         gen_VQDMULL_32,
2763         NULL,
2764     };
2765
2766     return do_2scalar_long(s, a, opfn[a->size], NULL);
2767 }
2768
2769 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2770 {
2771     static NeonGenTwoOpWidenFn * const opfn[] = {
2772         NULL,
2773         gen_VQDMULL_16,
2774         gen_VQDMULL_32,
2775         NULL,
2776     };
2777     static NeonGenTwo64OpFn * const accfn[] = {
2778         NULL,
2779         gen_VQDMLAL_acc_16,
2780         gen_VQDMLAL_acc_32,
2781         NULL,
2782     };
2783
2784     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2785 }
2786
2787 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2788 {
2789     static NeonGenTwoOpWidenFn * const opfn[] = {
2790         NULL,
2791         gen_VQDMULL_16,
2792         gen_VQDMULL_32,
2793         NULL,
2794     };
2795     static NeonGenTwo64OpFn * const accfn[] = {
2796         NULL,
2797         gen_VQDMLSL_acc_16,
2798         gen_VQDMLSL_acc_32,
2799         NULL,
2800     };
2801
2802     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2803 }
2804
2805 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2806 {
2807     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2808         return false;
2809     }
2810
2811     /* UNDEF accesses to D16-D31 if they don't exist. */
2812     if (!dc_isar_feature(aa32_simd_r32, s) &&
2813         ((a->vd | a->vn | a->vm) & 0x10)) {
2814         return false;
2815     }
2816
2817     if ((a->vn | a->vm | a->vd) & a->q) {
2818         return false;
2819     }
2820
2821     if (a->imm > 7 && !a->q) {
2822         return false;
2823     }
2824
2825     if (!vfp_access_check(s)) {
2826         return true;
2827     }
2828
2829     if (!a->q) {
2830         /* Extract 64 bits from <Vm:Vn> */
2831         TCGv_i64 left, right, dest;
2832
2833         left = tcg_temp_new_i64();
2834         right = tcg_temp_new_i64();
2835         dest = tcg_temp_new_i64();
2836
2837         read_neon_element64(right, a->vn, 0, MO_64);
2838         read_neon_element64(left, a->vm, 0, MO_64);
2839         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2840         write_neon_element64(dest, a->vd, 0, MO_64);
2841
2842         tcg_temp_free_i64(left);
2843         tcg_temp_free_i64(right);
2844         tcg_temp_free_i64(dest);
2845     } else {
2846         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2847         TCGv_i64 left, middle, right, destleft, destright;
2848
2849         left = tcg_temp_new_i64();
2850         middle = tcg_temp_new_i64();
2851         right = tcg_temp_new_i64();
2852         destleft = tcg_temp_new_i64();
2853         destright = tcg_temp_new_i64();
2854
2855         if (a->imm < 8) {
2856             read_neon_element64(right, a->vn, 0, MO_64);
2857             read_neon_element64(middle, a->vn, 1, MO_64);
2858             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2859             read_neon_element64(left, a->vm, 0, MO_64);
2860             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2861         } else {
2862             read_neon_element64(right, a->vn, 1, MO_64);
2863             read_neon_element64(middle, a->vm, 0, MO_64);
2864             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2865             read_neon_element64(left, a->vm, 1, MO_64);
2866             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2867         }
2868
2869         write_neon_element64(destright, a->vd, 0, MO_64);
2870         write_neon_element64(destleft, a->vd, 1, MO_64);
2871
2872         tcg_temp_free_i64(destright);
2873         tcg_temp_free_i64(destleft);
2874         tcg_temp_free_i64(right);
2875         tcg_temp_free_i64(middle);
2876         tcg_temp_free_i64(left);
2877     }
2878     return true;
2879 }
2880
2881 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2882 {
2883     TCGv_i64 val, def;
2884     TCGv_i32 desc;
2885
2886     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2887         return false;
2888     }
2889
2890     /* UNDEF accesses to D16-D31 if they don't exist. */
2891     if (!dc_isar_feature(aa32_simd_r32, s) &&
2892         ((a->vd | a->vn | a->vm) & 0x10)) {
2893         return false;
2894     }
2895
2896     if ((a->vn + a->len + 1) > 32) {
2897         /*
2898          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2899          * helper function running off the end of the register file.
2900          */
2901         return false;
2902     }
2903
2904     if (!vfp_access_check(s)) {
2905         return true;
2906     }
2907
2908     desc = tcg_constant_i32((a->vn << 2) | a->len);
2909     def = tcg_temp_new_i64();
2910     if (a->op) {
2911         read_neon_element64(def, a->vd, 0, MO_64);
2912     } else {
2913         tcg_gen_movi_i64(def, 0);
2914     }
2915     val = tcg_temp_new_i64();
2916     read_neon_element64(val, a->vm, 0, MO_64);
2917
2918     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2919     write_neon_element64(val, a->vd, 0, MO_64);
2920
2921     tcg_temp_free_i64(def);
2922     tcg_temp_free_i64(val);
2923     return true;
2924 }
2925
2926 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2927 {
2928     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2929         return false;
2930     }
2931
2932     /* UNDEF accesses to D16-D31 if they don't exist. */
2933     if (!dc_isar_feature(aa32_simd_r32, s) &&
2934         ((a->vd | a->vm) & 0x10)) {
2935         return false;
2936     }
2937
2938     if (a->vd & a->q) {
2939         return false;
2940     }
2941
2942     if (!vfp_access_check(s)) {
2943         return true;
2944     }
2945
2946     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2947                          neon_element_offset(a->vm, a->index, a->size),
2948                          a->q ? 16 : 8, a->q ? 16 : 8);
2949     return true;
2950 }
2951
2952 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2953 {
2954     int pass, half;
2955     TCGv_i32 tmp[2];
2956
2957     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2958         return false;
2959     }
2960
2961     /* UNDEF accesses to D16-D31 if they don't exist. */
2962     if (!dc_isar_feature(aa32_simd_r32, s) &&
2963         ((a->vd | a->vm) & 0x10)) {
2964         return false;
2965     }
2966
2967     if ((a->vd | a->vm) & a->q) {
2968         return false;
2969     }
2970
2971     if (a->size == 3) {
2972         return false;
2973     }
2974
2975     if (!vfp_access_check(s)) {
2976         return true;
2977     }
2978
2979     tmp[0] = tcg_temp_new_i32();
2980     tmp[1] = tcg_temp_new_i32();
2981
2982     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2983         for (half = 0; half < 2; half++) {
2984             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2985             switch (a->size) {
2986             case 0:
2987                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2988                 break;
2989             case 1:
2990                 gen_swap_half(tmp[half], tmp[half]);
2991                 break;
2992             case 2:
2993                 break;
2994             default:
2995                 g_assert_not_reached();
2996             }
2997         }
2998         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2999         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3000     }
3001
3002     tcg_temp_free_i32(tmp[0]);
3003     tcg_temp_free_i32(tmp[1]);
3004     return true;
3005 }
3006
3007 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3008                               NeonGenWidenFn *widenfn,
3009                               NeonGenTwo64OpFn *opfn,
3010                               NeonGenTwo64OpFn *accfn)
3011 {
3012     /*
3013      * Pairwise long operations: widen both halves of the pair,
3014      * combine the pairs with the opfn, and then possibly accumulate
3015      * into the destination with the accfn.
3016      */
3017     int pass;
3018
3019     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3020         return false;
3021     }
3022
3023     /* UNDEF accesses to D16-D31 if they don't exist. */
3024     if (!dc_isar_feature(aa32_simd_r32, s) &&
3025         ((a->vd | a->vm) & 0x10)) {
3026         return false;
3027     }
3028
3029     if ((a->vd | a->vm) & a->q) {
3030         return false;
3031     }
3032
3033     if (!widenfn) {
3034         return false;
3035     }
3036
3037     if (!vfp_access_check(s)) {
3038         return true;
3039     }
3040
3041     for (pass = 0; pass < a->q + 1; pass++) {
3042         TCGv_i32 tmp;
3043         TCGv_i64 rm0_64, rm1_64, rd_64;
3044
3045         rm0_64 = tcg_temp_new_i64();
3046         rm1_64 = tcg_temp_new_i64();
3047         rd_64 = tcg_temp_new_i64();
3048
3049         tmp = tcg_temp_new_i32();
3050         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3051         widenfn(rm0_64, tmp);
3052         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3053         widenfn(rm1_64, tmp);
3054         tcg_temp_free_i32(tmp);
3055
3056         opfn(rd_64, rm0_64, rm1_64);
3057         tcg_temp_free_i64(rm0_64);
3058         tcg_temp_free_i64(rm1_64);
3059
3060         if (accfn) {
3061             TCGv_i64 tmp64 = tcg_temp_new_i64();
3062             read_neon_element64(tmp64, a->vd, pass, MO_64);
3063             accfn(rd_64, tmp64, rd_64);
3064             tcg_temp_free_i64(tmp64);
3065         }
3066         write_neon_element64(rd_64, a->vd, pass, MO_64);
3067         tcg_temp_free_i64(rd_64);
3068     }
3069     return true;
3070 }
3071
3072 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3073 {
3074     static NeonGenWidenFn * const widenfn[] = {
3075         gen_helper_neon_widen_s8,
3076         gen_helper_neon_widen_s16,
3077         tcg_gen_ext_i32_i64,
3078         NULL,
3079     };
3080     static NeonGenTwo64OpFn * const opfn[] = {
3081         gen_helper_neon_paddl_u16,
3082         gen_helper_neon_paddl_u32,
3083         tcg_gen_add_i64,
3084         NULL,
3085     };
3086
3087     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3088 }
3089
3090 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3091 {
3092     static NeonGenWidenFn * const widenfn[] = {
3093         gen_helper_neon_widen_u8,
3094         gen_helper_neon_widen_u16,
3095         tcg_gen_extu_i32_i64,
3096         NULL,
3097     };
3098     static NeonGenTwo64OpFn * const opfn[] = {
3099         gen_helper_neon_paddl_u16,
3100         gen_helper_neon_paddl_u32,
3101         tcg_gen_add_i64,
3102         NULL,
3103     };
3104
3105     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3106 }
3107
3108 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3109 {
3110     static NeonGenWidenFn * const widenfn[] = {
3111         gen_helper_neon_widen_s8,
3112         gen_helper_neon_widen_s16,
3113         tcg_gen_ext_i32_i64,
3114         NULL,
3115     };
3116     static NeonGenTwo64OpFn * const opfn[] = {
3117         gen_helper_neon_paddl_u16,
3118         gen_helper_neon_paddl_u32,
3119         tcg_gen_add_i64,
3120         NULL,
3121     };
3122     static NeonGenTwo64OpFn * const accfn[] = {
3123         gen_helper_neon_addl_u16,
3124         gen_helper_neon_addl_u32,
3125         tcg_gen_add_i64,
3126         NULL,
3127     };
3128
3129     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3130                              accfn[a->size]);
3131 }
3132
3133 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3134 {
3135     static NeonGenWidenFn * const widenfn[] = {
3136         gen_helper_neon_widen_u8,
3137         gen_helper_neon_widen_u16,
3138         tcg_gen_extu_i32_i64,
3139         NULL,
3140     };
3141     static NeonGenTwo64OpFn * const opfn[] = {
3142         gen_helper_neon_paddl_u16,
3143         gen_helper_neon_paddl_u32,
3144         tcg_gen_add_i64,
3145         NULL,
3146     };
3147     static NeonGenTwo64OpFn * const accfn[] = {
3148         gen_helper_neon_addl_u16,
3149         gen_helper_neon_addl_u32,
3150         tcg_gen_add_i64,
3151         NULL,
3152     };
3153
3154     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3155                              accfn[a->size]);
3156 }
3157
3158 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3159
3160 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3161                        ZipFn *fn)
3162 {
3163     TCGv_ptr pd, pm;
3164
3165     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3166         return false;
3167     }
3168
3169     /* UNDEF accesses to D16-D31 if they don't exist. */
3170     if (!dc_isar_feature(aa32_simd_r32, s) &&
3171         ((a->vd | a->vm) & 0x10)) {
3172         return false;
3173     }
3174
3175     if ((a->vd | a->vm) & a->q) {
3176         return false;
3177     }
3178
3179     if (!fn) {
3180         /* Bad size or size/q combination */
3181         return false;
3182     }
3183
3184     if (!vfp_access_check(s)) {
3185         return true;
3186     }
3187
3188     pd = vfp_reg_ptr(true, a->vd);
3189     pm = vfp_reg_ptr(true, a->vm);
3190     fn(pd, pm);
3191     tcg_temp_free_ptr(pd);
3192     tcg_temp_free_ptr(pm);
3193     return true;
3194 }
3195
3196 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3197 {
3198     static ZipFn * const fn[2][4] = {
3199         {
3200             gen_helper_neon_unzip8,
3201             gen_helper_neon_unzip16,
3202             NULL,
3203             NULL,
3204         }, {
3205             gen_helper_neon_qunzip8,
3206             gen_helper_neon_qunzip16,
3207             gen_helper_neon_qunzip32,
3208             NULL,
3209         }
3210     };
3211     return do_zip_uzp(s, a, fn[a->q][a->size]);
3212 }
3213
3214 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3215 {
3216     static ZipFn * const fn[2][4] = {
3217         {
3218             gen_helper_neon_zip8,
3219             gen_helper_neon_zip16,
3220             NULL,
3221             NULL,
3222         }, {
3223             gen_helper_neon_qzip8,
3224             gen_helper_neon_qzip16,
3225             gen_helper_neon_qzip32,
3226             NULL,
3227         }
3228     };
3229     return do_zip_uzp(s, a, fn[a->q][a->size]);
3230 }
3231
3232 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3233                      NeonGenNarrowEnvFn *narrowfn)
3234 {
3235     TCGv_i64 rm;
3236     TCGv_i32 rd0, rd1;
3237
3238     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3239         return false;
3240     }
3241
3242     /* UNDEF accesses to D16-D31 if they don't exist. */
3243     if (!dc_isar_feature(aa32_simd_r32, s) &&
3244         ((a->vd | a->vm) & 0x10)) {
3245         return false;
3246     }
3247
3248     if (a->vm & 1) {
3249         return false;
3250     }
3251
3252     if (!narrowfn) {
3253         return false;
3254     }
3255
3256     if (!vfp_access_check(s)) {
3257         return true;
3258     }
3259
3260     rm = tcg_temp_new_i64();
3261     rd0 = tcg_temp_new_i32();
3262     rd1 = tcg_temp_new_i32();
3263
3264     read_neon_element64(rm, a->vm, 0, MO_64);
3265     narrowfn(rd0, cpu_env, rm);
3266     read_neon_element64(rm, a->vm, 1, MO_64);
3267     narrowfn(rd1, cpu_env, rm);
3268     write_neon_element32(rd0, a->vd, 0, MO_32);
3269     write_neon_element32(rd1, a->vd, 1, MO_32);
3270     tcg_temp_free_i32(rd0);
3271     tcg_temp_free_i32(rd1);
3272     tcg_temp_free_i64(rm);
3273     return true;
3274 }
3275
3276 #define DO_VMOVN(INSN, FUNC)                                    \
3277     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3278     {                                                           \
3279         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3280             FUNC##8,                                            \
3281             FUNC##16,                                           \
3282             FUNC##32,                                           \
3283             NULL,                                               \
3284         };                                                      \
3285         return do_vmovn(s, a, narrowfn[a->size]);               \
3286     }
3287
3288 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3289 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3290 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3291 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3292
3293 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3294 {
3295     TCGv_i32 rm0, rm1;
3296     TCGv_i64 rd;
3297     static NeonGenWidenFn * const widenfns[] = {
3298         gen_helper_neon_widen_u8,
3299         gen_helper_neon_widen_u16,
3300         tcg_gen_extu_i32_i64,
3301         NULL,
3302     };
3303     NeonGenWidenFn *widenfn = widenfns[a->size];
3304
3305     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3306         return false;
3307     }
3308
3309     /* UNDEF accesses to D16-D31 if they don't exist. */
3310     if (!dc_isar_feature(aa32_simd_r32, s) &&
3311         ((a->vd | a->vm) & 0x10)) {
3312         return false;
3313     }
3314
3315     if (a->vd & 1) {
3316         return false;
3317     }
3318
3319     if (!widenfn) {
3320         return false;
3321     }
3322
3323     if (!vfp_access_check(s)) {
3324         return true;
3325     }
3326
3327     rd = tcg_temp_new_i64();
3328     rm0 = tcg_temp_new_i32();
3329     rm1 = tcg_temp_new_i32();
3330
3331     read_neon_element32(rm0, a->vm, 0, MO_32);
3332     read_neon_element32(rm1, a->vm, 1, MO_32);
3333
3334     widenfn(rd, rm0);
3335     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3336     write_neon_element64(rd, a->vd, 0, MO_64);
3337     widenfn(rd, rm1);
3338     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3339     write_neon_element64(rd, a->vd, 1, MO_64);
3340
3341     tcg_temp_free_i64(rd);
3342     tcg_temp_free_i32(rm0);
3343     tcg_temp_free_i32(rm1);
3344     return true;
3345 }
3346
3347 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3348 {
3349     TCGv_ptr fpst;
3350     TCGv_i64 tmp;
3351     TCGv_i32 dst0, dst1;
3352
3353     if (!dc_isar_feature(aa32_bf16, s)) {
3354         return false;
3355     }
3356
3357     /* UNDEF accesses to D16-D31 if they don't exist. */
3358     if (!dc_isar_feature(aa32_simd_r32, s) &&
3359         ((a->vd | a->vm) & 0x10)) {
3360         return false;
3361     }
3362
3363     if ((a->vm & 1) || (a->size != 1)) {
3364         return false;
3365     }
3366
3367     if (!vfp_access_check(s)) {
3368         return true;
3369     }
3370
3371     fpst = fpstatus_ptr(FPST_STD);
3372     tmp = tcg_temp_new_i64();
3373     dst0 = tcg_temp_new_i32();
3374     dst1 = tcg_temp_new_i32();
3375
3376     read_neon_element64(tmp, a->vm, 0, MO_64);
3377     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3378
3379     read_neon_element64(tmp, a->vm, 1, MO_64);
3380     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3381
3382     write_neon_element32(dst0, a->vd, 0, MO_32);
3383     write_neon_element32(dst1, a->vd, 1, MO_32);
3384
3385     tcg_temp_free_i64(tmp);
3386     tcg_temp_free_i32(dst0);
3387     tcg_temp_free_i32(dst1);
3388     tcg_temp_free_ptr(fpst);
3389     return true;
3390 }
3391
3392 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3393 {
3394     TCGv_ptr fpst;
3395     TCGv_i32 ahp, tmp, tmp2, tmp3;
3396
3397     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3398         !dc_isar_feature(aa32_fp16_spconv, s)) {
3399         return false;
3400     }
3401
3402     /* UNDEF accesses to D16-D31 if they don't exist. */
3403     if (!dc_isar_feature(aa32_simd_r32, s) &&
3404         ((a->vd | a->vm) & 0x10)) {
3405         return false;
3406     }
3407
3408     if ((a->vm & 1) || (a->size != 1)) {
3409         return false;
3410     }
3411
3412     if (!vfp_access_check(s)) {
3413         return true;
3414     }
3415
3416     fpst = fpstatus_ptr(FPST_STD);
3417     ahp = get_ahp_flag();
3418     tmp = tcg_temp_new_i32();
3419     read_neon_element32(tmp, a->vm, 0, MO_32);
3420     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3421     tmp2 = tcg_temp_new_i32();
3422     read_neon_element32(tmp2, a->vm, 1, MO_32);
3423     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3424     tcg_gen_shli_i32(tmp2, tmp2, 16);
3425     tcg_gen_or_i32(tmp2, tmp2, tmp);
3426     read_neon_element32(tmp, a->vm, 2, MO_32);
3427     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3428     tmp3 = tcg_temp_new_i32();
3429     read_neon_element32(tmp3, a->vm, 3, MO_32);
3430     write_neon_element32(tmp2, a->vd, 0, MO_32);
3431     tcg_temp_free_i32(tmp2);
3432     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3433     tcg_gen_shli_i32(tmp3, tmp3, 16);
3434     tcg_gen_or_i32(tmp3, tmp3, tmp);
3435     write_neon_element32(tmp3, a->vd, 1, MO_32);
3436     tcg_temp_free_i32(tmp3);
3437     tcg_temp_free_i32(tmp);
3438     tcg_temp_free_i32(ahp);
3439     tcg_temp_free_ptr(fpst);
3440
3441     return true;
3442 }
3443
3444 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3445 {
3446     TCGv_ptr fpst;
3447     TCGv_i32 ahp, tmp, tmp2, tmp3;
3448
3449     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3450         !dc_isar_feature(aa32_fp16_spconv, s)) {
3451         return false;
3452     }
3453
3454     /* UNDEF accesses to D16-D31 if they don't exist. */
3455     if (!dc_isar_feature(aa32_simd_r32, s) &&
3456         ((a->vd | a->vm) & 0x10)) {
3457         return false;
3458     }
3459
3460     if ((a->vd & 1) || (a->size != 1)) {
3461         return false;
3462     }
3463
3464     if (!vfp_access_check(s)) {
3465         return true;
3466     }
3467
3468     fpst = fpstatus_ptr(FPST_STD);
3469     ahp = get_ahp_flag();
3470     tmp3 = tcg_temp_new_i32();
3471     tmp2 = tcg_temp_new_i32();
3472     tmp = tcg_temp_new_i32();
3473     read_neon_element32(tmp, a->vm, 0, MO_32);
3474     read_neon_element32(tmp2, a->vm, 1, MO_32);
3475     tcg_gen_ext16u_i32(tmp3, tmp);
3476     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3477     write_neon_element32(tmp3, a->vd, 0, MO_32);
3478     tcg_gen_shri_i32(tmp, tmp, 16);
3479     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3480     write_neon_element32(tmp, a->vd, 1, MO_32);
3481     tcg_temp_free_i32(tmp);
3482     tcg_gen_ext16u_i32(tmp3, tmp2);
3483     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3484     write_neon_element32(tmp3, a->vd, 2, MO_32);
3485     tcg_temp_free_i32(tmp3);
3486     tcg_gen_shri_i32(tmp2, tmp2, 16);
3487     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3488     write_neon_element32(tmp2, a->vd, 3, MO_32);
3489     tcg_temp_free_i32(tmp2);
3490     tcg_temp_free_i32(ahp);
3491     tcg_temp_free_ptr(fpst);
3492
3493     return true;
3494 }
3495
3496 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3497 {
3498     int vec_size = a->q ? 16 : 8;
3499     int rd_ofs = neon_full_reg_offset(a->vd);
3500     int rm_ofs = neon_full_reg_offset(a->vm);
3501
3502     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3503         return false;
3504     }
3505
3506     /* UNDEF accesses to D16-D31 if they don't exist. */
3507     if (!dc_isar_feature(aa32_simd_r32, s) &&
3508         ((a->vd | a->vm) & 0x10)) {
3509         return false;
3510     }
3511
3512     if (a->size == 3) {
3513         return false;
3514     }
3515
3516     if ((a->vd | a->vm) & a->q) {
3517         return false;
3518     }
3519
3520     if (!vfp_access_check(s)) {
3521         return true;
3522     }
3523
3524     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3525
3526     return true;
3527 }
3528
3529 #define DO_2MISC_VEC(INSN, FN)                                  \
3530     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3531     {                                                           \
3532         return do_2misc_vec(s, a, FN);                          \
3533     }
3534
3535 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3536 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3537 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3538 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3539 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3540 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3541 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3542
3543 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3544 {
3545     if (a->size != 0) {
3546         return false;
3547     }
3548     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3549 }
3550
3551 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3552     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3553                          uint32_t rm_ofs, uint32_t oprsz,               \
3554                          uint32_t maxsz)                                \
3555     {                                                                   \
3556         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3557                            DATA, FUNC);                                 \
3558     }
3559
3560 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3561     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3562                          uint32_t rm_ofs, uint32_t oprsz,               \
3563                          uint32_t maxsz)                                \
3564     {                                                                   \
3565         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3566     }
3567
3568 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3569 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3570 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3571 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3572 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3573 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3574 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3575
3576 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3577     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3578     {                                                           \
3579         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3580             return false;                                       \
3581         }                                                       \
3582         return do_2misc_vec(s, a, gen_##INSN);                  \
3583     }
3584
3585 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3586 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3587 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3588 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3589 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3590 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3591 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3592
3593 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3594 {
3595     TCGv_i32 tmp;
3596     int pass;
3597
3598     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3599     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3600         return false;
3601     }
3602
3603     /* UNDEF accesses to D16-D31 if they don't exist. */
3604     if (!dc_isar_feature(aa32_simd_r32, s) &&
3605         ((a->vd | a->vm) & 0x10)) {
3606         return false;
3607     }
3608
3609     if (!fn) {
3610         return false;
3611     }
3612
3613     if ((a->vd | a->vm) & a->q) {
3614         return false;
3615     }
3616
3617     if (!vfp_access_check(s)) {
3618         return true;
3619     }
3620
3621     tmp = tcg_temp_new_i32();
3622     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3623         read_neon_element32(tmp, a->vm, pass, MO_32);
3624         fn(tmp, tmp);
3625         write_neon_element32(tmp, a->vd, pass, MO_32);
3626     }
3627     tcg_temp_free_i32(tmp);
3628
3629     return true;
3630 }
3631
3632 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3633 {
3634     static NeonGenOneOpFn * const fn[] = {
3635         tcg_gen_bswap32_i32,
3636         gen_swap_half,
3637         NULL,
3638         NULL,
3639     };
3640     return do_2misc(s, a, fn[a->size]);
3641 }
3642
3643 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3644 {
3645     if (a->size != 0) {
3646         return false;
3647     }
3648     return do_2misc(s, a, gen_rev16);
3649 }
3650
3651 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3652 {
3653     static NeonGenOneOpFn * const fn[] = {
3654         gen_helper_neon_cls_s8,
3655         gen_helper_neon_cls_s16,
3656         gen_helper_neon_cls_s32,
3657         NULL,
3658     };
3659     return do_2misc(s, a, fn[a->size]);
3660 }
3661
3662 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3663 {
3664     tcg_gen_clzi_i32(rd, rm, 32);
3665 }
3666
3667 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3668 {
3669     static NeonGenOneOpFn * const fn[] = {
3670         gen_helper_neon_clz_u8,
3671         gen_helper_neon_clz_u16,
3672         do_VCLZ_32,
3673         NULL,
3674     };
3675     return do_2misc(s, a, fn[a->size]);
3676 }
3677
3678 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3679 {
3680     if (a->size != 0) {
3681         return false;
3682     }
3683     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3684 }
3685
3686 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3687                        uint32_t oprsz, uint32_t maxsz)
3688 {
3689     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3690                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3691                       oprsz, maxsz);
3692 }
3693
3694 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3695 {
3696     if (a->size == MO_16) {
3697         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3698             return false;
3699         }
3700     } else if (a->size != MO_32) {
3701         return false;
3702     }
3703     return do_2misc_vec(s, a, gen_VABS_F);
3704 }
3705
3706 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3707                        uint32_t oprsz, uint32_t maxsz)
3708 {
3709     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3710                       vece == MO_16 ? 0x8000 : 0x80000000,
3711                       oprsz, maxsz);
3712 }
3713
3714 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3715 {
3716     if (a->size == MO_16) {
3717         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3718             return false;
3719         }
3720     } else if (a->size != MO_32) {
3721         return false;
3722     }
3723     return do_2misc_vec(s, a, gen_VNEG_F);
3724 }
3725
3726 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3727 {
3728     if (a->size != 2) {
3729         return false;
3730     }
3731     return do_2misc(s, a, gen_helper_recpe_u32);
3732 }
3733
3734 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3735 {
3736     if (a->size != 2) {
3737         return false;
3738     }
3739     return do_2misc(s, a, gen_helper_rsqrte_u32);
3740 }
3741
3742 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3743     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3744     {                                                   \
3745         FUNC(d, cpu_env, m);                            \
3746     }
3747
3748 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3749 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3750 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3751 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3752 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3753 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3754
3755 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3756 {
3757     static NeonGenOneOpFn * const fn[] = {
3758         gen_VQABS_s8,
3759         gen_VQABS_s16,
3760         gen_VQABS_s32,
3761         NULL,
3762     };
3763     return do_2misc(s, a, fn[a->size]);
3764 }
3765
3766 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3767 {
3768     static NeonGenOneOpFn * const fn[] = {
3769         gen_VQNEG_s8,
3770         gen_VQNEG_s16,
3771         gen_VQNEG_s32,
3772         NULL,
3773     };
3774     return do_2misc(s, a, fn[a->size]);
3775 }
3776
3777 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3778     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3779                            uint32_t rm_ofs,                             \
3780                            uint32_t oprsz, uint32_t maxsz)              \
3781     {                                                                   \
3782         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3783             NULL, HFUNC, SFUNC, NULL,                                   \
3784         };                                                              \
3785         TCGv_ptr fpst;                                                  \
3786         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3787         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3788                            fns[vece]);                                  \
3789         tcg_temp_free_ptr(fpst);                                        \
3790     }                                                                   \
3791     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3792     {                                                                   \
3793         if (a->size == MO_16) {                                         \
3794             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3795                 return false;                                           \
3796             }                                                           \
3797         } else if (a->size != MO_32) {                                  \
3798             return false;                                               \
3799         }                                                               \
3800         return do_2misc_vec(s, a, gen_##INSN);                          \
3801     }
3802
3803 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3804 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3805 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3806 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3807 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3808 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3809 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3810 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3811 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3812 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3813 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3814
3815 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3816
3817 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3818 {
3819     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3820         return false;
3821     }
3822     return trans_VRINTX_impl(s, a);
3823 }
3824
3825 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3826     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3827                            uint32_t rm_ofs,                             \
3828                            uint32_t oprsz, uint32_t maxsz)              \
3829     {                                                                   \
3830         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3831             NULL,                                                       \
3832             gen_helper_gvec_##OP##h,                                    \
3833             gen_helper_gvec_##OP##s,                                    \
3834             NULL,                                                       \
3835         };                                                              \
3836         TCGv_ptr fpst;                                                  \
3837         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3838         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3839                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3840         tcg_temp_free_ptr(fpst);                                        \
3841     }                                                                   \
3842     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3843     {                                                                   \
3844         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3845             return false;                                               \
3846         }                                                               \
3847         if (a->size == MO_16) {                                         \
3848             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3849                 return false;                                           \
3850             }                                                           \
3851         } else if (a->size != MO_32) {                                  \
3852             return false;                                               \
3853         }                                                               \
3854         return do_2misc_vec(s, a, gen_##INSN);                          \
3855     }
3856
3857 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3858 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3859 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3860 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3861 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3862 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3863 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3864 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3865
3866 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3867 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3868 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3869 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3870 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3871
3872 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3873 {
3874     TCGv_i64 rm, rd;
3875     int pass;
3876
3877     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3878         return false;
3879     }
3880
3881     /* UNDEF accesses to D16-D31 if they don't exist. */
3882     if (!dc_isar_feature(aa32_simd_r32, s) &&
3883         ((a->vd | a->vm) & 0x10)) {
3884         return false;
3885     }
3886
3887     if (a->size != 0) {
3888         return false;
3889     }
3890
3891     if ((a->vd | a->vm) & a->q) {
3892         return false;
3893     }
3894
3895     if (!vfp_access_check(s)) {
3896         return true;
3897     }
3898
3899     rm = tcg_temp_new_i64();
3900     rd = tcg_temp_new_i64();
3901     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3902         read_neon_element64(rm, a->vm, pass, MO_64);
3903         read_neon_element64(rd, a->vd, pass, MO_64);
3904         write_neon_element64(rm, a->vd, pass, MO_64);
3905         write_neon_element64(rd, a->vm, pass, MO_64);
3906     }
3907     tcg_temp_free_i64(rm);
3908     tcg_temp_free_i64(rd);
3909
3910     return true;
3911 }
3912 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3913 {
3914     TCGv_i32 rd, tmp;
3915
3916     rd = tcg_temp_new_i32();
3917     tmp = tcg_temp_new_i32();
3918
3919     tcg_gen_shli_i32(rd, t0, 8);
3920     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3921     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3922     tcg_gen_or_i32(rd, rd, tmp);
3923
3924     tcg_gen_shri_i32(t1, t1, 8);
3925     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3926     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3927     tcg_gen_or_i32(t1, t1, tmp);
3928     tcg_gen_mov_i32(t0, rd);
3929
3930     tcg_temp_free_i32(tmp);
3931     tcg_temp_free_i32(rd);
3932 }
3933
3934 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3935 {
3936     TCGv_i32 rd, tmp;
3937
3938     rd = tcg_temp_new_i32();
3939     tmp = tcg_temp_new_i32();
3940
3941     tcg_gen_shli_i32(rd, t0, 16);
3942     tcg_gen_andi_i32(tmp, t1, 0xffff);
3943     tcg_gen_or_i32(rd, rd, tmp);
3944     tcg_gen_shri_i32(t1, t1, 16);
3945     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3946     tcg_gen_or_i32(t1, t1, tmp);
3947     tcg_gen_mov_i32(t0, rd);
3948
3949     tcg_temp_free_i32(tmp);
3950     tcg_temp_free_i32(rd);
3951 }
3952
3953 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3954 {
3955     TCGv_i32 tmp, tmp2;
3956     int pass;
3957
3958     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3959         return false;
3960     }
3961
3962     /* UNDEF accesses to D16-D31 if they don't exist. */
3963     if (!dc_isar_feature(aa32_simd_r32, s) &&
3964         ((a->vd | a->vm) & 0x10)) {
3965         return false;
3966     }
3967
3968     if ((a->vd | a->vm) & a->q) {
3969         return false;
3970     }
3971
3972     if (a->size == 3) {
3973         return false;
3974     }
3975
3976     if (!vfp_access_check(s)) {
3977         return true;
3978     }
3979
3980     tmp = tcg_temp_new_i32();
3981     tmp2 = tcg_temp_new_i32();
3982     if (a->size == MO_32) {
3983         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3984             read_neon_element32(tmp, a->vm, pass, MO_32);
3985             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3986             write_neon_element32(tmp2, a->vm, pass, MO_32);
3987             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3988         }
3989     } else {
3990         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3991             read_neon_element32(tmp, a->vm, pass, MO_32);
3992             read_neon_element32(tmp2, a->vd, pass, MO_32);
3993             if (a->size == MO_8) {
3994                 gen_neon_trn_u8(tmp, tmp2);
3995             } else {
3996                 gen_neon_trn_u16(tmp, tmp2);
3997             }
3998             write_neon_element32(tmp2, a->vm, pass, MO_32);
3999             write_neon_element32(tmp, a->vd, pass, MO_32);
4000         }
4001     }
4002     tcg_temp_free_i32(tmp);
4003     tcg_temp_free_i32(tmp2);
4004     return true;
4005 }
4006
4007 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4008 {
4009     if (!dc_isar_feature(aa32_i8mm, s)) {
4010         return false;
4011     }
4012     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4013                         gen_helper_gvec_smmla_b);
4014 }
4015
4016 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4017 {
4018     if (!dc_isar_feature(aa32_i8mm, s)) {
4019         return false;
4020     }
4021     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4022                         gen_helper_gvec_ummla_b);
4023 }
4024
4025 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4026 {
4027     if (!dc_isar_feature(aa32_i8mm, s)) {
4028         return false;
4029     }
4030     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4031                         gen_helper_gvec_usmmla_b);
4032 }
4033
4034 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4035 {
4036     if (!dc_isar_feature(aa32_bf16, s)) {
4037         return false;
4038     }
4039     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4040                         gen_helper_gvec_bfmmla);
4041 }
4042
4043 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4044 {
4045     if (!dc_isar_feature(aa32_bf16, s)) {
4046         return false;
4047     }
4048     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4049                              gen_helper_gvec_bfmlal);
4050 }
4051
4052 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4053 {
4054     if (!dc_isar_feature(aa32_bf16, s)) {
4055         return false;
4056     }
4057     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4058                              (a->index << 1) | a->q, FPST_STD,
4059                              gen_helper_gvec_bfmlal_idx);
4060 }