target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 /* Include the generated Neon decoder */
  32 #include "decode-neon-dp.c.inc"
  33 #include "decode-neon-ls.c.inc"
  34 #include "decode-neon-shared.c.inc"
  35
  36 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  37 {
  38     TCGv_ptr ret = tcg_temp_new_ptr();
  39     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  40     return ret;
  41 }
  42
  43 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  44 {
  45     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  46
  47     switch (mop) {
  48     case MO_UB:
  49         tcg_gen_ld8u_i32(var, cpu_env, offset);
  50         break;
  51     case MO_UW:
  52         tcg_gen_ld16u_i32(var, cpu_env, offset);
  53         break;
  54     case MO_UL:
  55         tcg_gen_ld_i32(var, cpu_env, offset);
  56         break;
  57     default:
  58         g_assert_not_reached();
  59     }
  60 }
  61
  62 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  63 {
  64     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  65
  66     switch (mop) {
  67     case MO_UB:
  68         tcg_gen_ld8u_i64(var, cpu_env, offset);
  69         break;
  70     case MO_UW:
  71         tcg_gen_ld16u_i64(var, cpu_env, offset);
  72         break;
  73     case MO_UL:
  74         tcg_gen_ld32u_i64(var, cpu_env, offset);
  75         break;
  76     case MO_Q:
  77         tcg_gen_ld_i64(var, cpu_env, offset);
  78         break;
  79     default:
  80         g_assert_not_reached();
  81     }
  82 }
  83
  84 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  85 {
  86     long offset = neon_element_offset(reg, ele, size);
  87
  88     switch (size) {
  89     case MO_8:
  90         tcg_gen_st8_i32(var, cpu_env, offset);
  91         break;
  92     case MO_16:
  93         tcg_gen_st16_i32(var, cpu_env, offset);
  94         break;
  95     case MO_32:
  96         tcg_gen_st_i32(var, cpu_env, offset);
  97         break;
  98     default:
  99         g_assert_not_reached();
 100     }
 101 }
 102
 103 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 104 {
 105     long offset = neon_element_offset(reg, ele, size);
 106
 107     switch (size) {
 108     case MO_8:
 109         tcg_gen_st8_i64(var, cpu_env, offset);
 110         break;
 111     case MO_16:
 112         tcg_gen_st16_i64(var, cpu_env, offset);
 113         break;
 114     case MO_32:
 115         tcg_gen_st32_i64(var, cpu_env, offset);
 116         break;
 117     case MO_64:
 118         tcg_gen_st_i64(var, cpu_env, offset);
 119         break;
 120     default:
 121         g_assert_not_reached();
 122     }
 123 }
 124
 125 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 126                          int data, gen_helper_gvec_4 *fn_gvec)
 127 {
 128     /* UNDEF accesses to D16-D31 if they don't exist. */
 129     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 130         return false;
 131     }
 132
 133     /*
 134      * UNDEF accesses to odd registers for each bit of Q.
 135      * Q will be 0b111 for all Q-reg instructions, otherwise
 136      * when we have mixed Q- and D-reg inputs.
 137      */
 138     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 139         return false;
 140     }
 141
 142     if (!vfp_access_check(s)) {
 143         return true;
 144     }
 145
 146     int opr_sz = q ? 16 : 8;
 147     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 148                        vfp_reg_offset(1, vn),
 149                        vfp_reg_offset(1, vm),
 150                        vfp_reg_offset(1, vd),
 151                        opr_sz, opr_sz, data, fn_gvec);
 152     return true;
 153 }
 154
 155 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 156                               int data, ARMFPStatusFlavour fp_flavour,
 157                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 158 {
 159     /* UNDEF accesses to D16-D31 if they don't exist. */
 160     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 161         return false;
 162     }
 163
 164     /*
 165      * UNDEF accesses to odd registers for each bit of Q.
 166      * Q will be 0b111 for all Q-reg instructions, otherwise
 167      * when we have mixed Q- and D-reg inputs.
 168      */
 169     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 170         return false;
 171     }
 172
 173     if (!vfp_access_check(s)) {
 174         return true;
 175     }
 176
 177     int opr_sz = q ? 16 : 8;
 178     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 179
 180     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 181                        vfp_reg_offset(1, vn),
 182                        vfp_reg_offset(1, vm),
 183                        vfp_reg_offset(1, vd),
 184                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 185     tcg_temp_free_ptr(fpst);
 186     return true;
 187 }
 188
 189 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 190 {
 191     if (!dc_isar_feature(aa32_vcma, s)) {
 192         return false;
 193     }
 194     if (a->size == MO_16) {
 195         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 196             return false;
 197         }
 198         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 199                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 200     }
 201     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 202                              FPST_STD, gen_helper_gvec_fcmlas);
 203 }
 204
 205 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 206 {
 207     int opr_sz;
 208     TCGv_ptr fpst;
 209     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 210
 211     if (!dc_isar_feature(aa32_vcma, s)
 212         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 213         return false;
 214     }
 215
 216     /* UNDEF accesses to D16-D31 if they don't exist. */
 217     if (!dc_isar_feature(aa32_simd_r32, s) &&
 218         ((a->vd | a->vn | a->vm) & 0x10)) {
 219         return false;
 220     }
 221
 222     if ((a->vn | a->vm | a->vd) & a->q) {
 223         return false;
 224     }
 225
 226     if (!vfp_access_check(s)) {
 227         return true;
 228     }
 229
 230     opr_sz = (1 + a->q) * 8;
 231     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 232     fn_gvec_ptr = (a->size == MO_16) ?
 233         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 234     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 235                        vfp_reg_offset(1, a->vn),
 236                        vfp_reg_offset(1, a->vm),
 237                        fpst, opr_sz, opr_sz, a->rot,
 238                        fn_gvec_ptr);
 239     tcg_temp_free_ptr(fpst);
 240     return true;
 241 }
 242
 243 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 244 {
 245     if (!dc_isar_feature(aa32_dp, s)) {
 246         return false;
 247     }
 248     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 249                         gen_helper_gvec_sdot_b);
 250 }
 251
 252 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 253 {
 254     if (!dc_isar_feature(aa32_dp, s)) {
 255         return false;
 256     }
 257     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 258                         gen_helper_gvec_udot_b);
 259 }
 260
 261 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 262 {
 263     if (!dc_isar_feature(aa32_i8mm, s)) {
 264         return false;
 265     }
 266     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 267                         gen_helper_gvec_usdot_b);
 268 }
 269
 270 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 271 {
 272     if (!dc_isar_feature(aa32_bf16, s)) {
 273         return false;
 274     }
 275     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 276                         gen_helper_gvec_bfdot);
 277 }
 278
 279 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 280 {
 281     int opr_sz;
 282
 283     if (!dc_isar_feature(aa32_fhm, s)) {
 284         return false;
 285     }
 286
 287     /* UNDEF accesses to D16-D31 if they don't exist. */
 288     if (!dc_isar_feature(aa32_simd_r32, s) &&
 289         (a->vd & 0x10)) {
 290         return false;
 291     }
 292
 293     if (a->vd & a->q) {
 294         return false;
 295     }
 296
 297     if (!vfp_access_check(s)) {
 298         return true;
 299     }
 300
 301     opr_sz = (1 + a->q) * 8;
 302     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 303                        vfp_reg_offset(a->q, a->vn),
 304                        vfp_reg_offset(a->q, a->vm),
 305                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 306                        gen_helper_gvec_fmlal_a32);
 307     return true;
 308 }
 309
 310 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 311 {
 312     int data = (a->index << 2) | a->rot;
 313
 314     if (!dc_isar_feature(aa32_vcma, s)) {
 315         return false;
 316     }
 317     if (a->size == MO_16) {
 318         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 319             return false;
 320         }
 321         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 322                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 323     }
 324     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 325                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 326 }
 327
 328 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 329 {
 330     if (!dc_isar_feature(aa32_dp, s)) {
 331         return false;
 332     }
 333     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 334                         gen_helper_gvec_sdot_idx_b);
 335 }
 336
 337 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 338 {
 339     if (!dc_isar_feature(aa32_dp, s)) {
 340         return false;
 341     }
 342     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 343                         gen_helper_gvec_udot_idx_b);
 344 }
 345
 346 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 347 {
 348     if (!dc_isar_feature(aa32_i8mm, s)) {
 349         return false;
 350     }
 351     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 352                         gen_helper_gvec_usdot_idx_b);
 353 }
 354
 355 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 356 {
 357     if (!dc_isar_feature(aa32_i8mm, s)) {
 358         return false;
 359     }
 360     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 361                         gen_helper_gvec_sudot_idx_b);
 362 }
 363
 364 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 365 {
 366     if (!dc_isar_feature(aa32_bf16, s)) {
 367         return false;
 368     }
 369     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 370                         gen_helper_gvec_bfdot_idx);
 371 }
 372
 373 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 374 {
 375     int opr_sz;
 376
 377     if (!dc_isar_feature(aa32_fhm, s)) {
 378         return false;
 379     }
 380
 381     /* UNDEF accesses to D16-D31 if they don't exist. */
 382     if (!dc_isar_feature(aa32_simd_r32, s) &&
 383         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 384         return false;
 385     }
 386
 387     if (a->vd & a->q) {
 388         return false;
 389     }
 390
 391     if (!vfp_access_check(s)) {
 392         return true;
 393     }
 394
 395     opr_sz = (1 + a->q) * 8;
 396     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 397                        vfp_reg_offset(a->q, a->vn),
 398                        vfp_reg_offset(a->q, a->rm),
 399                        cpu_env, opr_sz, opr_sz,
 400                        (a->index << 2) | a->s, /* is_2 == 0 */
 401                        gen_helper_gvec_fmlal_idx_a32);
 402     return true;
 403 }
 404
 405 static struct {
 406     int nregs;
 407     int interleave;
 408     int spacing;
 409 } const neon_ls_element_type[11] = {
 410     {1, 4, 1},
 411     {1, 4, 2},
 412     {4, 1, 1},
 413     {2, 2, 2},
 414     {1, 3, 1},
 415     {1, 3, 2},
 416     {3, 1, 1},
 417     {1, 1, 1},
 418     {1, 2, 1},
 419     {1, 2, 2},
 420     {2, 1, 1}
 421 };
 422
 423 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 424                                       int stride)
 425 {
 426     if (rm != 15) {
 427         TCGv_i32 base;
 428
 429         base = load_reg(s, rn);
 430         if (rm == 13) {
 431             tcg_gen_addi_i32(base, base, stride);
 432         } else {
 433             TCGv_i32 index;
 434             index = load_reg(s, rm);
 435             tcg_gen_add_i32(base, base, index);
 436             tcg_temp_free_i32(index);
 437         }
 438         store_reg(s, rn, base);
 439     }
 440 }
 441
 442 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 443 {
 444     /* Neon load/store multiple structures */
 445     int nregs, interleave, spacing, reg, n;
 446     MemOp mop, align, endian;
 447     int mmu_idx = get_mem_index(s);
 448     int size = a->size;
 449     TCGv_i64 tmp64;
 450     TCGv_i32 addr, tmp;
 451
 452     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 453         return false;
 454     }
 455
 456     /* UNDEF accesses to D16-D31 if they don't exist */
 457     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 458         return false;
 459     }
 460     if (a->itype > 10) {
 461         return false;
 462     }
 463     /* Catch UNDEF cases for bad values of align field */
 464     switch (a->itype & 0xc) {
 465     case 4:
 466         if (a->align >= 2) {
 467             return false;
 468         }
 469         break;
 470     case 8:
 471         if (a->align == 3) {
 472             return false;
 473         }
 474         break;
 475     default:
 476         break;
 477     }
 478     nregs = neon_ls_element_type[a->itype].nregs;
 479     interleave = neon_ls_element_type[a->itype].interleave;
 480     spacing = neon_ls_element_type[a->itype].spacing;
 481     if (size == 3 && (interleave | spacing) != 1) {
 482         return false;
 483     }
 484
 485     if (!vfp_access_check(s)) {
 486         return true;
 487     }
 488
 489     /* For our purposes, bytes are always little-endian.  */
 490     endian = s->be_data;
 491     if (size == 0) {
 492         endian = MO_LE;
 493     }
 494
 495     /* Enforce alignment requested by the instruction */
 496     if (a->align) {
 497         align = pow2_align(a->align + 2); /* 4 ** a->align */
 498     } else {
 499         align = s->align_mem ? MO_ALIGN : 0;
 500     }
 501
 502     /*
 503      * Consecutive little-endian elements from a single register
 504      * can be promoted to a larger little-endian operation.
 505      */
 506     if (interleave == 1 && endian == MO_LE) {
 507         /* Retain any natural alignment. */
 508         if (align == MO_ALIGN) {
 509             align = pow2_align(size);
 510         }
 511         size = 3;
 512     }
 513
 514     tmp64 = tcg_temp_new_i64();
 515     addr = tcg_temp_new_i32();
 516     tmp = tcg_const_i32(1 << size);
 517     load_reg_var(s, addr, a->rn);
 518
 519     mop = endian | size | align;
 520     for (reg = 0; reg < nregs; reg++) {
 521         for (n = 0; n < 8 >> size; n++) {
 522             int xs;
 523             for (xs = 0; xs < interleave; xs++) {
 524                 int tt = a->vd + reg + spacing * xs;
 525
 526                 if (a->l) {
 527                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 528                     neon_store_element64(tt, n, size, tmp64);
 529                 } else {
 530                     neon_load_element64(tmp64, tt, n, size);
 531                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 532                 }
 533                 tcg_gen_add_i32(addr, addr, tmp);
 534
 535                 /* Subsequent memory operations inherit alignment */
 536                 mop &= ~MO_AMASK;
 537             }
 538         }
 539     }
 540     tcg_temp_free_i32(addr);
 541     tcg_temp_free_i32(tmp);
 542     tcg_temp_free_i64(tmp64);
 543
 544     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 545     return true;
 546 }
 547
 548 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 549 {
 550     /* Neon load single structure to all lanes */
 551     int reg, stride, vec_size;
 552     int vd = a->vd;
 553     int size = a->size;
 554     int nregs = a->n + 1;
 555     TCGv_i32 addr, tmp;
 556     MemOp mop, align;
 557
 558     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 559         return false;
 560     }
 561
 562     /* UNDEF accesses to D16-D31 if they don't exist */
 563     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 564         return false;
 565     }
 566
 567     align = 0;
 568     if (size == 3) {
 569         if (nregs != 4 || a->a == 0) {
 570             return false;
 571         }
 572         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 573         size = MO_32;
 574         align = MO_ALIGN_16;
 575     } else if (a->a) {
 576         switch (nregs) {
 577         case 1:
 578             if (size == 0) {
 579                 return false;
 580             }
 581             align = MO_ALIGN;
 582             break;
 583         case 2:
 584             align = pow2_align(size + 1);
 585             break;
 586         case 3:
 587             return false;
 588         case 4:
 589             align = pow2_align(size + 2);
 590             break;
 591         default:
 592             g_assert_not_reached();
 593         }
 594     }
 595
 596     if (!vfp_access_check(s)) {
 597         return true;
 598     }
 599
 600     /*
 601      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 602      * VLD2/3/4 to all lanes: T bit indicates register stride.
 603      */
 604     stride = a->t ? 2 : 1;
 605     vec_size = nregs == 1 ? stride * 8 : 8;
 606     mop = size | align;
 607     tmp = tcg_temp_new_i32();
 608     addr = tcg_temp_new_i32();
 609     load_reg_var(s, addr, a->rn);
 610     for (reg = 0; reg < nregs; reg++) {
 611         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 612         if ((vd & 1) && vec_size == 16) {
 613             /*
 614              * We cannot write 16 bytes at once because the
 615              * destination is unaligned.
 616              */
 617             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 618                                  8, 8, tmp);
 619             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 620                              neon_full_reg_offset(vd), 8, 8);
 621         } else {
 622             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 623                                  vec_size, vec_size, tmp);
 624         }
 625         tcg_gen_addi_i32(addr, addr, 1 << size);
 626         vd += stride;
 627
 628         /* Subsequent memory operations inherit alignment */
 629         mop &= ~MO_AMASK;
 630     }
 631     tcg_temp_free_i32(tmp);
 632     tcg_temp_free_i32(addr);
 633
 634     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 635
 636     return true;
 637 }
 638
 639 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 640 {
 641     /* Neon load/store single structure to one lane */
 642     int reg;
 643     int nregs = a->n + 1;
 644     int vd = a->vd;
 645     TCGv_i32 addr, tmp;
 646     MemOp mop;
 647
 648     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 649         return false;
 650     }
 651
 652     /* UNDEF accesses to D16-D31 if they don't exist */
 653     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 654         return false;
 655     }
 656
 657     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 658     switch (nregs) {
 659     case 1:
 660         if (((a->align & (1 << a->size)) != 0) ||
 661             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 662             return false;
 663         }
 664         break;
 665     case 3:
 666         if ((a->align & 1) != 0) {
 667             return false;
 668         }
 669         /* fall through */
 670     case 2:
 671         if (a->size == 2 && (a->align & 2) != 0) {
 672             return false;
 673         }
 674         break;
 675     case 4:
 676         if (a->size == 2 && a->align == 3) {
 677             return false;
 678         }
 679         break;
 680     default:
 681         abort();
 682     }
 683     if ((vd + a->stride * (nregs - 1)) > 31) {
 684         /*
 685          * Attempts to write off the end of the register file are
 686          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 687          * access off the end of the array that holds the register data.
 688          */
 689         return false;
 690     }
 691
 692     if (!vfp_access_check(s)) {
 693         return true;
 694     }
 695
 696     /* Pick up SCTLR settings */
 697     mop = finalize_memop(s, a->size);
 698
 699     if (a->align) {
 700         MemOp align_op;
 701
 702         switch (nregs) {
 703         case 1:
 704             /* For VLD1, use natural alignment. */
 705             align_op = MO_ALIGN;
 706             break;
 707         case 2:
 708             /* For VLD2, use double alignment. */
 709             align_op = pow2_align(a->size + 1);
 710             break;
 711         case 4:
 712             if (a->size == MO_32) {
 713                 /*
 714                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 715                  * quad alignment; align = 3 is rejected above.
 716                  */
 717                 align_op = pow2_align(a->size + a->align);
 718             } else {
 719                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 720                 align_op = pow2_align(a->size + 2);
 721             }
 722             break;
 723         default:
 724             /* For VLD3, the alignment field is zero and rejected above. */
 725             g_assert_not_reached();
 726         }
 727
 728         mop = (mop & ~MO_AMASK) | align_op;
 729     }
 730
 731     tmp = tcg_temp_new_i32();
 732     addr = tcg_temp_new_i32();
 733     load_reg_var(s, addr, a->rn);
 734
 735     for (reg = 0; reg < nregs; reg++) {
 736         if (a->l) {
 737             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 738             neon_store_element(vd, a->reg_idx, a->size, tmp);
 739         } else { /* Store */
 740             neon_load_element(tmp, vd, a->reg_idx, a->size);
 741             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 742         }
 743         vd += a->stride;
 744         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 745
 746         /* Subsequent memory operations inherit alignment */
 747         mop &= ~MO_AMASK;
 748     }
 749     tcg_temp_free_i32(addr);
 750     tcg_temp_free_i32(tmp);
 751
 752     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 753
 754     return true;
 755 }
 756
 757 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 758 {
 759     int vec_size = a->q ? 16 : 8;
 760     int rd_ofs = neon_full_reg_offset(a->vd);
 761     int rn_ofs = neon_full_reg_offset(a->vn);
 762     int rm_ofs = neon_full_reg_offset(a->vm);
 763
 764     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 765         return false;
 766     }
 767
 768     /* UNDEF accesses to D16-D31 if they don't exist. */
 769     if (!dc_isar_feature(aa32_simd_r32, s) &&
 770         ((a->vd | a->vn | a->vm) & 0x10)) {
 771         return false;
 772     }
 773
 774     if ((a->vn | a->vm | a->vd) & a->q) {
 775         return false;
 776     }
 777
 778     if (!vfp_access_check(s)) {
 779         return true;
 780     }
 781
 782     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 783     return true;
 784 }
 785
 786 #define DO_3SAME(INSN, FUNC)                                            \
 787     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 788     {                                                                   \
 789         return do_3same(s, a, FUNC);                                    \
 790     }
 791
 792 DO_3SAME(VADD, tcg_gen_gvec_add)
 793 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 794 DO_3SAME(VAND, tcg_gen_gvec_and)
 795 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 796 DO_3SAME(VORR, tcg_gen_gvec_or)
 797 DO_3SAME(VORN, tcg_gen_gvec_orc)
 798 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 799 DO_3SAME(VSHL_S, gen_gvec_sshl)
 800 DO_3SAME(VSHL_U, gen_gvec_ushl)
 801 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 802 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 803 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 804 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 805
 806 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 807 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 808     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 809                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 810                                 uint32_t oprsz, uint32_t maxsz)         \
 811     {                                                                   \
 812         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 813     }                                                                   \
 814     DO_3SAME(INSN, gen_##INSN##_3s)
 815
 816 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 817 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 818 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 819
 820 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 821     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 822     {                                                                   \
 823         if (a->size == 3) {                                             \
 824             return false;                                               \
 825         }                                                               \
 826         return do_3same(s, a, FUNC);                                    \
 827     }
 828
 829 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 830 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 831 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 832 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 833 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 834 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 835 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 836 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 837 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 838 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 839 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 840 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 841
 842 #define DO_3SAME_CMP(INSN, COND)                                        \
 843     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 844                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 845                                 uint32_t oprsz, uint32_t maxsz)         \
 846     {                                                                   \
 847         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 848     }                                                                   \
 849     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 850
 851 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 852 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 853 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 854 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 855 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 856
 857 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 858     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 859                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 860     {                                                                      \
 861         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 862     }
 863
 864 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 865
 866 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 867 {
 868     if (a->size != 0) {
 869         return false;
 870     }
 871     return do_3same(s, a, gen_VMUL_p_3s);
 872 }
 873
 874 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 875     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 876     {                                                                   \
 877         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 878             return false;                                               \
 879         }                                                               \
 880         if (a->size != 1 && a->size != 2) {                             \
 881             return false;                                               \
 882         }                                                               \
 883         return do_3same(s, a, FUNC);                                    \
 884     }
 885
 886 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 887 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 888
 889 #define DO_SHA1(NAME, FUNC)                                             \
 890     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 891     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 892     {                                                                   \
 893         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 894             return false;                                               \
 895         }                                                               \
 896         return do_3same(s, a, gen_##NAME##_3s);                         \
 897     }
 898
 899 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 900 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 901 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 902 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 903
 904 #define DO_SHA2(NAME, FUNC)                                             \
 905     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 906     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 907     {                                                                   \
 908         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 909             return false;                                               \
 910         }                                                               \
 911         return do_3same(s, a, gen_##NAME##_3s);                         \
 912     }
 913
 914 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 915 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 916 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 917
 918 #define DO_3SAME_64(INSN, FUNC)                                         \
 919     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 920                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 921                                 uint32_t oprsz, uint32_t maxsz)         \
 922     {                                                                   \
 923         static const GVecGen3 op = { .fni8 = FUNC };                    \
 924         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 925     }                                                                   \
 926     DO_3SAME(INSN, gen_##INSN##_3s)
 927
 928 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 929     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 930     {                                                                   \
 931         FUNC(d, cpu_env, n, m);                                         \
 932     }                                                                   \
 933     DO_3SAME_64(INSN, gen_##INSN##_elt)
 934
 935 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 936 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 937 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 938 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 939 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 940 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 941
 942 #define DO_3SAME_32(INSN, FUNC)                                         \
 943     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 944                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 945                                 uint32_t oprsz, uint32_t maxsz)         \
 946     {                                                                   \
 947         static const GVecGen3 ops[4] = {                                \
 948             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 949             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 950             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 951             { 0 },                                                      \
 952         };                                                              \
 953         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 954     }                                                                   \
 955     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 956     {                                                                   \
 957         if (a->size > 2) {                                              \
 958             return false;                                               \
 959         }                                                               \
 960         return do_3same(s, a, gen_##INSN##_3s);                         \
 961     }
 962
 963 /*
 964  * Some helper functions need to be passed the cpu_env. In order
 965  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 966  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 967  * and which call a NeonGenTwoOpEnvFn().
 968  */
 969 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 970     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 971     {                                                                   \
 972         FUNC(d, cpu_env, n, m);                                         \
 973     }
 974
 975 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 976     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 977     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 978     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 979     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 980                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 981                                 uint32_t oprsz, uint32_t maxsz)         \
 982     {                                                                   \
 983         static const GVecGen3 ops[4] = {                                \
 984             { .fni4 = gen_##INSN##_tramp8 },                            \
 985             { .fni4 = gen_##INSN##_tramp16 },                           \
 986             { .fni4 = gen_##INSN##_tramp32 },                           \
 987             { 0 },                                                      \
 988         };                                                              \
 989         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 990     }                                                                   \
 991     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 992     {                                                                   \
 993         if (a->size > 2) {                                              \
 994             return false;                                               \
 995         }                                                               \
 996         return do_3same(s, a, gen_##INSN##_3s);                         \
 997     }
 998
 999 DO_3SAME_32(VHADD_S, hadd_s)
1000 DO_3SAME_32(VHADD_U, hadd_u)
1001 DO_3SAME_32(VHSUB_S, hsub_s)
1002 DO_3SAME_32(VHSUB_U, hsub_u)
1003 DO_3SAME_32(VRHADD_S, rhadd_s)
1004 DO_3SAME_32(VRHADD_U, rhadd_u)
1005 DO_3SAME_32(VRSHL_S, rshl_s)
1006 DO_3SAME_32(VRSHL_U, rshl_u)
1007
1008 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1009 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1010 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1011 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1012
1013 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1014 {
1015     /* Operations handled pairwise 32 bits at a time */
1016     TCGv_i32 tmp, tmp2, tmp3;
1017
1018     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1019         return false;
1020     }
1021
1022     /* UNDEF accesses to D16-D31 if they don't exist. */
1023     if (!dc_isar_feature(aa32_simd_r32, s) &&
1024         ((a->vd | a->vn | a->vm) & 0x10)) {
1025         return false;
1026     }
1027
1028     if (a->size == 3) {
1029         return false;
1030     }
1031
1032     if (!vfp_access_check(s)) {
1033         return true;
1034     }
1035
1036     assert(a->q == 0); /* enforced by decode patterns */
1037
1038     /*
1039      * Note that we have to be careful not to clobber the source operands
1040      * in the "vm == vd" case by storing the result of the first pass too
1041      * early. Since Q is 0 there are always just two passes, so instead
1042      * of a complicated loop over each pass we just unroll.
1043      */
1044     tmp = tcg_temp_new_i32();
1045     tmp2 = tcg_temp_new_i32();
1046     tmp3 = tcg_temp_new_i32();
1047
1048     read_neon_element32(tmp, a->vn, 0, MO_32);
1049     read_neon_element32(tmp2, a->vn, 1, MO_32);
1050     fn(tmp, tmp, tmp2);
1051
1052     read_neon_element32(tmp3, a->vm, 0, MO_32);
1053     read_neon_element32(tmp2, a->vm, 1, MO_32);
1054     fn(tmp3, tmp3, tmp2);
1055
1056     write_neon_element32(tmp, a->vd, 0, MO_32);
1057     write_neon_element32(tmp3, a->vd, 1, MO_32);
1058
1059     tcg_temp_free_i32(tmp);
1060     tcg_temp_free_i32(tmp2);
1061     tcg_temp_free_i32(tmp3);
1062     return true;
1063 }
1064
1065 #define DO_3SAME_PAIR(INSN, func)                                       \
1066     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1067     {                                                                   \
1068         static NeonGenTwoOpFn * const fns[] = {                         \
1069             gen_helper_neon_##func##8,                                  \
1070             gen_helper_neon_##func##16,                                 \
1071             gen_helper_neon_##func##32,                                 \
1072         };                                                              \
1073         if (a->size > 2) {                                              \
1074             return false;                                               \
1075         }                                                               \
1076         return do_3same_pair(s, a, fns[a->size]);                       \
1077     }
1078
1079 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1080 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1081 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1082 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1083 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1084 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1085
1086 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1087 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1088 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1089 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1090 DO_3SAME_PAIR(VPADD, padd_u)
1091
1092 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1093     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1094     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1095     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1096                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1097                                 uint32_t oprsz, uint32_t maxsz)         \
1098     {                                                                   \
1099         static const GVecGen3 ops[2] = {                                \
1100             { .fni4 = gen_##INSN##_tramp16 },                           \
1101             { .fni4 = gen_##INSN##_tramp32 },                           \
1102         };                                                              \
1103         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1104     }                                                                   \
1105     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1106     {                                                                   \
1107         if (a->size != 1 && a->size != 2) {                             \
1108             return false;                                               \
1109         }                                                               \
1110         return do_3same(s, a, gen_##INSN##_3s);                         \
1111     }
1112
1113 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1114 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1115
1116 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1117     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1118                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1119                          uint32_t oprsz, uint32_t maxsz)                \
1120     {                                                                   \
1121         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1122         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1123                            oprsz, maxsz, 0, FUNC);                      \
1124         tcg_temp_free_ptr(fpst);                                        \
1125     }
1126
1127 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1128     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1129     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1130     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1131     {                                                                   \
1132         if (a->size == MO_16) {                                         \
1133             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1134                 return false;                                           \
1135             }                                                           \
1136             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1137         }                                                               \
1138         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1139     }
1140
1141
1142 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1143 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1144 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1145 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1146 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1147 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1148 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1149 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1150 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1151 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1152 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1153 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1154 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1155 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1156 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1157 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1158 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1159
1160 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1161 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1162 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1163 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1164
1165 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1166 {
1167     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1168         return false;
1169     }
1170
1171     if (a->size == MO_16) {
1172         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1173             return false;
1174         }
1175         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1176     }
1177     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1178 }
1179
1180 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1181 {
1182     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1183         return false;
1184     }
1185
1186     if (a->size == MO_16) {
1187         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1188             return false;
1189         }
1190         return do_3same(s, a, gen_VMINNM_fp16_3s);
1191     }
1192     return do_3same(s, a, gen_VMINNM_fp32_3s);
1193 }
1194
1195 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1196                              gen_helper_gvec_3_ptr *fn)
1197 {
1198     /* FP pairwise operations */
1199     TCGv_ptr fpstatus;
1200
1201     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1202         return false;
1203     }
1204
1205     /* UNDEF accesses to D16-D31 if they don't exist. */
1206     if (!dc_isar_feature(aa32_simd_r32, s) &&
1207         ((a->vd | a->vn | a->vm) & 0x10)) {
1208         return false;
1209     }
1210
1211     if (!vfp_access_check(s)) {
1212         return true;
1213     }
1214
1215     assert(a->q == 0); /* enforced by decode patterns */
1216
1217
1218     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1219     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1220                        vfp_reg_offset(1, a->vn),
1221                        vfp_reg_offset(1, a->vm),
1222                        fpstatus, 8, 8, 0, fn);
1223     tcg_temp_free_ptr(fpstatus);
1224
1225     return true;
1226 }
1227
1228 /*
1229  * For all the functions using this macro, size == 1 means fp16,
1230  * which is an architecture extension we don't implement yet.
1231  */
1232 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1233     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1234     {                                                               \
1235         if (a->size == MO_16) {                                     \
1236             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1237                 return false;                                       \
1238             }                                                       \
1239             return do_3same_fp_pair(s, a, FUNC##h);                 \
1240         }                                                           \
1241         return do_3same_fp_pair(s, a, FUNC##s);                     \
1242     }
1243
1244 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1245 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1246 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1247
1248 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1249 {
1250     /* Handle a 2-reg-shift insn which can be vectorized. */
1251     int vec_size = a->q ? 16 : 8;
1252     int rd_ofs = neon_full_reg_offset(a->vd);
1253     int rm_ofs = neon_full_reg_offset(a->vm);
1254
1255     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1256         return false;
1257     }
1258
1259     /* UNDEF accesses to D16-D31 if they don't exist. */
1260     if (!dc_isar_feature(aa32_simd_r32, s) &&
1261         ((a->vd | a->vm) & 0x10)) {
1262         return false;
1263     }
1264
1265     if ((a->vm | a->vd) & a->q) {
1266         return false;
1267     }
1268
1269     if (!vfp_access_check(s)) {
1270         return true;
1271     }
1272
1273     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1274     return true;
1275 }
1276
1277 #define DO_2SH(INSN, FUNC)                                              \
1278     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1279     {                                                                   \
1280         return do_vector_2sh(s, a, FUNC);                               \
1281     }                                                                   \
1282
1283 DO_2SH(VSHL, tcg_gen_gvec_shli)
1284 DO_2SH(VSLI, gen_gvec_sli)
1285 DO_2SH(VSRI, gen_gvec_sri)
1286 DO_2SH(VSRA_S, gen_gvec_ssra)
1287 DO_2SH(VSRA_U, gen_gvec_usra)
1288 DO_2SH(VRSHR_S, gen_gvec_srshr)
1289 DO_2SH(VRSHR_U, gen_gvec_urshr)
1290 DO_2SH(VRSRA_S, gen_gvec_srsra)
1291 DO_2SH(VRSRA_U, gen_gvec_ursra)
1292
1293 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1294 {
1295     /* Signed shift out of range results in all-sign-bits */
1296     a->shift = MIN(a->shift, (8 << a->size) - 1);
1297     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1298 }
1299
1300 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1301                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1302 {
1303     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1304 }
1305
1306 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1307 {
1308     /* Shift out of range is architecturally valid and results in zero. */
1309     if (a->shift >= (8 << a->size)) {
1310         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1311     } else {
1312         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1313     }
1314 }
1315
1316 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1317                              NeonGenTwo64OpEnvFn *fn)
1318 {
1319     /*
1320      * 2-reg-and-shift operations, size == 3 case, where the
1321      * function needs to be passed cpu_env.
1322      */
1323     TCGv_i64 constimm;
1324     int pass;
1325
1326     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1327         return false;
1328     }
1329
1330     /* UNDEF accesses to D16-D31 if they don't exist. */
1331     if (!dc_isar_feature(aa32_simd_r32, s) &&
1332         ((a->vd | a->vm) & 0x10)) {
1333         return false;
1334     }
1335
1336     if ((a->vm | a->vd) & a->q) {
1337         return false;
1338     }
1339
1340     if (!vfp_access_check(s)) {
1341         return true;
1342     }
1343
1344     /*
1345      * To avoid excessive duplication of ops we implement shift
1346      * by immediate using the variable shift operations.
1347      */
1348     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1349
1350     for (pass = 0; pass < a->q + 1; pass++) {
1351         TCGv_i64 tmp = tcg_temp_new_i64();
1352
1353         read_neon_element64(tmp, a->vm, pass, MO_64);
1354         fn(tmp, cpu_env, tmp, constimm);
1355         write_neon_element64(tmp, a->vd, pass, MO_64);
1356         tcg_temp_free_i64(tmp);
1357     }
1358     tcg_temp_free_i64(constimm);
1359     return true;
1360 }
1361
1362 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1363                              NeonGenTwoOpEnvFn *fn)
1364 {
1365     /*
1366      * 2-reg-and-shift operations, size < 3 case, where the
1367      * helper needs to be passed cpu_env.
1368      */
1369     TCGv_i32 constimm, tmp;
1370     int pass;
1371
1372     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1373         return false;
1374     }
1375
1376     /* UNDEF accesses to D16-D31 if they don't exist. */
1377     if (!dc_isar_feature(aa32_simd_r32, s) &&
1378         ((a->vd | a->vm) & 0x10)) {
1379         return false;
1380     }
1381
1382     if ((a->vm | a->vd) & a->q) {
1383         return false;
1384     }
1385
1386     if (!vfp_access_check(s)) {
1387         return true;
1388     }
1389
1390     /*
1391      * To avoid excessive duplication of ops we implement shift
1392      * by immediate using the variable shift operations.
1393      */
1394     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1395     tmp = tcg_temp_new_i32();
1396
1397     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1398         read_neon_element32(tmp, a->vm, pass, MO_32);
1399         fn(tmp, cpu_env, tmp, constimm);
1400         write_neon_element32(tmp, a->vd, pass, MO_32);
1401     }
1402     tcg_temp_free_i32(tmp);
1403     tcg_temp_free_i32(constimm);
1404     return true;
1405 }
1406
1407 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1408     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1409     {                                                                   \
1410         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1411     }                                                                   \
1412     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1413     {                                                                   \
1414         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1415             gen_helper_neon_##FUNC##8,                                  \
1416             gen_helper_neon_##FUNC##16,                                 \
1417             gen_helper_neon_##FUNC##32,                                 \
1418         };                                                              \
1419         assert(a->size < ARRAY_SIZE(fns));                              \
1420         return do_2shift_env_32(s, a, fns[a->size]);                    \
1421     }
1422
1423 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1424 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1425 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1426
1427 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1428                                 NeonGenTwo64OpFn *shiftfn,
1429                                 NeonGenNarrowEnvFn *narrowfn)
1430 {
1431     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1432     TCGv_i64 constimm, rm1, rm2;
1433     TCGv_i32 rd;
1434
1435     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1436         return false;
1437     }
1438
1439     /* UNDEF accesses to D16-D31 if they don't exist. */
1440     if (!dc_isar_feature(aa32_simd_r32, s) &&
1441         ((a->vd | a->vm) & 0x10)) {
1442         return false;
1443     }
1444
1445     if (a->vm & 1) {
1446         return false;
1447     }
1448
1449     if (!vfp_access_check(s)) {
1450         return true;
1451     }
1452
1453     /*
1454      * This is always a right shift, and the shiftfn is always a
1455      * left-shift helper, which thus needs the negated shift count.
1456      */
1457     constimm = tcg_const_i64(-a->shift);
1458     rm1 = tcg_temp_new_i64();
1459     rm2 = tcg_temp_new_i64();
1460     rd = tcg_temp_new_i32();
1461
1462     /* Load both inputs first to avoid potential overwrite if rm == rd */
1463     read_neon_element64(rm1, a->vm, 0, MO_64);
1464     read_neon_element64(rm2, a->vm, 1, MO_64);
1465
1466     shiftfn(rm1, rm1, constimm);
1467     narrowfn(rd, cpu_env, rm1);
1468     write_neon_element32(rd, a->vd, 0, MO_32);
1469
1470     shiftfn(rm2, rm2, constimm);
1471     narrowfn(rd, cpu_env, rm2);
1472     write_neon_element32(rd, a->vd, 1, MO_32);
1473
1474     tcg_temp_free_i32(rd);
1475     tcg_temp_free_i64(rm1);
1476     tcg_temp_free_i64(rm2);
1477     tcg_temp_free_i64(constimm);
1478
1479     return true;
1480 }
1481
1482 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1483                                 NeonGenTwoOpFn *shiftfn,
1484                                 NeonGenNarrowEnvFn *narrowfn)
1485 {
1486     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1487     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1488     TCGv_i64 rtmp;
1489     uint32_t imm;
1490
1491     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1492         return false;
1493     }
1494
1495     /* UNDEF accesses to D16-D31 if they don't exist. */
1496     if (!dc_isar_feature(aa32_simd_r32, s) &&
1497         ((a->vd | a->vm) & 0x10)) {
1498         return false;
1499     }
1500
1501     if (a->vm & 1) {
1502         return false;
1503     }
1504
1505     if (!vfp_access_check(s)) {
1506         return true;
1507     }
1508
1509     /*
1510      * This is always a right shift, and the shiftfn is always a
1511      * left-shift helper, which thus needs the negated shift count
1512      * duplicated into each lane of the immediate value.
1513      */
1514     if (a->size == 1) {
1515         imm = (uint16_t)(-a->shift);
1516         imm |= imm << 16;
1517     } else {
1518         /* size == 2 */
1519         imm = -a->shift;
1520     }
1521     constimm = tcg_const_i32(imm);
1522
1523     /* Load all inputs first to avoid potential overwrite */
1524     rm1 = tcg_temp_new_i32();
1525     rm2 = tcg_temp_new_i32();
1526     rm3 = tcg_temp_new_i32();
1527     rm4 = tcg_temp_new_i32();
1528     read_neon_element32(rm1, a->vm, 0, MO_32);
1529     read_neon_element32(rm2, a->vm, 1, MO_32);
1530     read_neon_element32(rm3, a->vm, 2, MO_32);
1531     read_neon_element32(rm4, a->vm, 3, MO_32);
1532     rtmp = tcg_temp_new_i64();
1533
1534     shiftfn(rm1, rm1, constimm);
1535     shiftfn(rm2, rm2, constimm);
1536
1537     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1538     tcg_temp_free_i32(rm2);
1539
1540     narrowfn(rm1, cpu_env, rtmp);
1541     write_neon_element32(rm1, a->vd, 0, MO_32);
1542     tcg_temp_free_i32(rm1);
1543
1544     shiftfn(rm3, rm3, constimm);
1545     shiftfn(rm4, rm4, constimm);
1546     tcg_temp_free_i32(constimm);
1547
1548     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1549     tcg_temp_free_i32(rm4);
1550
1551     narrowfn(rm3, cpu_env, rtmp);
1552     tcg_temp_free_i64(rtmp);
1553     write_neon_element32(rm3, a->vd, 1, MO_32);
1554     tcg_temp_free_i32(rm3);
1555     return true;
1556 }
1557
1558 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1559     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1560     {                                                                   \
1561         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1562     }
1563 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1564     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1565     {                                                                   \
1566         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1567     }
1568
1569 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1570 {
1571     tcg_gen_extrl_i64_i32(dest, src);
1572 }
1573
1574 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1575 {
1576     gen_helper_neon_narrow_u16(dest, src);
1577 }
1578
1579 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1580 {
1581     gen_helper_neon_narrow_u8(dest, src);
1582 }
1583
1584 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1585 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1586 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1587
1588 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1589 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1590 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1591
1592 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1593 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1594 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1595
1596 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1597 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1598 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1599 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1600 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1601 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1602
1603 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1604 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1605 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1606
1607 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1608 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1609 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1610
1611 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1612 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1613 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1614
1615 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1616                          NeonGenWidenFn *widenfn, bool u)
1617 {
1618     TCGv_i64 tmp;
1619     TCGv_i32 rm0, rm1;
1620     uint64_t widen_mask = 0;
1621
1622     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1623         return false;
1624     }
1625
1626     /* UNDEF accesses to D16-D31 if they don't exist. */
1627     if (!dc_isar_feature(aa32_simd_r32, s) &&
1628         ((a->vd | a->vm) & 0x10)) {
1629         return false;
1630     }
1631
1632     if (a->vd & 1) {
1633         return false;
1634     }
1635
1636     if (!vfp_access_check(s)) {
1637         return true;
1638     }
1639
1640     /*
1641      * This is a widen-and-shift operation. The shift is always less
1642      * than the width of the source type, so after widening the input
1643      * vector we can simply shift the whole 64-bit widened register,
1644      * and then clear the potential overflow bits resulting from left
1645      * bits of the narrow input appearing as right bits of the left
1646      * neighbour narrow input. Calculate a mask of bits to clear.
1647      */
1648     if ((a->shift != 0) && (a->size < 2 || u)) {
1649         int esize = 8 << a->size;
1650         widen_mask = MAKE_64BIT_MASK(0, esize);
1651         widen_mask >>= esize - a->shift;
1652         widen_mask = dup_const(a->size + 1, widen_mask);
1653     }
1654
1655     rm0 = tcg_temp_new_i32();
1656     rm1 = tcg_temp_new_i32();
1657     read_neon_element32(rm0, a->vm, 0, MO_32);
1658     read_neon_element32(rm1, a->vm, 1, MO_32);
1659     tmp = tcg_temp_new_i64();
1660
1661     widenfn(tmp, rm0);
1662     tcg_temp_free_i32(rm0);
1663     if (a->shift != 0) {
1664         tcg_gen_shli_i64(tmp, tmp, a->shift);
1665         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1666     }
1667     write_neon_element64(tmp, a->vd, 0, MO_64);
1668
1669     widenfn(tmp, rm1);
1670     tcg_temp_free_i32(rm1);
1671     if (a->shift != 0) {
1672         tcg_gen_shli_i64(tmp, tmp, a->shift);
1673         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1674     }
1675     write_neon_element64(tmp, a->vd, 1, MO_64);
1676     tcg_temp_free_i64(tmp);
1677     return true;
1678 }
1679
1680 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1681 {
1682     static NeonGenWidenFn * const widenfn[] = {
1683         gen_helper_neon_widen_s8,
1684         gen_helper_neon_widen_s16,
1685         tcg_gen_ext_i32_i64,
1686     };
1687     return do_vshll_2sh(s, a, widenfn[a->size], false);
1688 }
1689
1690 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1691 {
1692     static NeonGenWidenFn * const widenfn[] = {
1693         gen_helper_neon_widen_u8,
1694         gen_helper_neon_widen_u16,
1695         tcg_gen_extu_i32_i64,
1696     };
1697     return do_vshll_2sh(s, a, widenfn[a->size], true);
1698 }
1699
1700 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1701                       gen_helper_gvec_2_ptr *fn)
1702 {
1703     /* FP operations in 2-reg-and-shift group */
1704     int vec_size = a->q ? 16 : 8;
1705     int rd_ofs = neon_full_reg_offset(a->vd);
1706     int rm_ofs = neon_full_reg_offset(a->vm);
1707     TCGv_ptr fpst;
1708
1709     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1710         return false;
1711     }
1712
1713     if (a->size == MO_16) {
1714         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1715             return false;
1716         }
1717     }
1718
1719     /* UNDEF accesses to D16-D31 if they don't exist. */
1720     if (!dc_isar_feature(aa32_simd_r32, s) &&
1721         ((a->vd | a->vm) & 0x10)) {
1722         return false;
1723     }
1724
1725     if ((a->vm | a->vd) & a->q) {
1726         return false;
1727     }
1728
1729     if (!vfp_access_check(s)) {
1730         return true;
1731     }
1732
1733     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1734     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1735     tcg_temp_free_ptr(fpst);
1736     return true;
1737 }
1738
1739 #define DO_FP_2SH(INSN, FUNC)                                           \
1740     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1741     {                                                                   \
1742         return do_fp_2sh(s, a, FUNC);                                   \
1743     }
1744
1745 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1746 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1747 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1748 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1749
1750 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1751 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1752 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1753 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1754
1755 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1756                         GVecGen2iFn *fn)
1757 {
1758     uint64_t imm;
1759     int reg_ofs, vec_size;
1760
1761     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1762         return false;
1763     }
1764
1765     /* UNDEF accesses to D16-D31 if they don't exist. */
1766     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1767         return false;
1768     }
1769
1770     if (a->vd & a->q) {
1771         return false;
1772     }
1773
1774     if (!vfp_access_check(s)) {
1775         return true;
1776     }
1777
1778     reg_ofs = neon_full_reg_offset(a->vd);
1779     vec_size = a->q ? 16 : 8;
1780     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1781
1782     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1783     return true;
1784 }
1785
1786 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1788 {
1789     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1790 }
1791
1792 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1793 {
1794     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1795     GVecGen2iFn *fn;
1796
1797     if ((a->cmode & 1) && a->cmode < 12) {
1798         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1799         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1800     } else {
1801         /* There is one unallocated cmode/op combination in this space */
1802         if (a->cmode == 15 && a->op == 1) {
1803             return false;
1804         }
1805         fn = gen_VMOV_1r;
1806     }
1807     return do_1reg_imm(s, a, fn);
1808 }
1809
1810 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1811                            NeonGenWidenFn *widenfn,
1812                            NeonGenTwo64OpFn *opfn,
1813                            int src1_mop, int src2_mop)
1814 {
1815     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1816     TCGv_i64 rn0_64, rn1_64, rm_64;
1817
1818     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1819         return false;
1820     }
1821
1822     /* UNDEF accesses to D16-D31 if they don't exist. */
1823     if (!dc_isar_feature(aa32_simd_r32, s) &&
1824         ((a->vd | a->vn | a->vm) & 0x10)) {
1825         return false;
1826     }
1827
1828     if (!opfn) {
1829         /* size == 3 case, which is an entirely different insn group */
1830         return false;
1831     }
1832
1833     if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1834         return false;
1835     }
1836
1837     if (!vfp_access_check(s)) {
1838         return true;
1839     }
1840
1841     rn0_64 = tcg_temp_new_i64();
1842     rn1_64 = tcg_temp_new_i64();
1843     rm_64 = tcg_temp_new_i64();
1844
1845     if (src1_mop >= 0) {
1846         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1847     } else {
1848         TCGv_i32 tmp = tcg_temp_new_i32();
1849         read_neon_element32(tmp, a->vn, 0, MO_32);
1850         widenfn(rn0_64, tmp);
1851         tcg_temp_free_i32(tmp);
1852     }
1853     if (src2_mop >= 0) {
1854         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1855     } else {
1856         TCGv_i32 tmp = tcg_temp_new_i32();
1857         read_neon_element32(tmp, a->vm, 0, MO_32);
1858         widenfn(rm_64, tmp);
1859         tcg_temp_free_i32(tmp);
1860     }
1861
1862     opfn(rn0_64, rn0_64, rm_64);
1863
1864     /*
1865      * Load second pass inputs before storing the first pass result, to
1866      * avoid incorrect results if a narrow input overlaps with the result.
1867      */
1868     if (src1_mop >= 0) {
1869         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1870     } else {
1871         TCGv_i32 tmp = tcg_temp_new_i32();
1872         read_neon_element32(tmp, a->vn, 1, MO_32);
1873         widenfn(rn1_64, tmp);
1874         tcg_temp_free_i32(tmp);
1875     }
1876     if (src2_mop >= 0) {
1877         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1878     } else {
1879         TCGv_i32 tmp = tcg_temp_new_i32();
1880         read_neon_element32(tmp, a->vm, 1, MO_32);
1881         widenfn(rm_64, tmp);
1882         tcg_temp_free_i32(tmp);
1883     }
1884
1885     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1886
1887     opfn(rn1_64, rn1_64, rm_64);
1888     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1889
1890     tcg_temp_free_i64(rn0_64);
1891     tcg_temp_free_i64(rn1_64);
1892     tcg_temp_free_i64(rm_64);
1893
1894     return true;
1895 }
1896
1897 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1898     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1899     {                                                                   \
1900         static NeonGenWidenFn * const widenfn[] = {                     \
1901             gen_helper_neon_widen_##S##8,                               \
1902             gen_helper_neon_widen_##S##16,                              \
1903             NULL, NULL,                                                 \
1904         };                                                              \
1905         static NeonGenTwo64OpFn * const addfn[] = {                     \
1906             gen_helper_neon_##OP##l_u16,                                \
1907             gen_helper_neon_##OP##l_u32,                                \
1908             tcg_gen_##OP##_i64,                                         \
1909             NULL,                                                       \
1910         };                                                              \
1911         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1912         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1913                               SRC1WIDE ? MO_Q : narrow_mop,             \
1914                               narrow_mop);                              \
1915     }
1916
1917 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1918 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1919 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1920 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1921 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1922 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1923 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1924 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1925
1926 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1927                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1928 {
1929     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1930     TCGv_i64 rn_64, rm_64;
1931     TCGv_i32 rd0, rd1;
1932
1933     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1934         return false;
1935     }
1936
1937     /* UNDEF accesses to D16-D31 if they don't exist. */
1938     if (!dc_isar_feature(aa32_simd_r32, s) &&
1939         ((a->vd | a->vn | a->vm) & 0x10)) {
1940         return false;
1941     }
1942
1943     if (!opfn || !narrowfn) {
1944         /* size == 3 case, which is an entirely different insn group */
1945         return false;
1946     }
1947
1948     if ((a->vn | a->vm) & 1) {
1949         return false;
1950     }
1951
1952     if (!vfp_access_check(s)) {
1953         return true;
1954     }
1955
1956     rn_64 = tcg_temp_new_i64();
1957     rm_64 = tcg_temp_new_i64();
1958     rd0 = tcg_temp_new_i32();
1959     rd1 = tcg_temp_new_i32();
1960
1961     read_neon_element64(rn_64, a->vn, 0, MO_64);
1962     read_neon_element64(rm_64, a->vm, 0, MO_64);
1963
1964     opfn(rn_64, rn_64, rm_64);
1965
1966     narrowfn(rd0, rn_64);
1967
1968     read_neon_element64(rn_64, a->vn, 1, MO_64);
1969     read_neon_element64(rm_64, a->vm, 1, MO_64);
1970
1971     opfn(rn_64, rn_64, rm_64);
1972
1973     narrowfn(rd1, rn_64);
1974
1975     write_neon_element32(rd0, a->vd, 0, MO_32);
1976     write_neon_element32(rd1, a->vd, 1, MO_32);
1977
1978     tcg_temp_free_i32(rd0);
1979     tcg_temp_free_i32(rd1);
1980     tcg_temp_free_i64(rn_64);
1981     tcg_temp_free_i64(rm_64);
1982
1983     return true;
1984 }
1985
1986 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1987     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1988     {                                                                   \
1989         static NeonGenTwo64OpFn * const addfn[] = {                     \
1990             gen_helper_neon_##OP##l_u16,                                \
1991             gen_helper_neon_##OP##l_u32,                                \
1992             tcg_gen_##OP##_i64,                                         \
1993             NULL,                                                       \
1994         };                                                              \
1995         static NeonGenNarrowFn * const narrowfn[] = {                   \
1996             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1997             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1998             EXTOP,                                                      \
1999             NULL,                                                       \
2000         };                                                              \
2001         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2002     }
2003
2004 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2005 {
2006     tcg_gen_addi_i64(rn, rn, 1u << 31);
2007     tcg_gen_extrh_i64_i32(rd, rn);
2008 }
2009
2010 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2011 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2012 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2013 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2014
2015 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2016                        NeonGenTwoOpWidenFn *opfn,
2017                        NeonGenTwo64OpFn *accfn)
2018 {
2019     /*
2020      * 3-regs different lengths, long operations.
2021      * These perform an operation on two inputs that returns a double-width
2022      * result, and then possibly perform an accumulation operation of
2023      * that result into the double-width destination.
2024      */
2025     TCGv_i64 rd0, rd1, tmp;
2026     TCGv_i32 rn, rm;
2027
2028     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2029         return false;
2030     }
2031
2032     /* UNDEF accesses to D16-D31 if they don't exist. */
2033     if (!dc_isar_feature(aa32_simd_r32, s) &&
2034         ((a->vd | a->vn | a->vm) & 0x10)) {
2035         return false;
2036     }
2037
2038     if (!opfn) {
2039         /* size == 3 case, which is an entirely different insn group */
2040         return false;
2041     }
2042
2043     if (a->vd & 1) {
2044         return false;
2045     }
2046
2047     if (!vfp_access_check(s)) {
2048         return true;
2049     }
2050
2051     rd0 = tcg_temp_new_i64();
2052     rd1 = tcg_temp_new_i64();
2053
2054     rn = tcg_temp_new_i32();
2055     rm = tcg_temp_new_i32();
2056     read_neon_element32(rn, a->vn, 0, MO_32);
2057     read_neon_element32(rm, a->vm, 0, MO_32);
2058     opfn(rd0, rn, rm);
2059
2060     read_neon_element32(rn, a->vn, 1, MO_32);
2061     read_neon_element32(rm, a->vm, 1, MO_32);
2062     opfn(rd1, rn, rm);
2063     tcg_temp_free_i32(rn);
2064     tcg_temp_free_i32(rm);
2065
2066     /* Don't store results until after all loads: they might overlap */
2067     if (accfn) {
2068         tmp = tcg_temp_new_i64();
2069         read_neon_element64(tmp, a->vd, 0, MO_64);
2070         accfn(rd0, tmp, rd0);
2071         read_neon_element64(tmp, a->vd, 1, MO_64);
2072         accfn(rd1, tmp, rd1);
2073         tcg_temp_free_i64(tmp);
2074     }
2075
2076     write_neon_element64(rd0, a->vd, 0, MO_64);
2077     write_neon_element64(rd1, a->vd, 1, MO_64);
2078     tcg_temp_free_i64(rd0);
2079     tcg_temp_free_i64(rd1);
2080
2081     return true;
2082 }
2083
2084 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2085 {
2086     static NeonGenTwoOpWidenFn * const opfn[] = {
2087         gen_helper_neon_abdl_s16,
2088         gen_helper_neon_abdl_s32,
2089         gen_helper_neon_abdl_s64,
2090         NULL,
2091     };
2092
2093     return do_long_3d(s, a, opfn[a->size], NULL);
2094 }
2095
2096 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2097 {
2098     static NeonGenTwoOpWidenFn * const opfn[] = {
2099         gen_helper_neon_abdl_u16,
2100         gen_helper_neon_abdl_u32,
2101         gen_helper_neon_abdl_u64,
2102         NULL,
2103     };
2104
2105     return do_long_3d(s, a, opfn[a->size], NULL);
2106 }
2107
2108 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2109 {
2110     static NeonGenTwoOpWidenFn * const opfn[] = {
2111         gen_helper_neon_abdl_s16,
2112         gen_helper_neon_abdl_s32,
2113         gen_helper_neon_abdl_s64,
2114         NULL,
2115     };
2116     static NeonGenTwo64OpFn * const addfn[] = {
2117         gen_helper_neon_addl_u16,
2118         gen_helper_neon_addl_u32,
2119         tcg_gen_add_i64,
2120         NULL,
2121     };
2122
2123     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2124 }
2125
2126 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2127 {
2128     static NeonGenTwoOpWidenFn * const opfn[] = {
2129         gen_helper_neon_abdl_u16,
2130         gen_helper_neon_abdl_u32,
2131         gen_helper_neon_abdl_u64,
2132         NULL,
2133     };
2134     static NeonGenTwo64OpFn * const addfn[] = {
2135         gen_helper_neon_addl_u16,
2136         gen_helper_neon_addl_u32,
2137         tcg_gen_add_i64,
2138         NULL,
2139     };
2140
2141     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2142 }
2143
2144 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2145 {
2146     TCGv_i32 lo = tcg_temp_new_i32();
2147     TCGv_i32 hi = tcg_temp_new_i32();
2148
2149     tcg_gen_muls2_i32(lo, hi, rn, rm);
2150     tcg_gen_concat_i32_i64(rd, lo, hi);
2151
2152     tcg_temp_free_i32(lo);
2153     tcg_temp_free_i32(hi);
2154 }
2155
2156 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2157 {
2158     TCGv_i32 lo = tcg_temp_new_i32();
2159     TCGv_i32 hi = tcg_temp_new_i32();
2160
2161     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2162     tcg_gen_concat_i32_i64(rd, lo, hi);
2163
2164     tcg_temp_free_i32(lo);
2165     tcg_temp_free_i32(hi);
2166 }
2167
2168 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2169 {
2170     static NeonGenTwoOpWidenFn * const opfn[] = {
2171         gen_helper_neon_mull_s8,
2172         gen_helper_neon_mull_s16,
2173         gen_mull_s32,
2174         NULL,
2175     };
2176
2177     return do_long_3d(s, a, opfn[a->size], NULL);
2178 }
2179
2180 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2181 {
2182     static NeonGenTwoOpWidenFn * const opfn[] = {
2183         gen_helper_neon_mull_u8,
2184         gen_helper_neon_mull_u16,
2185         gen_mull_u32,
2186         NULL,
2187     };
2188
2189     return do_long_3d(s, a, opfn[a->size], NULL);
2190 }
2191
2192 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2193     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2194     {                                                                   \
2195         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2196             gen_helper_neon_##MULL##8,                                  \
2197             gen_helper_neon_##MULL##16,                                 \
2198             gen_##MULL##32,                                             \
2199             NULL,                                                       \
2200         };                                                              \
2201         static NeonGenTwo64OpFn * const accfn[] = {                     \
2202             gen_helper_neon_##ACC##l_u16,                               \
2203             gen_helper_neon_##ACC##l_u32,                               \
2204             tcg_gen_##ACC##_i64,                                        \
2205             NULL,                                                       \
2206         };                                                              \
2207         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2208     }
2209
2210 DO_VMLAL(VMLAL_S,mull_s,add)
2211 DO_VMLAL(VMLAL_U,mull_u,add)
2212 DO_VMLAL(VMLSL_S,mull_s,sub)
2213 DO_VMLAL(VMLSL_U,mull_u,sub)
2214
2215 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2216 {
2217     gen_helper_neon_mull_s16(rd, rn, rm);
2218     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2219 }
2220
2221 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2222 {
2223     gen_mull_s32(rd, rn, rm);
2224     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2225 }
2226
2227 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2228 {
2229     static NeonGenTwoOpWidenFn * const opfn[] = {
2230         NULL,
2231         gen_VQDMULL_16,
2232         gen_VQDMULL_32,
2233         NULL,
2234     };
2235
2236     return do_long_3d(s, a, opfn[a->size], NULL);
2237 }
2238
2239 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2240 {
2241     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2242 }
2243
2244 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2245 {
2246     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2247 }
2248
2249 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2250 {
2251     static NeonGenTwoOpWidenFn * const opfn[] = {
2252         NULL,
2253         gen_VQDMULL_16,
2254         gen_VQDMULL_32,
2255         NULL,
2256     };
2257     static NeonGenTwo64OpFn * const accfn[] = {
2258         NULL,
2259         gen_VQDMLAL_acc_16,
2260         gen_VQDMLAL_acc_32,
2261         NULL,
2262     };
2263
2264     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2265 }
2266
2267 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2268 {
2269     gen_helper_neon_negl_u32(rm, rm);
2270     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2271 }
2272
2273 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2274 {
2275     tcg_gen_neg_i64(rm, rm);
2276     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2277 }
2278
2279 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2280 {
2281     static NeonGenTwoOpWidenFn * const opfn[] = {
2282         NULL,
2283         gen_VQDMULL_16,
2284         gen_VQDMULL_32,
2285         NULL,
2286     };
2287     static NeonGenTwo64OpFn * const accfn[] = {
2288         NULL,
2289         gen_VQDMLSL_acc_16,
2290         gen_VQDMLSL_acc_32,
2291         NULL,
2292     };
2293
2294     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2295 }
2296
2297 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2298 {
2299     gen_helper_gvec_3 *fn_gvec;
2300
2301     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2302         return false;
2303     }
2304
2305     /* UNDEF accesses to D16-D31 if they don't exist. */
2306     if (!dc_isar_feature(aa32_simd_r32, s) &&
2307         ((a->vd | a->vn | a->vm) & 0x10)) {
2308         return false;
2309     }
2310
2311     if (a->vd & 1) {
2312         return false;
2313     }
2314
2315     switch (a->size) {
2316     case 0:
2317         fn_gvec = gen_helper_neon_pmull_h;
2318         break;
2319     case 2:
2320         if (!dc_isar_feature(aa32_pmull, s)) {
2321             return false;
2322         }
2323         fn_gvec = gen_helper_gvec_pmull_q;
2324         break;
2325     default:
2326         return false;
2327     }
2328
2329     if (!vfp_access_check(s)) {
2330         return true;
2331     }
2332
2333     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2334                        neon_full_reg_offset(a->vn),
2335                        neon_full_reg_offset(a->vm),
2336                        16, 16, 0, fn_gvec);
2337     return true;
2338 }
2339
2340 static void gen_neon_dup_low16(TCGv_i32 var)
2341 {
2342     TCGv_i32 tmp = tcg_temp_new_i32();
2343     tcg_gen_ext16u_i32(var, var);
2344     tcg_gen_shli_i32(tmp, var, 16);
2345     tcg_gen_or_i32(var, var, tmp);
2346     tcg_temp_free_i32(tmp);
2347 }
2348
2349 static void gen_neon_dup_high16(TCGv_i32 var)
2350 {
2351     TCGv_i32 tmp = tcg_temp_new_i32();
2352     tcg_gen_andi_i32(var, var, 0xffff0000);
2353     tcg_gen_shri_i32(tmp, var, 16);
2354     tcg_gen_or_i32(var, var, tmp);
2355     tcg_temp_free_i32(tmp);
2356 }
2357
2358 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2359 {
2360     TCGv_i32 tmp = tcg_temp_new_i32();
2361     if (size == MO_16) {
2362         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2363         if (reg & 8) {
2364             gen_neon_dup_high16(tmp);
2365         } else {
2366             gen_neon_dup_low16(tmp);
2367         }
2368     } else {
2369         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2370     }
2371     return tmp;
2372 }
2373
2374 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2375                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2376 {
2377     /*
2378      * Two registers and a scalar: perform an operation between
2379      * the input elements and the scalar, and then possibly
2380      * perform an accumulation operation of that result into the
2381      * destination.
2382      */
2383     TCGv_i32 scalar, tmp;
2384     int pass;
2385
2386     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2387         return false;
2388     }
2389
2390     /* UNDEF accesses to D16-D31 if they don't exist. */
2391     if (!dc_isar_feature(aa32_simd_r32, s) &&
2392         ((a->vd | a->vn | a->vm) & 0x10)) {
2393         return false;
2394     }
2395
2396     if (!opfn) {
2397         /* Bad size (including size == 3, which is a different insn group) */
2398         return false;
2399     }
2400
2401     if (a->q && ((a->vd | a->vn) & 1)) {
2402         return false;
2403     }
2404
2405     if (!vfp_access_check(s)) {
2406         return true;
2407     }
2408
2409     scalar = neon_get_scalar(a->size, a->vm);
2410     tmp = tcg_temp_new_i32();
2411
2412     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2413         read_neon_element32(tmp, a->vn, pass, MO_32);
2414         opfn(tmp, tmp, scalar);
2415         if (accfn) {
2416             TCGv_i32 rd = tcg_temp_new_i32();
2417             read_neon_element32(rd, a->vd, pass, MO_32);
2418             accfn(tmp, rd, tmp);
2419             tcg_temp_free_i32(rd);
2420         }
2421         write_neon_element32(tmp, a->vd, pass, MO_32);
2422     }
2423     tcg_temp_free_i32(tmp);
2424     tcg_temp_free_i32(scalar);
2425     return true;
2426 }
2427
2428 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2429 {
2430     static NeonGenTwoOpFn * const opfn[] = {
2431         NULL,
2432         gen_helper_neon_mul_u16,
2433         tcg_gen_mul_i32,
2434         NULL,
2435     };
2436
2437     return do_2scalar(s, a, opfn[a->size], NULL);
2438 }
2439
2440 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2441 {
2442     static NeonGenTwoOpFn * const opfn[] = {
2443         NULL,
2444         gen_helper_neon_mul_u16,
2445         tcg_gen_mul_i32,
2446         NULL,
2447     };
2448     static NeonGenTwoOpFn * const accfn[] = {
2449         NULL,
2450         gen_helper_neon_add_u16,
2451         tcg_gen_add_i32,
2452         NULL,
2453     };
2454
2455     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2456 }
2457
2458 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2459 {
2460     static NeonGenTwoOpFn * const opfn[] = {
2461         NULL,
2462         gen_helper_neon_mul_u16,
2463         tcg_gen_mul_i32,
2464         NULL,
2465     };
2466     static NeonGenTwoOpFn * const accfn[] = {
2467         NULL,
2468         gen_helper_neon_sub_u16,
2469         tcg_gen_sub_i32,
2470         NULL,
2471     };
2472
2473     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2474 }
2475
2476 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2477                               gen_helper_gvec_3_ptr *fn)
2478 {
2479     /* Two registers and a scalar, using gvec */
2480     int vec_size = a->q ? 16 : 8;
2481     int rd_ofs = neon_full_reg_offset(a->vd);
2482     int rn_ofs = neon_full_reg_offset(a->vn);
2483     int rm_ofs;
2484     int idx;
2485     TCGv_ptr fpstatus;
2486
2487     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2488         return false;
2489     }
2490
2491     /* UNDEF accesses to D16-D31 if they don't exist. */
2492     if (!dc_isar_feature(aa32_simd_r32, s) &&
2493         ((a->vd | a->vn | a->vm) & 0x10)) {
2494         return false;
2495     }
2496
2497     if (!fn) {
2498         /* Bad size (including size == 3, which is a different insn group) */
2499         return false;
2500     }
2501
2502     if (a->q && ((a->vd | a->vn) & 1)) {
2503         return false;
2504     }
2505
2506     if (!vfp_access_check(s)) {
2507         return true;
2508     }
2509
2510     /* a->vm is M:Vm, which encodes both register and index */
2511     idx = extract32(a->vm, a->size + 2, 2);
2512     a->vm = extract32(a->vm, 0, a->size + 2);
2513     rm_ofs = neon_full_reg_offset(a->vm);
2514
2515     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2516     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2517                        vec_size, vec_size, idx, fn);
2518     tcg_temp_free_ptr(fpstatus);
2519     return true;
2520 }
2521
2522 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2523     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2524     {                                                                   \
2525         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2526             NULL,                                                       \
2527             gen_helper_##FUNC##_h,                                      \
2528             gen_helper_##FUNC##_s,                                      \
2529             NULL,                                                       \
2530         };                                                              \
2531         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2532             return false;                                               \
2533         }                                                               \
2534         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2535     }
2536
2537 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2538 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2539 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2540
2541 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2542 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2543 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2544 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2545
2546 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2547 {
2548     static NeonGenTwoOpFn * const opfn[] = {
2549         NULL,
2550         gen_VQDMULH_16,
2551         gen_VQDMULH_32,
2552         NULL,
2553     };
2554
2555     return do_2scalar(s, a, opfn[a->size], NULL);
2556 }
2557
2558 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2559 {
2560     static NeonGenTwoOpFn * const opfn[] = {
2561         NULL,
2562         gen_VQRDMULH_16,
2563         gen_VQRDMULH_32,
2564         NULL,
2565     };
2566
2567     return do_2scalar(s, a, opfn[a->size], NULL);
2568 }
2569
2570 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2571                             NeonGenThreeOpEnvFn *opfn)
2572 {
2573     /*
2574      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2575      * performs a kind of fused op-then-accumulate using a helper
2576      * function that takes all of rd, rn and the scalar at once.
2577      */
2578     TCGv_i32 scalar, rn, rd;
2579     int pass;
2580
2581     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2582         return false;
2583     }
2584
2585     if (!dc_isar_feature(aa32_rdm, s)) {
2586         return false;
2587     }
2588
2589     /* UNDEF accesses to D16-D31 if they don't exist. */
2590     if (!dc_isar_feature(aa32_simd_r32, s) &&
2591         ((a->vd | a->vn | a->vm) & 0x10)) {
2592         return false;
2593     }
2594
2595     if (!opfn) {
2596         /* Bad size (including size == 3, which is a different insn group) */
2597         return false;
2598     }
2599
2600     if (a->q && ((a->vd | a->vn) & 1)) {
2601         return false;
2602     }
2603
2604     if (!vfp_access_check(s)) {
2605         return true;
2606     }
2607
2608     scalar = neon_get_scalar(a->size, a->vm);
2609     rn = tcg_temp_new_i32();
2610     rd = tcg_temp_new_i32();
2611
2612     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2613         read_neon_element32(rn, a->vn, pass, MO_32);
2614         read_neon_element32(rd, a->vd, pass, MO_32);
2615         opfn(rd, cpu_env, rn, scalar, rd);
2616         write_neon_element32(rd, a->vd, pass, MO_32);
2617     }
2618     tcg_temp_free_i32(rn);
2619     tcg_temp_free_i32(rd);
2620     tcg_temp_free_i32(scalar);
2621
2622     return true;
2623 }
2624
2625 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2626 {
2627     static NeonGenThreeOpEnvFn *opfn[] = {
2628         NULL,
2629         gen_helper_neon_qrdmlah_s16,
2630         gen_helper_neon_qrdmlah_s32,
2631         NULL,
2632     };
2633     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2634 }
2635
2636 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2637 {
2638     static NeonGenThreeOpEnvFn *opfn[] = {
2639         NULL,
2640         gen_helper_neon_qrdmlsh_s16,
2641         gen_helper_neon_qrdmlsh_s32,
2642         NULL,
2643     };
2644     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2645 }
2646
2647 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2648                             NeonGenTwoOpWidenFn *opfn,
2649                             NeonGenTwo64OpFn *accfn)
2650 {
2651     /*
2652      * Two registers and a scalar, long operations: perform an
2653      * operation on the input elements and the scalar which produces
2654      * a double-width result, and then possibly perform an accumulation
2655      * operation of that result into the destination.
2656      */
2657     TCGv_i32 scalar, rn;
2658     TCGv_i64 rn0_64, rn1_64;
2659
2660     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2661         return false;
2662     }
2663
2664     /* UNDEF accesses to D16-D31 if they don't exist. */
2665     if (!dc_isar_feature(aa32_simd_r32, s) &&
2666         ((a->vd | a->vn | a->vm) & 0x10)) {
2667         return false;
2668     }
2669
2670     if (!opfn) {
2671         /* Bad size (including size == 3, which is a different insn group) */
2672         return false;
2673     }
2674
2675     if (a->vd & 1) {
2676         return false;
2677     }
2678
2679     if (!vfp_access_check(s)) {
2680         return true;
2681     }
2682
2683     scalar = neon_get_scalar(a->size, a->vm);
2684
2685     /* Load all inputs before writing any outputs, in case of overlap */
2686     rn = tcg_temp_new_i32();
2687     read_neon_element32(rn, a->vn, 0, MO_32);
2688     rn0_64 = tcg_temp_new_i64();
2689     opfn(rn0_64, rn, scalar);
2690
2691     read_neon_element32(rn, a->vn, 1, MO_32);
2692     rn1_64 = tcg_temp_new_i64();
2693     opfn(rn1_64, rn, scalar);
2694     tcg_temp_free_i32(rn);
2695     tcg_temp_free_i32(scalar);
2696
2697     if (accfn) {
2698         TCGv_i64 t64 = tcg_temp_new_i64();
2699         read_neon_element64(t64, a->vd, 0, MO_64);
2700         accfn(rn0_64, t64, rn0_64);
2701         read_neon_element64(t64, a->vd, 1, MO_64);
2702         accfn(rn1_64, t64, rn1_64);
2703         tcg_temp_free_i64(t64);
2704     }
2705
2706     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2707     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2708     tcg_temp_free_i64(rn0_64);
2709     tcg_temp_free_i64(rn1_64);
2710     return true;
2711 }
2712
2713 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2714 {
2715     static NeonGenTwoOpWidenFn * const opfn[] = {
2716         NULL,
2717         gen_helper_neon_mull_s16,
2718         gen_mull_s32,
2719         NULL,
2720     };
2721
2722     return do_2scalar_long(s, a, opfn[a->size], NULL);
2723 }
2724
2725 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2726 {
2727     static NeonGenTwoOpWidenFn * const opfn[] = {
2728         NULL,
2729         gen_helper_neon_mull_u16,
2730         gen_mull_u32,
2731         NULL,
2732     };
2733
2734     return do_2scalar_long(s, a, opfn[a->size], NULL);
2735 }
2736
2737 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2738     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2739     {                                                                   \
2740         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2741             NULL,                                                       \
2742             gen_helper_neon_##MULL##16,                                 \
2743             gen_##MULL##32,                                             \
2744             NULL,                                                       \
2745         };                                                              \
2746         static NeonGenTwo64OpFn * const accfn[] = {                     \
2747             NULL,                                                       \
2748             gen_helper_neon_##ACC##l_u32,                               \
2749             tcg_gen_##ACC##_i64,                                        \
2750             NULL,                                                       \
2751         };                                                              \
2752         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2753     }
2754
2755 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2756 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2757 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2758 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2759
2760 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2761 {
2762     static NeonGenTwoOpWidenFn * const opfn[] = {
2763         NULL,
2764         gen_VQDMULL_16,
2765         gen_VQDMULL_32,
2766         NULL,
2767     };
2768
2769     return do_2scalar_long(s, a, opfn[a->size], NULL);
2770 }
2771
2772 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2773 {
2774     static NeonGenTwoOpWidenFn * const opfn[] = {
2775         NULL,
2776         gen_VQDMULL_16,
2777         gen_VQDMULL_32,
2778         NULL,
2779     };
2780     static NeonGenTwo64OpFn * const accfn[] = {
2781         NULL,
2782         gen_VQDMLAL_acc_16,
2783         gen_VQDMLAL_acc_32,
2784         NULL,
2785     };
2786
2787     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2788 }
2789
2790 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2791 {
2792     static NeonGenTwoOpWidenFn * const opfn[] = {
2793         NULL,
2794         gen_VQDMULL_16,
2795         gen_VQDMULL_32,
2796         NULL,
2797     };
2798     static NeonGenTwo64OpFn * const accfn[] = {
2799         NULL,
2800         gen_VQDMLSL_acc_16,
2801         gen_VQDMLSL_acc_32,
2802         NULL,
2803     };
2804
2805     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2806 }
2807
2808 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2809 {
2810     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2811         return false;
2812     }
2813
2814     /* UNDEF accesses to D16-D31 if they don't exist. */
2815     if (!dc_isar_feature(aa32_simd_r32, s) &&
2816         ((a->vd | a->vn | a->vm) & 0x10)) {
2817         return false;
2818     }
2819
2820     if ((a->vn | a->vm | a->vd) & a->q) {
2821         return false;
2822     }
2823
2824     if (a->imm > 7 && !a->q) {
2825         return false;
2826     }
2827
2828     if (!vfp_access_check(s)) {
2829         return true;
2830     }
2831
2832     if (!a->q) {
2833         /* Extract 64 bits from <Vm:Vn> */
2834         TCGv_i64 left, right, dest;
2835
2836         left = tcg_temp_new_i64();
2837         right = tcg_temp_new_i64();
2838         dest = tcg_temp_new_i64();
2839
2840         read_neon_element64(right, a->vn, 0, MO_64);
2841         read_neon_element64(left, a->vm, 0, MO_64);
2842         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2843         write_neon_element64(dest, a->vd, 0, MO_64);
2844
2845         tcg_temp_free_i64(left);
2846         tcg_temp_free_i64(right);
2847         tcg_temp_free_i64(dest);
2848     } else {
2849         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2850         TCGv_i64 left, middle, right, destleft, destright;
2851
2852         left = tcg_temp_new_i64();
2853         middle = tcg_temp_new_i64();
2854         right = tcg_temp_new_i64();
2855         destleft = tcg_temp_new_i64();
2856         destright = tcg_temp_new_i64();
2857
2858         if (a->imm < 8) {
2859             read_neon_element64(right, a->vn, 0, MO_64);
2860             read_neon_element64(middle, a->vn, 1, MO_64);
2861             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2862             read_neon_element64(left, a->vm, 0, MO_64);
2863             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2864         } else {
2865             read_neon_element64(right, a->vn, 1, MO_64);
2866             read_neon_element64(middle, a->vm, 0, MO_64);
2867             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2868             read_neon_element64(left, a->vm, 1, MO_64);
2869             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2870         }
2871
2872         write_neon_element64(destright, a->vd, 0, MO_64);
2873         write_neon_element64(destleft, a->vd, 1, MO_64);
2874
2875         tcg_temp_free_i64(destright);
2876         tcg_temp_free_i64(destleft);
2877         tcg_temp_free_i64(right);
2878         tcg_temp_free_i64(middle);
2879         tcg_temp_free_i64(left);
2880     }
2881     return true;
2882 }
2883
2884 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2885 {
2886     TCGv_i64 val, def;
2887     TCGv_i32 desc;
2888
2889     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2890         return false;
2891     }
2892
2893     /* UNDEF accesses to D16-D31 if they don't exist. */
2894     if (!dc_isar_feature(aa32_simd_r32, s) &&
2895         ((a->vd | a->vn | a->vm) & 0x10)) {
2896         return false;
2897     }
2898
2899     if ((a->vn + a->len + 1) > 32) {
2900         /*
2901          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2902          * helper function running off the end of the register file.
2903          */
2904         return false;
2905     }
2906
2907     if (!vfp_access_check(s)) {
2908         return true;
2909     }
2910
2911     desc = tcg_const_i32((a->vn << 2) | a->len);
2912     def = tcg_temp_new_i64();
2913     if (a->op) {
2914         read_neon_element64(def, a->vd, 0, MO_64);
2915     } else {
2916         tcg_gen_movi_i64(def, 0);
2917     }
2918     val = tcg_temp_new_i64();
2919     read_neon_element64(val, a->vm, 0, MO_64);
2920
2921     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2922     write_neon_element64(val, a->vd, 0, MO_64);
2923
2924     tcg_temp_free_i64(def);
2925     tcg_temp_free_i64(val);
2926     tcg_temp_free_i32(desc);
2927     return true;
2928 }
2929
2930 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2931 {
2932     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2933         return false;
2934     }
2935
2936     /* UNDEF accesses to D16-D31 if they don't exist. */
2937     if (!dc_isar_feature(aa32_simd_r32, s) &&
2938         ((a->vd | a->vm) & 0x10)) {
2939         return false;
2940     }
2941
2942     if (a->vd & a->q) {
2943         return false;
2944     }
2945
2946     if (!vfp_access_check(s)) {
2947         return true;
2948     }
2949
2950     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2951                          neon_element_offset(a->vm, a->index, a->size),
2952                          a->q ? 16 : 8, a->q ? 16 : 8);
2953     return true;
2954 }
2955
2956 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2957 {
2958     int pass, half;
2959     TCGv_i32 tmp[2];
2960
2961     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2962         return false;
2963     }
2964
2965     /* UNDEF accesses to D16-D31 if they don't exist. */
2966     if (!dc_isar_feature(aa32_simd_r32, s) &&
2967         ((a->vd | a->vm) & 0x10)) {
2968         return false;
2969     }
2970
2971     if ((a->vd | a->vm) & a->q) {
2972         return false;
2973     }
2974
2975     if (a->size == 3) {
2976         return false;
2977     }
2978
2979     if (!vfp_access_check(s)) {
2980         return true;
2981     }
2982
2983     tmp[0] = tcg_temp_new_i32();
2984     tmp[1] = tcg_temp_new_i32();
2985
2986     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2987         for (half = 0; half < 2; half++) {
2988             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2989             switch (a->size) {
2990             case 0:
2991                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2992                 break;
2993             case 1:
2994                 gen_swap_half(tmp[half], tmp[half]);
2995                 break;
2996             case 2:
2997                 break;
2998             default:
2999                 g_assert_not_reached();
3000             }
3001         }
3002         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3003         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3004     }
3005
3006     tcg_temp_free_i32(tmp[0]);
3007     tcg_temp_free_i32(tmp[1]);
3008     return true;
3009 }
3010
3011 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3012                               NeonGenWidenFn *widenfn,
3013                               NeonGenTwo64OpFn *opfn,
3014                               NeonGenTwo64OpFn *accfn)
3015 {
3016     /*
3017      * Pairwise long operations: widen both halves of the pair,
3018      * combine the pairs with the opfn, and then possibly accumulate
3019      * into the destination with the accfn.
3020      */
3021     int pass;
3022
3023     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3024         return false;
3025     }
3026
3027     /* UNDEF accesses to D16-D31 if they don't exist. */
3028     if (!dc_isar_feature(aa32_simd_r32, s) &&
3029         ((a->vd | a->vm) & 0x10)) {
3030         return false;
3031     }
3032
3033     if ((a->vd | a->vm) & a->q) {
3034         return false;
3035     }
3036
3037     if (!widenfn) {
3038         return false;
3039     }
3040
3041     if (!vfp_access_check(s)) {
3042         return true;
3043     }
3044
3045     for (pass = 0; pass < a->q + 1; pass++) {
3046         TCGv_i32 tmp;
3047         TCGv_i64 rm0_64, rm1_64, rd_64;
3048
3049         rm0_64 = tcg_temp_new_i64();
3050         rm1_64 = tcg_temp_new_i64();
3051         rd_64 = tcg_temp_new_i64();
3052
3053         tmp = tcg_temp_new_i32();
3054         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3055         widenfn(rm0_64, tmp);
3056         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3057         widenfn(rm1_64, tmp);
3058         tcg_temp_free_i32(tmp);
3059
3060         opfn(rd_64, rm0_64, rm1_64);
3061         tcg_temp_free_i64(rm0_64);
3062         tcg_temp_free_i64(rm1_64);
3063
3064         if (accfn) {
3065             TCGv_i64 tmp64 = tcg_temp_new_i64();
3066             read_neon_element64(tmp64, a->vd, pass, MO_64);
3067             accfn(rd_64, tmp64, rd_64);
3068             tcg_temp_free_i64(tmp64);
3069         }
3070         write_neon_element64(rd_64, a->vd, pass, MO_64);
3071         tcg_temp_free_i64(rd_64);
3072     }
3073     return true;
3074 }
3075
3076 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3077 {
3078     static NeonGenWidenFn * const widenfn[] = {
3079         gen_helper_neon_widen_s8,
3080         gen_helper_neon_widen_s16,
3081         tcg_gen_ext_i32_i64,
3082         NULL,
3083     };
3084     static NeonGenTwo64OpFn * const opfn[] = {
3085         gen_helper_neon_paddl_u16,
3086         gen_helper_neon_paddl_u32,
3087         tcg_gen_add_i64,
3088         NULL,
3089     };
3090
3091     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3092 }
3093
3094 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3095 {
3096     static NeonGenWidenFn * const widenfn[] = {
3097         gen_helper_neon_widen_u8,
3098         gen_helper_neon_widen_u16,
3099         tcg_gen_extu_i32_i64,
3100         NULL,
3101     };
3102     static NeonGenTwo64OpFn * const opfn[] = {
3103         gen_helper_neon_paddl_u16,
3104         gen_helper_neon_paddl_u32,
3105         tcg_gen_add_i64,
3106         NULL,
3107     };
3108
3109     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3110 }
3111
3112 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3113 {
3114     static NeonGenWidenFn * const widenfn[] = {
3115         gen_helper_neon_widen_s8,
3116         gen_helper_neon_widen_s16,
3117         tcg_gen_ext_i32_i64,
3118         NULL,
3119     };
3120     static NeonGenTwo64OpFn * const opfn[] = {
3121         gen_helper_neon_paddl_u16,
3122         gen_helper_neon_paddl_u32,
3123         tcg_gen_add_i64,
3124         NULL,
3125     };
3126     static NeonGenTwo64OpFn * const accfn[] = {
3127         gen_helper_neon_addl_u16,
3128         gen_helper_neon_addl_u32,
3129         tcg_gen_add_i64,
3130         NULL,
3131     };
3132
3133     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3134                              accfn[a->size]);
3135 }
3136
3137 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3138 {
3139     static NeonGenWidenFn * const widenfn[] = {
3140         gen_helper_neon_widen_u8,
3141         gen_helper_neon_widen_u16,
3142         tcg_gen_extu_i32_i64,
3143         NULL,
3144     };
3145     static NeonGenTwo64OpFn * const opfn[] = {
3146         gen_helper_neon_paddl_u16,
3147         gen_helper_neon_paddl_u32,
3148         tcg_gen_add_i64,
3149         NULL,
3150     };
3151     static NeonGenTwo64OpFn * const accfn[] = {
3152         gen_helper_neon_addl_u16,
3153         gen_helper_neon_addl_u32,
3154         tcg_gen_add_i64,
3155         NULL,
3156     };
3157
3158     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3159                              accfn[a->size]);
3160 }
3161
3162 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3163
3164 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3165                        ZipFn *fn)
3166 {
3167     TCGv_ptr pd, pm;
3168
3169     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3170         return false;
3171     }
3172
3173     /* UNDEF accesses to D16-D31 if they don't exist. */
3174     if (!dc_isar_feature(aa32_simd_r32, s) &&
3175         ((a->vd | a->vm) & 0x10)) {
3176         return false;
3177     }
3178
3179     if ((a->vd | a->vm) & a->q) {
3180         return false;
3181     }
3182
3183     if (!fn) {
3184         /* Bad size or size/q combination */
3185         return false;
3186     }
3187
3188     if (!vfp_access_check(s)) {
3189         return true;
3190     }
3191
3192     pd = vfp_reg_ptr(true, a->vd);
3193     pm = vfp_reg_ptr(true, a->vm);
3194     fn(pd, pm);
3195     tcg_temp_free_ptr(pd);
3196     tcg_temp_free_ptr(pm);
3197     return true;
3198 }
3199
3200 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3201 {
3202     static ZipFn * const fn[2][4] = {
3203         {
3204             gen_helper_neon_unzip8,
3205             gen_helper_neon_unzip16,
3206             NULL,
3207             NULL,
3208         }, {
3209             gen_helper_neon_qunzip8,
3210             gen_helper_neon_qunzip16,
3211             gen_helper_neon_qunzip32,
3212             NULL,
3213         }
3214     };
3215     return do_zip_uzp(s, a, fn[a->q][a->size]);
3216 }
3217
3218 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3219 {
3220     static ZipFn * const fn[2][4] = {
3221         {
3222             gen_helper_neon_zip8,
3223             gen_helper_neon_zip16,
3224             NULL,
3225             NULL,
3226         }, {
3227             gen_helper_neon_qzip8,
3228             gen_helper_neon_qzip16,
3229             gen_helper_neon_qzip32,
3230             NULL,
3231         }
3232     };
3233     return do_zip_uzp(s, a, fn[a->q][a->size]);
3234 }
3235
3236 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3237                      NeonGenNarrowEnvFn *narrowfn)
3238 {
3239     TCGv_i64 rm;
3240     TCGv_i32 rd0, rd1;
3241
3242     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3243         return false;
3244     }
3245
3246     /* UNDEF accesses to D16-D31 if they don't exist. */
3247     if (!dc_isar_feature(aa32_simd_r32, s) &&
3248         ((a->vd | a->vm) & 0x10)) {
3249         return false;
3250     }
3251
3252     if (a->vm & 1) {
3253         return false;
3254     }
3255
3256     if (!narrowfn) {
3257         return false;
3258     }
3259
3260     if (!vfp_access_check(s)) {
3261         return true;
3262     }
3263
3264     rm = tcg_temp_new_i64();
3265     rd0 = tcg_temp_new_i32();
3266     rd1 = tcg_temp_new_i32();
3267
3268     read_neon_element64(rm, a->vm, 0, MO_64);
3269     narrowfn(rd0, cpu_env, rm);
3270     read_neon_element64(rm, a->vm, 1, MO_64);
3271     narrowfn(rd1, cpu_env, rm);
3272     write_neon_element32(rd0, a->vd, 0, MO_32);
3273     write_neon_element32(rd1, a->vd, 1, MO_32);
3274     tcg_temp_free_i32(rd0);
3275     tcg_temp_free_i32(rd1);
3276     tcg_temp_free_i64(rm);
3277     return true;
3278 }
3279
3280 #define DO_VMOVN(INSN, FUNC)                                    \
3281     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3282     {                                                           \
3283         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3284             FUNC##8,                                            \
3285             FUNC##16,                                           \
3286             FUNC##32,                                           \
3287             NULL,                                               \
3288         };                                                      \
3289         return do_vmovn(s, a, narrowfn[a->size]);               \
3290     }
3291
3292 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3293 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3294 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3295 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3296
3297 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3298 {
3299     TCGv_i32 rm0, rm1;
3300     TCGv_i64 rd;
3301     static NeonGenWidenFn * const widenfns[] = {
3302         gen_helper_neon_widen_u8,
3303         gen_helper_neon_widen_u16,
3304         tcg_gen_extu_i32_i64,
3305         NULL,
3306     };
3307     NeonGenWidenFn *widenfn = widenfns[a->size];
3308
3309     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3310         return false;
3311     }
3312
3313     /* UNDEF accesses to D16-D31 if they don't exist. */
3314     if (!dc_isar_feature(aa32_simd_r32, s) &&
3315         ((a->vd | a->vm) & 0x10)) {
3316         return false;
3317     }
3318
3319     if (a->vd & 1) {
3320         return false;
3321     }
3322
3323     if (!widenfn) {
3324         return false;
3325     }
3326
3327     if (!vfp_access_check(s)) {
3328         return true;
3329     }
3330
3331     rd = tcg_temp_new_i64();
3332     rm0 = tcg_temp_new_i32();
3333     rm1 = tcg_temp_new_i32();
3334
3335     read_neon_element32(rm0, a->vm, 0, MO_32);
3336     read_neon_element32(rm1, a->vm, 1, MO_32);
3337
3338     widenfn(rd, rm0);
3339     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3340     write_neon_element64(rd, a->vd, 0, MO_64);
3341     widenfn(rd, rm1);
3342     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3343     write_neon_element64(rd, a->vd, 1, MO_64);
3344
3345     tcg_temp_free_i64(rd);
3346     tcg_temp_free_i32(rm0);
3347     tcg_temp_free_i32(rm1);
3348     return true;
3349 }
3350
3351 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3352 {
3353     TCGv_ptr fpst;
3354     TCGv_i64 tmp;
3355     TCGv_i32 dst0, dst1;
3356
3357     if (!dc_isar_feature(aa32_bf16, s)) {
3358         return false;
3359     }
3360
3361     /* UNDEF accesses to D16-D31 if they don't exist. */
3362     if (!dc_isar_feature(aa32_simd_r32, s) &&
3363         ((a->vd | a->vm) & 0x10)) {
3364         return false;
3365     }
3366
3367     if ((a->vm & 1) || (a->size != 1)) {
3368         return false;
3369     }
3370
3371     if (!vfp_access_check(s)) {
3372         return true;
3373     }
3374
3375     fpst = fpstatus_ptr(FPST_STD);
3376     tmp = tcg_temp_new_i64();
3377     dst0 = tcg_temp_new_i32();
3378     dst1 = tcg_temp_new_i32();
3379
3380     read_neon_element64(tmp, a->vm, 0, MO_64);
3381     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3382
3383     read_neon_element64(tmp, a->vm, 1, MO_64);
3384     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3385
3386     write_neon_element32(dst0, a->vd, 0, MO_32);
3387     write_neon_element32(dst1, a->vd, 1, MO_32);
3388
3389     tcg_temp_free_i64(tmp);
3390     tcg_temp_free_i32(dst0);
3391     tcg_temp_free_i32(dst1);
3392     tcg_temp_free_ptr(fpst);
3393     return true;
3394 }
3395
3396 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3397 {
3398     TCGv_ptr fpst;
3399     TCGv_i32 ahp, tmp, tmp2, tmp3;
3400
3401     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3402         !dc_isar_feature(aa32_fp16_spconv, s)) {
3403         return false;
3404     }
3405
3406     /* UNDEF accesses to D16-D31 if they don't exist. */
3407     if (!dc_isar_feature(aa32_simd_r32, s) &&
3408         ((a->vd | a->vm) & 0x10)) {
3409         return false;
3410     }
3411
3412     if ((a->vm & 1) || (a->size != 1)) {
3413         return false;
3414     }
3415
3416     if (!vfp_access_check(s)) {
3417         return true;
3418     }
3419
3420     fpst = fpstatus_ptr(FPST_STD);
3421     ahp = get_ahp_flag();
3422     tmp = tcg_temp_new_i32();
3423     read_neon_element32(tmp, a->vm, 0, MO_32);
3424     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3425     tmp2 = tcg_temp_new_i32();
3426     read_neon_element32(tmp2, a->vm, 1, MO_32);
3427     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3428     tcg_gen_shli_i32(tmp2, tmp2, 16);
3429     tcg_gen_or_i32(tmp2, tmp2, tmp);
3430     read_neon_element32(tmp, a->vm, 2, MO_32);
3431     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3432     tmp3 = tcg_temp_new_i32();
3433     read_neon_element32(tmp3, a->vm, 3, MO_32);
3434     write_neon_element32(tmp2, a->vd, 0, MO_32);
3435     tcg_temp_free_i32(tmp2);
3436     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3437     tcg_gen_shli_i32(tmp3, tmp3, 16);
3438     tcg_gen_or_i32(tmp3, tmp3, tmp);
3439     write_neon_element32(tmp3, a->vd, 1, MO_32);
3440     tcg_temp_free_i32(tmp3);
3441     tcg_temp_free_i32(tmp);
3442     tcg_temp_free_i32(ahp);
3443     tcg_temp_free_ptr(fpst);
3444
3445     return true;
3446 }
3447
3448 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3449 {
3450     TCGv_ptr fpst;
3451     TCGv_i32 ahp, tmp, tmp2, tmp3;
3452
3453     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3454         !dc_isar_feature(aa32_fp16_spconv, s)) {
3455         return false;
3456     }
3457
3458     /* UNDEF accesses to D16-D31 if they don't exist. */
3459     if (!dc_isar_feature(aa32_simd_r32, s) &&
3460         ((a->vd | a->vm) & 0x10)) {
3461         return false;
3462     }
3463
3464     if ((a->vd & 1) || (a->size != 1)) {
3465         return false;
3466     }
3467
3468     if (!vfp_access_check(s)) {
3469         return true;
3470     }
3471
3472     fpst = fpstatus_ptr(FPST_STD);
3473     ahp = get_ahp_flag();
3474     tmp3 = tcg_temp_new_i32();
3475     tmp2 = tcg_temp_new_i32();
3476     tmp = tcg_temp_new_i32();
3477     read_neon_element32(tmp, a->vm, 0, MO_32);
3478     read_neon_element32(tmp2, a->vm, 1, MO_32);
3479     tcg_gen_ext16u_i32(tmp3, tmp);
3480     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3481     write_neon_element32(tmp3, a->vd, 0, MO_32);
3482     tcg_gen_shri_i32(tmp, tmp, 16);
3483     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3484     write_neon_element32(tmp, a->vd, 1, MO_32);
3485     tcg_temp_free_i32(tmp);
3486     tcg_gen_ext16u_i32(tmp3, tmp2);
3487     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3488     write_neon_element32(tmp3, a->vd, 2, MO_32);
3489     tcg_temp_free_i32(tmp3);
3490     tcg_gen_shri_i32(tmp2, tmp2, 16);
3491     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3492     write_neon_element32(tmp2, a->vd, 3, MO_32);
3493     tcg_temp_free_i32(tmp2);
3494     tcg_temp_free_i32(ahp);
3495     tcg_temp_free_ptr(fpst);
3496
3497     return true;
3498 }
3499
3500 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3501 {
3502     int vec_size = a->q ? 16 : 8;
3503     int rd_ofs = neon_full_reg_offset(a->vd);
3504     int rm_ofs = neon_full_reg_offset(a->vm);
3505
3506     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3507         return false;
3508     }
3509
3510     /* UNDEF accesses to D16-D31 if they don't exist. */
3511     if (!dc_isar_feature(aa32_simd_r32, s) &&
3512         ((a->vd | a->vm) & 0x10)) {
3513         return false;
3514     }
3515
3516     if (a->size == 3) {
3517         return false;
3518     }
3519
3520     if ((a->vd | a->vm) & a->q) {
3521         return false;
3522     }
3523
3524     if (!vfp_access_check(s)) {
3525         return true;
3526     }
3527
3528     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3529
3530     return true;
3531 }
3532
3533 #define DO_2MISC_VEC(INSN, FN)                                  \
3534     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3535     {                                                           \
3536         return do_2misc_vec(s, a, FN);                          \
3537     }
3538
3539 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3540 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3541 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3542 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3543 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3544 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3545 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3546
3547 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3548 {
3549     if (a->size != 0) {
3550         return false;
3551     }
3552     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3553 }
3554
3555 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3556     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3557                          uint32_t rm_ofs, uint32_t oprsz,               \
3558                          uint32_t maxsz)                                \
3559     {                                                                   \
3560         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3561                            DATA, FUNC);                                 \
3562     }
3563
3564 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3565     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3566                          uint32_t rm_ofs, uint32_t oprsz,               \
3567                          uint32_t maxsz)                                \
3568     {                                                                   \
3569         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3570     }
3571
3572 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3573 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3574 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3575 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3576 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3577 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3578 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3579
3580 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3581     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3582     {                                                           \
3583         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3584             return false;                                       \
3585         }                                                       \
3586         return do_2misc_vec(s, a, gen_##INSN);                  \
3587     }
3588
3589 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3590 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3591 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3592 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3593 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3594 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3595 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3596
3597 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3598 {
3599     TCGv_i32 tmp;
3600     int pass;
3601
3602     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3603     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3604         return false;
3605     }
3606
3607     /* UNDEF accesses to D16-D31 if they don't exist. */
3608     if (!dc_isar_feature(aa32_simd_r32, s) &&
3609         ((a->vd | a->vm) & 0x10)) {
3610         return false;
3611     }
3612
3613     if (!fn) {
3614         return false;
3615     }
3616
3617     if ((a->vd | a->vm) & a->q) {
3618         return false;
3619     }
3620
3621     if (!vfp_access_check(s)) {
3622         return true;
3623     }
3624
3625     tmp = tcg_temp_new_i32();
3626     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3627         read_neon_element32(tmp, a->vm, pass, MO_32);
3628         fn(tmp, tmp);
3629         write_neon_element32(tmp, a->vd, pass, MO_32);
3630     }
3631     tcg_temp_free_i32(tmp);
3632
3633     return true;
3634 }
3635
3636 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3637 {
3638     static NeonGenOneOpFn * const fn[] = {
3639         tcg_gen_bswap32_i32,
3640         gen_swap_half,
3641         NULL,
3642         NULL,
3643     };
3644     return do_2misc(s, a, fn[a->size]);
3645 }
3646
3647 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3648 {
3649     if (a->size != 0) {
3650         return false;
3651     }
3652     return do_2misc(s, a, gen_rev16);
3653 }
3654
3655 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3656 {
3657     static NeonGenOneOpFn * const fn[] = {
3658         gen_helper_neon_cls_s8,
3659         gen_helper_neon_cls_s16,
3660         gen_helper_neon_cls_s32,
3661         NULL,
3662     };
3663     return do_2misc(s, a, fn[a->size]);
3664 }
3665
3666 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3667 {
3668     tcg_gen_clzi_i32(rd, rm, 32);
3669 }
3670
3671 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3672 {
3673     static NeonGenOneOpFn * const fn[] = {
3674         gen_helper_neon_clz_u8,
3675         gen_helper_neon_clz_u16,
3676         do_VCLZ_32,
3677         NULL,
3678     };
3679     return do_2misc(s, a, fn[a->size]);
3680 }
3681
3682 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3683 {
3684     if (a->size != 0) {
3685         return false;
3686     }
3687     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3688 }
3689
3690 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3691                        uint32_t oprsz, uint32_t maxsz)
3692 {
3693     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3694                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3695                       oprsz, maxsz);
3696 }
3697
3698 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3699 {
3700     if (a->size == MO_16) {
3701         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3702             return false;
3703         }
3704     } else if (a->size != MO_32) {
3705         return false;
3706     }
3707     return do_2misc_vec(s, a, gen_VABS_F);
3708 }
3709
3710 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3711                        uint32_t oprsz, uint32_t maxsz)
3712 {
3713     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3714                       vece == MO_16 ? 0x8000 : 0x80000000,
3715                       oprsz, maxsz);
3716 }
3717
3718 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3719 {
3720     if (a->size == MO_16) {
3721         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3722             return false;
3723         }
3724     } else if (a->size != MO_32) {
3725         return false;
3726     }
3727     return do_2misc_vec(s, a, gen_VNEG_F);
3728 }
3729
3730 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3731 {
3732     if (a->size != 2) {
3733         return false;
3734     }
3735     return do_2misc(s, a, gen_helper_recpe_u32);
3736 }
3737
3738 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3739 {
3740     if (a->size != 2) {
3741         return false;
3742     }
3743     return do_2misc(s, a, gen_helper_rsqrte_u32);
3744 }
3745
3746 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3747     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3748     {                                                   \
3749         FUNC(d, cpu_env, m);                            \
3750     }
3751
3752 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3753 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3754 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3755 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3756 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3757 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3758
3759 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3760 {
3761     static NeonGenOneOpFn * const fn[] = {
3762         gen_VQABS_s8,
3763         gen_VQABS_s16,
3764         gen_VQABS_s32,
3765         NULL,
3766     };
3767     return do_2misc(s, a, fn[a->size]);
3768 }
3769
3770 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3771 {
3772     static NeonGenOneOpFn * const fn[] = {
3773         gen_VQNEG_s8,
3774         gen_VQNEG_s16,
3775         gen_VQNEG_s32,
3776         NULL,
3777     };
3778     return do_2misc(s, a, fn[a->size]);
3779 }
3780
3781 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3782     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3783                            uint32_t rm_ofs,                             \
3784                            uint32_t oprsz, uint32_t maxsz)              \
3785     {                                                                   \
3786         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3787             NULL, HFUNC, SFUNC, NULL,                                   \
3788         };                                                              \
3789         TCGv_ptr fpst;                                                  \
3790         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3791         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3792                            fns[vece]);                                  \
3793         tcg_temp_free_ptr(fpst);                                        \
3794     }                                                                   \
3795     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3796     {                                                                   \
3797         if (a->size == MO_16) {                                         \
3798             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3799                 return false;                                           \
3800             }                                                           \
3801         } else if (a->size != MO_32) {                                  \
3802             return false;                                               \
3803         }                                                               \
3804         return do_2misc_vec(s, a, gen_##INSN);                          \
3805     }
3806
3807 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3808 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3809 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3810 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3811 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3812 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3813 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3814 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3815 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3816 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3817 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3818
3819 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3820
3821 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3822 {
3823     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3824         return false;
3825     }
3826     return trans_VRINTX_impl(s, a);
3827 }
3828
3829 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3830     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3831                            uint32_t rm_ofs,                             \
3832                            uint32_t oprsz, uint32_t maxsz)              \
3833     {                                                                   \
3834         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3835             NULL,                                                       \
3836             gen_helper_gvec_##OP##h,                                    \
3837             gen_helper_gvec_##OP##s,                                    \
3838             NULL,                                                       \
3839         };                                                              \
3840         TCGv_ptr fpst;                                                  \
3841         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3842         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3843                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3844         tcg_temp_free_ptr(fpst);                                        \
3845     }                                                                   \
3846     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3847     {                                                                   \
3848         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3849             return false;                                               \
3850         }                                                               \
3851         if (a->size == MO_16) {                                         \
3852             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3853                 return false;                                           \
3854             }                                                           \
3855         } else if (a->size != MO_32) {                                  \
3856             return false;                                               \
3857         }                                                               \
3858         return do_2misc_vec(s, a, gen_##INSN);                          \
3859     }
3860
3861 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3862 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3863 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3864 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3865 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3866 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3867 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3868 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3869
3870 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3871 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3872 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3873 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3874 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3875
3876 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3877 {
3878     TCGv_i64 rm, rd;
3879     int pass;
3880
3881     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3882         return false;
3883     }
3884
3885     /* UNDEF accesses to D16-D31 if they don't exist. */
3886     if (!dc_isar_feature(aa32_simd_r32, s) &&
3887         ((a->vd | a->vm) & 0x10)) {
3888         return false;
3889     }
3890
3891     if (a->size != 0) {
3892         return false;
3893     }
3894
3895     if ((a->vd | a->vm) & a->q) {
3896         return false;
3897     }
3898
3899     if (!vfp_access_check(s)) {
3900         return true;
3901     }
3902
3903     rm = tcg_temp_new_i64();
3904     rd = tcg_temp_new_i64();
3905     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3906         read_neon_element64(rm, a->vm, pass, MO_64);
3907         read_neon_element64(rd, a->vd, pass, MO_64);
3908         write_neon_element64(rm, a->vd, pass, MO_64);
3909         write_neon_element64(rd, a->vm, pass, MO_64);
3910     }
3911     tcg_temp_free_i64(rm);
3912     tcg_temp_free_i64(rd);
3913
3914     return true;
3915 }
3916 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3917 {
3918     TCGv_i32 rd, tmp;
3919
3920     rd = tcg_temp_new_i32();
3921     tmp = tcg_temp_new_i32();
3922
3923     tcg_gen_shli_i32(rd, t0, 8);
3924     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3925     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3926     tcg_gen_or_i32(rd, rd, tmp);
3927
3928     tcg_gen_shri_i32(t1, t1, 8);
3929     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3930     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3931     tcg_gen_or_i32(t1, t1, tmp);
3932     tcg_gen_mov_i32(t0, rd);
3933
3934     tcg_temp_free_i32(tmp);
3935     tcg_temp_free_i32(rd);
3936 }
3937
3938 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3939 {
3940     TCGv_i32 rd, tmp;
3941
3942     rd = tcg_temp_new_i32();
3943     tmp = tcg_temp_new_i32();
3944
3945     tcg_gen_shli_i32(rd, t0, 16);
3946     tcg_gen_andi_i32(tmp, t1, 0xffff);
3947     tcg_gen_or_i32(rd, rd, tmp);
3948     tcg_gen_shri_i32(t1, t1, 16);
3949     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3950     tcg_gen_or_i32(t1, t1, tmp);
3951     tcg_gen_mov_i32(t0, rd);
3952
3953     tcg_temp_free_i32(tmp);
3954     tcg_temp_free_i32(rd);
3955 }
3956
3957 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3958 {
3959     TCGv_i32 tmp, tmp2;
3960     int pass;
3961
3962     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3963         return false;
3964     }
3965
3966     /* UNDEF accesses to D16-D31 if they don't exist. */
3967     if (!dc_isar_feature(aa32_simd_r32, s) &&
3968         ((a->vd | a->vm) & 0x10)) {
3969         return false;
3970     }
3971
3972     if ((a->vd | a->vm) & a->q) {
3973         return false;
3974     }
3975
3976     if (a->size == 3) {
3977         return false;
3978     }
3979
3980     if (!vfp_access_check(s)) {
3981         return true;
3982     }
3983
3984     tmp = tcg_temp_new_i32();
3985     tmp2 = tcg_temp_new_i32();
3986     if (a->size == MO_32) {
3987         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3988             read_neon_element32(tmp, a->vm, pass, MO_32);
3989             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3990             write_neon_element32(tmp2, a->vm, pass, MO_32);
3991             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3992         }
3993     } else {
3994         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3995             read_neon_element32(tmp, a->vm, pass, MO_32);
3996             read_neon_element32(tmp2, a->vd, pass, MO_32);
3997             if (a->size == MO_8) {
3998                 gen_neon_trn_u8(tmp, tmp2);
3999             } else {
4000                 gen_neon_trn_u16(tmp, tmp2);
4001             }
4002             write_neon_element32(tmp2, a->vm, pass, MO_32);
4003             write_neon_element32(tmp, a->vd, pass, MO_32);
4004         }
4005     }
4006     tcg_temp_free_i32(tmp);
4007     tcg_temp_free_i32(tmp2);
4008     return true;
4009 }
4010
4011 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4012 {
4013     if (!dc_isar_feature(aa32_i8mm, s)) {
4014         return false;
4015     }
4016     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4017                         gen_helper_gvec_smmla_b);
4018 }
4019
4020 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4021 {
4022     if (!dc_isar_feature(aa32_i8mm, s)) {
4023         return false;
4024     }
4025     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4026                         gen_helper_gvec_ummla_b);
4027 }
4028
4029 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4030 {
4031     if (!dc_isar_feature(aa32_i8mm, s)) {
4032         return false;
4033     }
4034     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4035                         gen_helper_gvec_usmmla_b);
4036 }
4037
4038 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4039 {
4040     if (!dc_isar_feature(aa32_bf16, s)) {
4041         return false;
4042     }
4043     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4044                         gen_helper_gvec_bfmmla);
4045 }
4046
4047 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4048 {
4049     if (!dc_isar_feature(aa32_bf16, s)) {
4050         return false;
4051     }
4052     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4053                              gen_helper_gvec_bfmlal);
4054 }
4055
4056 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4057 {
4058     if (!dc_isar_feature(aa32_bf16, s)) {
4059         return false;
4060     }
4061     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4062                              (a->index << 1) | a->q, FPST_STD,
4063                              gen_helper_gvec_bfmlal_idx);
4064 }