target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 static inline int plus1(DisasContext *s, int x)
  32 {
  33     return x + 1;
  34 }
  35
  36 static inline int rsub_64(DisasContext *s, int x)
  37 {
  38     return 64 - x;
  39 }
  40
  41 static inline int rsub_32(DisasContext *s, int x)
  42 {
  43     return 32 - x;
  44 }
  45 static inline int rsub_16(DisasContext *s, int x)
  46 {
  47     return 16 - x;
  48 }
  49 static inline int rsub_8(DisasContext *s, int x)
  50 {
  51     return 8 - x;
  52 }
  53
  54 static inline int neon_3same_fp_size(DisasContext *s, int x)
  55 {
  56     /* Convert 0==fp32, 1==fp16 into a MO_* value */
  57     return MO_32 - x;
  58 }
  59
  60 /* Include the generated Neon decoder */
  61 #include "decode-neon-dp.c.inc"
  62 #include "decode-neon-ls.c.inc"
  63 #include "decode-neon-shared.c.inc"
  64
  65 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  66 {
  67     TCGv_ptr ret = tcg_temp_new_ptr();
  68     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  69     return ret;
  70 }
  71
  72 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  73 {
  74     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  75
  76     switch (mop) {
  77     case MO_UB:
  78         tcg_gen_ld8u_i32(var, cpu_env, offset);
  79         break;
  80     case MO_UW:
  81         tcg_gen_ld16u_i32(var, cpu_env, offset);
  82         break;
  83     case MO_UL:
  84         tcg_gen_ld_i32(var, cpu_env, offset);
  85         break;
  86     default:
  87         g_assert_not_reached();
  88     }
  89 }
  90
  91 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  92 {
  93     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  94
  95     switch (mop) {
  96     case MO_UB:
  97         tcg_gen_ld8u_i64(var, cpu_env, offset);
  98         break;
  99     case MO_UW:
 100         tcg_gen_ld16u_i64(var, cpu_env, offset);
 101         break;
 102     case MO_UL:
 103         tcg_gen_ld32u_i64(var, cpu_env, offset);
 104         break;
 105     case MO_Q:
 106         tcg_gen_ld_i64(var, cpu_env, offset);
 107         break;
 108     default:
 109         g_assert_not_reached();
 110     }
 111 }
 112
 113 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
 114 {
 115     long offset = neon_element_offset(reg, ele, size);
 116
 117     switch (size) {
 118     case MO_8:
 119         tcg_gen_st8_i32(var, cpu_env, offset);
 120         break;
 121     case MO_16:
 122         tcg_gen_st16_i32(var, cpu_env, offset);
 123         break;
 124     case MO_32:
 125         tcg_gen_st_i32(var, cpu_env, offset);
 126         break;
 127     default:
 128         g_assert_not_reached();
 129     }
 130 }
 131
 132 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 133 {
 134     long offset = neon_element_offset(reg, ele, size);
 135
 136     switch (size) {
 137     case MO_8:
 138         tcg_gen_st8_i64(var, cpu_env, offset);
 139         break;
 140     case MO_16:
 141         tcg_gen_st16_i64(var, cpu_env, offset);
 142         break;
 143     case MO_32:
 144         tcg_gen_st32_i64(var, cpu_env, offset);
 145         break;
 146     case MO_64:
 147         tcg_gen_st_i64(var, cpu_env, offset);
 148         break;
 149     default:
 150         g_assert_not_reached();
 151     }
 152 }
 153
 154 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 155                          int data, gen_helper_gvec_4 *fn_gvec)
 156 {
 157     /* UNDEF accesses to D16-D31 if they don't exist. */
 158     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 159         return false;
 160     }
 161
 162     /*
 163      * UNDEF accesses to odd registers for each bit of Q.
 164      * Q will be 0b111 for all Q-reg instructions, otherwise
 165      * when we have mixed Q- and D-reg inputs.
 166      */
 167     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 168         return false;
 169     }
 170
 171     if (!vfp_access_check(s)) {
 172         return true;
 173     }
 174
 175     int opr_sz = q ? 16 : 8;
 176     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 177                        vfp_reg_offset(1, vn),
 178                        vfp_reg_offset(1, vm),
 179                        vfp_reg_offset(1, vd),
 180                        opr_sz, opr_sz, data, fn_gvec);
 181     return true;
 182 }
 183
 184 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 185                               int data, ARMFPStatusFlavour fp_flavour,
 186                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 187 {
 188     /* UNDEF accesses to D16-D31 if they don't exist. */
 189     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 190         return false;
 191     }
 192
 193     /*
 194      * UNDEF accesses to odd registers for each bit of Q.
 195      * Q will be 0b111 for all Q-reg instructions, otherwise
 196      * when we have mixed Q- and D-reg inputs.
 197      */
 198     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 199         return false;
 200     }
 201
 202     if (!vfp_access_check(s)) {
 203         return true;
 204     }
 205
 206     int opr_sz = q ? 16 : 8;
 207     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 208
 209     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 210                        vfp_reg_offset(1, vn),
 211                        vfp_reg_offset(1, vm),
 212                        vfp_reg_offset(1, vd),
 213                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 214     tcg_temp_free_ptr(fpst);
 215     return true;
 216 }
 217
 218 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 219 {
 220     if (!dc_isar_feature(aa32_vcma, s)) {
 221         return false;
 222     }
 223     if (a->size == MO_16) {
 224         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 225             return false;
 226         }
 227         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 228                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 229     }
 230     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 231                              FPST_STD, gen_helper_gvec_fcmlas);
 232 }
 233
 234 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 235 {
 236     int opr_sz;
 237     TCGv_ptr fpst;
 238     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 239
 240     if (!dc_isar_feature(aa32_vcma, s)
 241         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 242         return false;
 243     }
 244
 245     /* UNDEF accesses to D16-D31 if they don't exist. */
 246     if (!dc_isar_feature(aa32_simd_r32, s) &&
 247         ((a->vd | a->vn | a->vm) & 0x10)) {
 248         return false;
 249     }
 250
 251     if ((a->vn | a->vm | a->vd) & a->q) {
 252         return false;
 253     }
 254
 255     if (!vfp_access_check(s)) {
 256         return true;
 257     }
 258
 259     opr_sz = (1 + a->q) * 8;
 260     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 261     fn_gvec_ptr = (a->size == MO_16) ?
 262         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 263     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 264                        vfp_reg_offset(1, a->vn),
 265                        vfp_reg_offset(1, a->vm),
 266                        fpst, opr_sz, opr_sz, a->rot,
 267                        fn_gvec_ptr);
 268     tcg_temp_free_ptr(fpst);
 269     return true;
 270 }
 271
 272 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 273 {
 274     if (!dc_isar_feature(aa32_dp, s)) {
 275         return false;
 276     }
 277     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 278                         gen_helper_gvec_sdot_b);
 279 }
 280
 281 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 282 {
 283     if (!dc_isar_feature(aa32_dp, s)) {
 284         return false;
 285     }
 286     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 287                         gen_helper_gvec_udot_b);
 288 }
 289
 290 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 291 {
 292     if (!dc_isar_feature(aa32_i8mm, s)) {
 293         return false;
 294     }
 295     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 296                         gen_helper_gvec_usdot_b);
 297 }
 298
 299 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 300 {
 301     if (!dc_isar_feature(aa32_bf16, s)) {
 302         return false;
 303     }
 304     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 305                         gen_helper_gvec_bfdot);
 306 }
 307
 308 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 309 {
 310     int opr_sz;
 311
 312     if (!dc_isar_feature(aa32_fhm, s)) {
 313         return false;
 314     }
 315
 316     /* UNDEF accesses to D16-D31 if they don't exist. */
 317     if (!dc_isar_feature(aa32_simd_r32, s) &&
 318         (a->vd & 0x10)) {
 319         return false;
 320     }
 321
 322     if (a->vd & a->q) {
 323         return false;
 324     }
 325
 326     if (!vfp_access_check(s)) {
 327         return true;
 328     }
 329
 330     opr_sz = (1 + a->q) * 8;
 331     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 332                        vfp_reg_offset(a->q, a->vn),
 333                        vfp_reg_offset(a->q, a->vm),
 334                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 335                        gen_helper_gvec_fmlal_a32);
 336     return true;
 337 }
 338
 339 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 340 {
 341     int data = (a->index << 2) | a->rot;
 342
 343     if (!dc_isar_feature(aa32_vcma, s)) {
 344         return false;
 345     }
 346     if (a->size == MO_16) {
 347         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 348             return false;
 349         }
 350         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 351                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 352     }
 353     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 354                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 355 }
 356
 357 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 358 {
 359     if (!dc_isar_feature(aa32_dp, s)) {
 360         return false;
 361     }
 362     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 363                         gen_helper_gvec_sdot_idx_b);
 364 }
 365
 366 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 367 {
 368     if (!dc_isar_feature(aa32_dp, s)) {
 369         return false;
 370     }
 371     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 372                         gen_helper_gvec_udot_idx_b);
 373 }
 374
 375 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 376 {
 377     if (!dc_isar_feature(aa32_i8mm, s)) {
 378         return false;
 379     }
 380     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 381                         gen_helper_gvec_usdot_idx_b);
 382 }
 383
 384 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 385 {
 386     if (!dc_isar_feature(aa32_i8mm, s)) {
 387         return false;
 388     }
 389     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 390                         gen_helper_gvec_sudot_idx_b);
 391 }
 392
 393 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 394 {
 395     if (!dc_isar_feature(aa32_bf16, s)) {
 396         return false;
 397     }
 398     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 399                         gen_helper_gvec_bfdot_idx);
 400 }
 401
 402 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 403 {
 404     int opr_sz;
 405
 406     if (!dc_isar_feature(aa32_fhm, s)) {
 407         return false;
 408     }
 409
 410     /* UNDEF accesses to D16-D31 if they don't exist. */
 411     if (!dc_isar_feature(aa32_simd_r32, s) &&
 412         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 413         return false;
 414     }
 415
 416     if (a->vd & a->q) {
 417         return false;
 418     }
 419
 420     if (!vfp_access_check(s)) {
 421         return true;
 422     }
 423
 424     opr_sz = (1 + a->q) * 8;
 425     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 426                        vfp_reg_offset(a->q, a->vn),
 427                        vfp_reg_offset(a->q, a->rm),
 428                        cpu_env, opr_sz, opr_sz,
 429                        (a->index << 2) | a->s, /* is_2 == 0 */
 430                        gen_helper_gvec_fmlal_idx_a32);
 431     return true;
 432 }
 433
 434 static struct {
 435     int nregs;
 436     int interleave;
 437     int spacing;
 438 } const neon_ls_element_type[11] = {
 439     {1, 4, 1},
 440     {1, 4, 2},
 441     {4, 1, 1},
 442     {2, 2, 2},
 443     {1, 3, 1},
 444     {1, 3, 2},
 445     {3, 1, 1},
 446     {1, 1, 1},
 447     {1, 2, 1},
 448     {1, 2, 2},
 449     {2, 1, 1}
 450 };
 451
 452 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 453                                       int stride)
 454 {
 455     if (rm != 15) {
 456         TCGv_i32 base;
 457
 458         base = load_reg(s, rn);
 459         if (rm == 13) {
 460             tcg_gen_addi_i32(base, base, stride);
 461         } else {
 462             TCGv_i32 index;
 463             index = load_reg(s, rm);
 464             tcg_gen_add_i32(base, base, index);
 465             tcg_temp_free_i32(index);
 466         }
 467         store_reg(s, rn, base);
 468     }
 469 }
 470
 471 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 472 {
 473     /* Neon load/store multiple structures */
 474     int nregs, interleave, spacing, reg, n;
 475     MemOp mop, align, endian;
 476     int mmu_idx = get_mem_index(s);
 477     int size = a->size;
 478     TCGv_i64 tmp64;
 479     TCGv_i32 addr, tmp;
 480
 481     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 482         return false;
 483     }
 484
 485     /* UNDEF accesses to D16-D31 if they don't exist */
 486     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 487         return false;
 488     }
 489     if (a->itype > 10) {
 490         return false;
 491     }
 492     /* Catch UNDEF cases for bad values of align field */
 493     switch (a->itype & 0xc) {
 494     case 4:
 495         if (a->align >= 2) {
 496             return false;
 497         }
 498         break;
 499     case 8:
 500         if (a->align == 3) {
 501             return false;
 502         }
 503         break;
 504     default:
 505         break;
 506     }
 507     nregs = neon_ls_element_type[a->itype].nregs;
 508     interleave = neon_ls_element_type[a->itype].interleave;
 509     spacing = neon_ls_element_type[a->itype].spacing;
 510     if (size == 3 && (interleave | spacing) != 1) {
 511         return false;
 512     }
 513
 514     if (!vfp_access_check(s)) {
 515         return true;
 516     }
 517
 518     /* For our purposes, bytes are always little-endian.  */
 519     endian = s->be_data;
 520     if (size == 0) {
 521         endian = MO_LE;
 522     }
 523
 524     /* Enforce alignment requested by the instruction */
 525     if (a->align) {
 526         align = pow2_align(a->align + 2); /* 4 ** a->align */
 527     } else {
 528         align = s->align_mem ? MO_ALIGN : 0;
 529     }
 530
 531     /*
 532      * Consecutive little-endian elements from a single register
 533      * can be promoted to a larger little-endian operation.
 534      */
 535     if (interleave == 1 && endian == MO_LE) {
 536         /* Retain any natural alignment. */
 537         if (align == MO_ALIGN) {
 538             align = pow2_align(size);
 539         }
 540         size = 3;
 541     }
 542
 543     tmp64 = tcg_temp_new_i64();
 544     addr = tcg_temp_new_i32();
 545     tmp = tcg_const_i32(1 << size);
 546     load_reg_var(s, addr, a->rn);
 547
 548     mop = endian | size | align;
 549     for (reg = 0; reg < nregs; reg++) {
 550         for (n = 0; n < 8 >> size; n++) {
 551             int xs;
 552             for (xs = 0; xs < interleave; xs++) {
 553                 int tt = a->vd + reg + spacing * xs;
 554
 555                 if (a->l) {
 556                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 557                     neon_store_element64(tt, n, size, tmp64);
 558                 } else {
 559                     neon_load_element64(tmp64, tt, n, size);
 560                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 561                 }
 562                 tcg_gen_add_i32(addr, addr, tmp);
 563
 564                 /* Subsequent memory operations inherit alignment */
 565                 mop &= ~MO_AMASK;
 566             }
 567         }
 568     }
 569     tcg_temp_free_i32(addr);
 570     tcg_temp_free_i32(tmp);
 571     tcg_temp_free_i64(tmp64);
 572
 573     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 574     return true;
 575 }
 576
 577 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 578 {
 579     /* Neon load single structure to all lanes */
 580     int reg, stride, vec_size;
 581     int vd = a->vd;
 582     int size = a->size;
 583     int nregs = a->n + 1;
 584     TCGv_i32 addr, tmp;
 585     MemOp mop, align;
 586
 587     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 588         return false;
 589     }
 590
 591     /* UNDEF accesses to D16-D31 if they don't exist */
 592     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 593         return false;
 594     }
 595
 596     align = 0;
 597     if (size == 3) {
 598         if (nregs != 4 || a->a == 0) {
 599             return false;
 600         }
 601         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 602         size = MO_32;
 603         align = MO_ALIGN_16;
 604     } else if (a->a) {
 605         switch (nregs) {
 606         case 1:
 607             if (size == 0) {
 608                 return false;
 609             }
 610             align = MO_ALIGN;
 611             break;
 612         case 2:
 613             align = pow2_align(size + 1);
 614             break;
 615         case 3:
 616             return false;
 617         case 4:
 618             align = pow2_align(size + 2);
 619             break;
 620         default:
 621             g_assert_not_reached();
 622         }
 623     }
 624
 625     if (!vfp_access_check(s)) {
 626         return true;
 627     }
 628
 629     /*
 630      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 631      * VLD2/3/4 to all lanes: T bit indicates register stride.
 632      */
 633     stride = a->t ? 2 : 1;
 634     vec_size = nregs == 1 ? stride * 8 : 8;
 635     mop = size | align;
 636     tmp = tcg_temp_new_i32();
 637     addr = tcg_temp_new_i32();
 638     load_reg_var(s, addr, a->rn);
 639     for (reg = 0; reg < nregs; reg++) {
 640         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 641         if ((vd & 1) && vec_size == 16) {
 642             /*
 643              * We cannot write 16 bytes at once because the
 644              * destination is unaligned.
 645              */
 646             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 647                                  8, 8, tmp);
 648             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 649                              neon_full_reg_offset(vd), 8, 8);
 650         } else {
 651             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 652                                  vec_size, vec_size, tmp);
 653         }
 654         tcg_gen_addi_i32(addr, addr, 1 << size);
 655         vd += stride;
 656
 657         /* Subsequent memory operations inherit alignment */
 658         mop &= ~MO_AMASK;
 659     }
 660     tcg_temp_free_i32(tmp);
 661     tcg_temp_free_i32(addr);
 662
 663     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 664
 665     return true;
 666 }
 667
 668 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 669 {
 670     /* Neon load/store single structure to one lane */
 671     int reg;
 672     int nregs = a->n + 1;
 673     int vd = a->vd;
 674     TCGv_i32 addr, tmp;
 675     MemOp mop;
 676
 677     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 678         return false;
 679     }
 680
 681     /* UNDEF accesses to D16-D31 if they don't exist */
 682     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 683         return false;
 684     }
 685
 686     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 687     switch (nregs) {
 688     case 1:
 689         if (((a->align & (1 << a->size)) != 0) ||
 690             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 691             return false;
 692         }
 693         break;
 694     case 3:
 695         if ((a->align & 1) != 0) {
 696             return false;
 697         }
 698         /* fall through */
 699     case 2:
 700         if (a->size == 2 && (a->align & 2) != 0) {
 701             return false;
 702         }
 703         break;
 704     case 4:
 705         if (a->size == 2 && a->align == 3) {
 706             return false;
 707         }
 708         break;
 709     default:
 710         abort();
 711     }
 712     if ((vd + a->stride * (nregs - 1)) > 31) {
 713         /*
 714          * Attempts to write off the end of the register file are
 715          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 716          * access off the end of the array that holds the register data.
 717          */
 718         return false;
 719     }
 720
 721     if (!vfp_access_check(s)) {
 722         return true;
 723     }
 724
 725     /* Pick up SCTLR settings */
 726     mop = finalize_memop(s, a->size);
 727
 728     if (a->align) {
 729         MemOp align_op;
 730
 731         switch (nregs) {
 732         case 1:
 733             /* For VLD1, use natural alignment. */
 734             align_op = MO_ALIGN;
 735             break;
 736         case 2:
 737             /* For VLD2, use double alignment. */
 738             align_op = pow2_align(a->size + 1);
 739             break;
 740         case 4:
 741             if (a->size == MO_32) {
 742                 /*
 743                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 744                  * quad alignment; align = 3 is rejected above.
 745                  */
 746                 align_op = pow2_align(a->size + a->align);
 747             } else {
 748                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 749                 align_op = pow2_align(a->size + 2);
 750             }
 751             break;
 752         default:
 753             /* For VLD3, the alignment field is zero and rejected above. */
 754             g_assert_not_reached();
 755         }
 756
 757         mop = (mop & ~MO_AMASK) | align_op;
 758     }
 759
 760     tmp = tcg_temp_new_i32();
 761     addr = tcg_temp_new_i32();
 762     load_reg_var(s, addr, a->rn);
 763
 764     for (reg = 0; reg < nregs; reg++) {
 765         if (a->l) {
 766             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 767             neon_store_element(vd, a->reg_idx, a->size, tmp);
 768         } else { /* Store */
 769             neon_load_element(tmp, vd, a->reg_idx, a->size);
 770             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 771         }
 772         vd += a->stride;
 773         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 774
 775         /* Subsequent memory operations inherit alignment */
 776         mop &= ~MO_AMASK;
 777     }
 778     tcg_temp_free_i32(addr);
 779     tcg_temp_free_i32(tmp);
 780
 781     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 782
 783     return true;
 784 }
 785
 786 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 787 {
 788     int vec_size = a->q ? 16 : 8;
 789     int rd_ofs = neon_full_reg_offset(a->vd);
 790     int rn_ofs = neon_full_reg_offset(a->vn);
 791     int rm_ofs = neon_full_reg_offset(a->vm);
 792
 793     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 794         return false;
 795     }
 796
 797     /* UNDEF accesses to D16-D31 if they don't exist. */
 798     if (!dc_isar_feature(aa32_simd_r32, s) &&
 799         ((a->vd | a->vn | a->vm) & 0x10)) {
 800         return false;
 801     }
 802
 803     if ((a->vn | a->vm | a->vd) & a->q) {
 804         return false;
 805     }
 806
 807     if (!vfp_access_check(s)) {
 808         return true;
 809     }
 810
 811     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 812     return true;
 813 }
 814
 815 #define DO_3SAME(INSN, FUNC)                                            \
 816     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 817     {                                                                   \
 818         return do_3same(s, a, FUNC);                                    \
 819     }
 820
 821 DO_3SAME(VADD, tcg_gen_gvec_add)
 822 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 823 DO_3SAME(VAND, tcg_gen_gvec_and)
 824 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 825 DO_3SAME(VORR, tcg_gen_gvec_or)
 826 DO_3SAME(VORN, tcg_gen_gvec_orc)
 827 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 828 DO_3SAME(VSHL_S, gen_gvec_sshl)
 829 DO_3SAME(VSHL_U, gen_gvec_ushl)
 830 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 831 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 832 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 833 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 834
 835 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 836 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 837     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 838                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 839                                 uint32_t oprsz, uint32_t maxsz)         \
 840     {                                                                   \
 841         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 842     }                                                                   \
 843     DO_3SAME(INSN, gen_##INSN##_3s)
 844
 845 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 846 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 847 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 848
 849 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 850     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 851     {                                                                   \
 852         if (a->size == 3) {                                             \
 853             return false;                                               \
 854         }                                                               \
 855         return do_3same(s, a, FUNC);                                    \
 856     }
 857
 858 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 859 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 860 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 861 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 862 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 863 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 864 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 865 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 866 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 867 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 868 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 869 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 870
 871 #define DO_3SAME_CMP(INSN, COND)                                        \
 872     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 873                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 874                                 uint32_t oprsz, uint32_t maxsz)         \
 875     {                                                                   \
 876         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 877     }                                                                   \
 878     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 879
 880 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 881 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 882 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 883 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 884 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 885
 886 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 887     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 888                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 889     {                                                                      \
 890         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 891     }
 892
 893 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 894
 895 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 896 {
 897     if (a->size != 0) {
 898         return false;
 899     }
 900     return do_3same(s, a, gen_VMUL_p_3s);
 901 }
 902
 903 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 904     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 905     {                                                                   \
 906         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 907             return false;                                               \
 908         }                                                               \
 909         if (a->size != 1 && a->size != 2) {                             \
 910             return false;                                               \
 911         }                                                               \
 912         return do_3same(s, a, FUNC);                                    \
 913     }
 914
 915 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 916 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 917
 918 #define DO_SHA1(NAME, FUNC)                                             \
 919     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 920     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 921     {                                                                   \
 922         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 923             return false;                                               \
 924         }                                                               \
 925         return do_3same(s, a, gen_##NAME##_3s);                         \
 926     }
 927
 928 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 929 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 930 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 931 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 932
 933 #define DO_SHA2(NAME, FUNC)                                             \
 934     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 935     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 936     {                                                                   \
 937         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 938             return false;                                               \
 939         }                                                               \
 940         return do_3same(s, a, gen_##NAME##_3s);                         \
 941     }
 942
 943 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 944 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 945 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 946
 947 #define DO_3SAME_64(INSN, FUNC)                                         \
 948     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 949                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 950                                 uint32_t oprsz, uint32_t maxsz)         \
 951     {                                                                   \
 952         static const GVecGen3 op = { .fni8 = FUNC };                    \
 953         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 954     }                                                                   \
 955     DO_3SAME(INSN, gen_##INSN##_3s)
 956
 957 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 958     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 959     {                                                                   \
 960         FUNC(d, cpu_env, n, m);                                         \
 961     }                                                                   \
 962     DO_3SAME_64(INSN, gen_##INSN##_elt)
 963
 964 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 965 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 966 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 967 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 968 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 969 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 970
 971 #define DO_3SAME_32(INSN, FUNC)                                         \
 972     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 973                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 974                                 uint32_t oprsz, uint32_t maxsz)         \
 975     {                                                                   \
 976         static const GVecGen3 ops[4] = {                                \
 977             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 978             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 979             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 980             { 0 },                                                      \
 981         };                                                              \
 982         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 983     }                                                                   \
 984     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 985     {                                                                   \
 986         if (a->size > 2) {                                              \
 987             return false;                                               \
 988         }                                                               \
 989         return do_3same(s, a, gen_##INSN##_3s);                         \
 990     }
 991
 992 /*
 993  * Some helper functions need to be passed the cpu_env. In order
 994  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 995  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 996  * and which call a NeonGenTwoOpEnvFn().
 997  */
 998 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 999     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
1000     {                                                                   \
1001         FUNC(d, cpu_env, n, m);                                         \
1002     }
1003
1004 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
1005     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
1006     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
1007     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
1008     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1009                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1010                                 uint32_t oprsz, uint32_t maxsz)         \
1011     {                                                                   \
1012         static const GVecGen3 ops[4] = {                                \
1013             { .fni4 = gen_##INSN##_tramp8 },                            \
1014             { .fni4 = gen_##INSN##_tramp16 },                           \
1015             { .fni4 = gen_##INSN##_tramp32 },                           \
1016             { 0 },                                                      \
1017         };                                                              \
1018         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
1019     }                                                                   \
1020     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1021     {                                                                   \
1022         if (a->size > 2) {                                              \
1023             return false;                                               \
1024         }                                                               \
1025         return do_3same(s, a, gen_##INSN##_3s);                         \
1026     }
1027
1028 DO_3SAME_32(VHADD_S, hadd_s)
1029 DO_3SAME_32(VHADD_U, hadd_u)
1030 DO_3SAME_32(VHSUB_S, hsub_s)
1031 DO_3SAME_32(VHSUB_U, hsub_u)
1032 DO_3SAME_32(VRHADD_S, rhadd_s)
1033 DO_3SAME_32(VRHADD_U, rhadd_u)
1034 DO_3SAME_32(VRSHL_S, rshl_s)
1035 DO_3SAME_32(VRSHL_U, rshl_u)
1036
1037 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1038 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1039 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1040 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1041
1042 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1043 {
1044     /* Operations handled pairwise 32 bits at a time */
1045     TCGv_i32 tmp, tmp2, tmp3;
1046
1047     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1048         return false;
1049     }
1050
1051     /* UNDEF accesses to D16-D31 if they don't exist. */
1052     if (!dc_isar_feature(aa32_simd_r32, s) &&
1053         ((a->vd | a->vn | a->vm) & 0x10)) {
1054         return false;
1055     }
1056
1057     if (a->size == 3) {
1058         return false;
1059     }
1060
1061     if (!vfp_access_check(s)) {
1062         return true;
1063     }
1064
1065     assert(a->q == 0); /* enforced by decode patterns */
1066
1067     /*
1068      * Note that we have to be careful not to clobber the source operands
1069      * in the "vm == vd" case by storing the result of the first pass too
1070      * early. Since Q is 0 there are always just two passes, so instead
1071      * of a complicated loop over each pass we just unroll.
1072      */
1073     tmp = tcg_temp_new_i32();
1074     tmp2 = tcg_temp_new_i32();
1075     tmp3 = tcg_temp_new_i32();
1076
1077     read_neon_element32(tmp, a->vn, 0, MO_32);
1078     read_neon_element32(tmp2, a->vn, 1, MO_32);
1079     fn(tmp, tmp, tmp2);
1080
1081     read_neon_element32(tmp3, a->vm, 0, MO_32);
1082     read_neon_element32(tmp2, a->vm, 1, MO_32);
1083     fn(tmp3, tmp3, tmp2);
1084
1085     write_neon_element32(tmp, a->vd, 0, MO_32);
1086     write_neon_element32(tmp3, a->vd, 1, MO_32);
1087
1088     tcg_temp_free_i32(tmp);
1089     tcg_temp_free_i32(tmp2);
1090     tcg_temp_free_i32(tmp3);
1091     return true;
1092 }
1093
1094 #define DO_3SAME_PAIR(INSN, func)                                       \
1095     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1096     {                                                                   \
1097         static NeonGenTwoOpFn * const fns[] = {                         \
1098             gen_helper_neon_##func##8,                                  \
1099             gen_helper_neon_##func##16,                                 \
1100             gen_helper_neon_##func##32,                                 \
1101         };                                                              \
1102         if (a->size > 2) {                                              \
1103             return false;                                               \
1104         }                                                               \
1105         return do_3same_pair(s, a, fns[a->size]);                       \
1106     }
1107
1108 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1109 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1110 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1111 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1112 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1113 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1114
1115 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1116 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1117 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1118 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1119 DO_3SAME_PAIR(VPADD, padd_u)
1120
1121 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1122     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1123     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1124     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1125                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1126                                 uint32_t oprsz, uint32_t maxsz)         \
1127     {                                                                   \
1128         static const GVecGen3 ops[2] = {                                \
1129             { .fni4 = gen_##INSN##_tramp16 },                           \
1130             { .fni4 = gen_##INSN##_tramp32 },                           \
1131         };                                                              \
1132         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1133     }                                                                   \
1134     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1135     {                                                                   \
1136         if (a->size != 1 && a->size != 2) {                             \
1137             return false;                                               \
1138         }                                                               \
1139         return do_3same(s, a, gen_##INSN##_3s);                         \
1140     }
1141
1142 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1143 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1144
1145 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1146     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1147                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1148                          uint32_t oprsz, uint32_t maxsz)                \
1149     {                                                                   \
1150         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1151         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1152                            oprsz, maxsz, 0, FUNC);                      \
1153         tcg_temp_free_ptr(fpst);                                        \
1154     }
1155
1156 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1157     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1158     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1159     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1160     {                                                                   \
1161         if (a->size == MO_16) {                                         \
1162             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1163                 return false;                                           \
1164             }                                                           \
1165             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1166         }                                                               \
1167         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1168     }
1169
1170
1171 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1172 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1173 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1174 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1175 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1176 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1177 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1178 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1179 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1180 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1181 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1182 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1183 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1184 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1185 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1186 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1187 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1188
1189 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1190 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1191 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1192 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1193
1194 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1195 {
1196     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1197         return false;
1198     }
1199
1200     if (a->size == MO_16) {
1201         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1202             return false;
1203         }
1204         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1205     }
1206     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1207 }
1208
1209 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1210 {
1211     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1212         return false;
1213     }
1214
1215     if (a->size == MO_16) {
1216         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1217             return false;
1218         }
1219         return do_3same(s, a, gen_VMINNM_fp16_3s);
1220     }
1221     return do_3same(s, a, gen_VMINNM_fp32_3s);
1222 }
1223
1224 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1225                              gen_helper_gvec_3_ptr *fn)
1226 {
1227     /* FP pairwise operations */
1228     TCGv_ptr fpstatus;
1229
1230     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1231         return false;
1232     }
1233
1234     /* UNDEF accesses to D16-D31 if they don't exist. */
1235     if (!dc_isar_feature(aa32_simd_r32, s) &&
1236         ((a->vd | a->vn | a->vm) & 0x10)) {
1237         return false;
1238     }
1239
1240     if (!vfp_access_check(s)) {
1241         return true;
1242     }
1243
1244     assert(a->q == 0); /* enforced by decode patterns */
1245
1246
1247     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1248     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1249                        vfp_reg_offset(1, a->vn),
1250                        vfp_reg_offset(1, a->vm),
1251                        fpstatus, 8, 8, 0, fn);
1252     tcg_temp_free_ptr(fpstatus);
1253
1254     return true;
1255 }
1256
1257 /*
1258  * For all the functions using this macro, size == 1 means fp16,
1259  * which is an architecture extension we don't implement yet.
1260  */
1261 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1262     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1263     {                                                               \
1264         if (a->size == MO_16) {                                     \
1265             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1266                 return false;                                       \
1267             }                                                       \
1268             return do_3same_fp_pair(s, a, FUNC##h);                 \
1269         }                                                           \
1270         return do_3same_fp_pair(s, a, FUNC##s);                     \
1271     }
1272
1273 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1274 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1275 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1276
1277 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1278 {
1279     /* Handle a 2-reg-shift insn which can be vectorized. */
1280     int vec_size = a->q ? 16 : 8;
1281     int rd_ofs = neon_full_reg_offset(a->vd);
1282     int rm_ofs = neon_full_reg_offset(a->vm);
1283
1284     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1285         return false;
1286     }
1287
1288     /* UNDEF accesses to D16-D31 if they don't exist. */
1289     if (!dc_isar_feature(aa32_simd_r32, s) &&
1290         ((a->vd | a->vm) & 0x10)) {
1291         return false;
1292     }
1293
1294     if ((a->vm | a->vd) & a->q) {
1295         return false;
1296     }
1297
1298     if (!vfp_access_check(s)) {
1299         return true;
1300     }
1301
1302     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1303     return true;
1304 }
1305
1306 #define DO_2SH(INSN, FUNC)                                              \
1307     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1308     {                                                                   \
1309         return do_vector_2sh(s, a, FUNC);                               \
1310     }                                                                   \
1311
1312 DO_2SH(VSHL, tcg_gen_gvec_shli)
1313 DO_2SH(VSLI, gen_gvec_sli)
1314 DO_2SH(VSRI, gen_gvec_sri)
1315 DO_2SH(VSRA_S, gen_gvec_ssra)
1316 DO_2SH(VSRA_U, gen_gvec_usra)
1317 DO_2SH(VRSHR_S, gen_gvec_srshr)
1318 DO_2SH(VRSHR_U, gen_gvec_urshr)
1319 DO_2SH(VRSRA_S, gen_gvec_srsra)
1320 DO_2SH(VRSRA_U, gen_gvec_ursra)
1321
1322 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1323 {
1324     /* Signed shift out of range results in all-sign-bits */
1325     a->shift = MIN(a->shift, (8 << a->size) - 1);
1326     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1327 }
1328
1329 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1330                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1331 {
1332     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1333 }
1334
1335 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1336 {
1337     /* Shift out of range is architecturally valid and results in zero. */
1338     if (a->shift >= (8 << a->size)) {
1339         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1340     } else {
1341         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1342     }
1343 }
1344
1345 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1346                              NeonGenTwo64OpEnvFn *fn)
1347 {
1348     /*
1349      * 2-reg-and-shift operations, size == 3 case, where the
1350      * function needs to be passed cpu_env.
1351      */
1352     TCGv_i64 constimm;
1353     int pass;
1354
1355     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1356         return false;
1357     }
1358
1359     /* UNDEF accesses to D16-D31 if they don't exist. */
1360     if (!dc_isar_feature(aa32_simd_r32, s) &&
1361         ((a->vd | a->vm) & 0x10)) {
1362         return false;
1363     }
1364
1365     if ((a->vm | a->vd) & a->q) {
1366         return false;
1367     }
1368
1369     if (!vfp_access_check(s)) {
1370         return true;
1371     }
1372
1373     /*
1374      * To avoid excessive duplication of ops we implement shift
1375      * by immediate using the variable shift operations.
1376      */
1377     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1378
1379     for (pass = 0; pass < a->q + 1; pass++) {
1380         TCGv_i64 tmp = tcg_temp_new_i64();
1381
1382         read_neon_element64(tmp, a->vm, pass, MO_64);
1383         fn(tmp, cpu_env, tmp, constimm);
1384         write_neon_element64(tmp, a->vd, pass, MO_64);
1385         tcg_temp_free_i64(tmp);
1386     }
1387     tcg_temp_free_i64(constimm);
1388     return true;
1389 }
1390
1391 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1392                              NeonGenTwoOpEnvFn *fn)
1393 {
1394     /*
1395      * 2-reg-and-shift operations, size < 3 case, where the
1396      * helper needs to be passed cpu_env.
1397      */
1398     TCGv_i32 constimm, tmp;
1399     int pass;
1400
1401     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1402         return false;
1403     }
1404
1405     /* UNDEF accesses to D16-D31 if they don't exist. */
1406     if (!dc_isar_feature(aa32_simd_r32, s) &&
1407         ((a->vd | a->vm) & 0x10)) {
1408         return false;
1409     }
1410
1411     if ((a->vm | a->vd) & a->q) {
1412         return false;
1413     }
1414
1415     if (!vfp_access_check(s)) {
1416         return true;
1417     }
1418
1419     /*
1420      * To avoid excessive duplication of ops we implement shift
1421      * by immediate using the variable shift operations.
1422      */
1423     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1424     tmp = tcg_temp_new_i32();
1425
1426     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1427         read_neon_element32(tmp, a->vm, pass, MO_32);
1428         fn(tmp, cpu_env, tmp, constimm);
1429         write_neon_element32(tmp, a->vd, pass, MO_32);
1430     }
1431     tcg_temp_free_i32(tmp);
1432     tcg_temp_free_i32(constimm);
1433     return true;
1434 }
1435
1436 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1437     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1438     {                                                                   \
1439         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1440     }                                                                   \
1441     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1442     {                                                                   \
1443         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1444             gen_helper_neon_##FUNC##8,                                  \
1445             gen_helper_neon_##FUNC##16,                                 \
1446             gen_helper_neon_##FUNC##32,                                 \
1447         };                                                              \
1448         assert(a->size < ARRAY_SIZE(fns));                              \
1449         return do_2shift_env_32(s, a, fns[a->size]);                    \
1450     }
1451
1452 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1453 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1454 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1455
1456 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1457                                 NeonGenTwo64OpFn *shiftfn,
1458                                 NeonGenNarrowEnvFn *narrowfn)
1459 {
1460     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1461     TCGv_i64 constimm, rm1, rm2;
1462     TCGv_i32 rd;
1463
1464     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1465         return false;
1466     }
1467
1468     /* UNDEF accesses to D16-D31 if they don't exist. */
1469     if (!dc_isar_feature(aa32_simd_r32, s) &&
1470         ((a->vd | a->vm) & 0x10)) {
1471         return false;
1472     }
1473
1474     if (a->vm & 1) {
1475         return false;
1476     }
1477
1478     if (!vfp_access_check(s)) {
1479         return true;
1480     }
1481
1482     /*
1483      * This is always a right shift, and the shiftfn is always a
1484      * left-shift helper, which thus needs the negated shift count.
1485      */
1486     constimm = tcg_const_i64(-a->shift);
1487     rm1 = tcg_temp_new_i64();
1488     rm2 = tcg_temp_new_i64();
1489     rd = tcg_temp_new_i32();
1490
1491     /* Load both inputs first to avoid potential overwrite if rm == rd */
1492     read_neon_element64(rm1, a->vm, 0, MO_64);
1493     read_neon_element64(rm2, a->vm, 1, MO_64);
1494
1495     shiftfn(rm1, rm1, constimm);
1496     narrowfn(rd, cpu_env, rm1);
1497     write_neon_element32(rd, a->vd, 0, MO_32);
1498
1499     shiftfn(rm2, rm2, constimm);
1500     narrowfn(rd, cpu_env, rm2);
1501     write_neon_element32(rd, a->vd, 1, MO_32);
1502
1503     tcg_temp_free_i32(rd);
1504     tcg_temp_free_i64(rm1);
1505     tcg_temp_free_i64(rm2);
1506     tcg_temp_free_i64(constimm);
1507
1508     return true;
1509 }
1510
1511 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1512                                 NeonGenTwoOpFn *shiftfn,
1513                                 NeonGenNarrowEnvFn *narrowfn)
1514 {
1515     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1516     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1517     TCGv_i64 rtmp;
1518     uint32_t imm;
1519
1520     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1521         return false;
1522     }
1523
1524     /* UNDEF accesses to D16-D31 if they don't exist. */
1525     if (!dc_isar_feature(aa32_simd_r32, s) &&
1526         ((a->vd | a->vm) & 0x10)) {
1527         return false;
1528     }
1529
1530     if (a->vm & 1) {
1531         return false;
1532     }
1533
1534     if (!vfp_access_check(s)) {
1535         return true;
1536     }
1537
1538     /*
1539      * This is always a right shift, and the shiftfn is always a
1540      * left-shift helper, which thus needs the negated shift count
1541      * duplicated into each lane of the immediate value.
1542      */
1543     if (a->size == 1) {
1544         imm = (uint16_t)(-a->shift);
1545         imm |= imm << 16;
1546     } else {
1547         /* size == 2 */
1548         imm = -a->shift;
1549     }
1550     constimm = tcg_const_i32(imm);
1551
1552     /* Load all inputs first to avoid potential overwrite */
1553     rm1 = tcg_temp_new_i32();
1554     rm2 = tcg_temp_new_i32();
1555     rm3 = tcg_temp_new_i32();
1556     rm4 = tcg_temp_new_i32();
1557     read_neon_element32(rm1, a->vm, 0, MO_32);
1558     read_neon_element32(rm2, a->vm, 1, MO_32);
1559     read_neon_element32(rm3, a->vm, 2, MO_32);
1560     read_neon_element32(rm4, a->vm, 3, MO_32);
1561     rtmp = tcg_temp_new_i64();
1562
1563     shiftfn(rm1, rm1, constimm);
1564     shiftfn(rm2, rm2, constimm);
1565
1566     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1567     tcg_temp_free_i32(rm2);
1568
1569     narrowfn(rm1, cpu_env, rtmp);
1570     write_neon_element32(rm1, a->vd, 0, MO_32);
1571     tcg_temp_free_i32(rm1);
1572
1573     shiftfn(rm3, rm3, constimm);
1574     shiftfn(rm4, rm4, constimm);
1575     tcg_temp_free_i32(constimm);
1576
1577     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1578     tcg_temp_free_i32(rm4);
1579
1580     narrowfn(rm3, cpu_env, rtmp);
1581     tcg_temp_free_i64(rtmp);
1582     write_neon_element32(rm3, a->vd, 1, MO_32);
1583     tcg_temp_free_i32(rm3);
1584     return true;
1585 }
1586
1587 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1588     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1589     {                                                                   \
1590         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1591     }
1592 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1593     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1594     {                                                                   \
1595         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1596     }
1597
1598 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1599 {
1600     tcg_gen_extrl_i64_i32(dest, src);
1601 }
1602
1603 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1604 {
1605     gen_helper_neon_narrow_u16(dest, src);
1606 }
1607
1608 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1609 {
1610     gen_helper_neon_narrow_u8(dest, src);
1611 }
1612
1613 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1614 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1615 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1616
1617 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1618 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1619 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1620
1621 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1622 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1623 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1624
1625 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1626 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1627 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1628 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1629 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1630 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1631
1632 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1633 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1634 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1635
1636 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1637 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1638 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1639
1640 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1641 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1642 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1643
1644 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1645                          NeonGenWidenFn *widenfn, bool u)
1646 {
1647     TCGv_i64 tmp;
1648     TCGv_i32 rm0, rm1;
1649     uint64_t widen_mask = 0;
1650
1651     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1652         return false;
1653     }
1654
1655     /* UNDEF accesses to D16-D31 if they don't exist. */
1656     if (!dc_isar_feature(aa32_simd_r32, s) &&
1657         ((a->vd | a->vm) & 0x10)) {
1658         return false;
1659     }
1660
1661     if (a->vd & 1) {
1662         return false;
1663     }
1664
1665     if (!vfp_access_check(s)) {
1666         return true;
1667     }
1668
1669     /*
1670      * This is a widen-and-shift operation. The shift is always less
1671      * than the width of the source type, so after widening the input
1672      * vector we can simply shift the whole 64-bit widened register,
1673      * and then clear the potential overflow bits resulting from left
1674      * bits of the narrow input appearing as right bits of the left
1675      * neighbour narrow input. Calculate a mask of bits to clear.
1676      */
1677     if ((a->shift != 0) && (a->size < 2 || u)) {
1678         int esize = 8 << a->size;
1679         widen_mask = MAKE_64BIT_MASK(0, esize);
1680         widen_mask >>= esize - a->shift;
1681         widen_mask = dup_const(a->size + 1, widen_mask);
1682     }
1683
1684     rm0 = tcg_temp_new_i32();
1685     rm1 = tcg_temp_new_i32();
1686     read_neon_element32(rm0, a->vm, 0, MO_32);
1687     read_neon_element32(rm1, a->vm, 1, MO_32);
1688     tmp = tcg_temp_new_i64();
1689
1690     widenfn(tmp, rm0);
1691     tcg_temp_free_i32(rm0);
1692     if (a->shift != 0) {
1693         tcg_gen_shli_i64(tmp, tmp, a->shift);
1694         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1695     }
1696     write_neon_element64(tmp, a->vd, 0, MO_64);
1697
1698     widenfn(tmp, rm1);
1699     tcg_temp_free_i32(rm1);
1700     if (a->shift != 0) {
1701         tcg_gen_shli_i64(tmp, tmp, a->shift);
1702         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1703     }
1704     write_neon_element64(tmp, a->vd, 1, MO_64);
1705     tcg_temp_free_i64(tmp);
1706     return true;
1707 }
1708
1709 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1710 {
1711     static NeonGenWidenFn * const widenfn[] = {
1712         gen_helper_neon_widen_s8,
1713         gen_helper_neon_widen_s16,
1714         tcg_gen_ext_i32_i64,
1715     };
1716     return do_vshll_2sh(s, a, widenfn[a->size], false);
1717 }
1718
1719 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1720 {
1721     static NeonGenWidenFn * const widenfn[] = {
1722         gen_helper_neon_widen_u8,
1723         gen_helper_neon_widen_u16,
1724         tcg_gen_extu_i32_i64,
1725     };
1726     return do_vshll_2sh(s, a, widenfn[a->size], true);
1727 }
1728
1729 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1730                       gen_helper_gvec_2_ptr *fn)
1731 {
1732     /* FP operations in 2-reg-and-shift group */
1733     int vec_size = a->q ? 16 : 8;
1734     int rd_ofs = neon_full_reg_offset(a->vd);
1735     int rm_ofs = neon_full_reg_offset(a->vm);
1736     TCGv_ptr fpst;
1737
1738     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1739         return false;
1740     }
1741
1742     if (a->size == MO_16) {
1743         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1744             return false;
1745         }
1746     }
1747
1748     /* UNDEF accesses to D16-D31 if they don't exist. */
1749     if (!dc_isar_feature(aa32_simd_r32, s) &&
1750         ((a->vd | a->vm) & 0x10)) {
1751         return false;
1752     }
1753
1754     if ((a->vm | a->vd) & a->q) {
1755         return false;
1756     }
1757
1758     if (!vfp_access_check(s)) {
1759         return true;
1760     }
1761
1762     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1763     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1764     tcg_temp_free_ptr(fpst);
1765     return true;
1766 }
1767
1768 #define DO_FP_2SH(INSN, FUNC)                                           \
1769     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1770     {                                                                   \
1771         return do_fp_2sh(s, a, FUNC);                                   \
1772     }
1773
1774 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1775 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1776 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1777 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1778
1779 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1780 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1781 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1782 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1783
1784 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1785 {
1786     /*
1787      * Expand the encoded constant.
1788      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1789      * We choose to not special-case this and will behave as if a
1790      * valid constant encoding of 0 had been given.
1791      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1792      */
1793     switch (cmode) {
1794     case 0: case 1:
1795         /* no-op */
1796         break;
1797     case 2: case 3:
1798         imm <<= 8;
1799         break;
1800     case 4: case 5:
1801         imm <<= 16;
1802         break;
1803     case 6: case 7:
1804         imm <<= 24;
1805         break;
1806     case 8: case 9:
1807         imm |= imm << 16;
1808         break;
1809     case 10: case 11:
1810         imm = (imm << 8) | (imm << 24);
1811         break;
1812     case 12:
1813         imm = (imm << 8) | 0xff;
1814         break;
1815     case 13:
1816         imm = (imm << 16) | 0xffff;
1817         break;
1818     case 14:
1819         if (op) {
1820             /*
1821              * This is the only case where the top and bottom 32 bits
1822              * of the encoded constant differ.
1823              */
1824             uint64_t imm64 = 0;
1825             int n;
1826
1827             for (n = 0; n < 8; n++) {
1828                 if (imm & (1 << n)) {
1829                     imm64 |= (0xffULL << (n * 8));
1830                 }
1831             }
1832             return imm64;
1833         }
1834         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1835         break;
1836     case 15:
1837         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1838             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1839         break;
1840     }
1841     if (op) {
1842         imm = ~imm;
1843     }
1844     return dup_const(MO_32, imm);
1845 }
1846
1847 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1848                         GVecGen2iFn *fn)
1849 {
1850     uint64_t imm;
1851     int reg_ofs, vec_size;
1852
1853     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1854         return false;
1855     }
1856
1857     /* UNDEF accesses to D16-D31 if they don't exist. */
1858     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1859         return false;
1860     }
1861
1862     if (a->vd & a->q) {
1863         return false;
1864     }
1865
1866     if (!vfp_access_check(s)) {
1867         return true;
1868     }
1869
1870     reg_ofs = neon_full_reg_offset(a->vd);
1871     vec_size = a->q ? 16 : 8;
1872     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1873
1874     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1875     return true;
1876 }
1877
1878 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1879                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1880 {
1881     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1882 }
1883
1884 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1885 {
1886     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1887     GVecGen2iFn *fn;
1888
1889     if ((a->cmode & 1) && a->cmode < 12) {
1890         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1891         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1892     } else {
1893         /* There is one unallocated cmode/op combination in this space */
1894         if (a->cmode == 15 && a->op == 1) {
1895             return false;
1896         }
1897         fn = gen_VMOV_1r;
1898     }
1899     return do_1reg_imm(s, a, fn);
1900 }
1901
1902 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1903                            NeonGenWidenFn *widenfn,
1904                            NeonGenTwo64OpFn *opfn,
1905                            int src1_mop, int src2_mop)
1906 {
1907     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1908     TCGv_i64 rn0_64, rn1_64, rm_64;
1909
1910     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1911         return false;
1912     }
1913
1914     /* UNDEF accesses to D16-D31 if they don't exist. */
1915     if (!dc_isar_feature(aa32_simd_r32, s) &&
1916         ((a->vd | a->vn | a->vm) & 0x10)) {
1917         return false;
1918     }
1919
1920     if (!opfn) {
1921         /* size == 3 case, which is an entirely different insn group */
1922         return false;
1923     }
1924
1925     if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1926         return false;
1927     }
1928
1929     if (!vfp_access_check(s)) {
1930         return true;
1931     }
1932
1933     rn0_64 = tcg_temp_new_i64();
1934     rn1_64 = tcg_temp_new_i64();
1935     rm_64 = tcg_temp_new_i64();
1936
1937     if (src1_mop >= 0) {
1938         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1939     } else {
1940         TCGv_i32 tmp = tcg_temp_new_i32();
1941         read_neon_element32(tmp, a->vn, 0, MO_32);
1942         widenfn(rn0_64, tmp);
1943         tcg_temp_free_i32(tmp);
1944     }
1945     if (src2_mop >= 0) {
1946         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1947     } else {
1948         TCGv_i32 tmp = tcg_temp_new_i32();
1949         read_neon_element32(tmp, a->vm, 0, MO_32);
1950         widenfn(rm_64, tmp);
1951         tcg_temp_free_i32(tmp);
1952     }
1953
1954     opfn(rn0_64, rn0_64, rm_64);
1955
1956     /*
1957      * Load second pass inputs before storing the first pass result, to
1958      * avoid incorrect results if a narrow input overlaps with the result.
1959      */
1960     if (src1_mop >= 0) {
1961         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1962     } else {
1963         TCGv_i32 tmp = tcg_temp_new_i32();
1964         read_neon_element32(tmp, a->vn, 1, MO_32);
1965         widenfn(rn1_64, tmp);
1966         tcg_temp_free_i32(tmp);
1967     }
1968     if (src2_mop >= 0) {
1969         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1970     } else {
1971         TCGv_i32 tmp = tcg_temp_new_i32();
1972         read_neon_element32(tmp, a->vm, 1, MO_32);
1973         widenfn(rm_64, tmp);
1974         tcg_temp_free_i32(tmp);
1975     }
1976
1977     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1978
1979     opfn(rn1_64, rn1_64, rm_64);
1980     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1981
1982     tcg_temp_free_i64(rn0_64);
1983     tcg_temp_free_i64(rn1_64);
1984     tcg_temp_free_i64(rm_64);
1985
1986     return true;
1987 }
1988
1989 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1990     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1991     {                                                                   \
1992         static NeonGenWidenFn * const widenfn[] = {                     \
1993             gen_helper_neon_widen_##S##8,                               \
1994             gen_helper_neon_widen_##S##16,                              \
1995             NULL, NULL,                                                 \
1996         };                                                              \
1997         static NeonGenTwo64OpFn * const addfn[] = {                     \
1998             gen_helper_neon_##OP##l_u16,                                \
1999             gen_helper_neon_##OP##l_u32,                                \
2000             tcg_gen_##OP##_i64,                                         \
2001             NULL,                                                       \
2002         };                                                              \
2003         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
2004         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
2005                               SRC1WIDE ? MO_Q : narrow_mop,             \
2006                               narrow_mop);                              \
2007     }
2008
2009 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
2010 DO_PREWIDEN(VADDL_U, u, add, false, 0)
2011 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
2012 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
2013 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
2014 DO_PREWIDEN(VADDW_U, u, add, true, 0)
2015 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
2016 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
2017
2018 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
2019                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
2020 {
2021     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
2022     TCGv_i64 rn_64, rm_64;
2023     TCGv_i32 rd0, rd1;
2024
2025     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2026         return false;
2027     }
2028
2029     /* UNDEF accesses to D16-D31 if they don't exist. */
2030     if (!dc_isar_feature(aa32_simd_r32, s) &&
2031         ((a->vd | a->vn | a->vm) & 0x10)) {
2032         return false;
2033     }
2034
2035     if (!opfn || !narrowfn) {
2036         /* size == 3 case, which is an entirely different insn group */
2037         return false;
2038     }
2039
2040     if ((a->vn | a->vm) & 1) {
2041         return false;
2042     }
2043
2044     if (!vfp_access_check(s)) {
2045         return true;
2046     }
2047
2048     rn_64 = tcg_temp_new_i64();
2049     rm_64 = tcg_temp_new_i64();
2050     rd0 = tcg_temp_new_i32();
2051     rd1 = tcg_temp_new_i32();
2052
2053     read_neon_element64(rn_64, a->vn, 0, MO_64);
2054     read_neon_element64(rm_64, a->vm, 0, MO_64);
2055
2056     opfn(rn_64, rn_64, rm_64);
2057
2058     narrowfn(rd0, rn_64);
2059
2060     read_neon_element64(rn_64, a->vn, 1, MO_64);
2061     read_neon_element64(rm_64, a->vm, 1, MO_64);
2062
2063     opfn(rn_64, rn_64, rm_64);
2064
2065     narrowfn(rd1, rn_64);
2066
2067     write_neon_element32(rd0, a->vd, 0, MO_32);
2068     write_neon_element32(rd1, a->vd, 1, MO_32);
2069
2070     tcg_temp_free_i32(rd0);
2071     tcg_temp_free_i32(rd1);
2072     tcg_temp_free_i64(rn_64);
2073     tcg_temp_free_i64(rm_64);
2074
2075     return true;
2076 }
2077
2078 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2079     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2080     {                                                                   \
2081         static NeonGenTwo64OpFn * const addfn[] = {                     \
2082             gen_helper_neon_##OP##l_u16,                                \
2083             gen_helper_neon_##OP##l_u32,                                \
2084             tcg_gen_##OP##_i64,                                         \
2085             NULL,                                                       \
2086         };                                                              \
2087         static NeonGenNarrowFn * const narrowfn[] = {                   \
2088             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2089             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2090             EXTOP,                                                      \
2091             NULL,                                                       \
2092         };                                                              \
2093         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2094     }
2095
2096 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2097 {
2098     tcg_gen_addi_i64(rn, rn, 1u << 31);
2099     tcg_gen_extrh_i64_i32(rd, rn);
2100 }
2101
2102 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2103 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2104 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2105 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2106
2107 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2108                        NeonGenTwoOpWidenFn *opfn,
2109                        NeonGenTwo64OpFn *accfn)
2110 {
2111     /*
2112      * 3-regs different lengths, long operations.
2113      * These perform an operation on two inputs that returns a double-width
2114      * result, and then possibly perform an accumulation operation of
2115      * that result into the double-width destination.
2116      */
2117     TCGv_i64 rd0, rd1, tmp;
2118     TCGv_i32 rn, rm;
2119
2120     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2121         return false;
2122     }
2123
2124     /* UNDEF accesses to D16-D31 if they don't exist. */
2125     if (!dc_isar_feature(aa32_simd_r32, s) &&
2126         ((a->vd | a->vn | a->vm) & 0x10)) {
2127         return false;
2128     }
2129
2130     if (!opfn) {
2131         /* size == 3 case, which is an entirely different insn group */
2132         return false;
2133     }
2134
2135     if (a->vd & 1) {
2136         return false;
2137     }
2138
2139     if (!vfp_access_check(s)) {
2140         return true;
2141     }
2142
2143     rd0 = tcg_temp_new_i64();
2144     rd1 = tcg_temp_new_i64();
2145
2146     rn = tcg_temp_new_i32();
2147     rm = tcg_temp_new_i32();
2148     read_neon_element32(rn, a->vn, 0, MO_32);
2149     read_neon_element32(rm, a->vm, 0, MO_32);
2150     opfn(rd0, rn, rm);
2151
2152     read_neon_element32(rn, a->vn, 1, MO_32);
2153     read_neon_element32(rm, a->vm, 1, MO_32);
2154     opfn(rd1, rn, rm);
2155     tcg_temp_free_i32(rn);
2156     tcg_temp_free_i32(rm);
2157
2158     /* Don't store results until after all loads: they might overlap */
2159     if (accfn) {
2160         tmp = tcg_temp_new_i64();
2161         read_neon_element64(tmp, a->vd, 0, MO_64);
2162         accfn(rd0, tmp, rd0);
2163         read_neon_element64(tmp, a->vd, 1, MO_64);
2164         accfn(rd1, tmp, rd1);
2165         tcg_temp_free_i64(tmp);
2166     }
2167
2168     write_neon_element64(rd0, a->vd, 0, MO_64);
2169     write_neon_element64(rd1, a->vd, 1, MO_64);
2170     tcg_temp_free_i64(rd0);
2171     tcg_temp_free_i64(rd1);
2172
2173     return true;
2174 }
2175
2176 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2177 {
2178     static NeonGenTwoOpWidenFn * const opfn[] = {
2179         gen_helper_neon_abdl_s16,
2180         gen_helper_neon_abdl_s32,
2181         gen_helper_neon_abdl_s64,
2182         NULL,
2183     };
2184
2185     return do_long_3d(s, a, opfn[a->size], NULL);
2186 }
2187
2188 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2189 {
2190     static NeonGenTwoOpWidenFn * const opfn[] = {
2191         gen_helper_neon_abdl_u16,
2192         gen_helper_neon_abdl_u32,
2193         gen_helper_neon_abdl_u64,
2194         NULL,
2195     };
2196
2197     return do_long_3d(s, a, opfn[a->size], NULL);
2198 }
2199
2200 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2201 {
2202     static NeonGenTwoOpWidenFn * const opfn[] = {
2203         gen_helper_neon_abdl_s16,
2204         gen_helper_neon_abdl_s32,
2205         gen_helper_neon_abdl_s64,
2206         NULL,
2207     };
2208     static NeonGenTwo64OpFn * const addfn[] = {
2209         gen_helper_neon_addl_u16,
2210         gen_helper_neon_addl_u32,
2211         tcg_gen_add_i64,
2212         NULL,
2213     };
2214
2215     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2216 }
2217
2218 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2219 {
2220     static NeonGenTwoOpWidenFn * const opfn[] = {
2221         gen_helper_neon_abdl_u16,
2222         gen_helper_neon_abdl_u32,
2223         gen_helper_neon_abdl_u64,
2224         NULL,
2225     };
2226     static NeonGenTwo64OpFn * const addfn[] = {
2227         gen_helper_neon_addl_u16,
2228         gen_helper_neon_addl_u32,
2229         tcg_gen_add_i64,
2230         NULL,
2231     };
2232
2233     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2234 }
2235
2236 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2237 {
2238     TCGv_i32 lo = tcg_temp_new_i32();
2239     TCGv_i32 hi = tcg_temp_new_i32();
2240
2241     tcg_gen_muls2_i32(lo, hi, rn, rm);
2242     tcg_gen_concat_i32_i64(rd, lo, hi);
2243
2244     tcg_temp_free_i32(lo);
2245     tcg_temp_free_i32(hi);
2246 }
2247
2248 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2249 {
2250     TCGv_i32 lo = tcg_temp_new_i32();
2251     TCGv_i32 hi = tcg_temp_new_i32();
2252
2253     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2254     tcg_gen_concat_i32_i64(rd, lo, hi);
2255
2256     tcg_temp_free_i32(lo);
2257     tcg_temp_free_i32(hi);
2258 }
2259
2260 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2261 {
2262     static NeonGenTwoOpWidenFn * const opfn[] = {
2263         gen_helper_neon_mull_s8,
2264         gen_helper_neon_mull_s16,
2265         gen_mull_s32,
2266         NULL,
2267     };
2268
2269     return do_long_3d(s, a, opfn[a->size], NULL);
2270 }
2271
2272 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2273 {
2274     static NeonGenTwoOpWidenFn * const opfn[] = {
2275         gen_helper_neon_mull_u8,
2276         gen_helper_neon_mull_u16,
2277         gen_mull_u32,
2278         NULL,
2279     };
2280
2281     return do_long_3d(s, a, opfn[a->size], NULL);
2282 }
2283
2284 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2285     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2286     {                                                                   \
2287         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2288             gen_helper_neon_##MULL##8,                                  \
2289             gen_helper_neon_##MULL##16,                                 \
2290             gen_##MULL##32,                                             \
2291             NULL,                                                       \
2292         };                                                              \
2293         static NeonGenTwo64OpFn * const accfn[] = {                     \
2294             gen_helper_neon_##ACC##l_u16,                               \
2295             gen_helper_neon_##ACC##l_u32,                               \
2296             tcg_gen_##ACC##_i64,                                        \
2297             NULL,                                                       \
2298         };                                                              \
2299         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2300     }
2301
2302 DO_VMLAL(VMLAL_S,mull_s,add)
2303 DO_VMLAL(VMLAL_U,mull_u,add)
2304 DO_VMLAL(VMLSL_S,mull_s,sub)
2305 DO_VMLAL(VMLSL_U,mull_u,sub)
2306
2307 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2308 {
2309     gen_helper_neon_mull_s16(rd, rn, rm);
2310     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2311 }
2312
2313 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2314 {
2315     gen_mull_s32(rd, rn, rm);
2316     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2317 }
2318
2319 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2320 {
2321     static NeonGenTwoOpWidenFn * const opfn[] = {
2322         NULL,
2323         gen_VQDMULL_16,
2324         gen_VQDMULL_32,
2325         NULL,
2326     };
2327
2328     return do_long_3d(s, a, opfn[a->size], NULL);
2329 }
2330
2331 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2332 {
2333     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2334 }
2335
2336 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2337 {
2338     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2339 }
2340
2341 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2342 {
2343     static NeonGenTwoOpWidenFn * const opfn[] = {
2344         NULL,
2345         gen_VQDMULL_16,
2346         gen_VQDMULL_32,
2347         NULL,
2348     };
2349     static NeonGenTwo64OpFn * const accfn[] = {
2350         NULL,
2351         gen_VQDMLAL_acc_16,
2352         gen_VQDMLAL_acc_32,
2353         NULL,
2354     };
2355
2356     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2357 }
2358
2359 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2360 {
2361     gen_helper_neon_negl_u32(rm, rm);
2362     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2363 }
2364
2365 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2366 {
2367     tcg_gen_neg_i64(rm, rm);
2368     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2369 }
2370
2371 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2372 {
2373     static NeonGenTwoOpWidenFn * const opfn[] = {
2374         NULL,
2375         gen_VQDMULL_16,
2376         gen_VQDMULL_32,
2377         NULL,
2378     };
2379     static NeonGenTwo64OpFn * const accfn[] = {
2380         NULL,
2381         gen_VQDMLSL_acc_16,
2382         gen_VQDMLSL_acc_32,
2383         NULL,
2384     };
2385
2386     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2387 }
2388
2389 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2390 {
2391     gen_helper_gvec_3 *fn_gvec;
2392
2393     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2394         return false;
2395     }
2396
2397     /* UNDEF accesses to D16-D31 if they don't exist. */
2398     if (!dc_isar_feature(aa32_simd_r32, s) &&
2399         ((a->vd | a->vn | a->vm) & 0x10)) {
2400         return false;
2401     }
2402
2403     if (a->vd & 1) {
2404         return false;
2405     }
2406
2407     switch (a->size) {
2408     case 0:
2409         fn_gvec = gen_helper_neon_pmull_h;
2410         break;
2411     case 2:
2412         if (!dc_isar_feature(aa32_pmull, s)) {
2413             return false;
2414         }
2415         fn_gvec = gen_helper_gvec_pmull_q;
2416         break;
2417     default:
2418         return false;
2419     }
2420
2421     if (!vfp_access_check(s)) {
2422         return true;
2423     }
2424
2425     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2426                        neon_full_reg_offset(a->vn),
2427                        neon_full_reg_offset(a->vm),
2428                        16, 16, 0, fn_gvec);
2429     return true;
2430 }
2431
2432 static void gen_neon_dup_low16(TCGv_i32 var)
2433 {
2434     TCGv_i32 tmp = tcg_temp_new_i32();
2435     tcg_gen_ext16u_i32(var, var);
2436     tcg_gen_shli_i32(tmp, var, 16);
2437     tcg_gen_or_i32(var, var, tmp);
2438     tcg_temp_free_i32(tmp);
2439 }
2440
2441 static void gen_neon_dup_high16(TCGv_i32 var)
2442 {
2443     TCGv_i32 tmp = tcg_temp_new_i32();
2444     tcg_gen_andi_i32(var, var, 0xffff0000);
2445     tcg_gen_shri_i32(tmp, var, 16);
2446     tcg_gen_or_i32(var, var, tmp);
2447     tcg_temp_free_i32(tmp);
2448 }
2449
2450 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2451 {
2452     TCGv_i32 tmp = tcg_temp_new_i32();
2453     if (size == MO_16) {
2454         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2455         if (reg & 8) {
2456             gen_neon_dup_high16(tmp);
2457         } else {
2458             gen_neon_dup_low16(tmp);
2459         }
2460     } else {
2461         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2462     }
2463     return tmp;
2464 }
2465
2466 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2467                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2468 {
2469     /*
2470      * Two registers and a scalar: perform an operation between
2471      * the input elements and the scalar, and then possibly
2472      * perform an accumulation operation of that result into the
2473      * destination.
2474      */
2475     TCGv_i32 scalar, tmp;
2476     int pass;
2477
2478     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2479         return false;
2480     }
2481
2482     /* UNDEF accesses to D16-D31 if they don't exist. */
2483     if (!dc_isar_feature(aa32_simd_r32, s) &&
2484         ((a->vd | a->vn | a->vm) & 0x10)) {
2485         return false;
2486     }
2487
2488     if (!opfn) {
2489         /* Bad size (including size == 3, which is a different insn group) */
2490         return false;
2491     }
2492
2493     if (a->q && ((a->vd | a->vn) & 1)) {
2494         return false;
2495     }
2496
2497     if (!vfp_access_check(s)) {
2498         return true;
2499     }
2500
2501     scalar = neon_get_scalar(a->size, a->vm);
2502     tmp = tcg_temp_new_i32();
2503
2504     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2505         read_neon_element32(tmp, a->vn, pass, MO_32);
2506         opfn(tmp, tmp, scalar);
2507         if (accfn) {
2508             TCGv_i32 rd = tcg_temp_new_i32();
2509             read_neon_element32(rd, a->vd, pass, MO_32);
2510             accfn(tmp, rd, tmp);
2511             tcg_temp_free_i32(rd);
2512         }
2513         write_neon_element32(tmp, a->vd, pass, MO_32);
2514     }
2515     tcg_temp_free_i32(tmp);
2516     tcg_temp_free_i32(scalar);
2517     return true;
2518 }
2519
2520 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2521 {
2522     static NeonGenTwoOpFn * const opfn[] = {
2523         NULL,
2524         gen_helper_neon_mul_u16,
2525         tcg_gen_mul_i32,
2526         NULL,
2527     };
2528
2529     return do_2scalar(s, a, opfn[a->size], NULL);
2530 }
2531
2532 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2533 {
2534     static NeonGenTwoOpFn * const opfn[] = {
2535         NULL,
2536         gen_helper_neon_mul_u16,
2537         tcg_gen_mul_i32,
2538         NULL,
2539     };
2540     static NeonGenTwoOpFn * const accfn[] = {
2541         NULL,
2542         gen_helper_neon_add_u16,
2543         tcg_gen_add_i32,
2544         NULL,
2545     };
2546
2547     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2548 }
2549
2550 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2551 {
2552     static NeonGenTwoOpFn * const opfn[] = {
2553         NULL,
2554         gen_helper_neon_mul_u16,
2555         tcg_gen_mul_i32,
2556         NULL,
2557     };
2558     static NeonGenTwoOpFn * const accfn[] = {
2559         NULL,
2560         gen_helper_neon_sub_u16,
2561         tcg_gen_sub_i32,
2562         NULL,
2563     };
2564
2565     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2566 }
2567
2568 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2569                               gen_helper_gvec_3_ptr *fn)
2570 {
2571     /* Two registers and a scalar, using gvec */
2572     int vec_size = a->q ? 16 : 8;
2573     int rd_ofs = neon_full_reg_offset(a->vd);
2574     int rn_ofs = neon_full_reg_offset(a->vn);
2575     int rm_ofs;
2576     int idx;
2577     TCGv_ptr fpstatus;
2578
2579     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2580         return false;
2581     }
2582
2583     /* UNDEF accesses to D16-D31 if they don't exist. */
2584     if (!dc_isar_feature(aa32_simd_r32, s) &&
2585         ((a->vd | a->vn | a->vm) & 0x10)) {
2586         return false;
2587     }
2588
2589     if (!fn) {
2590         /* Bad size (including size == 3, which is a different insn group) */
2591         return false;
2592     }
2593
2594     if (a->q && ((a->vd | a->vn) & 1)) {
2595         return false;
2596     }
2597
2598     if (!vfp_access_check(s)) {
2599         return true;
2600     }
2601
2602     /* a->vm is M:Vm, which encodes both register and index */
2603     idx = extract32(a->vm, a->size + 2, 2);
2604     a->vm = extract32(a->vm, 0, a->size + 2);
2605     rm_ofs = neon_full_reg_offset(a->vm);
2606
2607     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2608     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2609                        vec_size, vec_size, idx, fn);
2610     tcg_temp_free_ptr(fpstatus);
2611     return true;
2612 }
2613
2614 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2615     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2616     {                                                                   \
2617         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2618             NULL,                                                       \
2619             gen_helper_##FUNC##_h,                                      \
2620             gen_helper_##FUNC##_s,                                      \
2621             NULL,                                                       \
2622         };                                                              \
2623         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2624             return false;                                               \
2625         }                                                               \
2626         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2627     }
2628
2629 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2630 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2631 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2632
2633 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2634 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2635 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2636 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2637
2638 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2639 {
2640     static NeonGenTwoOpFn * const opfn[] = {
2641         NULL,
2642         gen_VQDMULH_16,
2643         gen_VQDMULH_32,
2644         NULL,
2645     };
2646
2647     return do_2scalar(s, a, opfn[a->size], NULL);
2648 }
2649
2650 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2651 {
2652     static NeonGenTwoOpFn * const opfn[] = {
2653         NULL,
2654         gen_VQRDMULH_16,
2655         gen_VQRDMULH_32,
2656         NULL,
2657     };
2658
2659     return do_2scalar(s, a, opfn[a->size], NULL);
2660 }
2661
2662 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2663                             NeonGenThreeOpEnvFn *opfn)
2664 {
2665     /*
2666      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2667      * performs a kind of fused op-then-accumulate using a helper
2668      * function that takes all of rd, rn and the scalar at once.
2669      */
2670     TCGv_i32 scalar, rn, rd;
2671     int pass;
2672
2673     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2674         return false;
2675     }
2676
2677     if (!dc_isar_feature(aa32_rdm, s)) {
2678         return false;
2679     }
2680
2681     /* UNDEF accesses to D16-D31 if they don't exist. */
2682     if (!dc_isar_feature(aa32_simd_r32, s) &&
2683         ((a->vd | a->vn | a->vm) & 0x10)) {
2684         return false;
2685     }
2686
2687     if (!opfn) {
2688         /* Bad size (including size == 3, which is a different insn group) */
2689         return false;
2690     }
2691
2692     if (a->q && ((a->vd | a->vn) & 1)) {
2693         return false;
2694     }
2695
2696     if (!vfp_access_check(s)) {
2697         return true;
2698     }
2699
2700     scalar = neon_get_scalar(a->size, a->vm);
2701     rn = tcg_temp_new_i32();
2702     rd = tcg_temp_new_i32();
2703
2704     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2705         read_neon_element32(rn, a->vn, pass, MO_32);
2706         read_neon_element32(rd, a->vd, pass, MO_32);
2707         opfn(rd, cpu_env, rn, scalar, rd);
2708         write_neon_element32(rd, a->vd, pass, MO_32);
2709     }
2710     tcg_temp_free_i32(rn);
2711     tcg_temp_free_i32(rd);
2712     tcg_temp_free_i32(scalar);
2713
2714     return true;
2715 }
2716
2717 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2718 {
2719     static NeonGenThreeOpEnvFn *opfn[] = {
2720         NULL,
2721         gen_helper_neon_qrdmlah_s16,
2722         gen_helper_neon_qrdmlah_s32,
2723         NULL,
2724     };
2725     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2726 }
2727
2728 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2729 {
2730     static NeonGenThreeOpEnvFn *opfn[] = {
2731         NULL,
2732         gen_helper_neon_qrdmlsh_s16,
2733         gen_helper_neon_qrdmlsh_s32,
2734         NULL,
2735     };
2736     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2737 }
2738
2739 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2740                             NeonGenTwoOpWidenFn *opfn,
2741                             NeonGenTwo64OpFn *accfn)
2742 {
2743     /*
2744      * Two registers and a scalar, long operations: perform an
2745      * operation on the input elements and the scalar which produces
2746      * a double-width result, and then possibly perform an accumulation
2747      * operation of that result into the destination.
2748      */
2749     TCGv_i32 scalar, rn;
2750     TCGv_i64 rn0_64, rn1_64;
2751
2752     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2753         return false;
2754     }
2755
2756     /* UNDEF accesses to D16-D31 if they don't exist. */
2757     if (!dc_isar_feature(aa32_simd_r32, s) &&
2758         ((a->vd | a->vn | a->vm) & 0x10)) {
2759         return false;
2760     }
2761
2762     if (!opfn) {
2763         /* Bad size (including size == 3, which is a different insn group) */
2764         return false;
2765     }
2766
2767     if (a->vd & 1) {
2768         return false;
2769     }
2770
2771     if (!vfp_access_check(s)) {
2772         return true;
2773     }
2774
2775     scalar = neon_get_scalar(a->size, a->vm);
2776
2777     /* Load all inputs before writing any outputs, in case of overlap */
2778     rn = tcg_temp_new_i32();
2779     read_neon_element32(rn, a->vn, 0, MO_32);
2780     rn0_64 = tcg_temp_new_i64();
2781     opfn(rn0_64, rn, scalar);
2782
2783     read_neon_element32(rn, a->vn, 1, MO_32);
2784     rn1_64 = tcg_temp_new_i64();
2785     opfn(rn1_64, rn, scalar);
2786     tcg_temp_free_i32(rn);
2787     tcg_temp_free_i32(scalar);
2788
2789     if (accfn) {
2790         TCGv_i64 t64 = tcg_temp_new_i64();
2791         read_neon_element64(t64, a->vd, 0, MO_64);
2792         accfn(rn0_64, t64, rn0_64);
2793         read_neon_element64(t64, a->vd, 1, MO_64);
2794         accfn(rn1_64, t64, rn1_64);
2795         tcg_temp_free_i64(t64);
2796     }
2797
2798     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2799     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2800     tcg_temp_free_i64(rn0_64);
2801     tcg_temp_free_i64(rn1_64);
2802     return true;
2803 }
2804
2805 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2806 {
2807     static NeonGenTwoOpWidenFn * const opfn[] = {
2808         NULL,
2809         gen_helper_neon_mull_s16,
2810         gen_mull_s32,
2811         NULL,
2812     };
2813
2814     return do_2scalar_long(s, a, opfn[a->size], NULL);
2815 }
2816
2817 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2818 {
2819     static NeonGenTwoOpWidenFn * const opfn[] = {
2820         NULL,
2821         gen_helper_neon_mull_u16,
2822         gen_mull_u32,
2823         NULL,
2824     };
2825
2826     return do_2scalar_long(s, a, opfn[a->size], NULL);
2827 }
2828
2829 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2830     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2831     {                                                                   \
2832         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2833             NULL,                                                       \
2834             gen_helper_neon_##MULL##16,                                 \
2835             gen_##MULL##32,                                             \
2836             NULL,                                                       \
2837         };                                                              \
2838         static NeonGenTwo64OpFn * const accfn[] = {                     \
2839             NULL,                                                       \
2840             gen_helper_neon_##ACC##l_u32,                               \
2841             tcg_gen_##ACC##_i64,                                        \
2842             NULL,                                                       \
2843         };                                                              \
2844         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2845     }
2846
2847 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2848 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2849 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2850 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2851
2852 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2853 {
2854     static NeonGenTwoOpWidenFn * const opfn[] = {
2855         NULL,
2856         gen_VQDMULL_16,
2857         gen_VQDMULL_32,
2858         NULL,
2859     };
2860
2861     return do_2scalar_long(s, a, opfn[a->size], NULL);
2862 }
2863
2864 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2865 {
2866     static NeonGenTwoOpWidenFn * const opfn[] = {
2867         NULL,
2868         gen_VQDMULL_16,
2869         gen_VQDMULL_32,
2870         NULL,
2871     };
2872     static NeonGenTwo64OpFn * const accfn[] = {
2873         NULL,
2874         gen_VQDMLAL_acc_16,
2875         gen_VQDMLAL_acc_32,
2876         NULL,
2877     };
2878
2879     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2880 }
2881
2882 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2883 {
2884     static NeonGenTwoOpWidenFn * const opfn[] = {
2885         NULL,
2886         gen_VQDMULL_16,
2887         gen_VQDMULL_32,
2888         NULL,
2889     };
2890     static NeonGenTwo64OpFn * const accfn[] = {
2891         NULL,
2892         gen_VQDMLSL_acc_16,
2893         gen_VQDMLSL_acc_32,
2894         NULL,
2895     };
2896
2897     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2898 }
2899
2900 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2901 {
2902     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2903         return false;
2904     }
2905
2906     /* UNDEF accesses to D16-D31 if they don't exist. */
2907     if (!dc_isar_feature(aa32_simd_r32, s) &&
2908         ((a->vd | a->vn | a->vm) & 0x10)) {
2909         return false;
2910     }
2911
2912     if ((a->vn | a->vm | a->vd) & a->q) {
2913         return false;
2914     }
2915
2916     if (a->imm > 7 && !a->q) {
2917         return false;
2918     }
2919
2920     if (!vfp_access_check(s)) {
2921         return true;
2922     }
2923
2924     if (!a->q) {
2925         /* Extract 64 bits from <Vm:Vn> */
2926         TCGv_i64 left, right, dest;
2927
2928         left = tcg_temp_new_i64();
2929         right = tcg_temp_new_i64();
2930         dest = tcg_temp_new_i64();
2931
2932         read_neon_element64(right, a->vn, 0, MO_64);
2933         read_neon_element64(left, a->vm, 0, MO_64);
2934         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2935         write_neon_element64(dest, a->vd, 0, MO_64);
2936
2937         tcg_temp_free_i64(left);
2938         tcg_temp_free_i64(right);
2939         tcg_temp_free_i64(dest);
2940     } else {
2941         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2942         TCGv_i64 left, middle, right, destleft, destright;
2943
2944         left = tcg_temp_new_i64();
2945         middle = tcg_temp_new_i64();
2946         right = tcg_temp_new_i64();
2947         destleft = tcg_temp_new_i64();
2948         destright = tcg_temp_new_i64();
2949
2950         if (a->imm < 8) {
2951             read_neon_element64(right, a->vn, 0, MO_64);
2952             read_neon_element64(middle, a->vn, 1, MO_64);
2953             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2954             read_neon_element64(left, a->vm, 0, MO_64);
2955             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2956         } else {
2957             read_neon_element64(right, a->vn, 1, MO_64);
2958             read_neon_element64(middle, a->vm, 0, MO_64);
2959             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2960             read_neon_element64(left, a->vm, 1, MO_64);
2961             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2962         }
2963
2964         write_neon_element64(destright, a->vd, 0, MO_64);
2965         write_neon_element64(destleft, a->vd, 1, MO_64);
2966
2967         tcg_temp_free_i64(destright);
2968         tcg_temp_free_i64(destleft);
2969         tcg_temp_free_i64(right);
2970         tcg_temp_free_i64(middle);
2971         tcg_temp_free_i64(left);
2972     }
2973     return true;
2974 }
2975
2976 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2977 {
2978     TCGv_i64 val, def;
2979     TCGv_i32 desc;
2980
2981     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2982         return false;
2983     }
2984
2985     /* UNDEF accesses to D16-D31 if they don't exist. */
2986     if (!dc_isar_feature(aa32_simd_r32, s) &&
2987         ((a->vd | a->vn | a->vm) & 0x10)) {
2988         return false;
2989     }
2990
2991     if ((a->vn + a->len + 1) > 32) {
2992         /*
2993          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2994          * helper function running off the end of the register file.
2995          */
2996         return false;
2997     }
2998
2999     if (!vfp_access_check(s)) {
3000         return true;
3001     }
3002
3003     desc = tcg_const_i32((a->vn << 2) | a->len);
3004     def = tcg_temp_new_i64();
3005     if (a->op) {
3006         read_neon_element64(def, a->vd, 0, MO_64);
3007     } else {
3008         tcg_gen_movi_i64(def, 0);
3009     }
3010     val = tcg_temp_new_i64();
3011     read_neon_element64(val, a->vm, 0, MO_64);
3012
3013     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
3014     write_neon_element64(val, a->vd, 0, MO_64);
3015
3016     tcg_temp_free_i64(def);
3017     tcg_temp_free_i64(val);
3018     tcg_temp_free_i32(desc);
3019     return true;
3020 }
3021
3022 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
3023 {
3024     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3025         return false;
3026     }
3027
3028     /* UNDEF accesses to D16-D31 if they don't exist. */
3029     if (!dc_isar_feature(aa32_simd_r32, s) &&
3030         ((a->vd | a->vm) & 0x10)) {
3031         return false;
3032     }
3033
3034     if (a->vd & a->q) {
3035         return false;
3036     }
3037
3038     if (!vfp_access_check(s)) {
3039         return true;
3040     }
3041
3042     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
3043                          neon_element_offset(a->vm, a->index, a->size),
3044                          a->q ? 16 : 8, a->q ? 16 : 8);
3045     return true;
3046 }
3047
3048 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3049 {
3050     int pass, half;
3051     TCGv_i32 tmp[2];
3052
3053     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3054         return false;
3055     }
3056
3057     /* UNDEF accesses to D16-D31 if they don't exist. */
3058     if (!dc_isar_feature(aa32_simd_r32, s) &&
3059         ((a->vd | a->vm) & 0x10)) {
3060         return false;
3061     }
3062
3063     if ((a->vd | a->vm) & a->q) {
3064         return false;
3065     }
3066
3067     if (a->size == 3) {
3068         return false;
3069     }
3070
3071     if (!vfp_access_check(s)) {
3072         return true;
3073     }
3074
3075     tmp[0] = tcg_temp_new_i32();
3076     tmp[1] = tcg_temp_new_i32();
3077
3078     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3079         for (half = 0; half < 2; half++) {
3080             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
3081             switch (a->size) {
3082             case 0:
3083                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3084                 break;
3085             case 1:
3086                 gen_swap_half(tmp[half], tmp[half]);
3087                 break;
3088             case 2:
3089                 break;
3090             default:
3091                 g_assert_not_reached();
3092             }
3093         }
3094         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3095         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3096     }
3097
3098     tcg_temp_free_i32(tmp[0]);
3099     tcg_temp_free_i32(tmp[1]);
3100     return true;
3101 }
3102
3103 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3104                               NeonGenWidenFn *widenfn,
3105                               NeonGenTwo64OpFn *opfn,
3106                               NeonGenTwo64OpFn *accfn)
3107 {
3108     /*
3109      * Pairwise long operations: widen both halves of the pair,
3110      * combine the pairs with the opfn, and then possibly accumulate
3111      * into the destination with the accfn.
3112      */
3113     int pass;
3114
3115     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3116         return false;
3117     }
3118
3119     /* UNDEF accesses to D16-D31 if they don't exist. */
3120     if (!dc_isar_feature(aa32_simd_r32, s) &&
3121         ((a->vd | a->vm) & 0x10)) {
3122         return false;
3123     }
3124
3125     if ((a->vd | a->vm) & a->q) {
3126         return false;
3127     }
3128
3129     if (!widenfn) {
3130         return false;
3131     }
3132
3133     if (!vfp_access_check(s)) {
3134         return true;
3135     }
3136
3137     for (pass = 0; pass < a->q + 1; pass++) {
3138         TCGv_i32 tmp;
3139         TCGv_i64 rm0_64, rm1_64, rd_64;
3140
3141         rm0_64 = tcg_temp_new_i64();
3142         rm1_64 = tcg_temp_new_i64();
3143         rd_64 = tcg_temp_new_i64();
3144
3145         tmp = tcg_temp_new_i32();
3146         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3147         widenfn(rm0_64, tmp);
3148         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3149         widenfn(rm1_64, tmp);
3150         tcg_temp_free_i32(tmp);
3151
3152         opfn(rd_64, rm0_64, rm1_64);
3153         tcg_temp_free_i64(rm0_64);
3154         tcg_temp_free_i64(rm1_64);
3155
3156         if (accfn) {
3157             TCGv_i64 tmp64 = tcg_temp_new_i64();
3158             read_neon_element64(tmp64, a->vd, pass, MO_64);
3159             accfn(rd_64, tmp64, rd_64);
3160             tcg_temp_free_i64(tmp64);
3161         }
3162         write_neon_element64(rd_64, a->vd, pass, MO_64);
3163         tcg_temp_free_i64(rd_64);
3164     }
3165     return true;
3166 }
3167
3168 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3169 {
3170     static NeonGenWidenFn * const widenfn[] = {
3171         gen_helper_neon_widen_s8,
3172         gen_helper_neon_widen_s16,
3173         tcg_gen_ext_i32_i64,
3174         NULL,
3175     };
3176     static NeonGenTwo64OpFn * const opfn[] = {
3177         gen_helper_neon_paddl_u16,
3178         gen_helper_neon_paddl_u32,
3179         tcg_gen_add_i64,
3180         NULL,
3181     };
3182
3183     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3184 }
3185
3186 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3187 {
3188     static NeonGenWidenFn * const widenfn[] = {
3189         gen_helper_neon_widen_u8,
3190         gen_helper_neon_widen_u16,
3191         tcg_gen_extu_i32_i64,
3192         NULL,
3193     };
3194     static NeonGenTwo64OpFn * const opfn[] = {
3195         gen_helper_neon_paddl_u16,
3196         gen_helper_neon_paddl_u32,
3197         tcg_gen_add_i64,
3198         NULL,
3199     };
3200
3201     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3202 }
3203
3204 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3205 {
3206     static NeonGenWidenFn * const widenfn[] = {
3207         gen_helper_neon_widen_s8,
3208         gen_helper_neon_widen_s16,
3209         tcg_gen_ext_i32_i64,
3210         NULL,
3211     };
3212     static NeonGenTwo64OpFn * const opfn[] = {
3213         gen_helper_neon_paddl_u16,
3214         gen_helper_neon_paddl_u32,
3215         tcg_gen_add_i64,
3216         NULL,
3217     };
3218     static NeonGenTwo64OpFn * const accfn[] = {
3219         gen_helper_neon_addl_u16,
3220         gen_helper_neon_addl_u32,
3221         tcg_gen_add_i64,
3222         NULL,
3223     };
3224
3225     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3226                              accfn[a->size]);
3227 }
3228
3229 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3230 {
3231     static NeonGenWidenFn * const widenfn[] = {
3232         gen_helper_neon_widen_u8,
3233         gen_helper_neon_widen_u16,
3234         tcg_gen_extu_i32_i64,
3235         NULL,
3236     };
3237     static NeonGenTwo64OpFn * const opfn[] = {
3238         gen_helper_neon_paddl_u16,
3239         gen_helper_neon_paddl_u32,
3240         tcg_gen_add_i64,
3241         NULL,
3242     };
3243     static NeonGenTwo64OpFn * const accfn[] = {
3244         gen_helper_neon_addl_u16,
3245         gen_helper_neon_addl_u32,
3246         tcg_gen_add_i64,
3247         NULL,
3248     };
3249
3250     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3251                              accfn[a->size]);
3252 }
3253
3254 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3255
3256 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3257                        ZipFn *fn)
3258 {
3259     TCGv_ptr pd, pm;
3260
3261     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3262         return false;
3263     }
3264
3265     /* UNDEF accesses to D16-D31 if they don't exist. */
3266     if (!dc_isar_feature(aa32_simd_r32, s) &&
3267         ((a->vd | a->vm) & 0x10)) {
3268         return false;
3269     }
3270
3271     if ((a->vd | a->vm) & a->q) {
3272         return false;
3273     }
3274
3275     if (!fn) {
3276         /* Bad size or size/q combination */
3277         return false;
3278     }
3279
3280     if (!vfp_access_check(s)) {
3281         return true;
3282     }
3283
3284     pd = vfp_reg_ptr(true, a->vd);
3285     pm = vfp_reg_ptr(true, a->vm);
3286     fn(pd, pm);
3287     tcg_temp_free_ptr(pd);
3288     tcg_temp_free_ptr(pm);
3289     return true;
3290 }
3291
3292 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3293 {
3294     static ZipFn * const fn[2][4] = {
3295         {
3296             gen_helper_neon_unzip8,
3297             gen_helper_neon_unzip16,
3298             NULL,
3299             NULL,
3300         }, {
3301             gen_helper_neon_qunzip8,
3302             gen_helper_neon_qunzip16,
3303             gen_helper_neon_qunzip32,
3304             NULL,
3305         }
3306     };
3307     return do_zip_uzp(s, a, fn[a->q][a->size]);
3308 }
3309
3310 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3311 {
3312     static ZipFn * const fn[2][4] = {
3313         {
3314             gen_helper_neon_zip8,
3315             gen_helper_neon_zip16,
3316             NULL,
3317             NULL,
3318         }, {
3319             gen_helper_neon_qzip8,
3320             gen_helper_neon_qzip16,
3321             gen_helper_neon_qzip32,
3322             NULL,
3323         }
3324     };
3325     return do_zip_uzp(s, a, fn[a->q][a->size]);
3326 }
3327
3328 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3329                      NeonGenNarrowEnvFn *narrowfn)
3330 {
3331     TCGv_i64 rm;
3332     TCGv_i32 rd0, rd1;
3333
3334     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3335         return false;
3336     }
3337
3338     /* UNDEF accesses to D16-D31 if they don't exist. */
3339     if (!dc_isar_feature(aa32_simd_r32, s) &&
3340         ((a->vd | a->vm) & 0x10)) {
3341         return false;
3342     }
3343
3344     if (a->vm & 1) {
3345         return false;
3346     }
3347
3348     if (!narrowfn) {
3349         return false;
3350     }
3351
3352     if (!vfp_access_check(s)) {
3353         return true;
3354     }
3355
3356     rm = tcg_temp_new_i64();
3357     rd0 = tcg_temp_new_i32();
3358     rd1 = tcg_temp_new_i32();
3359
3360     read_neon_element64(rm, a->vm, 0, MO_64);
3361     narrowfn(rd0, cpu_env, rm);
3362     read_neon_element64(rm, a->vm, 1, MO_64);
3363     narrowfn(rd1, cpu_env, rm);
3364     write_neon_element32(rd0, a->vd, 0, MO_32);
3365     write_neon_element32(rd1, a->vd, 1, MO_32);
3366     tcg_temp_free_i32(rd0);
3367     tcg_temp_free_i32(rd1);
3368     tcg_temp_free_i64(rm);
3369     return true;
3370 }
3371
3372 #define DO_VMOVN(INSN, FUNC)                                    \
3373     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3374     {                                                           \
3375         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3376             FUNC##8,                                            \
3377             FUNC##16,                                           \
3378             FUNC##32,                                           \
3379             NULL,                                               \
3380         };                                                      \
3381         return do_vmovn(s, a, narrowfn[a->size]);               \
3382     }
3383
3384 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3385 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3386 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3387 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3388
3389 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3390 {
3391     TCGv_i32 rm0, rm1;
3392     TCGv_i64 rd;
3393     static NeonGenWidenFn * const widenfns[] = {
3394         gen_helper_neon_widen_u8,
3395         gen_helper_neon_widen_u16,
3396         tcg_gen_extu_i32_i64,
3397         NULL,
3398     };
3399     NeonGenWidenFn *widenfn = widenfns[a->size];
3400
3401     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3402         return false;
3403     }
3404
3405     /* UNDEF accesses to D16-D31 if they don't exist. */
3406     if (!dc_isar_feature(aa32_simd_r32, s) &&
3407         ((a->vd | a->vm) & 0x10)) {
3408         return false;
3409     }
3410
3411     if (a->vd & 1) {
3412         return false;
3413     }
3414
3415     if (!widenfn) {
3416         return false;
3417     }
3418
3419     if (!vfp_access_check(s)) {
3420         return true;
3421     }
3422
3423     rd = tcg_temp_new_i64();
3424     rm0 = tcg_temp_new_i32();
3425     rm1 = tcg_temp_new_i32();
3426
3427     read_neon_element32(rm0, a->vm, 0, MO_32);
3428     read_neon_element32(rm1, a->vm, 1, MO_32);
3429
3430     widenfn(rd, rm0);
3431     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3432     write_neon_element64(rd, a->vd, 0, MO_64);
3433     widenfn(rd, rm1);
3434     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3435     write_neon_element64(rd, a->vd, 1, MO_64);
3436
3437     tcg_temp_free_i64(rd);
3438     tcg_temp_free_i32(rm0);
3439     tcg_temp_free_i32(rm1);
3440     return true;
3441 }
3442
3443 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3444 {
3445     TCGv_ptr fpst;
3446     TCGv_i64 tmp;
3447     TCGv_i32 dst0, dst1;
3448
3449     if (!dc_isar_feature(aa32_bf16, s)) {
3450         return false;
3451     }
3452
3453     /* UNDEF accesses to D16-D31 if they don't exist. */
3454     if (!dc_isar_feature(aa32_simd_r32, s) &&
3455         ((a->vd | a->vm) & 0x10)) {
3456         return false;
3457     }
3458
3459     if ((a->vm & 1) || (a->size != 1)) {
3460         return false;
3461     }
3462
3463     if (!vfp_access_check(s)) {
3464         return true;
3465     }
3466
3467     fpst = fpstatus_ptr(FPST_STD);
3468     tmp = tcg_temp_new_i64();
3469     dst0 = tcg_temp_new_i32();
3470     dst1 = tcg_temp_new_i32();
3471
3472     read_neon_element64(tmp, a->vm, 0, MO_64);
3473     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3474
3475     read_neon_element64(tmp, a->vm, 1, MO_64);
3476     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3477
3478     write_neon_element32(dst0, a->vd, 0, MO_32);
3479     write_neon_element32(dst1, a->vd, 1, MO_32);
3480
3481     tcg_temp_free_i64(tmp);
3482     tcg_temp_free_i32(dst0);
3483     tcg_temp_free_i32(dst1);
3484     tcg_temp_free_ptr(fpst);
3485     return true;
3486 }
3487
3488 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3489 {
3490     TCGv_ptr fpst;
3491     TCGv_i32 ahp, tmp, tmp2, tmp3;
3492
3493     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3494         !dc_isar_feature(aa32_fp16_spconv, s)) {
3495         return false;
3496     }
3497
3498     /* UNDEF accesses to D16-D31 if they don't exist. */
3499     if (!dc_isar_feature(aa32_simd_r32, s) &&
3500         ((a->vd | a->vm) & 0x10)) {
3501         return false;
3502     }
3503
3504     if ((a->vm & 1) || (a->size != 1)) {
3505         return false;
3506     }
3507
3508     if (!vfp_access_check(s)) {
3509         return true;
3510     }
3511
3512     fpst = fpstatus_ptr(FPST_STD);
3513     ahp = get_ahp_flag();
3514     tmp = tcg_temp_new_i32();
3515     read_neon_element32(tmp, a->vm, 0, MO_32);
3516     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3517     tmp2 = tcg_temp_new_i32();
3518     read_neon_element32(tmp2, a->vm, 1, MO_32);
3519     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3520     tcg_gen_shli_i32(tmp2, tmp2, 16);
3521     tcg_gen_or_i32(tmp2, tmp2, tmp);
3522     read_neon_element32(tmp, a->vm, 2, MO_32);
3523     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3524     tmp3 = tcg_temp_new_i32();
3525     read_neon_element32(tmp3, a->vm, 3, MO_32);
3526     write_neon_element32(tmp2, a->vd, 0, MO_32);
3527     tcg_temp_free_i32(tmp2);
3528     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3529     tcg_gen_shli_i32(tmp3, tmp3, 16);
3530     tcg_gen_or_i32(tmp3, tmp3, tmp);
3531     write_neon_element32(tmp3, a->vd, 1, MO_32);
3532     tcg_temp_free_i32(tmp3);
3533     tcg_temp_free_i32(tmp);
3534     tcg_temp_free_i32(ahp);
3535     tcg_temp_free_ptr(fpst);
3536
3537     return true;
3538 }
3539
3540 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3541 {
3542     TCGv_ptr fpst;
3543     TCGv_i32 ahp, tmp, tmp2, tmp3;
3544
3545     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3546         !dc_isar_feature(aa32_fp16_spconv, s)) {
3547         return false;
3548     }
3549
3550     /* UNDEF accesses to D16-D31 if they don't exist. */
3551     if (!dc_isar_feature(aa32_simd_r32, s) &&
3552         ((a->vd | a->vm) & 0x10)) {
3553         return false;
3554     }
3555
3556     if ((a->vd & 1) || (a->size != 1)) {
3557         return false;
3558     }
3559
3560     if (!vfp_access_check(s)) {
3561         return true;
3562     }
3563
3564     fpst = fpstatus_ptr(FPST_STD);
3565     ahp = get_ahp_flag();
3566     tmp3 = tcg_temp_new_i32();
3567     tmp2 = tcg_temp_new_i32();
3568     tmp = tcg_temp_new_i32();
3569     read_neon_element32(tmp, a->vm, 0, MO_32);
3570     read_neon_element32(tmp2, a->vm, 1, MO_32);
3571     tcg_gen_ext16u_i32(tmp3, tmp);
3572     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3573     write_neon_element32(tmp3, a->vd, 0, MO_32);
3574     tcg_gen_shri_i32(tmp, tmp, 16);
3575     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3576     write_neon_element32(tmp, a->vd, 1, MO_32);
3577     tcg_temp_free_i32(tmp);
3578     tcg_gen_ext16u_i32(tmp3, tmp2);
3579     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3580     write_neon_element32(tmp3, a->vd, 2, MO_32);
3581     tcg_temp_free_i32(tmp3);
3582     tcg_gen_shri_i32(tmp2, tmp2, 16);
3583     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3584     write_neon_element32(tmp2, a->vd, 3, MO_32);
3585     tcg_temp_free_i32(tmp2);
3586     tcg_temp_free_i32(ahp);
3587     tcg_temp_free_ptr(fpst);
3588
3589     return true;
3590 }
3591
3592 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3593 {
3594     int vec_size = a->q ? 16 : 8;
3595     int rd_ofs = neon_full_reg_offset(a->vd);
3596     int rm_ofs = neon_full_reg_offset(a->vm);
3597
3598     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3599         return false;
3600     }
3601
3602     /* UNDEF accesses to D16-D31 if they don't exist. */
3603     if (!dc_isar_feature(aa32_simd_r32, s) &&
3604         ((a->vd | a->vm) & 0x10)) {
3605         return false;
3606     }
3607
3608     if (a->size == 3) {
3609         return false;
3610     }
3611
3612     if ((a->vd | a->vm) & a->q) {
3613         return false;
3614     }
3615
3616     if (!vfp_access_check(s)) {
3617         return true;
3618     }
3619
3620     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3621
3622     return true;
3623 }
3624
3625 #define DO_2MISC_VEC(INSN, FN)                                  \
3626     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3627     {                                                           \
3628         return do_2misc_vec(s, a, FN);                          \
3629     }
3630
3631 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3632 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3633 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3634 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3635 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3636 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3637 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3638
3639 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3640 {
3641     if (a->size != 0) {
3642         return false;
3643     }
3644     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3645 }
3646
3647 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3648     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3649                          uint32_t rm_ofs, uint32_t oprsz,               \
3650                          uint32_t maxsz)                                \
3651     {                                                                   \
3652         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3653                            DATA, FUNC);                                 \
3654     }
3655
3656 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3657     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3658                          uint32_t rm_ofs, uint32_t oprsz,               \
3659                          uint32_t maxsz)                                \
3660     {                                                                   \
3661         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3662     }
3663
3664 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3665 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3666 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3667 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3668 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3669 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3670 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3671
3672 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3673     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3674     {                                                           \
3675         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3676             return false;                                       \
3677         }                                                       \
3678         return do_2misc_vec(s, a, gen_##INSN);                  \
3679     }
3680
3681 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3682 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3683 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3684 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3685 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3686 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3687 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3688
3689 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3690 {
3691     TCGv_i32 tmp;
3692     int pass;
3693
3694     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3695     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3696         return false;
3697     }
3698
3699     /* UNDEF accesses to D16-D31 if they don't exist. */
3700     if (!dc_isar_feature(aa32_simd_r32, s) &&
3701         ((a->vd | a->vm) & 0x10)) {
3702         return false;
3703     }
3704
3705     if (!fn) {
3706         return false;
3707     }
3708
3709     if ((a->vd | a->vm) & a->q) {
3710         return false;
3711     }
3712
3713     if (!vfp_access_check(s)) {
3714         return true;
3715     }
3716
3717     tmp = tcg_temp_new_i32();
3718     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3719         read_neon_element32(tmp, a->vm, pass, MO_32);
3720         fn(tmp, tmp);
3721         write_neon_element32(tmp, a->vd, pass, MO_32);
3722     }
3723     tcg_temp_free_i32(tmp);
3724
3725     return true;
3726 }
3727
3728 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3729 {
3730     static NeonGenOneOpFn * const fn[] = {
3731         tcg_gen_bswap32_i32,
3732         gen_swap_half,
3733         NULL,
3734         NULL,
3735     };
3736     return do_2misc(s, a, fn[a->size]);
3737 }
3738
3739 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3740 {
3741     if (a->size != 0) {
3742         return false;
3743     }
3744     return do_2misc(s, a, gen_rev16);
3745 }
3746
3747 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3748 {
3749     static NeonGenOneOpFn * const fn[] = {
3750         gen_helper_neon_cls_s8,
3751         gen_helper_neon_cls_s16,
3752         gen_helper_neon_cls_s32,
3753         NULL,
3754     };
3755     return do_2misc(s, a, fn[a->size]);
3756 }
3757
3758 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3759 {
3760     tcg_gen_clzi_i32(rd, rm, 32);
3761 }
3762
3763 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3764 {
3765     static NeonGenOneOpFn * const fn[] = {
3766         gen_helper_neon_clz_u8,
3767         gen_helper_neon_clz_u16,
3768         do_VCLZ_32,
3769         NULL,
3770     };
3771     return do_2misc(s, a, fn[a->size]);
3772 }
3773
3774 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3775 {
3776     if (a->size != 0) {
3777         return false;
3778     }
3779     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3780 }
3781
3782 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3783                        uint32_t oprsz, uint32_t maxsz)
3784 {
3785     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3786                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3787                       oprsz, maxsz);
3788 }
3789
3790 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3791 {
3792     if (a->size == MO_16) {
3793         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3794             return false;
3795         }
3796     } else if (a->size != MO_32) {
3797         return false;
3798     }
3799     return do_2misc_vec(s, a, gen_VABS_F);
3800 }
3801
3802 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3803                        uint32_t oprsz, uint32_t maxsz)
3804 {
3805     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3806                       vece == MO_16 ? 0x8000 : 0x80000000,
3807                       oprsz, maxsz);
3808 }
3809
3810 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3811 {
3812     if (a->size == MO_16) {
3813         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3814             return false;
3815         }
3816     } else if (a->size != MO_32) {
3817         return false;
3818     }
3819     return do_2misc_vec(s, a, gen_VNEG_F);
3820 }
3821
3822 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3823 {
3824     if (a->size != 2) {
3825         return false;
3826     }
3827     return do_2misc(s, a, gen_helper_recpe_u32);
3828 }
3829
3830 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3831 {
3832     if (a->size != 2) {
3833         return false;
3834     }
3835     return do_2misc(s, a, gen_helper_rsqrte_u32);
3836 }
3837
3838 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3839     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3840     {                                                   \
3841         FUNC(d, cpu_env, m);                            \
3842     }
3843
3844 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3845 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3846 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3847 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3848 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3849 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3850
3851 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3852 {
3853     static NeonGenOneOpFn * const fn[] = {
3854         gen_VQABS_s8,
3855         gen_VQABS_s16,
3856         gen_VQABS_s32,
3857         NULL,
3858     };
3859     return do_2misc(s, a, fn[a->size]);
3860 }
3861
3862 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3863 {
3864     static NeonGenOneOpFn * const fn[] = {
3865         gen_VQNEG_s8,
3866         gen_VQNEG_s16,
3867         gen_VQNEG_s32,
3868         NULL,
3869     };
3870     return do_2misc(s, a, fn[a->size]);
3871 }
3872
3873 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3874     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3875                            uint32_t rm_ofs,                             \
3876                            uint32_t oprsz, uint32_t maxsz)              \
3877     {                                                                   \
3878         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3879             NULL, HFUNC, SFUNC, NULL,                                   \
3880         };                                                              \
3881         TCGv_ptr fpst;                                                  \
3882         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3883         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3884                            fns[vece]);                                  \
3885         tcg_temp_free_ptr(fpst);                                        \
3886     }                                                                   \
3887     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3888     {                                                                   \
3889         if (a->size == MO_16) {                                         \
3890             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3891                 return false;                                           \
3892             }                                                           \
3893         } else if (a->size != MO_32) {                                  \
3894             return false;                                               \
3895         }                                                               \
3896         return do_2misc_vec(s, a, gen_##INSN);                          \
3897     }
3898
3899 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3900 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3901 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3902 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3903 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3904 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3905 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3906 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3907 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3908 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3909 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3910
3911 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3912
3913 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3914 {
3915     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3916         return false;
3917     }
3918     return trans_VRINTX_impl(s, a);
3919 }
3920
3921 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3922     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3923                            uint32_t rm_ofs,                             \
3924                            uint32_t oprsz, uint32_t maxsz)              \
3925     {                                                                   \
3926         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3927             NULL,                                                       \
3928             gen_helper_gvec_##OP##h,                                    \
3929             gen_helper_gvec_##OP##s,                                    \
3930             NULL,                                                       \
3931         };                                                              \
3932         TCGv_ptr fpst;                                                  \
3933         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3934         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3935                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3936         tcg_temp_free_ptr(fpst);                                        \
3937     }                                                                   \
3938     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3939     {                                                                   \
3940         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3941             return false;                                               \
3942         }                                                               \
3943         if (a->size == MO_16) {                                         \
3944             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3945                 return false;                                           \
3946             }                                                           \
3947         } else if (a->size != MO_32) {                                  \
3948             return false;                                               \
3949         }                                                               \
3950         return do_2misc_vec(s, a, gen_##INSN);                          \
3951     }
3952
3953 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3954 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3955 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3956 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3957 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3958 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3959 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3960 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3961
3962 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3963 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3964 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3965 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3966 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3967
3968 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3969 {
3970     TCGv_i64 rm, rd;
3971     int pass;
3972
3973     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3974         return false;
3975     }
3976
3977     /* UNDEF accesses to D16-D31 if they don't exist. */
3978     if (!dc_isar_feature(aa32_simd_r32, s) &&
3979         ((a->vd | a->vm) & 0x10)) {
3980         return false;
3981     }
3982
3983     if (a->size != 0) {
3984         return false;
3985     }
3986
3987     if ((a->vd | a->vm) & a->q) {
3988         return false;
3989     }
3990
3991     if (!vfp_access_check(s)) {
3992         return true;
3993     }
3994
3995     rm = tcg_temp_new_i64();
3996     rd = tcg_temp_new_i64();
3997     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3998         read_neon_element64(rm, a->vm, pass, MO_64);
3999         read_neon_element64(rd, a->vd, pass, MO_64);
4000         write_neon_element64(rm, a->vd, pass, MO_64);
4001         write_neon_element64(rd, a->vm, pass, MO_64);
4002     }
4003     tcg_temp_free_i64(rm);
4004     tcg_temp_free_i64(rd);
4005
4006     return true;
4007 }
4008 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
4009 {
4010     TCGv_i32 rd, tmp;
4011
4012     rd = tcg_temp_new_i32();
4013     tmp = tcg_temp_new_i32();
4014
4015     tcg_gen_shli_i32(rd, t0, 8);
4016     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
4017     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
4018     tcg_gen_or_i32(rd, rd, tmp);
4019
4020     tcg_gen_shri_i32(t1, t1, 8);
4021     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
4022     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
4023     tcg_gen_or_i32(t1, t1, tmp);
4024     tcg_gen_mov_i32(t0, rd);
4025
4026     tcg_temp_free_i32(tmp);
4027     tcg_temp_free_i32(rd);
4028 }
4029
4030 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
4031 {
4032     TCGv_i32 rd, tmp;
4033
4034     rd = tcg_temp_new_i32();
4035     tmp = tcg_temp_new_i32();
4036
4037     tcg_gen_shli_i32(rd, t0, 16);
4038     tcg_gen_andi_i32(tmp, t1, 0xffff);
4039     tcg_gen_or_i32(rd, rd, tmp);
4040     tcg_gen_shri_i32(t1, t1, 16);
4041     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
4042     tcg_gen_or_i32(t1, t1, tmp);
4043     tcg_gen_mov_i32(t0, rd);
4044
4045     tcg_temp_free_i32(tmp);
4046     tcg_temp_free_i32(rd);
4047 }
4048
4049 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
4050 {
4051     TCGv_i32 tmp, tmp2;
4052     int pass;
4053
4054     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
4055         return false;
4056     }
4057
4058     /* UNDEF accesses to D16-D31 if they don't exist. */
4059     if (!dc_isar_feature(aa32_simd_r32, s) &&
4060         ((a->vd | a->vm) & 0x10)) {
4061         return false;
4062     }
4063
4064     if ((a->vd | a->vm) & a->q) {
4065         return false;
4066     }
4067
4068     if (a->size == 3) {
4069         return false;
4070     }
4071
4072     if (!vfp_access_check(s)) {
4073         return true;
4074     }
4075
4076     tmp = tcg_temp_new_i32();
4077     tmp2 = tcg_temp_new_i32();
4078     if (a->size == MO_32) {
4079         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
4080             read_neon_element32(tmp, a->vm, pass, MO_32);
4081             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
4082             write_neon_element32(tmp2, a->vm, pass, MO_32);
4083             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
4084         }
4085     } else {
4086         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4087             read_neon_element32(tmp, a->vm, pass, MO_32);
4088             read_neon_element32(tmp2, a->vd, pass, MO_32);
4089             if (a->size == MO_8) {
4090                 gen_neon_trn_u8(tmp, tmp2);
4091             } else {
4092                 gen_neon_trn_u16(tmp, tmp2);
4093             }
4094             write_neon_element32(tmp2, a->vm, pass, MO_32);
4095             write_neon_element32(tmp, a->vd, pass, MO_32);
4096         }
4097     }
4098     tcg_temp_free_i32(tmp);
4099     tcg_temp_free_i32(tmp2);
4100     return true;
4101 }
4102
4103 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4104 {
4105     if (!dc_isar_feature(aa32_i8mm, s)) {
4106         return false;
4107     }
4108     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4109                         gen_helper_gvec_smmla_b);
4110 }
4111
4112 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4113 {
4114     if (!dc_isar_feature(aa32_i8mm, s)) {
4115         return false;
4116     }
4117     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4118                         gen_helper_gvec_ummla_b);
4119 }
4120
4121 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4122 {
4123     if (!dc_isar_feature(aa32_i8mm, s)) {
4124         return false;
4125     }
4126     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4127                         gen_helper_gvec_usmmla_b);
4128 }
4129
4130 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4131 {
4132     if (!dc_isar_feature(aa32_bf16, s)) {
4133         return false;
4134     }
4135     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4136                         gen_helper_gvec_bfmmla);
4137 }
4138
4139 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4140 {
4141     if (!dc_isar_feature(aa32_bf16, s)) {
4142         return false;
4143     }
4144     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4145                              gen_helper_gvec_bfmlal);
4146 }
4147
4148 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4149 {
4150     if (!dc_isar_feature(aa32_bf16, s)) {
4151         return false;
4152     }
4153     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4154                              (a->index << 1) | a->q, FPST_STD,
4155                              gen_helper_gvec_bfmlal_idx);
4156 }