target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 /* Include the generated Neon decoder */
  32 #include "decode-neon-dp.c.inc"
  33 #include "decode-neon-ls.c.inc"
  34 #include "decode-neon-shared.c.inc"
  35
  36 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  37 {
  38     TCGv_ptr ret = tcg_temp_new_ptr();
  39     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  40     return ret;
  41 }
  42
  43 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  44 {
  45     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  46
  47     switch (mop) {
  48     case MO_UB:
  49         tcg_gen_ld8u_i32(var, cpu_env, offset);
  50         break;
  51     case MO_UW:
  52         tcg_gen_ld16u_i32(var, cpu_env, offset);
  53         break;
  54     case MO_UL:
  55         tcg_gen_ld_i32(var, cpu_env, offset);
  56         break;
  57     default:
  58         g_assert_not_reached();
  59     }
  60 }
  61
  62 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  63 {
  64     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  65
  66     switch (mop) {
  67     case MO_UB:
  68         tcg_gen_ld8u_i64(var, cpu_env, offset);
  69         break;
  70     case MO_UW:
  71         tcg_gen_ld16u_i64(var, cpu_env, offset);
  72         break;
  73     case MO_UL:
  74         tcg_gen_ld32u_i64(var, cpu_env, offset);
  75         break;
  76     case MO_UQ:
  77         tcg_gen_ld_i64(var, cpu_env, offset);
  78         break;
  79     default:
  80         g_assert_not_reached();
  81     }
  82 }
  83
  84 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  85 {
  86     long offset = neon_element_offset(reg, ele, size);
  87
  88     switch (size) {
  89     case MO_8:
  90         tcg_gen_st8_i32(var, cpu_env, offset);
  91         break;
  92     case MO_16:
  93         tcg_gen_st16_i32(var, cpu_env, offset);
  94         break;
  95     case MO_32:
  96         tcg_gen_st_i32(var, cpu_env, offset);
  97         break;
  98     default:
  99         g_assert_not_reached();
 100     }
 101 }
 102
 103 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 104 {
 105     long offset = neon_element_offset(reg, ele, size);
 106
 107     switch (size) {
 108     case MO_8:
 109         tcg_gen_st8_i64(var, cpu_env, offset);
 110         break;
 111     case MO_16:
 112         tcg_gen_st16_i64(var, cpu_env, offset);
 113         break;
 114     case MO_32:
 115         tcg_gen_st32_i64(var, cpu_env, offset);
 116         break;
 117     case MO_64:
 118         tcg_gen_st_i64(var, cpu_env, offset);
 119         break;
 120     default:
 121         g_assert_not_reached();
 122     }
 123 }
 124
 125 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 126                          int data, gen_helper_gvec_4 *fn_gvec)
 127 {
 128     /* UNDEF accesses to D16-D31 if they don't exist. */
 129     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 130         return false;
 131     }
 132
 133     /*
 134      * UNDEF accesses to odd registers for each bit of Q.
 135      * Q will be 0b111 for all Q-reg instructions, otherwise
 136      * when we have mixed Q- and D-reg inputs.
 137      */
 138     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 139         return false;
 140     }
 141
 142     if (!vfp_access_check(s)) {
 143         return true;
 144     }
 145
 146     int opr_sz = q ? 16 : 8;
 147     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 148                        vfp_reg_offset(1, vn),
 149                        vfp_reg_offset(1, vm),
 150                        vfp_reg_offset(1, vd),
 151                        opr_sz, opr_sz, data, fn_gvec);
 152     return true;
 153 }
 154
 155 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 156                               int data, ARMFPStatusFlavour fp_flavour,
 157                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 158 {
 159     /* UNDEF accesses to D16-D31 if they don't exist. */
 160     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 161         return false;
 162     }
 163
 164     /*
 165      * UNDEF accesses to odd registers for each bit of Q.
 166      * Q will be 0b111 for all Q-reg instructions, otherwise
 167      * when we have mixed Q- and D-reg inputs.
 168      */
 169     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 170         return false;
 171     }
 172
 173     if (!vfp_access_check(s)) {
 174         return true;
 175     }
 176
 177     int opr_sz = q ? 16 : 8;
 178     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 179
 180     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 181                        vfp_reg_offset(1, vn),
 182                        vfp_reg_offset(1, vm),
 183                        vfp_reg_offset(1, vd),
 184                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 185     tcg_temp_free_ptr(fpst);
 186     return true;
 187 }
 188
 189 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 190 {
 191     if (!dc_isar_feature(aa32_vcma, s)) {
 192         return false;
 193     }
 194     if (a->size == MO_16) {
 195         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 196             return false;
 197         }
 198         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 199                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 200     }
 201     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 202                              FPST_STD, gen_helper_gvec_fcmlas);
 203 }
 204
 205 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 206 {
 207     int opr_sz;
 208     TCGv_ptr fpst;
 209     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 210
 211     if (!dc_isar_feature(aa32_vcma, s)
 212         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 213         return false;
 214     }
 215
 216     /* UNDEF accesses to D16-D31 if they don't exist. */
 217     if (!dc_isar_feature(aa32_simd_r32, s) &&
 218         ((a->vd | a->vn | a->vm) & 0x10)) {
 219         return false;
 220     }
 221
 222     if ((a->vn | a->vm | a->vd) & a->q) {
 223         return false;
 224     }
 225
 226     if (!vfp_access_check(s)) {
 227         return true;
 228     }
 229
 230     opr_sz = (1 + a->q) * 8;
 231     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 232     fn_gvec_ptr = (a->size == MO_16) ?
 233         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 234     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 235                        vfp_reg_offset(1, a->vn),
 236                        vfp_reg_offset(1, a->vm),
 237                        fpst, opr_sz, opr_sz, a->rot,
 238                        fn_gvec_ptr);
 239     tcg_temp_free_ptr(fpst);
 240     return true;
 241 }
 242
 243 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 244 {
 245     if (!dc_isar_feature(aa32_dp, s)) {
 246         return false;
 247     }
 248     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 249                         gen_helper_gvec_sdot_b);
 250 }
 251
 252 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 253 {
 254     if (!dc_isar_feature(aa32_dp, s)) {
 255         return false;
 256     }
 257     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 258                         gen_helper_gvec_udot_b);
 259 }
 260
 261 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 262 {
 263     if (!dc_isar_feature(aa32_i8mm, s)) {
 264         return false;
 265     }
 266     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 267                         gen_helper_gvec_usdot_b);
 268 }
 269
 270 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 271 {
 272     if (!dc_isar_feature(aa32_bf16, s)) {
 273         return false;
 274     }
 275     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 276                         gen_helper_gvec_bfdot);
 277 }
 278
 279 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 280 {
 281     int opr_sz;
 282
 283     if (!dc_isar_feature(aa32_fhm, s)) {
 284         return false;
 285     }
 286
 287     /* UNDEF accesses to D16-D31 if they don't exist. */
 288     if (!dc_isar_feature(aa32_simd_r32, s) &&
 289         (a->vd & 0x10)) {
 290         return false;
 291     }
 292
 293     if (a->vd & a->q) {
 294         return false;
 295     }
 296
 297     if (!vfp_access_check(s)) {
 298         return true;
 299     }
 300
 301     opr_sz = (1 + a->q) * 8;
 302     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 303                        vfp_reg_offset(a->q, a->vn),
 304                        vfp_reg_offset(a->q, a->vm),
 305                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 306                        gen_helper_gvec_fmlal_a32);
 307     return true;
 308 }
 309
 310 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 311 {
 312     int data = (a->index << 2) | a->rot;
 313
 314     if (!dc_isar_feature(aa32_vcma, s)) {
 315         return false;
 316     }
 317     if (a->size == MO_16) {
 318         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 319             return false;
 320         }
 321         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 322                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 323     }
 324     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 325                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 326 }
 327
 328 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 329 {
 330     if (!dc_isar_feature(aa32_dp, s)) {
 331         return false;
 332     }
 333     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 334                         gen_helper_gvec_sdot_idx_b);
 335 }
 336
 337 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 338 {
 339     if (!dc_isar_feature(aa32_dp, s)) {
 340         return false;
 341     }
 342     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 343                         gen_helper_gvec_udot_idx_b);
 344 }
 345
 346 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 347 {
 348     if (!dc_isar_feature(aa32_i8mm, s)) {
 349         return false;
 350     }
 351     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 352                         gen_helper_gvec_usdot_idx_b);
 353 }
 354
 355 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 356 {
 357     if (!dc_isar_feature(aa32_i8mm, s)) {
 358         return false;
 359     }
 360     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 361                         gen_helper_gvec_sudot_idx_b);
 362 }
 363
 364 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 365 {
 366     if (!dc_isar_feature(aa32_bf16, s)) {
 367         return false;
 368     }
 369     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 370                         gen_helper_gvec_bfdot_idx);
 371 }
 372
 373 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 374 {
 375     int opr_sz;
 376
 377     if (!dc_isar_feature(aa32_fhm, s)) {
 378         return false;
 379     }
 380
 381     /* UNDEF accesses to D16-D31 if they don't exist. */
 382     if (!dc_isar_feature(aa32_simd_r32, s) &&
 383         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 384         return false;
 385     }
 386
 387     if (a->vd & a->q) {
 388         return false;
 389     }
 390
 391     if (!vfp_access_check(s)) {
 392         return true;
 393     }
 394
 395     opr_sz = (1 + a->q) * 8;
 396     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 397                        vfp_reg_offset(a->q, a->vn),
 398                        vfp_reg_offset(a->q, a->rm),
 399                        cpu_env, opr_sz, opr_sz,
 400                        (a->index << 2) | a->s, /* is_2 == 0 */
 401                        gen_helper_gvec_fmlal_idx_a32);
 402     return true;
 403 }
 404
 405 static struct {
 406     int nregs;
 407     int interleave;
 408     int spacing;
 409 } const neon_ls_element_type[11] = {
 410     {1, 4, 1},
 411     {1, 4, 2},
 412     {4, 1, 1},
 413     {2, 2, 2},
 414     {1, 3, 1},
 415     {1, 3, 2},
 416     {3, 1, 1},
 417     {1, 1, 1},
 418     {1, 2, 1},
 419     {1, 2, 2},
 420     {2, 1, 1}
 421 };
 422
 423 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 424                                       int stride)
 425 {
 426     if (rm != 15) {
 427         TCGv_i32 base;
 428
 429         base = load_reg(s, rn);
 430         if (rm == 13) {
 431             tcg_gen_addi_i32(base, base, stride);
 432         } else {
 433             TCGv_i32 index;
 434             index = load_reg(s, rm);
 435             tcg_gen_add_i32(base, base, index);
 436             tcg_temp_free_i32(index);
 437         }
 438         store_reg(s, rn, base);
 439     }
 440 }
 441
 442 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 443 {
 444     /* Neon load/store multiple structures */
 445     int nregs, interleave, spacing, reg, n;
 446     MemOp mop, align, endian;
 447     int mmu_idx = get_mem_index(s);
 448     int size = a->size;
 449     TCGv_i64 tmp64;
 450     TCGv_i32 addr;
 451
 452     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 453         return false;
 454     }
 455
 456     /* UNDEF accesses to D16-D31 if they don't exist */
 457     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 458         return false;
 459     }
 460     if (a->itype > 10) {
 461         return false;
 462     }
 463     /* Catch UNDEF cases for bad values of align field */
 464     switch (a->itype & 0xc) {
 465     case 4:
 466         if (a->align >= 2) {
 467             return false;
 468         }
 469         break;
 470     case 8:
 471         if (a->align == 3) {
 472             return false;
 473         }
 474         break;
 475     default:
 476         break;
 477     }
 478     nregs = neon_ls_element_type[a->itype].nregs;
 479     interleave = neon_ls_element_type[a->itype].interleave;
 480     spacing = neon_ls_element_type[a->itype].spacing;
 481     if (size == 3 && (interleave | spacing) != 1) {
 482         return false;
 483     }
 484
 485     if (!vfp_access_check(s)) {
 486         return true;
 487     }
 488
 489     /* For our purposes, bytes are always little-endian.  */
 490     endian = s->be_data;
 491     if (size == 0) {
 492         endian = MO_LE;
 493     }
 494
 495     /* Enforce alignment requested by the instruction */
 496     if (a->align) {
 497         align = pow2_align(a->align + 2); /* 4 ** a->align */
 498     } else {
 499         align = s->align_mem ? MO_ALIGN : 0;
 500     }
 501
 502     /*
 503      * Consecutive little-endian elements from a single register
 504      * can be promoted to a larger little-endian operation.
 505      */
 506     if (interleave == 1 && endian == MO_LE) {
 507         /* Retain any natural alignment. */
 508         if (align == MO_ALIGN) {
 509             align = pow2_align(size);
 510         }
 511         size = 3;
 512     }
 513
 514     tmp64 = tcg_temp_new_i64();
 515     addr = tcg_temp_new_i32();
 516     load_reg_var(s, addr, a->rn);
 517
 518     mop = endian | size | align;
 519     for (reg = 0; reg < nregs; reg++) {
 520         for (n = 0; n < 8 >> size; n++) {
 521             int xs;
 522             for (xs = 0; xs < interleave; xs++) {
 523                 int tt = a->vd + reg + spacing * xs;
 524
 525                 if (a->l) {
 526                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 527                     neon_store_element64(tt, n, size, tmp64);
 528                 } else {
 529                     neon_load_element64(tmp64, tt, n, size);
 530                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 531                 }
 532                 tcg_gen_addi_i32(addr, addr, 1 << size);
 533
 534                 /* Subsequent memory operations inherit alignment */
 535                 mop &= ~MO_AMASK;
 536             }
 537         }
 538     }
 539     tcg_temp_free_i32(addr);
 540     tcg_temp_free_i64(tmp64);
 541
 542     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 543     return true;
 544 }
 545
 546 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 547 {
 548     /* Neon load single structure to all lanes */
 549     int reg, stride, vec_size;
 550     int vd = a->vd;
 551     int size = a->size;
 552     int nregs = a->n + 1;
 553     TCGv_i32 addr, tmp;
 554     MemOp mop, align;
 555
 556     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 557         return false;
 558     }
 559
 560     /* UNDEF accesses to D16-D31 if they don't exist */
 561     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 562         return false;
 563     }
 564
 565     align = 0;
 566     if (size == 3) {
 567         if (nregs != 4 || a->a == 0) {
 568             return false;
 569         }
 570         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 571         size = MO_32;
 572         align = MO_ALIGN_16;
 573     } else if (a->a) {
 574         switch (nregs) {
 575         case 1:
 576             if (size == 0) {
 577                 return false;
 578             }
 579             align = MO_ALIGN;
 580             break;
 581         case 2:
 582             align = pow2_align(size + 1);
 583             break;
 584         case 3:
 585             return false;
 586         case 4:
 587             if (size == 2) {
 588                 align = pow2_align(3);
 589             } else {
 590                 align = pow2_align(size + 2);
 591             }
 592             break;
 593         default:
 594             g_assert_not_reached();
 595         }
 596     }
 597
 598     if (!vfp_access_check(s)) {
 599         return true;
 600     }
 601
 602     /*
 603      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 604      * VLD2/3/4 to all lanes: T bit indicates register stride.
 605      */
 606     stride = a->t ? 2 : 1;
 607     vec_size = nregs == 1 ? stride * 8 : 8;
 608     mop = size | align;
 609     tmp = tcg_temp_new_i32();
 610     addr = tcg_temp_new_i32();
 611     load_reg_var(s, addr, a->rn);
 612     for (reg = 0; reg < nregs; reg++) {
 613         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 614         if ((vd & 1) && vec_size == 16) {
 615             /*
 616              * We cannot write 16 bytes at once because the
 617              * destination is unaligned.
 618              */
 619             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 620                                  8, 8, tmp);
 621             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 622                              neon_full_reg_offset(vd), 8, 8);
 623         } else {
 624             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 625                                  vec_size, vec_size, tmp);
 626         }
 627         tcg_gen_addi_i32(addr, addr, 1 << size);
 628         vd += stride;
 629
 630         /* Subsequent memory operations inherit alignment */
 631         mop &= ~MO_AMASK;
 632     }
 633     tcg_temp_free_i32(tmp);
 634     tcg_temp_free_i32(addr);
 635
 636     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 637
 638     return true;
 639 }
 640
 641 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 642 {
 643     /* Neon load/store single structure to one lane */
 644     int reg;
 645     int nregs = a->n + 1;
 646     int vd = a->vd;
 647     TCGv_i32 addr, tmp;
 648     MemOp mop;
 649
 650     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 651         return false;
 652     }
 653
 654     /* UNDEF accesses to D16-D31 if they don't exist */
 655     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 656         return false;
 657     }
 658
 659     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 660     switch (nregs) {
 661     case 1:
 662         if (a->stride != 1) {
 663             return false;
 664         }
 665         if (((a->align & (1 << a->size)) != 0) ||
 666             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 667             return false;
 668         }
 669         break;
 670     case 2:
 671         if (a->size == 2 && (a->align & 2) != 0) {
 672             return false;
 673         }
 674         break;
 675     case 3:
 676         if (a->align != 0) {
 677             return false;
 678         }
 679         break;
 680     case 4:
 681         if (a->size == 2 && a->align == 3) {
 682             return false;
 683         }
 684         break;
 685     default:
 686         g_assert_not_reached();
 687     }
 688     if ((vd + a->stride * (nregs - 1)) > 31) {
 689         /*
 690          * Attempts to write off the end of the register file are
 691          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 692          * access off the end of the array that holds the register data.
 693          */
 694         return false;
 695     }
 696
 697     if (!vfp_access_check(s)) {
 698         return true;
 699     }
 700
 701     /* Pick up SCTLR settings */
 702     mop = finalize_memop(s, a->size);
 703
 704     if (a->align) {
 705         MemOp align_op;
 706
 707         switch (nregs) {
 708         case 1:
 709             /* For VLD1, use natural alignment. */
 710             align_op = MO_ALIGN;
 711             break;
 712         case 2:
 713             /* For VLD2, use double alignment. */
 714             align_op = pow2_align(a->size + 1);
 715             break;
 716         case 4:
 717             if (a->size == MO_32) {
 718                 /*
 719                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 720                  * quad alignment; align = 3 is rejected above.
 721                  */
 722                 align_op = pow2_align(a->size + a->align);
 723             } else {
 724                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 725                 align_op = pow2_align(a->size + 2);
 726             }
 727             break;
 728         default:
 729             /* For VLD3, the alignment field is zero and rejected above. */
 730             g_assert_not_reached();
 731         }
 732
 733         mop = (mop & ~MO_AMASK) | align_op;
 734     }
 735
 736     tmp = tcg_temp_new_i32();
 737     addr = tcg_temp_new_i32();
 738     load_reg_var(s, addr, a->rn);
 739
 740     for (reg = 0; reg < nregs; reg++) {
 741         if (a->l) {
 742             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 743             neon_store_element(vd, a->reg_idx, a->size, tmp);
 744         } else { /* Store */
 745             neon_load_element(tmp, vd, a->reg_idx, a->size);
 746             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 747         }
 748         vd += a->stride;
 749         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 750
 751         /* Subsequent memory operations inherit alignment */
 752         mop &= ~MO_AMASK;
 753     }
 754     tcg_temp_free_i32(addr);
 755     tcg_temp_free_i32(tmp);
 756
 757     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 758
 759     return true;
 760 }
 761
 762 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 763 {
 764     int vec_size = a->q ? 16 : 8;
 765     int rd_ofs = neon_full_reg_offset(a->vd);
 766     int rn_ofs = neon_full_reg_offset(a->vn);
 767     int rm_ofs = neon_full_reg_offset(a->vm);
 768
 769     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 770         return false;
 771     }
 772
 773     /* UNDEF accesses to D16-D31 if they don't exist. */
 774     if (!dc_isar_feature(aa32_simd_r32, s) &&
 775         ((a->vd | a->vn | a->vm) & 0x10)) {
 776         return false;
 777     }
 778
 779     if ((a->vn | a->vm | a->vd) & a->q) {
 780         return false;
 781     }
 782
 783     if (!vfp_access_check(s)) {
 784         return true;
 785     }
 786
 787     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 788     return true;
 789 }
 790
 791 #define DO_3SAME(INSN, FUNC)                                            \
 792     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 793     {                                                                   \
 794         return do_3same(s, a, FUNC);                                    \
 795     }
 796
 797 DO_3SAME(VADD, tcg_gen_gvec_add)
 798 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 799 DO_3SAME(VAND, tcg_gen_gvec_and)
 800 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 801 DO_3SAME(VORR, tcg_gen_gvec_or)
 802 DO_3SAME(VORN, tcg_gen_gvec_orc)
 803 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 804 DO_3SAME(VSHL_S, gen_gvec_sshl)
 805 DO_3SAME(VSHL_U, gen_gvec_ushl)
 806 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 807 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 808 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 809 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 810
 811 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 812 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 813     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 814                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 815                                 uint32_t oprsz, uint32_t maxsz)         \
 816     {                                                                   \
 817         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 818     }                                                                   \
 819     DO_3SAME(INSN, gen_##INSN##_3s)
 820
 821 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 822 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 823 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 824
 825 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 826     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 827     {                                                                   \
 828         if (a->size == 3) {                                             \
 829             return false;                                               \
 830         }                                                               \
 831         return do_3same(s, a, FUNC);                                    \
 832     }
 833
 834 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 835 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 836 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 837 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 838 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 839 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 840 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 841 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 842 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 843 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 844 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 845 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 846
 847 #define DO_3SAME_CMP(INSN, COND)                                        \
 848     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 849                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 850                                 uint32_t oprsz, uint32_t maxsz)         \
 851     {                                                                   \
 852         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 853     }                                                                   \
 854     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 855
 856 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 857 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 858 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 859 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 860 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 861
 862 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 863     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 864                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 865     {                                                                      \
 866         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 867     }
 868
 869 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 870
 871 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 872 {
 873     if (a->size != 0) {
 874         return false;
 875     }
 876     return do_3same(s, a, gen_VMUL_p_3s);
 877 }
 878
 879 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 880     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 881     {                                                                   \
 882         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 883             return false;                                               \
 884         }                                                               \
 885         if (a->size != 1 && a->size != 2) {                             \
 886             return false;                                               \
 887         }                                                               \
 888         return do_3same(s, a, FUNC);                                    \
 889     }
 890
 891 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 892 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 893
 894 #define DO_SHA1(NAME, FUNC)                                             \
 895     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 896     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 897     {                                                                   \
 898         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 899             return false;                                               \
 900         }                                                               \
 901         return do_3same(s, a, gen_##NAME##_3s);                         \
 902     }
 903
 904 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 905 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 906 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 907 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 908
 909 #define DO_SHA2(NAME, FUNC)                                             \
 910     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 911     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 912     {                                                                   \
 913         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 914             return false;                                               \
 915         }                                                               \
 916         return do_3same(s, a, gen_##NAME##_3s);                         \
 917     }
 918
 919 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 920 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 921 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 922
 923 #define DO_3SAME_64(INSN, FUNC)                                         \
 924     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 925                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 926                                 uint32_t oprsz, uint32_t maxsz)         \
 927     {                                                                   \
 928         static const GVecGen3 op = { .fni8 = FUNC };                    \
 929         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 930     }                                                                   \
 931     DO_3SAME(INSN, gen_##INSN##_3s)
 932
 933 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 934     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 935     {                                                                   \
 936         FUNC(d, cpu_env, n, m);                                         \
 937     }                                                                   \
 938     DO_3SAME_64(INSN, gen_##INSN##_elt)
 939
 940 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 941 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 942 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 943 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 944 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 945 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 946
 947 #define DO_3SAME_32(INSN, FUNC)                                         \
 948     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 949                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 950                                 uint32_t oprsz, uint32_t maxsz)         \
 951     {                                                                   \
 952         static const GVecGen3 ops[4] = {                                \
 953             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 954             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 955             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 956             { 0 },                                                      \
 957         };                                                              \
 958         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 959     }                                                                   \
 960     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 961     {                                                                   \
 962         if (a->size > 2) {                                              \
 963             return false;                                               \
 964         }                                                               \
 965         return do_3same(s, a, gen_##INSN##_3s);                         \
 966     }
 967
 968 /*
 969  * Some helper functions need to be passed the cpu_env. In order
 970  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 971  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 972  * and which call a NeonGenTwoOpEnvFn().
 973  */
 974 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 975     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 976     {                                                                   \
 977         FUNC(d, cpu_env, n, m);                                         \
 978     }
 979
 980 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 981     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 982     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 983     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 984     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 985                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 986                                 uint32_t oprsz, uint32_t maxsz)         \
 987     {                                                                   \
 988         static const GVecGen3 ops[4] = {                                \
 989             { .fni4 = gen_##INSN##_tramp8 },                            \
 990             { .fni4 = gen_##INSN##_tramp16 },                           \
 991             { .fni4 = gen_##INSN##_tramp32 },                           \
 992             { 0 },                                                      \
 993         };                                                              \
 994         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 995     }                                                                   \
 996     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 997     {                                                                   \
 998         if (a->size > 2) {                                              \
 999             return false;                                               \
1000         }                                                               \
1001         return do_3same(s, a, gen_##INSN##_3s);                         \
1002     }
1003
1004 DO_3SAME_32(VHADD_S, hadd_s)
1005 DO_3SAME_32(VHADD_U, hadd_u)
1006 DO_3SAME_32(VHSUB_S, hsub_s)
1007 DO_3SAME_32(VHSUB_U, hsub_u)
1008 DO_3SAME_32(VRHADD_S, rhadd_s)
1009 DO_3SAME_32(VRHADD_U, rhadd_u)
1010 DO_3SAME_32(VRSHL_S, rshl_s)
1011 DO_3SAME_32(VRSHL_U, rshl_u)
1012
1013 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1014 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1015 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1016 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1017
1018 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1019 {
1020     /* Operations handled pairwise 32 bits at a time */
1021     TCGv_i32 tmp, tmp2, tmp3;
1022
1023     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1024         return false;
1025     }
1026
1027     /* UNDEF accesses to D16-D31 if they don't exist. */
1028     if (!dc_isar_feature(aa32_simd_r32, s) &&
1029         ((a->vd | a->vn | a->vm) & 0x10)) {
1030         return false;
1031     }
1032
1033     if (a->size == 3) {
1034         return false;
1035     }
1036
1037     if (!vfp_access_check(s)) {
1038         return true;
1039     }
1040
1041     assert(a->q == 0); /* enforced by decode patterns */
1042
1043     /*
1044      * Note that we have to be careful not to clobber the source operands
1045      * in the "vm == vd" case by storing the result of the first pass too
1046      * early. Since Q is 0 there are always just two passes, so instead
1047      * of a complicated loop over each pass we just unroll.
1048      */
1049     tmp = tcg_temp_new_i32();
1050     tmp2 = tcg_temp_new_i32();
1051     tmp3 = tcg_temp_new_i32();
1052
1053     read_neon_element32(tmp, a->vn, 0, MO_32);
1054     read_neon_element32(tmp2, a->vn, 1, MO_32);
1055     fn(tmp, tmp, tmp2);
1056
1057     read_neon_element32(tmp3, a->vm, 0, MO_32);
1058     read_neon_element32(tmp2, a->vm, 1, MO_32);
1059     fn(tmp3, tmp3, tmp2);
1060
1061     write_neon_element32(tmp, a->vd, 0, MO_32);
1062     write_neon_element32(tmp3, a->vd, 1, MO_32);
1063
1064     tcg_temp_free_i32(tmp);
1065     tcg_temp_free_i32(tmp2);
1066     tcg_temp_free_i32(tmp3);
1067     return true;
1068 }
1069
1070 #define DO_3SAME_PAIR(INSN, func)                                       \
1071     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1072     {                                                                   \
1073         static NeonGenTwoOpFn * const fns[] = {                         \
1074             gen_helper_neon_##func##8,                                  \
1075             gen_helper_neon_##func##16,                                 \
1076             gen_helper_neon_##func##32,                                 \
1077         };                                                              \
1078         if (a->size > 2) {                                              \
1079             return false;                                               \
1080         }                                                               \
1081         return do_3same_pair(s, a, fns[a->size]);                       \
1082     }
1083
1084 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1085 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1086 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1087 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1088 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1089 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1090
1091 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1092 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1093 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1094 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1095 DO_3SAME_PAIR(VPADD, padd_u)
1096
1097 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1098     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1099     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1100     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1101                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1102                                 uint32_t oprsz, uint32_t maxsz)         \
1103     {                                                                   \
1104         static const GVecGen3 ops[2] = {                                \
1105             { .fni4 = gen_##INSN##_tramp16 },                           \
1106             { .fni4 = gen_##INSN##_tramp32 },                           \
1107         };                                                              \
1108         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1109     }                                                                   \
1110     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1111     {                                                                   \
1112         if (a->size != 1 && a->size != 2) {                             \
1113             return false;                                               \
1114         }                                                               \
1115         return do_3same(s, a, gen_##INSN##_3s);                         \
1116     }
1117
1118 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1119 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1120
1121 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1122     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1123                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1124                          uint32_t oprsz, uint32_t maxsz)                \
1125     {                                                                   \
1126         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1127         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1128                            oprsz, maxsz, 0, FUNC);                      \
1129         tcg_temp_free_ptr(fpst);                                        \
1130     }
1131
1132 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1133     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1134     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1135     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1136     {                                                                   \
1137         if (a->size == MO_16) {                                         \
1138             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1139                 return false;                                           \
1140             }                                                           \
1141             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1142         }                                                               \
1143         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1144     }
1145
1146
1147 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1148 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1149 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1150 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1151 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1152 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1153 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1154 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1155 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1156 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1157 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1158 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1159 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1160 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1161 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1162 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1163 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1164
1165 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1166 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1167 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1168 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1169
1170 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1171 {
1172     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1173         return false;
1174     }
1175
1176     if (a->size == MO_16) {
1177         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1178             return false;
1179         }
1180         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1181     }
1182     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1183 }
1184
1185 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1186 {
1187     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1188         return false;
1189     }
1190
1191     if (a->size == MO_16) {
1192         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1193             return false;
1194         }
1195         return do_3same(s, a, gen_VMINNM_fp16_3s);
1196     }
1197     return do_3same(s, a, gen_VMINNM_fp32_3s);
1198 }
1199
1200 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1201                              gen_helper_gvec_3_ptr *fn)
1202 {
1203     /* FP pairwise operations */
1204     TCGv_ptr fpstatus;
1205
1206     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1207         return false;
1208     }
1209
1210     /* UNDEF accesses to D16-D31 if they don't exist. */
1211     if (!dc_isar_feature(aa32_simd_r32, s) &&
1212         ((a->vd | a->vn | a->vm) & 0x10)) {
1213         return false;
1214     }
1215
1216     if (!vfp_access_check(s)) {
1217         return true;
1218     }
1219
1220     assert(a->q == 0); /* enforced by decode patterns */
1221
1222
1223     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1224     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1225                        vfp_reg_offset(1, a->vn),
1226                        vfp_reg_offset(1, a->vm),
1227                        fpstatus, 8, 8, 0, fn);
1228     tcg_temp_free_ptr(fpstatus);
1229
1230     return true;
1231 }
1232
1233 /*
1234  * For all the functions using this macro, size == 1 means fp16,
1235  * which is an architecture extension we don't implement yet.
1236  */
1237 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1238     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1239     {                                                               \
1240         if (a->size == MO_16) {                                     \
1241             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1242                 return false;                                       \
1243             }                                                       \
1244             return do_3same_fp_pair(s, a, FUNC##h);                 \
1245         }                                                           \
1246         return do_3same_fp_pair(s, a, FUNC##s);                     \
1247     }
1248
1249 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1250 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1251 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1252
1253 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1254 {
1255     /* Handle a 2-reg-shift insn which can be vectorized. */
1256     int vec_size = a->q ? 16 : 8;
1257     int rd_ofs = neon_full_reg_offset(a->vd);
1258     int rm_ofs = neon_full_reg_offset(a->vm);
1259
1260     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1261         return false;
1262     }
1263
1264     /* UNDEF accesses to D16-D31 if they don't exist. */
1265     if (!dc_isar_feature(aa32_simd_r32, s) &&
1266         ((a->vd | a->vm) & 0x10)) {
1267         return false;
1268     }
1269
1270     if ((a->vm | a->vd) & a->q) {
1271         return false;
1272     }
1273
1274     if (!vfp_access_check(s)) {
1275         return true;
1276     }
1277
1278     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1279     return true;
1280 }
1281
1282 #define DO_2SH(INSN, FUNC)                                              \
1283     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1284     {                                                                   \
1285         return do_vector_2sh(s, a, FUNC);                               \
1286     }                                                                   \
1287
1288 DO_2SH(VSHL, tcg_gen_gvec_shli)
1289 DO_2SH(VSLI, gen_gvec_sli)
1290 DO_2SH(VSRI, gen_gvec_sri)
1291 DO_2SH(VSRA_S, gen_gvec_ssra)
1292 DO_2SH(VSRA_U, gen_gvec_usra)
1293 DO_2SH(VRSHR_S, gen_gvec_srshr)
1294 DO_2SH(VRSHR_U, gen_gvec_urshr)
1295 DO_2SH(VRSRA_S, gen_gvec_srsra)
1296 DO_2SH(VRSRA_U, gen_gvec_ursra)
1297
1298 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1299 {
1300     /* Signed shift out of range results in all-sign-bits */
1301     a->shift = MIN(a->shift, (8 << a->size) - 1);
1302     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1303 }
1304
1305 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1306                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1307 {
1308     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1309 }
1310
1311 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1312 {
1313     /* Shift out of range is architecturally valid and results in zero. */
1314     if (a->shift >= (8 << a->size)) {
1315         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1316     } else {
1317         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1318     }
1319 }
1320
1321 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1322                              NeonGenTwo64OpEnvFn *fn)
1323 {
1324     /*
1325      * 2-reg-and-shift operations, size == 3 case, where the
1326      * function needs to be passed cpu_env.
1327      */
1328     TCGv_i64 constimm;
1329     int pass;
1330
1331     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1332         return false;
1333     }
1334
1335     /* UNDEF accesses to D16-D31 if they don't exist. */
1336     if (!dc_isar_feature(aa32_simd_r32, s) &&
1337         ((a->vd | a->vm) & 0x10)) {
1338         return false;
1339     }
1340
1341     if ((a->vm | a->vd) & a->q) {
1342         return false;
1343     }
1344
1345     if (!vfp_access_check(s)) {
1346         return true;
1347     }
1348
1349     /*
1350      * To avoid excessive duplication of ops we implement shift
1351      * by immediate using the variable shift operations.
1352      */
1353     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1354
1355     for (pass = 0; pass < a->q + 1; pass++) {
1356         TCGv_i64 tmp = tcg_temp_new_i64();
1357
1358         read_neon_element64(tmp, a->vm, pass, MO_64);
1359         fn(tmp, cpu_env, tmp, constimm);
1360         write_neon_element64(tmp, a->vd, pass, MO_64);
1361         tcg_temp_free_i64(tmp);
1362     }
1363     return true;
1364 }
1365
1366 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1367                              NeonGenTwoOpEnvFn *fn)
1368 {
1369     /*
1370      * 2-reg-and-shift operations, size < 3 case, where the
1371      * helper needs to be passed cpu_env.
1372      */
1373     TCGv_i32 constimm, tmp;
1374     int pass;
1375
1376     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1377         return false;
1378     }
1379
1380     /* UNDEF accesses to D16-D31 if they don't exist. */
1381     if (!dc_isar_feature(aa32_simd_r32, s) &&
1382         ((a->vd | a->vm) & 0x10)) {
1383         return false;
1384     }
1385
1386     if ((a->vm | a->vd) & a->q) {
1387         return false;
1388     }
1389
1390     if (!vfp_access_check(s)) {
1391         return true;
1392     }
1393
1394     /*
1395      * To avoid excessive duplication of ops we implement shift
1396      * by immediate using the variable shift operations.
1397      */
1398     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1399     tmp = tcg_temp_new_i32();
1400
1401     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1402         read_neon_element32(tmp, a->vm, pass, MO_32);
1403         fn(tmp, cpu_env, tmp, constimm);
1404         write_neon_element32(tmp, a->vd, pass, MO_32);
1405     }
1406     tcg_temp_free_i32(tmp);
1407     return true;
1408 }
1409
1410 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1411     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1412     {                                                                   \
1413         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1414     }                                                                   \
1415     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1416     {                                                                   \
1417         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1418             gen_helper_neon_##FUNC##8,                                  \
1419             gen_helper_neon_##FUNC##16,                                 \
1420             gen_helper_neon_##FUNC##32,                                 \
1421         };                                                              \
1422         assert(a->size < ARRAY_SIZE(fns));                              \
1423         return do_2shift_env_32(s, a, fns[a->size]);                    \
1424     }
1425
1426 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1427 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1428 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1429
1430 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1431                                 NeonGenTwo64OpFn *shiftfn,
1432                                 NeonGenNarrowEnvFn *narrowfn)
1433 {
1434     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1435     TCGv_i64 constimm, rm1, rm2;
1436     TCGv_i32 rd;
1437
1438     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1439         return false;
1440     }
1441
1442     /* UNDEF accesses to D16-D31 if they don't exist. */
1443     if (!dc_isar_feature(aa32_simd_r32, s) &&
1444         ((a->vd | a->vm) & 0x10)) {
1445         return false;
1446     }
1447
1448     if (a->vm & 1) {
1449         return false;
1450     }
1451
1452     if (!vfp_access_check(s)) {
1453         return true;
1454     }
1455
1456     /*
1457      * This is always a right shift, and the shiftfn is always a
1458      * left-shift helper, which thus needs the negated shift count.
1459      */
1460     constimm = tcg_constant_i64(-a->shift);
1461     rm1 = tcg_temp_new_i64();
1462     rm2 = tcg_temp_new_i64();
1463     rd = tcg_temp_new_i32();
1464
1465     /* Load both inputs first to avoid potential overwrite if rm == rd */
1466     read_neon_element64(rm1, a->vm, 0, MO_64);
1467     read_neon_element64(rm2, a->vm, 1, MO_64);
1468
1469     shiftfn(rm1, rm1, constimm);
1470     narrowfn(rd, cpu_env, rm1);
1471     write_neon_element32(rd, a->vd, 0, MO_32);
1472
1473     shiftfn(rm2, rm2, constimm);
1474     narrowfn(rd, cpu_env, rm2);
1475     write_neon_element32(rd, a->vd, 1, MO_32);
1476
1477     tcg_temp_free_i32(rd);
1478     tcg_temp_free_i64(rm1);
1479     tcg_temp_free_i64(rm2);
1480
1481     return true;
1482 }
1483
1484 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1485                                 NeonGenTwoOpFn *shiftfn,
1486                                 NeonGenNarrowEnvFn *narrowfn)
1487 {
1488     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1489     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1490     TCGv_i64 rtmp;
1491     uint32_t imm;
1492
1493     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1494         return false;
1495     }
1496
1497     /* UNDEF accesses to D16-D31 if they don't exist. */
1498     if (!dc_isar_feature(aa32_simd_r32, s) &&
1499         ((a->vd | a->vm) & 0x10)) {
1500         return false;
1501     }
1502
1503     if (a->vm & 1) {
1504         return false;
1505     }
1506
1507     if (!vfp_access_check(s)) {
1508         return true;
1509     }
1510
1511     /*
1512      * This is always a right shift, and the shiftfn is always a
1513      * left-shift helper, which thus needs the negated shift count
1514      * duplicated into each lane of the immediate value.
1515      */
1516     if (a->size == 1) {
1517         imm = (uint16_t)(-a->shift);
1518         imm |= imm << 16;
1519     } else {
1520         /* size == 2 */
1521         imm = -a->shift;
1522     }
1523     constimm = tcg_constant_i32(imm);
1524
1525     /* Load all inputs first to avoid potential overwrite */
1526     rm1 = tcg_temp_new_i32();
1527     rm2 = tcg_temp_new_i32();
1528     rm3 = tcg_temp_new_i32();
1529     rm4 = tcg_temp_new_i32();
1530     read_neon_element32(rm1, a->vm, 0, MO_32);
1531     read_neon_element32(rm2, a->vm, 1, MO_32);
1532     read_neon_element32(rm3, a->vm, 2, MO_32);
1533     read_neon_element32(rm4, a->vm, 3, MO_32);
1534     rtmp = tcg_temp_new_i64();
1535
1536     shiftfn(rm1, rm1, constimm);
1537     shiftfn(rm2, rm2, constimm);
1538
1539     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1540     tcg_temp_free_i32(rm2);
1541
1542     narrowfn(rm1, cpu_env, rtmp);
1543     write_neon_element32(rm1, a->vd, 0, MO_32);
1544     tcg_temp_free_i32(rm1);
1545
1546     shiftfn(rm3, rm3, constimm);
1547     shiftfn(rm4, rm4, constimm);
1548
1549     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1550     tcg_temp_free_i32(rm4);
1551
1552     narrowfn(rm3, cpu_env, rtmp);
1553     tcg_temp_free_i64(rtmp);
1554     write_neon_element32(rm3, a->vd, 1, MO_32);
1555     tcg_temp_free_i32(rm3);
1556     return true;
1557 }
1558
1559 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1560     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1561     {                                                                   \
1562         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1563     }
1564 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1565     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1566     {                                                                   \
1567         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1568     }
1569
1570 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1571 {
1572     tcg_gen_extrl_i64_i32(dest, src);
1573 }
1574
1575 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1576 {
1577     gen_helper_neon_narrow_u16(dest, src);
1578 }
1579
1580 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1581 {
1582     gen_helper_neon_narrow_u8(dest, src);
1583 }
1584
1585 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1586 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1587 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1588
1589 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1590 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1591 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1592
1593 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1594 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1595 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1596
1597 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1598 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1599 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1600 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1601 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1602 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1603
1604 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1605 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1606 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1607
1608 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1609 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1610 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1611
1612 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1613 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1614 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1615
1616 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1617                          NeonGenWidenFn *widenfn, bool u)
1618 {
1619     TCGv_i64 tmp;
1620     TCGv_i32 rm0, rm1;
1621     uint64_t widen_mask = 0;
1622
1623     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1624         return false;
1625     }
1626
1627     /* UNDEF accesses to D16-D31 if they don't exist. */
1628     if (!dc_isar_feature(aa32_simd_r32, s) &&
1629         ((a->vd | a->vm) & 0x10)) {
1630         return false;
1631     }
1632
1633     if (a->vd & 1) {
1634         return false;
1635     }
1636
1637     if (!vfp_access_check(s)) {
1638         return true;
1639     }
1640
1641     /*
1642      * This is a widen-and-shift operation. The shift is always less
1643      * than the width of the source type, so after widening the input
1644      * vector we can simply shift the whole 64-bit widened register,
1645      * and then clear the potential overflow bits resulting from left
1646      * bits of the narrow input appearing as right bits of the left
1647      * neighbour narrow input. Calculate a mask of bits to clear.
1648      */
1649     if ((a->shift != 0) && (a->size < 2 || u)) {
1650         int esize = 8 << a->size;
1651         widen_mask = MAKE_64BIT_MASK(0, esize);
1652         widen_mask >>= esize - a->shift;
1653         widen_mask = dup_const(a->size + 1, widen_mask);
1654     }
1655
1656     rm0 = tcg_temp_new_i32();
1657     rm1 = tcg_temp_new_i32();
1658     read_neon_element32(rm0, a->vm, 0, MO_32);
1659     read_neon_element32(rm1, a->vm, 1, MO_32);
1660     tmp = tcg_temp_new_i64();
1661
1662     widenfn(tmp, rm0);
1663     tcg_temp_free_i32(rm0);
1664     if (a->shift != 0) {
1665         tcg_gen_shli_i64(tmp, tmp, a->shift);
1666         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1667     }
1668     write_neon_element64(tmp, a->vd, 0, MO_64);
1669
1670     widenfn(tmp, rm1);
1671     tcg_temp_free_i32(rm1);
1672     if (a->shift != 0) {
1673         tcg_gen_shli_i64(tmp, tmp, a->shift);
1674         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1675     }
1676     write_neon_element64(tmp, a->vd, 1, MO_64);
1677     tcg_temp_free_i64(tmp);
1678     return true;
1679 }
1680
1681 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1682 {
1683     static NeonGenWidenFn * const widenfn[] = {
1684         gen_helper_neon_widen_s8,
1685         gen_helper_neon_widen_s16,
1686         tcg_gen_ext_i32_i64,
1687     };
1688     return do_vshll_2sh(s, a, widenfn[a->size], false);
1689 }
1690
1691 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1692 {
1693     static NeonGenWidenFn * const widenfn[] = {
1694         gen_helper_neon_widen_u8,
1695         gen_helper_neon_widen_u16,
1696         tcg_gen_extu_i32_i64,
1697     };
1698     return do_vshll_2sh(s, a, widenfn[a->size], true);
1699 }
1700
1701 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1702                       gen_helper_gvec_2_ptr *fn)
1703 {
1704     /* FP operations in 2-reg-and-shift group */
1705     int vec_size = a->q ? 16 : 8;
1706     int rd_ofs = neon_full_reg_offset(a->vd);
1707     int rm_ofs = neon_full_reg_offset(a->vm);
1708     TCGv_ptr fpst;
1709
1710     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1711         return false;
1712     }
1713
1714     if (a->size == MO_16) {
1715         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1716             return false;
1717         }
1718     }
1719
1720     /* UNDEF accesses to D16-D31 if they don't exist. */
1721     if (!dc_isar_feature(aa32_simd_r32, s) &&
1722         ((a->vd | a->vm) & 0x10)) {
1723         return false;
1724     }
1725
1726     if ((a->vm | a->vd) & a->q) {
1727         return false;
1728     }
1729
1730     if (!vfp_access_check(s)) {
1731         return true;
1732     }
1733
1734     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1735     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1736     tcg_temp_free_ptr(fpst);
1737     return true;
1738 }
1739
1740 #define DO_FP_2SH(INSN, FUNC)                                           \
1741     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1742     {                                                                   \
1743         return do_fp_2sh(s, a, FUNC);                                   \
1744     }
1745
1746 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1747 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1748 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1749 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1750
1751 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1752 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1753 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1754 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1755
1756 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1757                         GVecGen2iFn *fn)
1758 {
1759     uint64_t imm;
1760     int reg_ofs, vec_size;
1761
1762     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1763         return false;
1764     }
1765
1766     /* UNDEF accesses to D16-D31 if they don't exist. */
1767     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1768         return false;
1769     }
1770
1771     if (a->vd & a->q) {
1772         return false;
1773     }
1774
1775     if (!vfp_access_check(s)) {
1776         return true;
1777     }
1778
1779     reg_ofs = neon_full_reg_offset(a->vd);
1780     vec_size = a->q ? 16 : 8;
1781     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1782
1783     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1784     return true;
1785 }
1786
1787 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1788                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1789 {
1790     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1791 }
1792
1793 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1794 {
1795     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1796     GVecGen2iFn *fn;
1797
1798     if ((a->cmode & 1) && a->cmode < 12) {
1799         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1800         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1801     } else {
1802         /* There is one unallocated cmode/op combination in this space */
1803         if (a->cmode == 15 && a->op == 1) {
1804             return false;
1805         }
1806         fn = gen_VMOV_1r;
1807     }
1808     return do_1reg_imm(s, a, fn);
1809 }
1810
1811 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1812                            NeonGenWidenFn *widenfn,
1813                            NeonGenTwo64OpFn *opfn,
1814                            int src1_mop, int src2_mop)
1815 {
1816     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1817     TCGv_i64 rn0_64, rn1_64, rm_64;
1818
1819     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1820         return false;
1821     }
1822
1823     /* UNDEF accesses to D16-D31 if they don't exist. */
1824     if (!dc_isar_feature(aa32_simd_r32, s) &&
1825         ((a->vd | a->vn | a->vm) & 0x10)) {
1826         return false;
1827     }
1828
1829     if (!opfn) {
1830         /* size == 3 case, which is an entirely different insn group */
1831         return false;
1832     }
1833
1834     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1835         return false;
1836     }
1837
1838     if (!vfp_access_check(s)) {
1839         return true;
1840     }
1841
1842     rn0_64 = tcg_temp_new_i64();
1843     rn1_64 = tcg_temp_new_i64();
1844     rm_64 = tcg_temp_new_i64();
1845
1846     if (src1_mop >= 0) {
1847         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1848     } else {
1849         TCGv_i32 tmp = tcg_temp_new_i32();
1850         read_neon_element32(tmp, a->vn, 0, MO_32);
1851         widenfn(rn0_64, tmp);
1852         tcg_temp_free_i32(tmp);
1853     }
1854     if (src2_mop >= 0) {
1855         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1856     } else {
1857         TCGv_i32 tmp = tcg_temp_new_i32();
1858         read_neon_element32(tmp, a->vm, 0, MO_32);
1859         widenfn(rm_64, tmp);
1860         tcg_temp_free_i32(tmp);
1861     }
1862
1863     opfn(rn0_64, rn0_64, rm_64);
1864
1865     /*
1866      * Load second pass inputs before storing the first pass result, to
1867      * avoid incorrect results if a narrow input overlaps with the result.
1868      */
1869     if (src1_mop >= 0) {
1870         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1871     } else {
1872         TCGv_i32 tmp = tcg_temp_new_i32();
1873         read_neon_element32(tmp, a->vn, 1, MO_32);
1874         widenfn(rn1_64, tmp);
1875         tcg_temp_free_i32(tmp);
1876     }
1877     if (src2_mop >= 0) {
1878         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1879     } else {
1880         TCGv_i32 tmp = tcg_temp_new_i32();
1881         read_neon_element32(tmp, a->vm, 1, MO_32);
1882         widenfn(rm_64, tmp);
1883         tcg_temp_free_i32(tmp);
1884     }
1885
1886     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1887
1888     opfn(rn1_64, rn1_64, rm_64);
1889     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1890
1891     tcg_temp_free_i64(rn0_64);
1892     tcg_temp_free_i64(rn1_64);
1893     tcg_temp_free_i64(rm_64);
1894
1895     return true;
1896 }
1897
1898 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1899     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1900     {                                                                   \
1901         static NeonGenWidenFn * const widenfn[] = {                     \
1902             gen_helper_neon_widen_##S##8,                               \
1903             gen_helper_neon_widen_##S##16,                              \
1904             NULL, NULL,                                                 \
1905         };                                                              \
1906         static NeonGenTwo64OpFn * const addfn[] = {                     \
1907             gen_helper_neon_##OP##l_u16,                                \
1908             gen_helper_neon_##OP##l_u32,                                \
1909             tcg_gen_##OP##_i64,                                         \
1910             NULL,                                                       \
1911         };                                                              \
1912         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1913         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1914                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1915                               narrow_mop);                              \
1916     }
1917
1918 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1919 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1920 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1921 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1922 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1923 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1924 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1925 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1926
1927 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1928                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1929 {
1930     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1931     TCGv_i64 rn_64, rm_64;
1932     TCGv_i32 rd0, rd1;
1933
1934     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1935         return false;
1936     }
1937
1938     /* UNDEF accesses to D16-D31 if they don't exist. */
1939     if (!dc_isar_feature(aa32_simd_r32, s) &&
1940         ((a->vd | a->vn | a->vm) & 0x10)) {
1941         return false;
1942     }
1943
1944     if (!opfn || !narrowfn) {
1945         /* size == 3 case, which is an entirely different insn group */
1946         return false;
1947     }
1948
1949     if ((a->vn | a->vm) & 1) {
1950         return false;
1951     }
1952
1953     if (!vfp_access_check(s)) {
1954         return true;
1955     }
1956
1957     rn_64 = tcg_temp_new_i64();
1958     rm_64 = tcg_temp_new_i64();
1959     rd0 = tcg_temp_new_i32();
1960     rd1 = tcg_temp_new_i32();
1961
1962     read_neon_element64(rn_64, a->vn, 0, MO_64);
1963     read_neon_element64(rm_64, a->vm, 0, MO_64);
1964
1965     opfn(rn_64, rn_64, rm_64);
1966
1967     narrowfn(rd0, rn_64);
1968
1969     read_neon_element64(rn_64, a->vn, 1, MO_64);
1970     read_neon_element64(rm_64, a->vm, 1, MO_64);
1971
1972     opfn(rn_64, rn_64, rm_64);
1973
1974     narrowfn(rd1, rn_64);
1975
1976     write_neon_element32(rd0, a->vd, 0, MO_32);
1977     write_neon_element32(rd1, a->vd, 1, MO_32);
1978
1979     tcg_temp_free_i32(rd0);
1980     tcg_temp_free_i32(rd1);
1981     tcg_temp_free_i64(rn_64);
1982     tcg_temp_free_i64(rm_64);
1983
1984     return true;
1985 }
1986
1987 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1988     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1989     {                                                                   \
1990         static NeonGenTwo64OpFn * const addfn[] = {                     \
1991             gen_helper_neon_##OP##l_u16,                                \
1992             gen_helper_neon_##OP##l_u32,                                \
1993             tcg_gen_##OP##_i64,                                         \
1994             NULL,                                                       \
1995         };                                                              \
1996         static NeonGenNarrowFn * const narrowfn[] = {                   \
1997             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1998             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1999             EXTOP,                                                      \
2000             NULL,                                                       \
2001         };                                                              \
2002         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2003     }
2004
2005 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2006 {
2007     tcg_gen_addi_i64(rn, rn, 1u << 31);
2008     tcg_gen_extrh_i64_i32(rd, rn);
2009 }
2010
2011 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2012 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2013 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2014 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2015
2016 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2017                        NeonGenTwoOpWidenFn *opfn,
2018                        NeonGenTwo64OpFn *accfn)
2019 {
2020     /*
2021      * 3-regs different lengths, long operations.
2022      * These perform an operation on two inputs that returns a double-width
2023      * result, and then possibly perform an accumulation operation of
2024      * that result into the double-width destination.
2025      */
2026     TCGv_i64 rd0, rd1, tmp;
2027     TCGv_i32 rn, rm;
2028
2029     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2030         return false;
2031     }
2032
2033     /* UNDEF accesses to D16-D31 if they don't exist. */
2034     if (!dc_isar_feature(aa32_simd_r32, s) &&
2035         ((a->vd | a->vn | a->vm) & 0x10)) {
2036         return false;
2037     }
2038
2039     if (!opfn) {
2040         /* size == 3 case, which is an entirely different insn group */
2041         return false;
2042     }
2043
2044     if (a->vd & 1) {
2045         return false;
2046     }
2047
2048     if (!vfp_access_check(s)) {
2049         return true;
2050     }
2051
2052     rd0 = tcg_temp_new_i64();
2053     rd1 = tcg_temp_new_i64();
2054
2055     rn = tcg_temp_new_i32();
2056     rm = tcg_temp_new_i32();
2057     read_neon_element32(rn, a->vn, 0, MO_32);
2058     read_neon_element32(rm, a->vm, 0, MO_32);
2059     opfn(rd0, rn, rm);
2060
2061     read_neon_element32(rn, a->vn, 1, MO_32);
2062     read_neon_element32(rm, a->vm, 1, MO_32);
2063     opfn(rd1, rn, rm);
2064     tcg_temp_free_i32(rn);
2065     tcg_temp_free_i32(rm);
2066
2067     /* Don't store results until after all loads: they might overlap */
2068     if (accfn) {
2069         tmp = tcg_temp_new_i64();
2070         read_neon_element64(tmp, a->vd, 0, MO_64);
2071         accfn(rd0, tmp, rd0);
2072         read_neon_element64(tmp, a->vd, 1, MO_64);
2073         accfn(rd1, tmp, rd1);
2074         tcg_temp_free_i64(tmp);
2075     }
2076
2077     write_neon_element64(rd0, a->vd, 0, MO_64);
2078     write_neon_element64(rd1, a->vd, 1, MO_64);
2079     tcg_temp_free_i64(rd0);
2080     tcg_temp_free_i64(rd1);
2081
2082     return true;
2083 }
2084
2085 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2086 {
2087     static NeonGenTwoOpWidenFn * const opfn[] = {
2088         gen_helper_neon_abdl_s16,
2089         gen_helper_neon_abdl_s32,
2090         gen_helper_neon_abdl_s64,
2091         NULL,
2092     };
2093
2094     return do_long_3d(s, a, opfn[a->size], NULL);
2095 }
2096
2097 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2098 {
2099     static NeonGenTwoOpWidenFn * const opfn[] = {
2100         gen_helper_neon_abdl_u16,
2101         gen_helper_neon_abdl_u32,
2102         gen_helper_neon_abdl_u64,
2103         NULL,
2104     };
2105
2106     return do_long_3d(s, a, opfn[a->size], NULL);
2107 }
2108
2109 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2110 {
2111     static NeonGenTwoOpWidenFn * const opfn[] = {
2112         gen_helper_neon_abdl_s16,
2113         gen_helper_neon_abdl_s32,
2114         gen_helper_neon_abdl_s64,
2115         NULL,
2116     };
2117     static NeonGenTwo64OpFn * const addfn[] = {
2118         gen_helper_neon_addl_u16,
2119         gen_helper_neon_addl_u32,
2120         tcg_gen_add_i64,
2121         NULL,
2122     };
2123
2124     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2125 }
2126
2127 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2128 {
2129     static NeonGenTwoOpWidenFn * const opfn[] = {
2130         gen_helper_neon_abdl_u16,
2131         gen_helper_neon_abdl_u32,
2132         gen_helper_neon_abdl_u64,
2133         NULL,
2134     };
2135     static NeonGenTwo64OpFn * const addfn[] = {
2136         gen_helper_neon_addl_u16,
2137         gen_helper_neon_addl_u32,
2138         tcg_gen_add_i64,
2139         NULL,
2140     };
2141
2142     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2143 }
2144
2145 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2146 {
2147     TCGv_i32 lo = tcg_temp_new_i32();
2148     TCGv_i32 hi = tcg_temp_new_i32();
2149
2150     tcg_gen_muls2_i32(lo, hi, rn, rm);
2151     tcg_gen_concat_i32_i64(rd, lo, hi);
2152
2153     tcg_temp_free_i32(lo);
2154     tcg_temp_free_i32(hi);
2155 }
2156
2157 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2158 {
2159     TCGv_i32 lo = tcg_temp_new_i32();
2160     TCGv_i32 hi = tcg_temp_new_i32();
2161
2162     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2163     tcg_gen_concat_i32_i64(rd, lo, hi);
2164
2165     tcg_temp_free_i32(lo);
2166     tcg_temp_free_i32(hi);
2167 }
2168
2169 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2170 {
2171     static NeonGenTwoOpWidenFn * const opfn[] = {
2172         gen_helper_neon_mull_s8,
2173         gen_helper_neon_mull_s16,
2174         gen_mull_s32,
2175         NULL,
2176     };
2177
2178     return do_long_3d(s, a, opfn[a->size], NULL);
2179 }
2180
2181 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2182 {
2183     static NeonGenTwoOpWidenFn * const opfn[] = {
2184         gen_helper_neon_mull_u8,
2185         gen_helper_neon_mull_u16,
2186         gen_mull_u32,
2187         NULL,
2188     };
2189
2190     return do_long_3d(s, a, opfn[a->size], NULL);
2191 }
2192
2193 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2194     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2195     {                                                                   \
2196         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2197             gen_helper_neon_##MULL##8,                                  \
2198             gen_helper_neon_##MULL##16,                                 \
2199             gen_##MULL##32,                                             \
2200             NULL,                                                       \
2201         };                                                              \
2202         static NeonGenTwo64OpFn * const accfn[] = {                     \
2203             gen_helper_neon_##ACC##l_u16,                               \
2204             gen_helper_neon_##ACC##l_u32,                               \
2205             tcg_gen_##ACC##_i64,                                        \
2206             NULL,                                                       \
2207         };                                                              \
2208         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2209     }
2210
2211 DO_VMLAL(VMLAL_S,mull_s,add)
2212 DO_VMLAL(VMLAL_U,mull_u,add)
2213 DO_VMLAL(VMLSL_S,mull_s,sub)
2214 DO_VMLAL(VMLSL_U,mull_u,sub)
2215
2216 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2217 {
2218     gen_helper_neon_mull_s16(rd, rn, rm);
2219     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2220 }
2221
2222 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2223 {
2224     gen_mull_s32(rd, rn, rm);
2225     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2226 }
2227
2228 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2229 {
2230     static NeonGenTwoOpWidenFn * const opfn[] = {
2231         NULL,
2232         gen_VQDMULL_16,
2233         gen_VQDMULL_32,
2234         NULL,
2235     };
2236
2237     return do_long_3d(s, a, opfn[a->size], NULL);
2238 }
2239
2240 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2241 {
2242     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2243 }
2244
2245 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2246 {
2247     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2248 }
2249
2250 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2251 {
2252     static NeonGenTwoOpWidenFn * const opfn[] = {
2253         NULL,
2254         gen_VQDMULL_16,
2255         gen_VQDMULL_32,
2256         NULL,
2257     };
2258     static NeonGenTwo64OpFn * const accfn[] = {
2259         NULL,
2260         gen_VQDMLAL_acc_16,
2261         gen_VQDMLAL_acc_32,
2262         NULL,
2263     };
2264
2265     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2266 }
2267
2268 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2269 {
2270     gen_helper_neon_negl_u32(rm, rm);
2271     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2272 }
2273
2274 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2275 {
2276     tcg_gen_neg_i64(rm, rm);
2277     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2278 }
2279
2280 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2281 {
2282     static NeonGenTwoOpWidenFn * const opfn[] = {
2283         NULL,
2284         gen_VQDMULL_16,
2285         gen_VQDMULL_32,
2286         NULL,
2287     };
2288     static NeonGenTwo64OpFn * const accfn[] = {
2289         NULL,
2290         gen_VQDMLSL_acc_16,
2291         gen_VQDMLSL_acc_32,
2292         NULL,
2293     };
2294
2295     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2296 }
2297
2298 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2299 {
2300     gen_helper_gvec_3 *fn_gvec;
2301
2302     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2303         return false;
2304     }
2305
2306     /* UNDEF accesses to D16-D31 if they don't exist. */
2307     if (!dc_isar_feature(aa32_simd_r32, s) &&
2308         ((a->vd | a->vn | a->vm) & 0x10)) {
2309         return false;
2310     }
2311
2312     if (a->vd & 1) {
2313         return false;
2314     }
2315
2316     switch (a->size) {
2317     case 0:
2318         fn_gvec = gen_helper_neon_pmull_h;
2319         break;
2320     case 2:
2321         if (!dc_isar_feature(aa32_pmull, s)) {
2322             return false;
2323         }
2324         fn_gvec = gen_helper_gvec_pmull_q;
2325         break;
2326     default:
2327         return false;
2328     }
2329
2330     if (!vfp_access_check(s)) {
2331         return true;
2332     }
2333
2334     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2335                        neon_full_reg_offset(a->vn),
2336                        neon_full_reg_offset(a->vm),
2337                        16, 16, 0, fn_gvec);
2338     return true;
2339 }
2340
2341 static void gen_neon_dup_low16(TCGv_i32 var)
2342 {
2343     TCGv_i32 tmp = tcg_temp_new_i32();
2344     tcg_gen_ext16u_i32(var, var);
2345     tcg_gen_shli_i32(tmp, var, 16);
2346     tcg_gen_or_i32(var, var, tmp);
2347     tcg_temp_free_i32(tmp);
2348 }
2349
2350 static void gen_neon_dup_high16(TCGv_i32 var)
2351 {
2352     TCGv_i32 tmp = tcg_temp_new_i32();
2353     tcg_gen_andi_i32(var, var, 0xffff0000);
2354     tcg_gen_shri_i32(tmp, var, 16);
2355     tcg_gen_or_i32(var, var, tmp);
2356     tcg_temp_free_i32(tmp);
2357 }
2358
2359 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2360 {
2361     TCGv_i32 tmp = tcg_temp_new_i32();
2362     if (size == MO_16) {
2363         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2364         if (reg & 8) {
2365             gen_neon_dup_high16(tmp);
2366         } else {
2367             gen_neon_dup_low16(tmp);
2368         }
2369     } else {
2370         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2371     }
2372     return tmp;
2373 }
2374
2375 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2376                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2377 {
2378     /*
2379      * Two registers and a scalar: perform an operation between
2380      * the input elements and the scalar, and then possibly
2381      * perform an accumulation operation of that result into the
2382      * destination.
2383      */
2384     TCGv_i32 scalar, tmp;
2385     int pass;
2386
2387     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2388         return false;
2389     }
2390
2391     /* UNDEF accesses to D16-D31 if they don't exist. */
2392     if (!dc_isar_feature(aa32_simd_r32, s) &&
2393         ((a->vd | a->vn | a->vm) & 0x10)) {
2394         return false;
2395     }
2396
2397     if (!opfn) {
2398         /* Bad size (including size == 3, which is a different insn group) */
2399         return false;
2400     }
2401
2402     if (a->q && ((a->vd | a->vn) & 1)) {
2403         return false;
2404     }
2405
2406     if (!vfp_access_check(s)) {
2407         return true;
2408     }
2409
2410     scalar = neon_get_scalar(a->size, a->vm);
2411     tmp = tcg_temp_new_i32();
2412
2413     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2414         read_neon_element32(tmp, a->vn, pass, MO_32);
2415         opfn(tmp, tmp, scalar);
2416         if (accfn) {
2417             TCGv_i32 rd = tcg_temp_new_i32();
2418             read_neon_element32(rd, a->vd, pass, MO_32);
2419             accfn(tmp, rd, tmp);
2420             tcg_temp_free_i32(rd);
2421         }
2422         write_neon_element32(tmp, a->vd, pass, MO_32);
2423     }
2424     tcg_temp_free_i32(tmp);
2425     tcg_temp_free_i32(scalar);
2426     return true;
2427 }
2428
2429 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2430 {
2431     static NeonGenTwoOpFn * const opfn[] = {
2432         NULL,
2433         gen_helper_neon_mul_u16,
2434         tcg_gen_mul_i32,
2435         NULL,
2436     };
2437
2438     return do_2scalar(s, a, opfn[a->size], NULL);
2439 }
2440
2441 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2442 {
2443     static NeonGenTwoOpFn * const opfn[] = {
2444         NULL,
2445         gen_helper_neon_mul_u16,
2446         tcg_gen_mul_i32,
2447         NULL,
2448     };
2449     static NeonGenTwoOpFn * const accfn[] = {
2450         NULL,
2451         gen_helper_neon_add_u16,
2452         tcg_gen_add_i32,
2453         NULL,
2454     };
2455
2456     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2457 }
2458
2459 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2460 {
2461     static NeonGenTwoOpFn * const opfn[] = {
2462         NULL,
2463         gen_helper_neon_mul_u16,
2464         tcg_gen_mul_i32,
2465         NULL,
2466     };
2467     static NeonGenTwoOpFn * const accfn[] = {
2468         NULL,
2469         gen_helper_neon_sub_u16,
2470         tcg_gen_sub_i32,
2471         NULL,
2472     };
2473
2474     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2475 }
2476
2477 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2478                               gen_helper_gvec_3_ptr *fn)
2479 {
2480     /* Two registers and a scalar, using gvec */
2481     int vec_size = a->q ? 16 : 8;
2482     int rd_ofs = neon_full_reg_offset(a->vd);
2483     int rn_ofs = neon_full_reg_offset(a->vn);
2484     int rm_ofs;
2485     int idx;
2486     TCGv_ptr fpstatus;
2487
2488     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2489         return false;
2490     }
2491
2492     /* UNDEF accesses to D16-D31 if they don't exist. */
2493     if (!dc_isar_feature(aa32_simd_r32, s) &&
2494         ((a->vd | a->vn | a->vm) & 0x10)) {
2495         return false;
2496     }
2497
2498     if (!fn) {
2499         /* Bad size (including size == 3, which is a different insn group) */
2500         return false;
2501     }
2502
2503     if (a->q && ((a->vd | a->vn) & 1)) {
2504         return false;
2505     }
2506
2507     if (!vfp_access_check(s)) {
2508         return true;
2509     }
2510
2511     /* a->vm is M:Vm, which encodes both register and index */
2512     idx = extract32(a->vm, a->size + 2, 2);
2513     a->vm = extract32(a->vm, 0, a->size + 2);
2514     rm_ofs = neon_full_reg_offset(a->vm);
2515
2516     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2517     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2518                        vec_size, vec_size, idx, fn);
2519     tcg_temp_free_ptr(fpstatus);
2520     return true;
2521 }
2522
2523 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2524     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2525     {                                                                   \
2526         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2527             NULL,                                                       \
2528             gen_helper_##FUNC##_h,                                      \
2529             gen_helper_##FUNC##_s,                                      \
2530             NULL,                                                       \
2531         };                                                              \
2532         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2533             return false;                                               \
2534         }                                                               \
2535         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2536     }
2537
2538 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2539 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2540 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2541
2542 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2543 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2544 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2545 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2546
2547 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2548 {
2549     static NeonGenTwoOpFn * const opfn[] = {
2550         NULL,
2551         gen_VQDMULH_16,
2552         gen_VQDMULH_32,
2553         NULL,
2554     };
2555
2556     return do_2scalar(s, a, opfn[a->size], NULL);
2557 }
2558
2559 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2560 {
2561     static NeonGenTwoOpFn * const opfn[] = {
2562         NULL,
2563         gen_VQRDMULH_16,
2564         gen_VQRDMULH_32,
2565         NULL,
2566     };
2567
2568     return do_2scalar(s, a, opfn[a->size], NULL);
2569 }
2570
2571 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2572                             NeonGenThreeOpEnvFn *opfn)
2573 {
2574     /*
2575      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2576      * performs a kind of fused op-then-accumulate using a helper
2577      * function that takes all of rd, rn and the scalar at once.
2578      */
2579     TCGv_i32 scalar, rn, rd;
2580     int pass;
2581
2582     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2583         return false;
2584     }
2585
2586     if (!dc_isar_feature(aa32_rdm, s)) {
2587         return false;
2588     }
2589
2590     /* UNDEF accesses to D16-D31 if they don't exist. */
2591     if (!dc_isar_feature(aa32_simd_r32, s) &&
2592         ((a->vd | a->vn | a->vm) & 0x10)) {
2593         return false;
2594     }
2595
2596     if (!opfn) {
2597         /* Bad size (including size == 3, which is a different insn group) */
2598         return false;
2599     }
2600
2601     if (a->q && ((a->vd | a->vn) & 1)) {
2602         return false;
2603     }
2604
2605     if (!vfp_access_check(s)) {
2606         return true;
2607     }
2608
2609     scalar = neon_get_scalar(a->size, a->vm);
2610     rn = tcg_temp_new_i32();
2611     rd = tcg_temp_new_i32();
2612
2613     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2614         read_neon_element32(rn, a->vn, pass, MO_32);
2615         read_neon_element32(rd, a->vd, pass, MO_32);
2616         opfn(rd, cpu_env, rn, scalar, rd);
2617         write_neon_element32(rd, a->vd, pass, MO_32);
2618     }
2619     tcg_temp_free_i32(rn);
2620     tcg_temp_free_i32(rd);
2621     tcg_temp_free_i32(scalar);
2622
2623     return true;
2624 }
2625
2626 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2627 {
2628     static NeonGenThreeOpEnvFn *opfn[] = {
2629         NULL,
2630         gen_helper_neon_qrdmlah_s16,
2631         gen_helper_neon_qrdmlah_s32,
2632         NULL,
2633     };
2634     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2635 }
2636
2637 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2638 {
2639     static NeonGenThreeOpEnvFn *opfn[] = {
2640         NULL,
2641         gen_helper_neon_qrdmlsh_s16,
2642         gen_helper_neon_qrdmlsh_s32,
2643         NULL,
2644     };
2645     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2646 }
2647
2648 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2649                             NeonGenTwoOpWidenFn *opfn,
2650                             NeonGenTwo64OpFn *accfn)
2651 {
2652     /*
2653      * Two registers and a scalar, long operations: perform an
2654      * operation on the input elements and the scalar which produces
2655      * a double-width result, and then possibly perform an accumulation
2656      * operation of that result into the destination.
2657      */
2658     TCGv_i32 scalar, rn;
2659     TCGv_i64 rn0_64, rn1_64;
2660
2661     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2662         return false;
2663     }
2664
2665     /* UNDEF accesses to D16-D31 if they don't exist. */
2666     if (!dc_isar_feature(aa32_simd_r32, s) &&
2667         ((a->vd | a->vn | a->vm) & 0x10)) {
2668         return false;
2669     }
2670
2671     if (!opfn) {
2672         /* Bad size (including size == 3, which is a different insn group) */
2673         return false;
2674     }
2675
2676     if (a->vd & 1) {
2677         return false;
2678     }
2679
2680     if (!vfp_access_check(s)) {
2681         return true;
2682     }
2683
2684     scalar = neon_get_scalar(a->size, a->vm);
2685
2686     /* Load all inputs before writing any outputs, in case of overlap */
2687     rn = tcg_temp_new_i32();
2688     read_neon_element32(rn, a->vn, 0, MO_32);
2689     rn0_64 = tcg_temp_new_i64();
2690     opfn(rn0_64, rn, scalar);
2691
2692     read_neon_element32(rn, a->vn, 1, MO_32);
2693     rn1_64 = tcg_temp_new_i64();
2694     opfn(rn1_64, rn, scalar);
2695     tcg_temp_free_i32(rn);
2696     tcg_temp_free_i32(scalar);
2697
2698     if (accfn) {
2699         TCGv_i64 t64 = tcg_temp_new_i64();
2700         read_neon_element64(t64, a->vd, 0, MO_64);
2701         accfn(rn0_64, t64, rn0_64);
2702         read_neon_element64(t64, a->vd, 1, MO_64);
2703         accfn(rn1_64, t64, rn1_64);
2704         tcg_temp_free_i64(t64);
2705     }
2706
2707     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2708     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2709     tcg_temp_free_i64(rn0_64);
2710     tcg_temp_free_i64(rn1_64);
2711     return true;
2712 }
2713
2714 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2715 {
2716     static NeonGenTwoOpWidenFn * const opfn[] = {
2717         NULL,
2718         gen_helper_neon_mull_s16,
2719         gen_mull_s32,
2720         NULL,
2721     };
2722
2723     return do_2scalar_long(s, a, opfn[a->size], NULL);
2724 }
2725
2726 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2727 {
2728     static NeonGenTwoOpWidenFn * const opfn[] = {
2729         NULL,
2730         gen_helper_neon_mull_u16,
2731         gen_mull_u32,
2732         NULL,
2733     };
2734
2735     return do_2scalar_long(s, a, opfn[a->size], NULL);
2736 }
2737
2738 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2739     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2740     {                                                                   \
2741         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2742             NULL,                                                       \
2743             gen_helper_neon_##MULL##16,                                 \
2744             gen_##MULL##32,                                             \
2745             NULL,                                                       \
2746         };                                                              \
2747         static NeonGenTwo64OpFn * const accfn[] = {                     \
2748             NULL,                                                       \
2749             gen_helper_neon_##ACC##l_u32,                               \
2750             tcg_gen_##ACC##_i64,                                        \
2751             NULL,                                                       \
2752         };                                                              \
2753         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2754     }
2755
2756 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2757 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2758 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2759 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2760
2761 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2762 {
2763     static NeonGenTwoOpWidenFn * const opfn[] = {
2764         NULL,
2765         gen_VQDMULL_16,
2766         gen_VQDMULL_32,
2767         NULL,
2768     };
2769
2770     return do_2scalar_long(s, a, opfn[a->size], NULL);
2771 }
2772
2773 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2774 {
2775     static NeonGenTwoOpWidenFn * const opfn[] = {
2776         NULL,
2777         gen_VQDMULL_16,
2778         gen_VQDMULL_32,
2779         NULL,
2780     };
2781     static NeonGenTwo64OpFn * const accfn[] = {
2782         NULL,
2783         gen_VQDMLAL_acc_16,
2784         gen_VQDMLAL_acc_32,
2785         NULL,
2786     };
2787
2788     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2789 }
2790
2791 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2792 {
2793     static NeonGenTwoOpWidenFn * const opfn[] = {
2794         NULL,
2795         gen_VQDMULL_16,
2796         gen_VQDMULL_32,
2797         NULL,
2798     };
2799     static NeonGenTwo64OpFn * const accfn[] = {
2800         NULL,
2801         gen_VQDMLSL_acc_16,
2802         gen_VQDMLSL_acc_32,
2803         NULL,
2804     };
2805
2806     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2807 }
2808
2809 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2810 {
2811     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2812         return false;
2813     }
2814
2815     /* UNDEF accesses to D16-D31 if they don't exist. */
2816     if (!dc_isar_feature(aa32_simd_r32, s) &&
2817         ((a->vd | a->vn | a->vm) & 0x10)) {
2818         return false;
2819     }
2820
2821     if ((a->vn | a->vm | a->vd) & a->q) {
2822         return false;
2823     }
2824
2825     if (a->imm > 7 && !a->q) {
2826         return false;
2827     }
2828
2829     if (!vfp_access_check(s)) {
2830         return true;
2831     }
2832
2833     if (!a->q) {
2834         /* Extract 64 bits from <Vm:Vn> */
2835         TCGv_i64 left, right, dest;
2836
2837         left = tcg_temp_new_i64();
2838         right = tcg_temp_new_i64();
2839         dest = tcg_temp_new_i64();
2840
2841         read_neon_element64(right, a->vn, 0, MO_64);
2842         read_neon_element64(left, a->vm, 0, MO_64);
2843         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2844         write_neon_element64(dest, a->vd, 0, MO_64);
2845
2846         tcg_temp_free_i64(left);
2847         tcg_temp_free_i64(right);
2848         tcg_temp_free_i64(dest);
2849     } else {
2850         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2851         TCGv_i64 left, middle, right, destleft, destright;
2852
2853         left = tcg_temp_new_i64();
2854         middle = tcg_temp_new_i64();
2855         right = tcg_temp_new_i64();
2856         destleft = tcg_temp_new_i64();
2857         destright = tcg_temp_new_i64();
2858
2859         if (a->imm < 8) {
2860             read_neon_element64(right, a->vn, 0, MO_64);
2861             read_neon_element64(middle, a->vn, 1, MO_64);
2862             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2863             read_neon_element64(left, a->vm, 0, MO_64);
2864             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2865         } else {
2866             read_neon_element64(right, a->vn, 1, MO_64);
2867             read_neon_element64(middle, a->vm, 0, MO_64);
2868             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2869             read_neon_element64(left, a->vm, 1, MO_64);
2870             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2871         }
2872
2873         write_neon_element64(destright, a->vd, 0, MO_64);
2874         write_neon_element64(destleft, a->vd, 1, MO_64);
2875
2876         tcg_temp_free_i64(destright);
2877         tcg_temp_free_i64(destleft);
2878         tcg_temp_free_i64(right);
2879         tcg_temp_free_i64(middle);
2880         tcg_temp_free_i64(left);
2881     }
2882     return true;
2883 }
2884
2885 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2886 {
2887     TCGv_i64 val, def;
2888     TCGv_i32 desc;
2889
2890     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2891         return false;
2892     }
2893
2894     /* UNDEF accesses to D16-D31 if they don't exist. */
2895     if (!dc_isar_feature(aa32_simd_r32, s) &&
2896         ((a->vd | a->vn | a->vm) & 0x10)) {
2897         return false;
2898     }
2899
2900     if ((a->vn + a->len + 1) > 32) {
2901         /*
2902          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2903          * helper function running off the end of the register file.
2904          */
2905         return false;
2906     }
2907
2908     if (!vfp_access_check(s)) {
2909         return true;
2910     }
2911
2912     desc = tcg_constant_i32((a->vn << 2) | a->len);
2913     def = tcg_temp_new_i64();
2914     if (a->op) {
2915         read_neon_element64(def, a->vd, 0, MO_64);
2916     } else {
2917         tcg_gen_movi_i64(def, 0);
2918     }
2919     val = tcg_temp_new_i64();
2920     read_neon_element64(val, a->vm, 0, MO_64);
2921
2922     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2923     write_neon_element64(val, a->vd, 0, MO_64);
2924
2925     tcg_temp_free_i64(def);
2926     tcg_temp_free_i64(val);
2927     return true;
2928 }
2929
2930 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2931 {
2932     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2933         return false;
2934     }
2935
2936     /* UNDEF accesses to D16-D31 if they don't exist. */
2937     if (!dc_isar_feature(aa32_simd_r32, s) &&
2938         ((a->vd | a->vm) & 0x10)) {
2939         return false;
2940     }
2941
2942     if (a->vd & a->q) {
2943         return false;
2944     }
2945
2946     if (!vfp_access_check(s)) {
2947         return true;
2948     }
2949
2950     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2951                          neon_element_offset(a->vm, a->index, a->size),
2952                          a->q ? 16 : 8, a->q ? 16 : 8);
2953     return true;
2954 }
2955
2956 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2957 {
2958     int pass, half;
2959     TCGv_i32 tmp[2];
2960
2961     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2962         return false;
2963     }
2964
2965     /* UNDEF accesses to D16-D31 if they don't exist. */
2966     if (!dc_isar_feature(aa32_simd_r32, s) &&
2967         ((a->vd | a->vm) & 0x10)) {
2968         return false;
2969     }
2970
2971     if ((a->vd | a->vm) & a->q) {
2972         return false;
2973     }
2974
2975     if (a->size == 3) {
2976         return false;
2977     }
2978
2979     if (!vfp_access_check(s)) {
2980         return true;
2981     }
2982
2983     tmp[0] = tcg_temp_new_i32();
2984     tmp[1] = tcg_temp_new_i32();
2985
2986     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2987         for (half = 0; half < 2; half++) {
2988             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2989             switch (a->size) {
2990             case 0:
2991                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2992                 break;
2993             case 1:
2994                 gen_swap_half(tmp[half], tmp[half]);
2995                 break;
2996             case 2:
2997                 break;
2998             default:
2999                 g_assert_not_reached();
3000             }
3001         }
3002         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3003         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3004     }
3005
3006     tcg_temp_free_i32(tmp[0]);
3007     tcg_temp_free_i32(tmp[1]);
3008     return true;
3009 }
3010
3011 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3012                               NeonGenWidenFn *widenfn,
3013                               NeonGenTwo64OpFn *opfn,
3014                               NeonGenTwo64OpFn *accfn)
3015 {
3016     /*
3017      * Pairwise long operations: widen both halves of the pair,
3018      * combine the pairs with the opfn, and then possibly accumulate
3019      * into the destination with the accfn.
3020      */
3021     int pass;
3022
3023     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3024         return false;
3025     }
3026
3027     /* UNDEF accesses to D16-D31 if they don't exist. */
3028     if (!dc_isar_feature(aa32_simd_r32, s) &&
3029         ((a->vd | a->vm) & 0x10)) {
3030         return false;
3031     }
3032
3033     if ((a->vd | a->vm) & a->q) {
3034         return false;
3035     }
3036
3037     if (!widenfn) {
3038         return false;
3039     }
3040
3041     if (!vfp_access_check(s)) {
3042         return true;
3043     }
3044
3045     for (pass = 0; pass < a->q + 1; pass++) {
3046         TCGv_i32 tmp;
3047         TCGv_i64 rm0_64, rm1_64, rd_64;
3048
3049         rm0_64 = tcg_temp_new_i64();
3050         rm1_64 = tcg_temp_new_i64();
3051         rd_64 = tcg_temp_new_i64();
3052
3053         tmp = tcg_temp_new_i32();
3054         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3055         widenfn(rm0_64, tmp);
3056         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3057         widenfn(rm1_64, tmp);
3058         tcg_temp_free_i32(tmp);
3059
3060         opfn(rd_64, rm0_64, rm1_64);
3061         tcg_temp_free_i64(rm0_64);
3062         tcg_temp_free_i64(rm1_64);
3063
3064         if (accfn) {
3065             TCGv_i64 tmp64 = tcg_temp_new_i64();
3066             read_neon_element64(tmp64, a->vd, pass, MO_64);
3067             accfn(rd_64, tmp64, rd_64);
3068             tcg_temp_free_i64(tmp64);
3069         }
3070         write_neon_element64(rd_64, a->vd, pass, MO_64);
3071         tcg_temp_free_i64(rd_64);
3072     }
3073     return true;
3074 }
3075
3076 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3077 {
3078     static NeonGenWidenFn * const widenfn[] = {
3079         gen_helper_neon_widen_s8,
3080         gen_helper_neon_widen_s16,
3081         tcg_gen_ext_i32_i64,
3082         NULL,
3083     };
3084     static NeonGenTwo64OpFn * const opfn[] = {
3085         gen_helper_neon_paddl_u16,
3086         gen_helper_neon_paddl_u32,
3087         tcg_gen_add_i64,
3088         NULL,
3089     };
3090
3091     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3092 }
3093
3094 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3095 {
3096     static NeonGenWidenFn * const widenfn[] = {
3097         gen_helper_neon_widen_u8,
3098         gen_helper_neon_widen_u16,
3099         tcg_gen_extu_i32_i64,
3100         NULL,
3101     };
3102     static NeonGenTwo64OpFn * const opfn[] = {
3103         gen_helper_neon_paddl_u16,
3104         gen_helper_neon_paddl_u32,
3105         tcg_gen_add_i64,
3106         NULL,
3107     };
3108
3109     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3110 }
3111
3112 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3113 {
3114     static NeonGenWidenFn * const widenfn[] = {
3115         gen_helper_neon_widen_s8,
3116         gen_helper_neon_widen_s16,
3117         tcg_gen_ext_i32_i64,
3118         NULL,
3119     };
3120     static NeonGenTwo64OpFn * const opfn[] = {
3121         gen_helper_neon_paddl_u16,
3122         gen_helper_neon_paddl_u32,
3123         tcg_gen_add_i64,
3124         NULL,
3125     };
3126     static NeonGenTwo64OpFn * const accfn[] = {
3127         gen_helper_neon_addl_u16,
3128         gen_helper_neon_addl_u32,
3129         tcg_gen_add_i64,
3130         NULL,
3131     };
3132
3133     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3134                              accfn[a->size]);
3135 }
3136
3137 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3138 {
3139     static NeonGenWidenFn * const widenfn[] = {
3140         gen_helper_neon_widen_u8,
3141         gen_helper_neon_widen_u16,
3142         tcg_gen_extu_i32_i64,
3143         NULL,
3144     };
3145     static NeonGenTwo64OpFn * const opfn[] = {
3146         gen_helper_neon_paddl_u16,
3147         gen_helper_neon_paddl_u32,
3148         tcg_gen_add_i64,
3149         NULL,
3150     };
3151     static NeonGenTwo64OpFn * const accfn[] = {
3152         gen_helper_neon_addl_u16,
3153         gen_helper_neon_addl_u32,
3154         tcg_gen_add_i64,
3155         NULL,
3156     };
3157
3158     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3159                              accfn[a->size]);
3160 }
3161
3162 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3163
3164 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3165                        ZipFn *fn)
3166 {
3167     TCGv_ptr pd, pm;
3168
3169     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3170         return false;
3171     }
3172
3173     /* UNDEF accesses to D16-D31 if they don't exist. */
3174     if (!dc_isar_feature(aa32_simd_r32, s) &&
3175         ((a->vd | a->vm) & 0x10)) {
3176         return false;
3177     }
3178
3179     if ((a->vd | a->vm) & a->q) {
3180         return false;
3181     }
3182
3183     if (!fn) {
3184         /* Bad size or size/q combination */
3185         return false;
3186     }
3187
3188     if (!vfp_access_check(s)) {
3189         return true;
3190     }
3191
3192     pd = vfp_reg_ptr(true, a->vd);
3193     pm = vfp_reg_ptr(true, a->vm);
3194     fn(pd, pm);
3195     tcg_temp_free_ptr(pd);
3196     tcg_temp_free_ptr(pm);
3197     return true;
3198 }
3199
3200 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3201 {
3202     static ZipFn * const fn[2][4] = {
3203         {
3204             gen_helper_neon_unzip8,
3205             gen_helper_neon_unzip16,
3206             NULL,
3207             NULL,
3208         }, {
3209             gen_helper_neon_qunzip8,
3210             gen_helper_neon_qunzip16,
3211             gen_helper_neon_qunzip32,
3212             NULL,
3213         }
3214     };
3215     return do_zip_uzp(s, a, fn[a->q][a->size]);
3216 }
3217
3218 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3219 {
3220     static ZipFn * const fn[2][4] = {
3221         {
3222             gen_helper_neon_zip8,
3223             gen_helper_neon_zip16,
3224             NULL,
3225             NULL,
3226         }, {
3227             gen_helper_neon_qzip8,
3228             gen_helper_neon_qzip16,
3229             gen_helper_neon_qzip32,
3230             NULL,
3231         }
3232     };
3233     return do_zip_uzp(s, a, fn[a->q][a->size]);
3234 }
3235
3236 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3237                      NeonGenNarrowEnvFn *narrowfn)
3238 {
3239     TCGv_i64 rm;
3240     TCGv_i32 rd0, rd1;
3241
3242     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3243         return false;
3244     }
3245
3246     /* UNDEF accesses to D16-D31 if they don't exist. */
3247     if (!dc_isar_feature(aa32_simd_r32, s) &&
3248         ((a->vd | a->vm) & 0x10)) {
3249         return false;
3250     }
3251
3252     if (a->vm & 1) {
3253         return false;
3254     }
3255
3256     if (!narrowfn) {
3257         return false;
3258     }
3259
3260     if (!vfp_access_check(s)) {
3261         return true;
3262     }
3263
3264     rm = tcg_temp_new_i64();
3265     rd0 = tcg_temp_new_i32();
3266     rd1 = tcg_temp_new_i32();
3267
3268     read_neon_element64(rm, a->vm, 0, MO_64);
3269     narrowfn(rd0, cpu_env, rm);
3270     read_neon_element64(rm, a->vm, 1, MO_64);
3271     narrowfn(rd1, cpu_env, rm);
3272     write_neon_element32(rd0, a->vd, 0, MO_32);
3273     write_neon_element32(rd1, a->vd, 1, MO_32);
3274     tcg_temp_free_i32(rd0);
3275     tcg_temp_free_i32(rd1);
3276     tcg_temp_free_i64(rm);
3277     return true;
3278 }
3279
3280 #define DO_VMOVN(INSN, FUNC)                                    \
3281     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3282     {                                                           \
3283         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3284             FUNC##8,                                            \
3285             FUNC##16,                                           \
3286             FUNC##32,                                           \
3287             NULL,                                               \
3288         };                                                      \
3289         return do_vmovn(s, a, narrowfn[a->size]);               \
3290     }
3291
3292 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3293 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3294 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3295 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3296
3297 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3298 {
3299     TCGv_i32 rm0, rm1;
3300     TCGv_i64 rd;
3301     static NeonGenWidenFn * const widenfns[] = {
3302         gen_helper_neon_widen_u8,
3303         gen_helper_neon_widen_u16,
3304         tcg_gen_extu_i32_i64,
3305         NULL,
3306     };
3307     NeonGenWidenFn *widenfn = widenfns[a->size];
3308
3309     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3310         return false;
3311     }
3312
3313     /* UNDEF accesses to D16-D31 if they don't exist. */
3314     if (!dc_isar_feature(aa32_simd_r32, s) &&
3315         ((a->vd | a->vm) & 0x10)) {
3316         return false;
3317     }
3318
3319     if (a->vd & 1) {
3320         return false;
3321     }
3322
3323     if (!widenfn) {
3324         return false;
3325     }
3326
3327     if (!vfp_access_check(s)) {
3328         return true;
3329     }
3330
3331     rd = tcg_temp_new_i64();
3332     rm0 = tcg_temp_new_i32();
3333     rm1 = tcg_temp_new_i32();
3334
3335     read_neon_element32(rm0, a->vm, 0, MO_32);
3336     read_neon_element32(rm1, a->vm, 1, MO_32);
3337
3338     widenfn(rd, rm0);
3339     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3340     write_neon_element64(rd, a->vd, 0, MO_64);
3341     widenfn(rd, rm1);
3342     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3343     write_neon_element64(rd, a->vd, 1, MO_64);
3344
3345     tcg_temp_free_i64(rd);
3346     tcg_temp_free_i32(rm0);
3347     tcg_temp_free_i32(rm1);
3348     return true;
3349 }
3350
3351 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3352 {
3353     TCGv_ptr fpst;
3354     TCGv_i64 tmp;
3355     TCGv_i32 dst0, dst1;
3356
3357     if (!dc_isar_feature(aa32_bf16, s)) {
3358         return false;
3359     }
3360
3361     /* UNDEF accesses to D16-D31 if they don't exist. */
3362     if (!dc_isar_feature(aa32_simd_r32, s) &&
3363         ((a->vd | a->vm) & 0x10)) {
3364         return false;
3365     }
3366
3367     if ((a->vm & 1) || (a->size != 1)) {
3368         return false;
3369     }
3370
3371     if (!vfp_access_check(s)) {
3372         return true;
3373     }
3374
3375     fpst = fpstatus_ptr(FPST_STD);
3376     tmp = tcg_temp_new_i64();
3377     dst0 = tcg_temp_new_i32();
3378     dst1 = tcg_temp_new_i32();
3379
3380     read_neon_element64(tmp, a->vm, 0, MO_64);
3381     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3382
3383     read_neon_element64(tmp, a->vm, 1, MO_64);
3384     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3385
3386     write_neon_element32(dst0, a->vd, 0, MO_32);
3387     write_neon_element32(dst1, a->vd, 1, MO_32);
3388
3389     tcg_temp_free_i64(tmp);
3390     tcg_temp_free_i32(dst0);
3391     tcg_temp_free_i32(dst1);
3392     tcg_temp_free_ptr(fpst);
3393     return true;
3394 }
3395
3396 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3397 {
3398     TCGv_ptr fpst;
3399     TCGv_i32 ahp, tmp, tmp2, tmp3;
3400
3401     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3402         !dc_isar_feature(aa32_fp16_spconv, s)) {
3403         return false;
3404     }
3405
3406     /* UNDEF accesses to D16-D31 if they don't exist. */
3407     if (!dc_isar_feature(aa32_simd_r32, s) &&
3408         ((a->vd | a->vm) & 0x10)) {
3409         return false;
3410     }
3411
3412     if ((a->vm & 1) || (a->size != 1)) {
3413         return false;
3414     }
3415
3416     if (!vfp_access_check(s)) {
3417         return true;
3418     }
3419
3420     fpst = fpstatus_ptr(FPST_STD);
3421     ahp = get_ahp_flag();
3422     tmp = tcg_temp_new_i32();
3423     read_neon_element32(tmp, a->vm, 0, MO_32);
3424     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3425     tmp2 = tcg_temp_new_i32();
3426     read_neon_element32(tmp2, a->vm, 1, MO_32);
3427     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3428     tcg_gen_shli_i32(tmp2, tmp2, 16);
3429     tcg_gen_or_i32(tmp2, tmp2, tmp);
3430     read_neon_element32(tmp, a->vm, 2, MO_32);
3431     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3432     tmp3 = tcg_temp_new_i32();
3433     read_neon_element32(tmp3, a->vm, 3, MO_32);
3434     write_neon_element32(tmp2, a->vd, 0, MO_32);
3435     tcg_temp_free_i32(tmp2);
3436     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3437     tcg_gen_shli_i32(tmp3, tmp3, 16);
3438     tcg_gen_or_i32(tmp3, tmp3, tmp);
3439     write_neon_element32(tmp3, a->vd, 1, MO_32);
3440     tcg_temp_free_i32(tmp3);
3441     tcg_temp_free_i32(tmp);
3442     tcg_temp_free_i32(ahp);
3443     tcg_temp_free_ptr(fpst);
3444
3445     return true;
3446 }
3447
3448 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3449 {
3450     TCGv_ptr fpst;
3451     TCGv_i32 ahp, tmp, tmp2, tmp3;
3452
3453     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3454         !dc_isar_feature(aa32_fp16_spconv, s)) {
3455         return false;
3456     }
3457
3458     /* UNDEF accesses to D16-D31 if they don't exist. */
3459     if (!dc_isar_feature(aa32_simd_r32, s) &&
3460         ((a->vd | a->vm) & 0x10)) {
3461         return false;
3462     }
3463
3464     if ((a->vd & 1) || (a->size != 1)) {
3465         return false;
3466     }
3467
3468     if (!vfp_access_check(s)) {
3469         return true;
3470     }
3471
3472     fpst = fpstatus_ptr(FPST_STD);
3473     ahp = get_ahp_flag();
3474     tmp3 = tcg_temp_new_i32();
3475     tmp2 = tcg_temp_new_i32();
3476     tmp = tcg_temp_new_i32();
3477     read_neon_element32(tmp, a->vm, 0, MO_32);
3478     read_neon_element32(tmp2, a->vm, 1, MO_32);
3479     tcg_gen_ext16u_i32(tmp3, tmp);
3480     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3481     write_neon_element32(tmp3, a->vd, 0, MO_32);
3482     tcg_gen_shri_i32(tmp, tmp, 16);
3483     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3484     write_neon_element32(tmp, a->vd, 1, MO_32);
3485     tcg_temp_free_i32(tmp);
3486     tcg_gen_ext16u_i32(tmp3, tmp2);
3487     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3488     write_neon_element32(tmp3, a->vd, 2, MO_32);
3489     tcg_temp_free_i32(tmp3);
3490     tcg_gen_shri_i32(tmp2, tmp2, 16);
3491     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3492     write_neon_element32(tmp2, a->vd, 3, MO_32);
3493     tcg_temp_free_i32(tmp2);
3494     tcg_temp_free_i32(ahp);
3495     tcg_temp_free_ptr(fpst);
3496
3497     return true;
3498 }
3499
3500 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3501 {
3502     int vec_size = a->q ? 16 : 8;
3503     int rd_ofs = neon_full_reg_offset(a->vd);
3504     int rm_ofs = neon_full_reg_offset(a->vm);
3505
3506     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3507         return false;
3508     }
3509
3510     /* UNDEF accesses to D16-D31 if they don't exist. */
3511     if (!dc_isar_feature(aa32_simd_r32, s) &&
3512         ((a->vd | a->vm) & 0x10)) {
3513         return false;
3514     }
3515
3516     if (a->size == 3) {
3517         return false;
3518     }
3519
3520     if ((a->vd | a->vm) & a->q) {
3521         return false;
3522     }
3523
3524     if (!vfp_access_check(s)) {
3525         return true;
3526     }
3527
3528     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3529
3530     return true;
3531 }
3532
3533 #define DO_2MISC_VEC(INSN, FN)                                  \
3534     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3535     {                                                           \
3536         return do_2misc_vec(s, a, FN);                          \
3537     }
3538
3539 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3540 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3541 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3542 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3543 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3544 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3545 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3546
3547 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3548 {
3549     if (a->size != 0) {
3550         return false;
3551     }
3552     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3553 }
3554
3555 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3556     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3557                          uint32_t rm_ofs, uint32_t oprsz,               \
3558                          uint32_t maxsz)                                \
3559     {                                                                   \
3560         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3561                            DATA, FUNC);                                 \
3562     }
3563
3564 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3565     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3566                          uint32_t rm_ofs, uint32_t oprsz,               \
3567                          uint32_t maxsz)                                \
3568     {                                                                   \
3569         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3570     }
3571
3572 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3573 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3574 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3575 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3576 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3577 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3578 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3579
3580 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3581     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3582     {                                                           \
3583         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3584             return false;                                       \
3585         }                                                       \
3586         return do_2misc_vec(s, a, gen_##INSN);                  \
3587     }
3588
3589 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3590 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3591 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3592 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3593 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3594 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3595 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3596
3597 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3598 {
3599     TCGv_i32 tmp;
3600     int pass;
3601
3602     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3603     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3604         return false;
3605     }
3606
3607     /* UNDEF accesses to D16-D31 if they don't exist. */
3608     if (!dc_isar_feature(aa32_simd_r32, s) &&
3609         ((a->vd | a->vm) & 0x10)) {
3610         return false;
3611     }
3612
3613     if (!fn) {
3614         return false;
3615     }
3616
3617     if ((a->vd | a->vm) & a->q) {
3618         return false;
3619     }
3620
3621     if (!vfp_access_check(s)) {
3622         return true;
3623     }
3624
3625     tmp = tcg_temp_new_i32();
3626     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3627         read_neon_element32(tmp, a->vm, pass, MO_32);
3628         fn(tmp, tmp);
3629         write_neon_element32(tmp, a->vd, pass, MO_32);
3630     }
3631     tcg_temp_free_i32(tmp);
3632
3633     return true;
3634 }
3635
3636 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3637 {
3638     static NeonGenOneOpFn * const fn[] = {
3639         tcg_gen_bswap32_i32,
3640         gen_swap_half,
3641         NULL,
3642         NULL,
3643     };
3644     return do_2misc(s, a, fn[a->size]);
3645 }
3646
3647 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3648 {
3649     if (a->size != 0) {
3650         return false;
3651     }
3652     return do_2misc(s, a, gen_rev16);
3653 }
3654
3655 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3656 {
3657     static NeonGenOneOpFn * const fn[] = {
3658         gen_helper_neon_cls_s8,
3659         gen_helper_neon_cls_s16,
3660         gen_helper_neon_cls_s32,
3661         NULL,
3662     };
3663     return do_2misc(s, a, fn[a->size]);
3664 }
3665
3666 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3667 {
3668     tcg_gen_clzi_i32(rd, rm, 32);
3669 }
3670
3671 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3672 {
3673     static NeonGenOneOpFn * const fn[] = {
3674         gen_helper_neon_clz_u8,
3675         gen_helper_neon_clz_u16,
3676         do_VCLZ_32,
3677         NULL,
3678     };
3679     return do_2misc(s, a, fn[a->size]);
3680 }
3681
3682 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3683 {
3684     if (a->size != 0) {
3685         return false;
3686     }
3687     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3688 }
3689
3690 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3691                        uint32_t oprsz, uint32_t maxsz)
3692 {
3693     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3694                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3695                       oprsz, maxsz);
3696 }
3697
3698 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3699 {
3700     if (a->size == MO_16) {
3701         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3702             return false;
3703         }
3704     } else if (a->size != MO_32) {
3705         return false;
3706     }
3707     return do_2misc_vec(s, a, gen_VABS_F);
3708 }
3709
3710 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3711                        uint32_t oprsz, uint32_t maxsz)
3712 {
3713     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3714                       vece == MO_16 ? 0x8000 : 0x80000000,
3715                       oprsz, maxsz);
3716 }
3717
3718 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3719 {
3720     if (a->size == MO_16) {
3721         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3722             return false;
3723         }
3724     } else if (a->size != MO_32) {
3725         return false;
3726     }
3727     return do_2misc_vec(s, a, gen_VNEG_F);
3728 }
3729
3730 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3731 {
3732     if (a->size != 2) {
3733         return false;
3734     }
3735     return do_2misc(s, a, gen_helper_recpe_u32);
3736 }
3737
3738 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3739 {
3740     if (a->size != 2) {
3741         return false;
3742     }
3743     return do_2misc(s, a, gen_helper_rsqrte_u32);
3744 }
3745
3746 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3747     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3748     {                                                   \
3749         FUNC(d, cpu_env, m);                            \
3750     }
3751
3752 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3753 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3754 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3755 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3756 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3757 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3758
3759 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3760 {
3761     static NeonGenOneOpFn * const fn[] = {
3762         gen_VQABS_s8,
3763         gen_VQABS_s16,
3764         gen_VQABS_s32,
3765         NULL,
3766     };
3767     return do_2misc(s, a, fn[a->size]);
3768 }
3769
3770 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3771 {
3772     static NeonGenOneOpFn * const fn[] = {
3773         gen_VQNEG_s8,
3774         gen_VQNEG_s16,
3775         gen_VQNEG_s32,
3776         NULL,
3777     };
3778     return do_2misc(s, a, fn[a->size]);
3779 }
3780
3781 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3782     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3783                            uint32_t rm_ofs,                             \
3784                            uint32_t oprsz, uint32_t maxsz)              \
3785     {                                                                   \
3786         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3787             NULL, HFUNC, SFUNC, NULL,                                   \
3788         };                                                              \
3789         TCGv_ptr fpst;                                                  \
3790         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3791         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3792                            fns[vece]);                                  \
3793         tcg_temp_free_ptr(fpst);                                        \
3794     }                                                                   \
3795     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3796     {                                                                   \
3797         if (a->size == MO_16) {                                         \
3798             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3799                 return false;                                           \
3800             }                                                           \
3801         } else if (a->size != MO_32) {                                  \
3802             return false;                                               \
3803         }                                                               \
3804         return do_2misc_vec(s, a, gen_##INSN);                          \
3805     }
3806
3807 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3808 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3809 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3810 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3811 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3812 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3813 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3814 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3815 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3816 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3817 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3818
3819 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3820
3821 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3822 {
3823     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3824         return false;
3825     }
3826     return trans_VRINTX_impl(s, a);
3827 }
3828
3829 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3830     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3831                            uint32_t rm_ofs,                             \
3832                            uint32_t oprsz, uint32_t maxsz)              \
3833     {                                                                   \
3834         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3835             NULL,                                                       \
3836             gen_helper_gvec_##OP##h,                                    \
3837             gen_helper_gvec_##OP##s,                                    \
3838             NULL,                                                       \
3839         };                                                              \
3840         TCGv_ptr fpst;                                                  \
3841         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3842         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3843                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3844         tcg_temp_free_ptr(fpst);                                        \
3845     }                                                                   \
3846     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3847     {                                                                   \
3848         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3849             return false;                                               \
3850         }                                                               \
3851         if (a->size == MO_16) {                                         \
3852             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3853                 return false;                                           \
3854             }                                                           \
3855         } else if (a->size != MO_32) {                                  \
3856             return false;                                               \
3857         }                                                               \
3858         return do_2misc_vec(s, a, gen_##INSN);                          \
3859     }
3860
3861 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3862 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3863 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3864 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3865 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3866 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3867 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3868 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3869
3870 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3871 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3872 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3873 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3874 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3875
3876 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3877 {
3878     TCGv_i64 rm, rd;
3879     int pass;
3880
3881     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3882         return false;
3883     }
3884
3885     /* UNDEF accesses to D16-D31 if they don't exist. */
3886     if (!dc_isar_feature(aa32_simd_r32, s) &&
3887         ((a->vd | a->vm) & 0x10)) {
3888         return false;
3889     }
3890
3891     if (a->size != 0) {
3892         return false;
3893     }
3894
3895     if ((a->vd | a->vm) & a->q) {
3896         return false;
3897     }
3898
3899     if (!vfp_access_check(s)) {
3900         return true;
3901     }
3902
3903     rm = tcg_temp_new_i64();
3904     rd = tcg_temp_new_i64();
3905     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3906         read_neon_element64(rm, a->vm, pass, MO_64);
3907         read_neon_element64(rd, a->vd, pass, MO_64);
3908         write_neon_element64(rm, a->vd, pass, MO_64);
3909         write_neon_element64(rd, a->vm, pass, MO_64);
3910     }
3911     tcg_temp_free_i64(rm);
3912     tcg_temp_free_i64(rd);
3913
3914     return true;
3915 }
3916 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3917 {
3918     TCGv_i32 rd, tmp;
3919
3920     rd = tcg_temp_new_i32();
3921     tmp = tcg_temp_new_i32();
3922
3923     tcg_gen_shli_i32(rd, t0, 8);
3924     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3925     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3926     tcg_gen_or_i32(rd, rd, tmp);
3927
3928     tcg_gen_shri_i32(t1, t1, 8);
3929     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3930     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3931     tcg_gen_or_i32(t1, t1, tmp);
3932     tcg_gen_mov_i32(t0, rd);
3933
3934     tcg_temp_free_i32(tmp);
3935     tcg_temp_free_i32(rd);
3936 }
3937
3938 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3939 {
3940     TCGv_i32 rd, tmp;
3941
3942     rd = tcg_temp_new_i32();
3943     tmp = tcg_temp_new_i32();
3944
3945     tcg_gen_shli_i32(rd, t0, 16);
3946     tcg_gen_andi_i32(tmp, t1, 0xffff);
3947     tcg_gen_or_i32(rd, rd, tmp);
3948     tcg_gen_shri_i32(t1, t1, 16);
3949     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3950     tcg_gen_or_i32(t1, t1, tmp);
3951     tcg_gen_mov_i32(t0, rd);
3952
3953     tcg_temp_free_i32(tmp);
3954     tcg_temp_free_i32(rd);
3955 }
3956
3957 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3958 {
3959     TCGv_i32 tmp, tmp2;
3960     int pass;
3961
3962     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3963         return false;
3964     }
3965
3966     /* UNDEF accesses to D16-D31 if they don't exist. */
3967     if (!dc_isar_feature(aa32_simd_r32, s) &&
3968         ((a->vd | a->vm) & 0x10)) {
3969         return false;
3970     }
3971
3972     if ((a->vd | a->vm) & a->q) {
3973         return false;
3974     }
3975
3976     if (a->size == 3) {
3977         return false;
3978     }
3979
3980     if (!vfp_access_check(s)) {
3981         return true;
3982     }
3983
3984     tmp = tcg_temp_new_i32();
3985     tmp2 = tcg_temp_new_i32();
3986     if (a->size == MO_32) {
3987         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3988             read_neon_element32(tmp, a->vm, pass, MO_32);
3989             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3990             write_neon_element32(tmp2, a->vm, pass, MO_32);
3991             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3992         }
3993     } else {
3994         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3995             read_neon_element32(tmp, a->vm, pass, MO_32);
3996             read_neon_element32(tmp2, a->vd, pass, MO_32);
3997             if (a->size == MO_8) {
3998                 gen_neon_trn_u8(tmp, tmp2);
3999             } else {
4000                 gen_neon_trn_u16(tmp, tmp2);
4001             }
4002             write_neon_element32(tmp2, a->vm, pass, MO_32);
4003             write_neon_element32(tmp, a->vd, pass, MO_32);
4004         }
4005     }
4006     tcg_temp_free_i32(tmp);
4007     tcg_temp_free_i32(tmp2);
4008     return true;
4009 }
4010
4011 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4012 {
4013     if (!dc_isar_feature(aa32_i8mm, s)) {
4014         return false;
4015     }
4016     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4017                         gen_helper_gvec_smmla_b);
4018 }
4019
4020 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4021 {
4022     if (!dc_isar_feature(aa32_i8mm, s)) {
4023         return false;
4024     }
4025     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4026                         gen_helper_gvec_ummla_b);
4027 }
4028
4029 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4030 {
4031     if (!dc_isar_feature(aa32_i8mm, s)) {
4032         return false;
4033     }
4034     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4035                         gen_helper_gvec_usmmla_b);
4036 }
4037
4038 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4039 {
4040     if (!dc_isar_feature(aa32_bf16, s)) {
4041         return false;
4042     }
4043     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4044                         gen_helper_gvec_bfmmla);
4045 }
4046
4047 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4048 {
4049     if (!dc_isar_feature(aa32_bf16, s)) {
4050         return false;
4051     }
4052     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4053                              gen_helper_gvec_bfmlal);
4054 }
4055
4056 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4057 {
4058     if (!dc_isar_feature(aa32_bf16, s)) {
4059         return false;
4060     }
4061     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4062                              (a->index << 1) | a->q, FPST_STD,
4063                              gen_helper_gvec_bfmlal_idx);
4064 }