target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 static inline int plus1(DisasContext *s, int x)
  32 {
  33     return x + 1;
  34 }
  35
  36 static inline int rsub_64(DisasContext *s, int x)
  37 {
  38     return 64 - x;
  39 }
  40
  41 static inline int rsub_32(DisasContext *s, int x)
  42 {
  43     return 32 - x;
  44 }
  45 static inline int rsub_16(DisasContext *s, int x)
  46 {
  47     return 16 - x;
  48 }
  49 static inline int rsub_8(DisasContext *s, int x)
  50 {
  51     return 8 - x;
  52 }
  53
  54 static inline int neon_3same_fp_size(DisasContext *s, int x)
  55 {
  56     /* Convert 0==fp32, 1==fp16 into a MO_* value */
  57     return MO_32 - x;
  58 }
  59
  60 /* Include the generated Neon decoder */
  61 #include "decode-neon-dp.c.inc"
  62 #include "decode-neon-ls.c.inc"
  63 #include "decode-neon-shared.c.inc"
  64
  65 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  66 {
  67     TCGv_ptr ret = tcg_temp_new_ptr();
  68     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  69     return ret;
  70 }
  71
  72 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  73 {
  74     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  75
  76     switch (mop) {
  77     case MO_UB:
  78         tcg_gen_ld8u_i32(var, cpu_env, offset);
  79         break;
  80     case MO_UW:
  81         tcg_gen_ld16u_i32(var, cpu_env, offset);
  82         break;
  83     case MO_UL:
  84         tcg_gen_ld_i32(var, cpu_env, offset);
  85         break;
  86     default:
  87         g_assert_not_reached();
  88     }
  89 }
  90
  91 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  92 {
  93     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  94
  95     switch (mop) {
  96     case MO_UB:
  97         tcg_gen_ld8u_i64(var, cpu_env, offset);
  98         break;
  99     case MO_UW:
 100         tcg_gen_ld16u_i64(var, cpu_env, offset);
 101         break;
 102     case MO_UL:
 103         tcg_gen_ld32u_i64(var, cpu_env, offset);
 104         break;
 105     case MO_Q:
 106         tcg_gen_ld_i64(var, cpu_env, offset);
 107         break;
 108     default:
 109         g_assert_not_reached();
 110     }
 111 }
 112
 113 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
 114 {
 115     long offset = neon_element_offset(reg, ele, size);
 116
 117     switch (size) {
 118     case MO_8:
 119         tcg_gen_st8_i32(var, cpu_env, offset);
 120         break;
 121     case MO_16:
 122         tcg_gen_st16_i32(var, cpu_env, offset);
 123         break;
 124     case MO_32:
 125         tcg_gen_st_i32(var, cpu_env, offset);
 126         break;
 127     default:
 128         g_assert_not_reached();
 129     }
 130 }
 131
 132 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 133 {
 134     long offset = neon_element_offset(reg, ele, size);
 135
 136     switch (size) {
 137     case MO_8:
 138         tcg_gen_st8_i64(var, cpu_env, offset);
 139         break;
 140     case MO_16:
 141         tcg_gen_st16_i64(var, cpu_env, offset);
 142         break;
 143     case MO_32:
 144         tcg_gen_st32_i64(var, cpu_env, offset);
 145         break;
 146     case MO_64:
 147         tcg_gen_st_i64(var, cpu_env, offset);
 148         break;
 149     default:
 150         g_assert_not_reached();
 151     }
 152 }
 153
 154 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 155 {
 156     int opr_sz;
 157     TCGv_ptr fpst;
 158     gen_helper_gvec_4_ptr *fn_gvec_ptr;
 159
 160     if (!dc_isar_feature(aa32_vcma, s)
 161         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 162         return false;
 163     }
 164
 165     /* UNDEF accesses to D16-D31 if they don't exist. */
 166     if (!dc_isar_feature(aa32_simd_r32, s) &&
 167         ((a->vd | a->vn | a->vm) & 0x10)) {
 168         return false;
 169     }
 170
 171     if ((a->vn | a->vm | a->vd) & a->q) {
 172         return false;
 173     }
 174
 175     if (!vfp_access_check(s)) {
 176         return true;
 177     }
 178
 179     opr_sz = (1 + a->q) * 8;
 180     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 181     fn_gvec_ptr = (a->size == MO_16) ?
 182         gen_helper_gvec_fcmlah : gen_helper_gvec_fcmlas;
 183     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, a->vd),
 184                        vfp_reg_offset(1, a->vn),
 185                        vfp_reg_offset(1, a->vm),
 186                        vfp_reg_offset(1, a->vd),
 187                        fpst, opr_sz, opr_sz, a->rot,
 188                        fn_gvec_ptr);
 189     tcg_temp_free_ptr(fpst);
 190     return true;
 191 }
 192
 193 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 194 {
 195     int opr_sz;
 196     TCGv_ptr fpst;
 197     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 198
 199     if (!dc_isar_feature(aa32_vcma, s)
 200         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 201         return false;
 202     }
 203
 204     /* UNDEF accesses to D16-D31 if they don't exist. */
 205     if (!dc_isar_feature(aa32_simd_r32, s) &&
 206         ((a->vd | a->vn | a->vm) & 0x10)) {
 207         return false;
 208     }
 209
 210     if ((a->vn | a->vm | a->vd) & a->q) {
 211         return false;
 212     }
 213
 214     if (!vfp_access_check(s)) {
 215         return true;
 216     }
 217
 218     opr_sz = (1 + a->q) * 8;
 219     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 220     fn_gvec_ptr = (a->size == MO_16) ?
 221         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 222     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 223                        vfp_reg_offset(1, a->vn),
 224                        vfp_reg_offset(1, a->vm),
 225                        fpst, opr_sz, opr_sz, a->rot,
 226                        fn_gvec_ptr);
 227     tcg_temp_free_ptr(fpst);
 228     return true;
 229 }
 230
 231 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
 232 {
 233     int opr_sz;
 234     gen_helper_gvec_4 *fn_gvec;
 235
 236     if (!dc_isar_feature(aa32_dp, s)) {
 237         return false;
 238     }
 239
 240     /* UNDEF accesses to D16-D31 if they don't exist. */
 241     if (!dc_isar_feature(aa32_simd_r32, s) &&
 242         ((a->vd | a->vn | a->vm) & 0x10)) {
 243         return false;
 244     }
 245
 246     if ((a->vn | a->vm | a->vd) & a->q) {
 247         return false;
 248     }
 249
 250     if (!vfp_access_check(s)) {
 251         return true;
 252     }
 253
 254     opr_sz = (1 + a->q) * 8;
 255     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
 256     tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd),
 257                        vfp_reg_offset(1, a->vn),
 258                        vfp_reg_offset(1, a->vm),
 259                        vfp_reg_offset(1, a->vd),
 260                        opr_sz, opr_sz, 0, fn_gvec);
 261     return true;
 262 }
 263
 264 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 265 {
 266     int opr_sz;
 267
 268     if (!dc_isar_feature(aa32_fhm, s)) {
 269         return false;
 270     }
 271
 272     /* UNDEF accesses to D16-D31 if they don't exist. */
 273     if (!dc_isar_feature(aa32_simd_r32, s) &&
 274         (a->vd & 0x10)) {
 275         return false;
 276     }
 277
 278     if (a->vd & a->q) {
 279         return false;
 280     }
 281
 282     if (!vfp_access_check(s)) {
 283         return true;
 284     }
 285
 286     opr_sz = (1 + a->q) * 8;
 287     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 288                        vfp_reg_offset(a->q, a->vn),
 289                        vfp_reg_offset(a->q, a->vm),
 290                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 291                        gen_helper_gvec_fmlal_a32);
 292     return true;
 293 }
 294
 295 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 296 {
 297     gen_helper_gvec_4_ptr *fn_gvec_ptr;
 298     int opr_sz;
 299     TCGv_ptr fpst;
 300
 301     if (!dc_isar_feature(aa32_vcma, s)) {
 302         return false;
 303     }
 304     if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) {
 305         return false;
 306     }
 307
 308     /* UNDEF accesses to D16-D31 if they don't exist. */
 309     if (!dc_isar_feature(aa32_simd_r32, s) &&
 310         ((a->vd | a->vn | a->vm) & 0x10)) {
 311         return false;
 312     }
 313
 314     if ((a->vd | a->vn) & a->q) {
 315         return false;
 316     }
 317
 318     if (!vfp_access_check(s)) {
 319         return true;
 320     }
 321
 322     fn_gvec_ptr = (a->size == MO_16) ?
 323         gen_helper_gvec_fcmlah_idx : gen_helper_gvec_fcmlas_idx;
 324     opr_sz = (1 + a->q) * 8;
 325     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 326     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, a->vd),
 327                        vfp_reg_offset(1, a->vn),
 328                        vfp_reg_offset(1, a->vm),
 329                        vfp_reg_offset(1, a->vd),
 330                        fpst, opr_sz, opr_sz,
 331                        (a->index << 2) | a->rot, fn_gvec_ptr);
 332     tcg_temp_free_ptr(fpst);
 333     return true;
 334 }
 335
 336 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
 337 {
 338     gen_helper_gvec_4 *fn_gvec;
 339     int opr_sz;
 340     TCGv_ptr fpst;
 341
 342     if (!dc_isar_feature(aa32_dp, s)) {
 343         return false;
 344     }
 345
 346     /* UNDEF accesses to D16-D31 if they don't exist. */
 347     if (!dc_isar_feature(aa32_simd_r32, s) &&
 348         ((a->vd | a->vn) & 0x10)) {
 349         return false;
 350     }
 351
 352     if ((a->vd | a->vn) & a->q) {
 353         return false;
 354     }
 355
 356     if (!vfp_access_check(s)) {
 357         return true;
 358     }
 359
 360     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
 361     opr_sz = (1 + a->q) * 8;
 362     fpst = fpstatus_ptr(FPST_STD);
 363     tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd),
 364                        vfp_reg_offset(1, a->vn),
 365                        vfp_reg_offset(1, a->rm),
 366                        vfp_reg_offset(1, a->vd),
 367                        opr_sz, opr_sz, a->index, fn_gvec);
 368     tcg_temp_free_ptr(fpst);
 369     return true;
 370 }
 371
 372 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 373 {
 374     int opr_sz;
 375
 376     if (!dc_isar_feature(aa32_fhm, s)) {
 377         return false;
 378     }
 379
 380     /* UNDEF accesses to D16-D31 if they don't exist. */
 381     if (!dc_isar_feature(aa32_simd_r32, s) &&
 382         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 383         return false;
 384     }
 385
 386     if (a->vd & a->q) {
 387         return false;
 388     }
 389
 390     if (!vfp_access_check(s)) {
 391         return true;
 392     }
 393
 394     opr_sz = (1 + a->q) * 8;
 395     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 396                        vfp_reg_offset(a->q, a->vn),
 397                        vfp_reg_offset(a->q, a->rm),
 398                        cpu_env, opr_sz, opr_sz,
 399                        (a->index << 2) | a->s, /* is_2 == 0 */
 400                        gen_helper_gvec_fmlal_idx_a32);
 401     return true;
 402 }
 403
 404 static struct {
 405     int nregs;
 406     int interleave;
 407     int spacing;
 408 } const neon_ls_element_type[11] = {
 409     {1, 4, 1},
 410     {1, 4, 2},
 411     {4, 1, 1},
 412     {2, 2, 2},
 413     {1, 3, 1},
 414     {1, 3, 2},
 415     {3, 1, 1},
 416     {1, 1, 1},
 417     {1, 2, 1},
 418     {1, 2, 2},
 419     {2, 1, 1}
 420 };
 421
 422 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 423                                       int stride)
 424 {
 425     if (rm != 15) {
 426         TCGv_i32 base;
 427
 428         base = load_reg(s, rn);
 429         if (rm == 13) {
 430             tcg_gen_addi_i32(base, base, stride);
 431         } else {
 432             TCGv_i32 index;
 433             index = load_reg(s, rm);
 434             tcg_gen_add_i32(base, base, index);
 435             tcg_temp_free_i32(index);
 436         }
 437         store_reg(s, rn, base);
 438     }
 439 }
 440
 441 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 442 {
 443     /* Neon load/store multiple structures */
 444     int nregs, interleave, spacing, reg, n;
 445     MemOp mop, align, endian;
 446     int mmu_idx = get_mem_index(s);
 447     int size = a->size;
 448     TCGv_i64 tmp64;
 449     TCGv_i32 addr, tmp;
 450
 451     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 452         return false;
 453     }
 454
 455     /* UNDEF accesses to D16-D31 if they don't exist */
 456     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 457         return false;
 458     }
 459     if (a->itype > 10) {
 460         return false;
 461     }
 462     /* Catch UNDEF cases for bad values of align field */
 463     switch (a->itype & 0xc) {
 464     case 4:
 465         if (a->align >= 2) {
 466             return false;
 467         }
 468         break;
 469     case 8:
 470         if (a->align == 3) {
 471             return false;
 472         }
 473         break;
 474     default:
 475         break;
 476     }
 477     nregs = neon_ls_element_type[a->itype].nregs;
 478     interleave = neon_ls_element_type[a->itype].interleave;
 479     spacing = neon_ls_element_type[a->itype].spacing;
 480     if (size == 3 && (interleave | spacing) != 1) {
 481         return false;
 482     }
 483
 484     if (!vfp_access_check(s)) {
 485         return true;
 486     }
 487
 488     /* For our purposes, bytes are always little-endian.  */
 489     endian = s->be_data;
 490     if (size == 0) {
 491         endian = MO_LE;
 492     }
 493
 494     /* Enforce alignment requested by the instruction */
 495     if (a->align) {
 496         align = pow2_align(a->align + 2); /* 4 ** a->align */
 497     } else {
 498         align = s->align_mem ? MO_ALIGN : 0;
 499     }
 500
 501     /*
 502      * Consecutive little-endian elements from a single register
 503      * can be promoted to a larger little-endian operation.
 504      */
 505     if (interleave == 1 && endian == MO_LE) {
 506         /* Retain any natural alignment. */
 507         if (align == MO_ALIGN) {
 508             align = pow2_align(size);
 509         }
 510         size = 3;
 511     }
 512
 513     tmp64 = tcg_temp_new_i64();
 514     addr = tcg_temp_new_i32();
 515     tmp = tcg_const_i32(1 << size);
 516     load_reg_var(s, addr, a->rn);
 517
 518     mop = endian | size | align;
 519     for (reg = 0; reg < nregs; reg++) {
 520         for (n = 0; n < 8 >> size; n++) {
 521             int xs;
 522             for (xs = 0; xs < interleave; xs++) {
 523                 int tt = a->vd + reg + spacing * xs;
 524
 525                 if (a->l) {
 526                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 527                     neon_store_element64(tt, n, size, tmp64);
 528                 } else {
 529                     neon_load_element64(tmp64, tt, n, size);
 530                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 531                 }
 532                 tcg_gen_add_i32(addr, addr, tmp);
 533
 534                 /* Subsequent memory operations inherit alignment */
 535                 mop &= ~MO_AMASK;
 536             }
 537         }
 538     }
 539     tcg_temp_free_i32(addr);
 540     tcg_temp_free_i32(tmp);
 541     tcg_temp_free_i64(tmp64);
 542
 543     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 544     return true;
 545 }
 546
 547 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 548 {
 549     /* Neon load single structure to all lanes */
 550     int reg, stride, vec_size;
 551     int vd = a->vd;
 552     int size = a->size;
 553     int nregs = a->n + 1;
 554     TCGv_i32 addr, tmp;
 555     MemOp mop, align;
 556
 557     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 558         return false;
 559     }
 560
 561     /* UNDEF accesses to D16-D31 if they don't exist */
 562     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 563         return false;
 564     }
 565
 566     align = 0;
 567     if (size == 3) {
 568         if (nregs != 4 || a->a == 0) {
 569             return false;
 570         }
 571         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 572         size = MO_32;
 573         align = MO_ALIGN_16;
 574     } else if (a->a) {
 575         switch (nregs) {
 576         case 1:
 577             if (size == 0) {
 578                 return false;
 579             }
 580             align = MO_ALIGN;
 581             break;
 582         case 2:
 583             align = pow2_align(size + 1);
 584             break;
 585         case 3:
 586             return false;
 587         case 4:
 588             align = pow2_align(size + 2);
 589             break;
 590         default:
 591             g_assert_not_reached();
 592         }
 593     }
 594
 595     if (!vfp_access_check(s)) {
 596         return true;
 597     }
 598
 599     /*
 600      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 601      * VLD2/3/4 to all lanes: T bit indicates register stride.
 602      */
 603     stride = a->t ? 2 : 1;
 604     vec_size = nregs == 1 ? stride * 8 : 8;
 605     mop = size | align;
 606     tmp = tcg_temp_new_i32();
 607     addr = tcg_temp_new_i32();
 608     load_reg_var(s, addr, a->rn);
 609     for (reg = 0; reg < nregs; reg++) {
 610         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 611         if ((vd & 1) && vec_size == 16) {
 612             /*
 613              * We cannot write 16 bytes at once because the
 614              * destination is unaligned.
 615              */
 616             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 617                                  8, 8, tmp);
 618             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 619                              neon_full_reg_offset(vd), 8, 8);
 620         } else {
 621             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 622                                  vec_size, vec_size, tmp);
 623         }
 624         tcg_gen_addi_i32(addr, addr, 1 << size);
 625         vd += stride;
 626
 627         /* Subsequent memory operations inherit alignment */
 628         mop &= ~MO_AMASK;
 629     }
 630     tcg_temp_free_i32(tmp);
 631     tcg_temp_free_i32(addr);
 632
 633     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 634
 635     return true;
 636 }
 637
 638 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 639 {
 640     /* Neon load/store single structure to one lane */
 641     int reg;
 642     int nregs = a->n + 1;
 643     int vd = a->vd;
 644     TCGv_i32 addr, tmp;
 645     MemOp mop;
 646
 647     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 648         return false;
 649     }
 650
 651     /* UNDEF accesses to D16-D31 if they don't exist */
 652     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 653         return false;
 654     }
 655
 656     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 657     switch (nregs) {
 658     case 1:
 659         if (((a->align & (1 << a->size)) != 0) ||
 660             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 661             return false;
 662         }
 663         break;
 664     case 3:
 665         if ((a->align & 1) != 0) {
 666             return false;
 667         }
 668         /* fall through */
 669     case 2:
 670         if (a->size == 2 && (a->align & 2) != 0) {
 671             return false;
 672         }
 673         break;
 674     case 4:
 675         if (a->size == 2 && a->align == 3) {
 676             return false;
 677         }
 678         break;
 679     default:
 680         abort();
 681     }
 682     if ((vd + a->stride * (nregs - 1)) > 31) {
 683         /*
 684          * Attempts to write off the end of the register file are
 685          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 686          * access off the end of the array that holds the register data.
 687          */
 688         return false;
 689     }
 690
 691     if (!vfp_access_check(s)) {
 692         return true;
 693     }
 694
 695     /* Pick up SCTLR settings */
 696     mop = finalize_memop(s, a->size);
 697
 698     if (a->align) {
 699         MemOp align_op;
 700
 701         switch (nregs) {
 702         case 1:
 703             /* For VLD1, use natural alignment. */
 704             align_op = MO_ALIGN;
 705             break;
 706         case 2:
 707             /* For VLD2, use double alignment. */
 708             align_op = pow2_align(a->size + 1);
 709             break;
 710         case 4:
 711             if (a->size == MO_32) {
 712                 /*
 713                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 714                  * quad alignment; align = 3 is rejected above.
 715                  */
 716                 align_op = pow2_align(a->size + a->align);
 717             } else {
 718                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 719                 align_op = pow2_align(a->size + 2);
 720             }
 721             break;
 722         default:
 723             /* For VLD3, the alignment field is zero and rejected above. */
 724             g_assert_not_reached();
 725         }
 726
 727         mop = (mop & ~MO_AMASK) | align_op;
 728     }
 729
 730     tmp = tcg_temp_new_i32();
 731     addr = tcg_temp_new_i32();
 732     load_reg_var(s, addr, a->rn);
 733
 734     for (reg = 0; reg < nregs; reg++) {
 735         if (a->l) {
 736             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 737             neon_store_element(vd, a->reg_idx, a->size, tmp);
 738         } else { /* Store */
 739             neon_load_element(tmp, vd, a->reg_idx, a->size);
 740             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 741         }
 742         vd += a->stride;
 743         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 744
 745         /* Subsequent memory operations inherit alignment */
 746         mop &= ~MO_AMASK;
 747     }
 748     tcg_temp_free_i32(addr);
 749     tcg_temp_free_i32(tmp);
 750
 751     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 752
 753     return true;
 754 }
 755
 756 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 757 {
 758     int vec_size = a->q ? 16 : 8;
 759     int rd_ofs = neon_full_reg_offset(a->vd);
 760     int rn_ofs = neon_full_reg_offset(a->vn);
 761     int rm_ofs = neon_full_reg_offset(a->vm);
 762
 763     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 764         return false;
 765     }
 766
 767     /* UNDEF accesses to D16-D31 if they don't exist. */
 768     if (!dc_isar_feature(aa32_simd_r32, s) &&
 769         ((a->vd | a->vn | a->vm) & 0x10)) {
 770         return false;
 771     }
 772
 773     if ((a->vn | a->vm | a->vd) & a->q) {
 774         return false;
 775     }
 776
 777     if (!vfp_access_check(s)) {
 778         return true;
 779     }
 780
 781     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 782     return true;
 783 }
 784
 785 #define DO_3SAME(INSN, FUNC)                                            \
 786     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 787     {                                                                   \
 788         return do_3same(s, a, FUNC);                                    \
 789     }
 790
 791 DO_3SAME(VADD, tcg_gen_gvec_add)
 792 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 793 DO_3SAME(VAND, tcg_gen_gvec_and)
 794 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 795 DO_3SAME(VORR, tcg_gen_gvec_or)
 796 DO_3SAME(VORN, tcg_gen_gvec_orc)
 797 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 798 DO_3SAME(VSHL_S, gen_gvec_sshl)
 799 DO_3SAME(VSHL_U, gen_gvec_ushl)
 800 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 801 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 802 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 803 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 804
 805 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 806 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 807     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 808                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 809                                 uint32_t oprsz, uint32_t maxsz)         \
 810     {                                                                   \
 811         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 812     }                                                                   \
 813     DO_3SAME(INSN, gen_##INSN##_3s)
 814
 815 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 816 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 817 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 818
 819 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 820     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 821     {                                                                   \
 822         if (a->size == 3) {                                             \
 823             return false;                                               \
 824         }                                                               \
 825         return do_3same(s, a, FUNC);                                    \
 826     }
 827
 828 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 829 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 830 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 831 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 832 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 833 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 834 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 835 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 836 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 837 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 838 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 839 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 840
 841 #define DO_3SAME_CMP(INSN, COND)                                        \
 842     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 843                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 844                                 uint32_t oprsz, uint32_t maxsz)         \
 845     {                                                                   \
 846         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 847     }                                                                   \
 848     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 849
 850 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 851 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 852 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 853 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 854 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 855
 856 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 857     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 858                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 859     {                                                                      \
 860         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 861     }
 862
 863 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 864
 865 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 866 {
 867     if (a->size != 0) {
 868         return false;
 869     }
 870     return do_3same(s, a, gen_VMUL_p_3s);
 871 }
 872
 873 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 874     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 875     {                                                                   \
 876         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 877             return false;                                               \
 878         }                                                               \
 879         if (a->size != 1 && a->size != 2) {                             \
 880             return false;                                               \
 881         }                                                               \
 882         return do_3same(s, a, FUNC);                                    \
 883     }
 884
 885 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 886 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 887
 888 #define DO_SHA1(NAME, FUNC)                                             \
 889     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 890     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 891     {                                                                   \
 892         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 893             return false;                                               \
 894         }                                                               \
 895         return do_3same(s, a, gen_##NAME##_3s);                         \
 896     }
 897
 898 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 899 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 900 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 901 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 902
 903 #define DO_SHA2(NAME, FUNC)                                             \
 904     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 905     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 906     {                                                                   \
 907         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 908             return false;                                               \
 909         }                                                               \
 910         return do_3same(s, a, gen_##NAME##_3s);                         \
 911     }
 912
 913 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 914 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 915 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 916
 917 #define DO_3SAME_64(INSN, FUNC)                                         \
 918     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 919                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 920                                 uint32_t oprsz, uint32_t maxsz)         \
 921     {                                                                   \
 922         static const GVecGen3 op = { .fni8 = FUNC };                    \
 923         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 924     }                                                                   \
 925     DO_3SAME(INSN, gen_##INSN##_3s)
 926
 927 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 928     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 929     {                                                                   \
 930         FUNC(d, cpu_env, n, m);                                         \
 931     }                                                                   \
 932     DO_3SAME_64(INSN, gen_##INSN##_elt)
 933
 934 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 935 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 936 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 937 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 938 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 939 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 940
 941 #define DO_3SAME_32(INSN, FUNC)                                         \
 942     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 943                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 944                                 uint32_t oprsz, uint32_t maxsz)         \
 945     {                                                                   \
 946         static const GVecGen3 ops[4] = {                                \
 947             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 948             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 949             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 950             { 0 },                                                      \
 951         };                                                              \
 952         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 953     }                                                                   \
 954     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 955     {                                                                   \
 956         if (a->size > 2) {                                              \
 957             return false;                                               \
 958         }                                                               \
 959         return do_3same(s, a, gen_##INSN##_3s);                         \
 960     }
 961
 962 /*
 963  * Some helper functions need to be passed the cpu_env. In order
 964  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 965  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 966  * and which call a NeonGenTwoOpEnvFn().
 967  */
 968 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 969     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 970     {                                                                   \
 971         FUNC(d, cpu_env, n, m);                                         \
 972     }
 973
 974 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 975     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 976     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 977     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 978     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 979                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 980                                 uint32_t oprsz, uint32_t maxsz)         \
 981     {                                                                   \
 982         static const GVecGen3 ops[4] = {                                \
 983             { .fni4 = gen_##INSN##_tramp8 },                            \
 984             { .fni4 = gen_##INSN##_tramp16 },                           \
 985             { .fni4 = gen_##INSN##_tramp32 },                           \
 986             { 0 },                                                      \
 987         };                                                              \
 988         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 989     }                                                                   \
 990     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 991     {                                                                   \
 992         if (a->size > 2) {                                              \
 993             return false;                                               \
 994         }                                                               \
 995         return do_3same(s, a, gen_##INSN##_3s);                         \
 996     }
 997
 998 DO_3SAME_32(VHADD_S, hadd_s)
 999 DO_3SAME_32(VHADD_U, hadd_u)
1000 DO_3SAME_32(VHSUB_S, hsub_s)
1001 DO_3SAME_32(VHSUB_U, hsub_u)
1002 DO_3SAME_32(VRHADD_S, rhadd_s)
1003 DO_3SAME_32(VRHADD_U, rhadd_u)
1004 DO_3SAME_32(VRSHL_S, rshl_s)
1005 DO_3SAME_32(VRSHL_U, rshl_u)
1006
1007 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1008 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1009 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1010 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1011
1012 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1013 {
1014     /* Operations handled pairwise 32 bits at a time */
1015     TCGv_i32 tmp, tmp2, tmp3;
1016
1017     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1018         return false;
1019     }
1020
1021     /* UNDEF accesses to D16-D31 if they don't exist. */
1022     if (!dc_isar_feature(aa32_simd_r32, s) &&
1023         ((a->vd | a->vn | a->vm) & 0x10)) {
1024         return false;
1025     }
1026
1027     if (a->size == 3) {
1028         return false;
1029     }
1030
1031     if (!vfp_access_check(s)) {
1032         return true;
1033     }
1034
1035     assert(a->q == 0); /* enforced by decode patterns */
1036
1037     /*
1038      * Note that we have to be careful not to clobber the source operands
1039      * in the "vm == vd" case by storing the result of the first pass too
1040      * early. Since Q is 0 there are always just two passes, so instead
1041      * of a complicated loop over each pass we just unroll.
1042      */
1043     tmp = tcg_temp_new_i32();
1044     tmp2 = tcg_temp_new_i32();
1045     tmp3 = tcg_temp_new_i32();
1046
1047     read_neon_element32(tmp, a->vn, 0, MO_32);
1048     read_neon_element32(tmp2, a->vn, 1, MO_32);
1049     fn(tmp, tmp, tmp2);
1050
1051     read_neon_element32(tmp3, a->vm, 0, MO_32);
1052     read_neon_element32(tmp2, a->vm, 1, MO_32);
1053     fn(tmp3, tmp3, tmp2);
1054
1055     write_neon_element32(tmp, a->vd, 0, MO_32);
1056     write_neon_element32(tmp3, a->vd, 1, MO_32);
1057
1058     tcg_temp_free_i32(tmp);
1059     tcg_temp_free_i32(tmp2);
1060     tcg_temp_free_i32(tmp3);
1061     return true;
1062 }
1063
1064 #define DO_3SAME_PAIR(INSN, func)                                       \
1065     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1066     {                                                                   \
1067         static NeonGenTwoOpFn * const fns[] = {                         \
1068             gen_helper_neon_##func##8,                                  \
1069             gen_helper_neon_##func##16,                                 \
1070             gen_helper_neon_##func##32,                                 \
1071         };                                                              \
1072         if (a->size > 2) {                                              \
1073             return false;                                               \
1074         }                                                               \
1075         return do_3same_pair(s, a, fns[a->size]);                       \
1076     }
1077
1078 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1079 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1080 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1081 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1082 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1083 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1084
1085 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1086 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1087 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1088 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1089 DO_3SAME_PAIR(VPADD, padd_u)
1090
1091 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1092     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1093     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1094     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1095                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1096                                 uint32_t oprsz, uint32_t maxsz)         \
1097     {                                                                   \
1098         static const GVecGen3 ops[2] = {                                \
1099             { .fni4 = gen_##INSN##_tramp16 },                           \
1100             { .fni4 = gen_##INSN##_tramp32 },                           \
1101         };                                                              \
1102         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1103     }                                                                   \
1104     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1105     {                                                                   \
1106         if (a->size != 1 && a->size != 2) {                             \
1107             return false;                                               \
1108         }                                                               \
1109         return do_3same(s, a, gen_##INSN##_3s);                         \
1110     }
1111
1112 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1113 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1114
1115 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1116     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1117                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1118                          uint32_t oprsz, uint32_t maxsz)                \
1119     {                                                                   \
1120         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1121         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1122                            oprsz, maxsz, 0, FUNC);                      \
1123         tcg_temp_free_ptr(fpst);                                        \
1124     }
1125
1126 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1127     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1128     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1129     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1130     {                                                                   \
1131         if (a->size == MO_16) {                                         \
1132             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1133                 return false;                                           \
1134             }                                                           \
1135             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1136         }                                                               \
1137         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1138     }
1139
1140
1141 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1142 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1143 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1144 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1145 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1146 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1147 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1148 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1149 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1150 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1151 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1152 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1153 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1154 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1155 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1156 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1157 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1158
1159 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1160 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1161 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1162 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1163
1164 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1165 {
1166     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1167         return false;
1168     }
1169
1170     if (a->size == MO_16) {
1171         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1172             return false;
1173         }
1174         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1175     }
1176     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1177 }
1178
1179 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1180 {
1181     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1182         return false;
1183     }
1184
1185     if (a->size == MO_16) {
1186         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1187             return false;
1188         }
1189         return do_3same(s, a, gen_VMINNM_fp16_3s);
1190     }
1191     return do_3same(s, a, gen_VMINNM_fp32_3s);
1192 }
1193
1194 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1195                              gen_helper_gvec_3_ptr *fn)
1196 {
1197     /* FP pairwise operations */
1198     TCGv_ptr fpstatus;
1199
1200     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1201         return false;
1202     }
1203
1204     /* UNDEF accesses to D16-D31 if they don't exist. */
1205     if (!dc_isar_feature(aa32_simd_r32, s) &&
1206         ((a->vd | a->vn | a->vm) & 0x10)) {
1207         return false;
1208     }
1209
1210     if (!vfp_access_check(s)) {
1211         return true;
1212     }
1213
1214     assert(a->q == 0); /* enforced by decode patterns */
1215
1216
1217     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1218     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1219                        vfp_reg_offset(1, a->vn),
1220                        vfp_reg_offset(1, a->vm),
1221                        fpstatus, 8, 8, 0, fn);
1222     tcg_temp_free_ptr(fpstatus);
1223
1224     return true;
1225 }
1226
1227 /*
1228  * For all the functions using this macro, size == 1 means fp16,
1229  * which is an architecture extension we don't implement yet.
1230  */
1231 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1232     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1233     {                                                               \
1234         if (a->size == MO_16) {                                     \
1235             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1236                 return false;                                       \
1237             }                                                       \
1238             return do_3same_fp_pair(s, a, FUNC##h);                 \
1239         }                                                           \
1240         return do_3same_fp_pair(s, a, FUNC##s);                     \
1241     }
1242
1243 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1244 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1245 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1246
1247 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1248 {
1249     /* Handle a 2-reg-shift insn which can be vectorized. */
1250     int vec_size = a->q ? 16 : 8;
1251     int rd_ofs = neon_full_reg_offset(a->vd);
1252     int rm_ofs = neon_full_reg_offset(a->vm);
1253
1254     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1255         return false;
1256     }
1257
1258     /* UNDEF accesses to D16-D31 if they don't exist. */
1259     if (!dc_isar_feature(aa32_simd_r32, s) &&
1260         ((a->vd | a->vm) & 0x10)) {
1261         return false;
1262     }
1263
1264     if ((a->vm | a->vd) & a->q) {
1265         return false;
1266     }
1267
1268     if (!vfp_access_check(s)) {
1269         return true;
1270     }
1271
1272     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1273     return true;
1274 }
1275
1276 #define DO_2SH(INSN, FUNC)                                              \
1277     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1278     {                                                                   \
1279         return do_vector_2sh(s, a, FUNC);                               \
1280     }                                                                   \
1281
1282 DO_2SH(VSHL, tcg_gen_gvec_shli)
1283 DO_2SH(VSLI, gen_gvec_sli)
1284 DO_2SH(VSRI, gen_gvec_sri)
1285 DO_2SH(VSRA_S, gen_gvec_ssra)
1286 DO_2SH(VSRA_U, gen_gvec_usra)
1287 DO_2SH(VRSHR_S, gen_gvec_srshr)
1288 DO_2SH(VRSHR_U, gen_gvec_urshr)
1289 DO_2SH(VRSRA_S, gen_gvec_srsra)
1290 DO_2SH(VRSRA_U, gen_gvec_ursra)
1291
1292 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1293 {
1294     /* Signed shift out of range results in all-sign-bits */
1295     a->shift = MIN(a->shift, (8 << a->size) - 1);
1296     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1297 }
1298
1299 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1300                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1301 {
1302     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1303 }
1304
1305 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1306 {
1307     /* Shift out of range is architecturally valid and results in zero. */
1308     if (a->shift >= (8 << a->size)) {
1309         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1310     } else {
1311         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1312     }
1313 }
1314
1315 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1316                              NeonGenTwo64OpEnvFn *fn)
1317 {
1318     /*
1319      * 2-reg-and-shift operations, size == 3 case, where the
1320      * function needs to be passed cpu_env.
1321      */
1322     TCGv_i64 constimm;
1323     int pass;
1324
1325     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1326         return false;
1327     }
1328
1329     /* UNDEF accesses to D16-D31 if they don't exist. */
1330     if (!dc_isar_feature(aa32_simd_r32, s) &&
1331         ((a->vd | a->vm) & 0x10)) {
1332         return false;
1333     }
1334
1335     if ((a->vm | a->vd) & a->q) {
1336         return false;
1337     }
1338
1339     if (!vfp_access_check(s)) {
1340         return true;
1341     }
1342
1343     /*
1344      * To avoid excessive duplication of ops we implement shift
1345      * by immediate using the variable shift operations.
1346      */
1347     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1348
1349     for (pass = 0; pass < a->q + 1; pass++) {
1350         TCGv_i64 tmp = tcg_temp_new_i64();
1351
1352         read_neon_element64(tmp, a->vm, pass, MO_64);
1353         fn(tmp, cpu_env, tmp, constimm);
1354         write_neon_element64(tmp, a->vd, pass, MO_64);
1355         tcg_temp_free_i64(tmp);
1356     }
1357     tcg_temp_free_i64(constimm);
1358     return true;
1359 }
1360
1361 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1362                              NeonGenTwoOpEnvFn *fn)
1363 {
1364     /*
1365      * 2-reg-and-shift operations, size < 3 case, where the
1366      * helper needs to be passed cpu_env.
1367      */
1368     TCGv_i32 constimm, tmp;
1369     int pass;
1370
1371     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1372         return false;
1373     }
1374
1375     /* UNDEF accesses to D16-D31 if they don't exist. */
1376     if (!dc_isar_feature(aa32_simd_r32, s) &&
1377         ((a->vd | a->vm) & 0x10)) {
1378         return false;
1379     }
1380
1381     if ((a->vm | a->vd) & a->q) {
1382         return false;
1383     }
1384
1385     if (!vfp_access_check(s)) {
1386         return true;
1387     }
1388
1389     /*
1390      * To avoid excessive duplication of ops we implement shift
1391      * by immediate using the variable shift operations.
1392      */
1393     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1394     tmp = tcg_temp_new_i32();
1395
1396     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1397         read_neon_element32(tmp, a->vm, pass, MO_32);
1398         fn(tmp, cpu_env, tmp, constimm);
1399         write_neon_element32(tmp, a->vd, pass, MO_32);
1400     }
1401     tcg_temp_free_i32(tmp);
1402     tcg_temp_free_i32(constimm);
1403     return true;
1404 }
1405
1406 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1407     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1408     {                                                                   \
1409         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1410     }                                                                   \
1411     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1412     {                                                                   \
1413         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1414             gen_helper_neon_##FUNC##8,                                  \
1415             gen_helper_neon_##FUNC##16,                                 \
1416             gen_helper_neon_##FUNC##32,                                 \
1417         };                                                              \
1418         assert(a->size < ARRAY_SIZE(fns));                              \
1419         return do_2shift_env_32(s, a, fns[a->size]);                    \
1420     }
1421
1422 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1423 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1424 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1425
1426 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1427                                 NeonGenTwo64OpFn *shiftfn,
1428                                 NeonGenNarrowEnvFn *narrowfn)
1429 {
1430     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1431     TCGv_i64 constimm, rm1, rm2;
1432     TCGv_i32 rd;
1433
1434     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1435         return false;
1436     }
1437
1438     /* UNDEF accesses to D16-D31 if they don't exist. */
1439     if (!dc_isar_feature(aa32_simd_r32, s) &&
1440         ((a->vd | a->vm) & 0x10)) {
1441         return false;
1442     }
1443
1444     if (a->vm & 1) {
1445         return false;
1446     }
1447
1448     if (!vfp_access_check(s)) {
1449         return true;
1450     }
1451
1452     /*
1453      * This is always a right shift, and the shiftfn is always a
1454      * left-shift helper, which thus needs the negated shift count.
1455      */
1456     constimm = tcg_const_i64(-a->shift);
1457     rm1 = tcg_temp_new_i64();
1458     rm2 = tcg_temp_new_i64();
1459     rd = tcg_temp_new_i32();
1460
1461     /* Load both inputs first to avoid potential overwrite if rm == rd */
1462     read_neon_element64(rm1, a->vm, 0, MO_64);
1463     read_neon_element64(rm2, a->vm, 1, MO_64);
1464
1465     shiftfn(rm1, rm1, constimm);
1466     narrowfn(rd, cpu_env, rm1);
1467     write_neon_element32(rd, a->vd, 0, MO_32);
1468
1469     shiftfn(rm2, rm2, constimm);
1470     narrowfn(rd, cpu_env, rm2);
1471     write_neon_element32(rd, a->vd, 1, MO_32);
1472
1473     tcg_temp_free_i32(rd);
1474     tcg_temp_free_i64(rm1);
1475     tcg_temp_free_i64(rm2);
1476     tcg_temp_free_i64(constimm);
1477
1478     return true;
1479 }
1480
1481 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1482                                 NeonGenTwoOpFn *shiftfn,
1483                                 NeonGenNarrowEnvFn *narrowfn)
1484 {
1485     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1486     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1487     TCGv_i64 rtmp;
1488     uint32_t imm;
1489
1490     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1491         return false;
1492     }
1493
1494     /* UNDEF accesses to D16-D31 if they don't exist. */
1495     if (!dc_isar_feature(aa32_simd_r32, s) &&
1496         ((a->vd | a->vm) & 0x10)) {
1497         return false;
1498     }
1499
1500     if (a->vm & 1) {
1501         return false;
1502     }
1503
1504     if (!vfp_access_check(s)) {
1505         return true;
1506     }
1507
1508     /*
1509      * This is always a right shift, and the shiftfn is always a
1510      * left-shift helper, which thus needs the negated shift count
1511      * duplicated into each lane of the immediate value.
1512      */
1513     if (a->size == 1) {
1514         imm = (uint16_t)(-a->shift);
1515         imm |= imm << 16;
1516     } else {
1517         /* size == 2 */
1518         imm = -a->shift;
1519     }
1520     constimm = tcg_const_i32(imm);
1521
1522     /* Load all inputs first to avoid potential overwrite */
1523     rm1 = tcg_temp_new_i32();
1524     rm2 = tcg_temp_new_i32();
1525     rm3 = tcg_temp_new_i32();
1526     rm4 = tcg_temp_new_i32();
1527     read_neon_element32(rm1, a->vm, 0, MO_32);
1528     read_neon_element32(rm2, a->vm, 1, MO_32);
1529     read_neon_element32(rm3, a->vm, 2, MO_32);
1530     read_neon_element32(rm4, a->vm, 3, MO_32);
1531     rtmp = tcg_temp_new_i64();
1532
1533     shiftfn(rm1, rm1, constimm);
1534     shiftfn(rm2, rm2, constimm);
1535
1536     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1537     tcg_temp_free_i32(rm2);
1538
1539     narrowfn(rm1, cpu_env, rtmp);
1540     write_neon_element32(rm1, a->vd, 0, MO_32);
1541     tcg_temp_free_i32(rm1);
1542
1543     shiftfn(rm3, rm3, constimm);
1544     shiftfn(rm4, rm4, constimm);
1545     tcg_temp_free_i32(constimm);
1546
1547     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1548     tcg_temp_free_i32(rm4);
1549
1550     narrowfn(rm3, cpu_env, rtmp);
1551     tcg_temp_free_i64(rtmp);
1552     write_neon_element32(rm3, a->vd, 1, MO_32);
1553     tcg_temp_free_i32(rm3);
1554     return true;
1555 }
1556
1557 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1558     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1559     {                                                                   \
1560         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1561     }
1562 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1563     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1564     {                                                                   \
1565         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1566     }
1567
1568 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1569 {
1570     tcg_gen_extrl_i64_i32(dest, src);
1571 }
1572
1573 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1574 {
1575     gen_helper_neon_narrow_u16(dest, src);
1576 }
1577
1578 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1579 {
1580     gen_helper_neon_narrow_u8(dest, src);
1581 }
1582
1583 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1584 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1585 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1586
1587 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1588 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1589 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1590
1591 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1592 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1593 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1594
1595 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1596 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1597 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1598 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1599 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1600 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1601
1602 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1603 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1604 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1605
1606 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1607 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1608 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1609
1610 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1611 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1612 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1613
1614 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1615                          NeonGenWidenFn *widenfn, bool u)
1616 {
1617     TCGv_i64 tmp;
1618     TCGv_i32 rm0, rm1;
1619     uint64_t widen_mask = 0;
1620
1621     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1622         return false;
1623     }
1624
1625     /* UNDEF accesses to D16-D31 if they don't exist. */
1626     if (!dc_isar_feature(aa32_simd_r32, s) &&
1627         ((a->vd | a->vm) & 0x10)) {
1628         return false;
1629     }
1630
1631     if (a->vd & 1) {
1632         return false;
1633     }
1634
1635     if (!vfp_access_check(s)) {
1636         return true;
1637     }
1638
1639     /*
1640      * This is a widen-and-shift operation. The shift is always less
1641      * than the width of the source type, so after widening the input
1642      * vector we can simply shift the whole 64-bit widened register,
1643      * and then clear the potential overflow bits resulting from left
1644      * bits of the narrow input appearing as right bits of the left
1645      * neighbour narrow input. Calculate a mask of bits to clear.
1646      */
1647     if ((a->shift != 0) && (a->size < 2 || u)) {
1648         int esize = 8 << a->size;
1649         widen_mask = MAKE_64BIT_MASK(0, esize);
1650         widen_mask >>= esize - a->shift;
1651         widen_mask = dup_const(a->size + 1, widen_mask);
1652     }
1653
1654     rm0 = tcg_temp_new_i32();
1655     rm1 = tcg_temp_new_i32();
1656     read_neon_element32(rm0, a->vm, 0, MO_32);
1657     read_neon_element32(rm1, a->vm, 1, MO_32);
1658     tmp = tcg_temp_new_i64();
1659
1660     widenfn(tmp, rm0);
1661     tcg_temp_free_i32(rm0);
1662     if (a->shift != 0) {
1663         tcg_gen_shli_i64(tmp, tmp, a->shift);
1664         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1665     }
1666     write_neon_element64(tmp, a->vd, 0, MO_64);
1667
1668     widenfn(tmp, rm1);
1669     tcg_temp_free_i32(rm1);
1670     if (a->shift != 0) {
1671         tcg_gen_shli_i64(tmp, tmp, a->shift);
1672         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1673     }
1674     write_neon_element64(tmp, a->vd, 1, MO_64);
1675     tcg_temp_free_i64(tmp);
1676     return true;
1677 }
1678
1679 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1680 {
1681     static NeonGenWidenFn * const widenfn[] = {
1682         gen_helper_neon_widen_s8,
1683         gen_helper_neon_widen_s16,
1684         tcg_gen_ext_i32_i64,
1685     };
1686     return do_vshll_2sh(s, a, widenfn[a->size], false);
1687 }
1688
1689 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1690 {
1691     static NeonGenWidenFn * const widenfn[] = {
1692         gen_helper_neon_widen_u8,
1693         gen_helper_neon_widen_u16,
1694         tcg_gen_extu_i32_i64,
1695     };
1696     return do_vshll_2sh(s, a, widenfn[a->size], true);
1697 }
1698
1699 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1700                       gen_helper_gvec_2_ptr *fn)
1701 {
1702     /* FP operations in 2-reg-and-shift group */
1703     int vec_size = a->q ? 16 : 8;
1704     int rd_ofs = neon_full_reg_offset(a->vd);
1705     int rm_ofs = neon_full_reg_offset(a->vm);
1706     TCGv_ptr fpst;
1707
1708     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1709         return false;
1710     }
1711
1712     if (a->size == MO_16) {
1713         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1714             return false;
1715         }
1716     }
1717
1718     /* UNDEF accesses to D16-D31 if they don't exist. */
1719     if (!dc_isar_feature(aa32_simd_r32, s) &&
1720         ((a->vd | a->vm) & 0x10)) {
1721         return false;
1722     }
1723
1724     if ((a->vm | a->vd) & a->q) {
1725         return false;
1726     }
1727
1728     if (!vfp_access_check(s)) {
1729         return true;
1730     }
1731
1732     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1733     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1734     tcg_temp_free_ptr(fpst);
1735     return true;
1736 }
1737
1738 #define DO_FP_2SH(INSN, FUNC)                                           \
1739     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1740     {                                                                   \
1741         return do_fp_2sh(s, a, FUNC);                                   \
1742     }
1743
1744 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1745 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1746 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1747 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1748
1749 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1750 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1751 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1752 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1753
1754 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1755 {
1756     /*
1757      * Expand the encoded constant.
1758      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1759      * We choose to not special-case this and will behave as if a
1760      * valid constant encoding of 0 had been given.
1761      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1762      */
1763     switch (cmode) {
1764     case 0: case 1:
1765         /* no-op */
1766         break;
1767     case 2: case 3:
1768         imm <<= 8;
1769         break;
1770     case 4: case 5:
1771         imm <<= 16;
1772         break;
1773     case 6: case 7:
1774         imm <<= 24;
1775         break;
1776     case 8: case 9:
1777         imm |= imm << 16;
1778         break;
1779     case 10: case 11:
1780         imm = (imm << 8) | (imm << 24);
1781         break;
1782     case 12:
1783         imm = (imm << 8) | 0xff;
1784         break;
1785     case 13:
1786         imm = (imm << 16) | 0xffff;
1787         break;
1788     case 14:
1789         if (op) {
1790             /*
1791              * This is the only case where the top and bottom 32 bits
1792              * of the encoded constant differ.
1793              */
1794             uint64_t imm64 = 0;
1795             int n;
1796
1797             for (n = 0; n < 8; n++) {
1798                 if (imm & (1 << n)) {
1799                     imm64 |= (0xffULL << (n * 8));
1800                 }
1801             }
1802             return imm64;
1803         }
1804         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1805         break;
1806     case 15:
1807         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1808             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1809         break;
1810     }
1811     if (op) {
1812         imm = ~imm;
1813     }
1814     return dup_const(MO_32, imm);
1815 }
1816
1817 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1818                         GVecGen2iFn *fn)
1819 {
1820     uint64_t imm;
1821     int reg_ofs, vec_size;
1822
1823     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1824         return false;
1825     }
1826
1827     /* UNDEF accesses to D16-D31 if they don't exist. */
1828     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1829         return false;
1830     }
1831
1832     if (a->vd & a->q) {
1833         return false;
1834     }
1835
1836     if (!vfp_access_check(s)) {
1837         return true;
1838     }
1839
1840     reg_ofs = neon_full_reg_offset(a->vd);
1841     vec_size = a->q ? 16 : 8;
1842     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1843
1844     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1845     return true;
1846 }
1847
1848 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1849                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1850 {
1851     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1852 }
1853
1854 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1855 {
1856     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1857     GVecGen2iFn *fn;
1858
1859     if ((a->cmode & 1) && a->cmode < 12) {
1860         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1861         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1862     } else {
1863         /* There is one unallocated cmode/op combination in this space */
1864         if (a->cmode == 15 && a->op == 1) {
1865             return false;
1866         }
1867         fn = gen_VMOV_1r;
1868     }
1869     return do_1reg_imm(s, a, fn);
1870 }
1871
1872 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1873                            NeonGenWidenFn *widenfn,
1874                            NeonGenTwo64OpFn *opfn,
1875                            int src1_mop, int src2_mop)
1876 {
1877     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1878     TCGv_i64 rn0_64, rn1_64, rm_64;
1879
1880     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1881         return false;
1882     }
1883
1884     /* UNDEF accesses to D16-D31 if they don't exist. */
1885     if (!dc_isar_feature(aa32_simd_r32, s) &&
1886         ((a->vd | a->vn | a->vm) & 0x10)) {
1887         return false;
1888     }
1889
1890     if (!opfn) {
1891         /* size == 3 case, which is an entirely different insn group */
1892         return false;
1893     }
1894
1895     if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1896         return false;
1897     }
1898
1899     if (!vfp_access_check(s)) {
1900         return true;
1901     }
1902
1903     rn0_64 = tcg_temp_new_i64();
1904     rn1_64 = tcg_temp_new_i64();
1905     rm_64 = tcg_temp_new_i64();
1906
1907     if (src1_mop >= 0) {
1908         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1909     } else {
1910         TCGv_i32 tmp = tcg_temp_new_i32();
1911         read_neon_element32(tmp, a->vn, 0, MO_32);
1912         widenfn(rn0_64, tmp);
1913         tcg_temp_free_i32(tmp);
1914     }
1915     if (src2_mop >= 0) {
1916         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1917     } else {
1918         TCGv_i32 tmp = tcg_temp_new_i32();
1919         read_neon_element32(tmp, a->vm, 0, MO_32);
1920         widenfn(rm_64, tmp);
1921         tcg_temp_free_i32(tmp);
1922     }
1923
1924     opfn(rn0_64, rn0_64, rm_64);
1925
1926     /*
1927      * Load second pass inputs before storing the first pass result, to
1928      * avoid incorrect results if a narrow input overlaps with the result.
1929      */
1930     if (src1_mop >= 0) {
1931         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1932     } else {
1933         TCGv_i32 tmp = tcg_temp_new_i32();
1934         read_neon_element32(tmp, a->vn, 1, MO_32);
1935         widenfn(rn1_64, tmp);
1936         tcg_temp_free_i32(tmp);
1937     }
1938     if (src2_mop >= 0) {
1939         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1940     } else {
1941         TCGv_i32 tmp = tcg_temp_new_i32();
1942         read_neon_element32(tmp, a->vm, 1, MO_32);
1943         widenfn(rm_64, tmp);
1944         tcg_temp_free_i32(tmp);
1945     }
1946
1947     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1948
1949     opfn(rn1_64, rn1_64, rm_64);
1950     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1951
1952     tcg_temp_free_i64(rn0_64);
1953     tcg_temp_free_i64(rn1_64);
1954     tcg_temp_free_i64(rm_64);
1955
1956     return true;
1957 }
1958
1959 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1960     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1961     {                                                                   \
1962         static NeonGenWidenFn * const widenfn[] = {                     \
1963             gen_helper_neon_widen_##S##8,                               \
1964             gen_helper_neon_widen_##S##16,                              \
1965             NULL, NULL,                                                 \
1966         };                                                              \
1967         static NeonGenTwo64OpFn * const addfn[] = {                     \
1968             gen_helper_neon_##OP##l_u16,                                \
1969             gen_helper_neon_##OP##l_u32,                                \
1970             tcg_gen_##OP##_i64,                                         \
1971             NULL,                                                       \
1972         };                                                              \
1973         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1974         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1975                               SRC1WIDE ? MO_Q : narrow_mop,             \
1976                               narrow_mop);                              \
1977     }
1978
1979 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1980 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1981 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1982 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1983 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1984 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1985 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1986 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1987
1988 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1989                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1990 {
1991     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1992     TCGv_i64 rn_64, rm_64;
1993     TCGv_i32 rd0, rd1;
1994
1995     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1996         return false;
1997     }
1998
1999     /* UNDEF accesses to D16-D31 if they don't exist. */
2000     if (!dc_isar_feature(aa32_simd_r32, s) &&
2001         ((a->vd | a->vn | a->vm) & 0x10)) {
2002         return false;
2003     }
2004
2005     if (!opfn || !narrowfn) {
2006         /* size == 3 case, which is an entirely different insn group */
2007         return false;
2008     }
2009
2010     if ((a->vn | a->vm) & 1) {
2011         return false;
2012     }
2013
2014     if (!vfp_access_check(s)) {
2015         return true;
2016     }
2017
2018     rn_64 = tcg_temp_new_i64();
2019     rm_64 = tcg_temp_new_i64();
2020     rd0 = tcg_temp_new_i32();
2021     rd1 = tcg_temp_new_i32();
2022
2023     read_neon_element64(rn_64, a->vn, 0, MO_64);
2024     read_neon_element64(rm_64, a->vm, 0, MO_64);
2025
2026     opfn(rn_64, rn_64, rm_64);
2027
2028     narrowfn(rd0, rn_64);
2029
2030     read_neon_element64(rn_64, a->vn, 1, MO_64);
2031     read_neon_element64(rm_64, a->vm, 1, MO_64);
2032
2033     opfn(rn_64, rn_64, rm_64);
2034
2035     narrowfn(rd1, rn_64);
2036
2037     write_neon_element32(rd0, a->vd, 0, MO_32);
2038     write_neon_element32(rd1, a->vd, 1, MO_32);
2039
2040     tcg_temp_free_i32(rd0);
2041     tcg_temp_free_i32(rd1);
2042     tcg_temp_free_i64(rn_64);
2043     tcg_temp_free_i64(rm_64);
2044
2045     return true;
2046 }
2047
2048 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2049     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2050     {                                                                   \
2051         static NeonGenTwo64OpFn * const addfn[] = {                     \
2052             gen_helper_neon_##OP##l_u16,                                \
2053             gen_helper_neon_##OP##l_u32,                                \
2054             tcg_gen_##OP##_i64,                                         \
2055             NULL,                                                       \
2056         };                                                              \
2057         static NeonGenNarrowFn * const narrowfn[] = {                   \
2058             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2059             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2060             EXTOP,                                                      \
2061             NULL,                                                       \
2062         };                                                              \
2063         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2064     }
2065
2066 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2067 {
2068     tcg_gen_addi_i64(rn, rn, 1u << 31);
2069     tcg_gen_extrh_i64_i32(rd, rn);
2070 }
2071
2072 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2073 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2074 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2075 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2076
2077 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2078                        NeonGenTwoOpWidenFn *opfn,
2079                        NeonGenTwo64OpFn *accfn)
2080 {
2081     /*
2082      * 3-regs different lengths, long operations.
2083      * These perform an operation on two inputs that returns a double-width
2084      * result, and then possibly perform an accumulation operation of
2085      * that result into the double-width destination.
2086      */
2087     TCGv_i64 rd0, rd1, tmp;
2088     TCGv_i32 rn, rm;
2089
2090     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2091         return false;
2092     }
2093
2094     /* UNDEF accesses to D16-D31 if they don't exist. */
2095     if (!dc_isar_feature(aa32_simd_r32, s) &&
2096         ((a->vd | a->vn | a->vm) & 0x10)) {
2097         return false;
2098     }
2099
2100     if (!opfn) {
2101         /* size == 3 case, which is an entirely different insn group */
2102         return false;
2103     }
2104
2105     if (a->vd & 1) {
2106         return false;
2107     }
2108
2109     if (!vfp_access_check(s)) {
2110         return true;
2111     }
2112
2113     rd0 = tcg_temp_new_i64();
2114     rd1 = tcg_temp_new_i64();
2115
2116     rn = tcg_temp_new_i32();
2117     rm = tcg_temp_new_i32();
2118     read_neon_element32(rn, a->vn, 0, MO_32);
2119     read_neon_element32(rm, a->vm, 0, MO_32);
2120     opfn(rd0, rn, rm);
2121
2122     read_neon_element32(rn, a->vn, 1, MO_32);
2123     read_neon_element32(rm, a->vm, 1, MO_32);
2124     opfn(rd1, rn, rm);
2125     tcg_temp_free_i32(rn);
2126     tcg_temp_free_i32(rm);
2127
2128     /* Don't store results until after all loads: they might overlap */
2129     if (accfn) {
2130         tmp = tcg_temp_new_i64();
2131         read_neon_element64(tmp, a->vd, 0, MO_64);
2132         accfn(rd0, tmp, rd0);
2133         read_neon_element64(tmp, a->vd, 1, MO_64);
2134         accfn(rd1, tmp, rd1);
2135         tcg_temp_free_i64(tmp);
2136     }
2137
2138     write_neon_element64(rd0, a->vd, 0, MO_64);
2139     write_neon_element64(rd1, a->vd, 1, MO_64);
2140     tcg_temp_free_i64(rd0);
2141     tcg_temp_free_i64(rd1);
2142
2143     return true;
2144 }
2145
2146 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2147 {
2148     static NeonGenTwoOpWidenFn * const opfn[] = {
2149         gen_helper_neon_abdl_s16,
2150         gen_helper_neon_abdl_s32,
2151         gen_helper_neon_abdl_s64,
2152         NULL,
2153     };
2154
2155     return do_long_3d(s, a, opfn[a->size], NULL);
2156 }
2157
2158 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2159 {
2160     static NeonGenTwoOpWidenFn * const opfn[] = {
2161         gen_helper_neon_abdl_u16,
2162         gen_helper_neon_abdl_u32,
2163         gen_helper_neon_abdl_u64,
2164         NULL,
2165     };
2166
2167     return do_long_3d(s, a, opfn[a->size], NULL);
2168 }
2169
2170 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2171 {
2172     static NeonGenTwoOpWidenFn * const opfn[] = {
2173         gen_helper_neon_abdl_s16,
2174         gen_helper_neon_abdl_s32,
2175         gen_helper_neon_abdl_s64,
2176         NULL,
2177     };
2178     static NeonGenTwo64OpFn * const addfn[] = {
2179         gen_helper_neon_addl_u16,
2180         gen_helper_neon_addl_u32,
2181         tcg_gen_add_i64,
2182         NULL,
2183     };
2184
2185     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2186 }
2187
2188 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2189 {
2190     static NeonGenTwoOpWidenFn * const opfn[] = {
2191         gen_helper_neon_abdl_u16,
2192         gen_helper_neon_abdl_u32,
2193         gen_helper_neon_abdl_u64,
2194         NULL,
2195     };
2196     static NeonGenTwo64OpFn * const addfn[] = {
2197         gen_helper_neon_addl_u16,
2198         gen_helper_neon_addl_u32,
2199         tcg_gen_add_i64,
2200         NULL,
2201     };
2202
2203     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2204 }
2205
2206 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2207 {
2208     TCGv_i32 lo = tcg_temp_new_i32();
2209     TCGv_i32 hi = tcg_temp_new_i32();
2210
2211     tcg_gen_muls2_i32(lo, hi, rn, rm);
2212     tcg_gen_concat_i32_i64(rd, lo, hi);
2213
2214     tcg_temp_free_i32(lo);
2215     tcg_temp_free_i32(hi);
2216 }
2217
2218 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2219 {
2220     TCGv_i32 lo = tcg_temp_new_i32();
2221     TCGv_i32 hi = tcg_temp_new_i32();
2222
2223     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2224     tcg_gen_concat_i32_i64(rd, lo, hi);
2225
2226     tcg_temp_free_i32(lo);
2227     tcg_temp_free_i32(hi);
2228 }
2229
2230 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2231 {
2232     static NeonGenTwoOpWidenFn * const opfn[] = {
2233         gen_helper_neon_mull_s8,
2234         gen_helper_neon_mull_s16,
2235         gen_mull_s32,
2236         NULL,
2237     };
2238
2239     return do_long_3d(s, a, opfn[a->size], NULL);
2240 }
2241
2242 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2243 {
2244     static NeonGenTwoOpWidenFn * const opfn[] = {
2245         gen_helper_neon_mull_u8,
2246         gen_helper_neon_mull_u16,
2247         gen_mull_u32,
2248         NULL,
2249     };
2250
2251     return do_long_3d(s, a, opfn[a->size], NULL);
2252 }
2253
2254 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2255     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2256     {                                                                   \
2257         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2258             gen_helper_neon_##MULL##8,                                  \
2259             gen_helper_neon_##MULL##16,                                 \
2260             gen_##MULL##32,                                             \
2261             NULL,                                                       \
2262         };                                                              \
2263         static NeonGenTwo64OpFn * const accfn[] = {                     \
2264             gen_helper_neon_##ACC##l_u16,                               \
2265             gen_helper_neon_##ACC##l_u32,                               \
2266             tcg_gen_##ACC##_i64,                                        \
2267             NULL,                                                       \
2268         };                                                              \
2269         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2270     }
2271
2272 DO_VMLAL(VMLAL_S,mull_s,add)
2273 DO_VMLAL(VMLAL_U,mull_u,add)
2274 DO_VMLAL(VMLSL_S,mull_s,sub)
2275 DO_VMLAL(VMLSL_U,mull_u,sub)
2276
2277 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2278 {
2279     gen_helper_neon_mull_s16(rd, rn, rm);
2280     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2281 }
2282
2283 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2284 {
2285     gen_mull_s32(rd, rn, rm);
2286     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2287 }
2288
2289 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2290 {
2291     static NeonGenTwoOpWidenFn * const opfn[] = {
2292         NULL,
2293         gen_VQDMULL_16,
2294         gen_VQDMULL_32,
2295         NULL,
2296     };
2297
2298     return do_long_3d(s, a, opfn[a->size], NULL);
2299 }
2300
2301 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2302 {
2303     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2304 }
2305
2306 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2307 {
2308     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2309 }
2310
2311 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2312 {
2313     static NeonGenTwoOpWidenFn * const opfn[] = {
2314         NULL,
2315         gen_VQDMULL_16,
2316         gen_VQDMULL_32,
2317         NULL,
2318     };
2319     static NeonGenTwo64OpFn * const accfn[] = {
2320         NULL,
2321         gen_VQDMLAL_acc_16,
2322         gen_VQDMLAL_acc_32,
2323         NULL,
2324     };
2325
2326     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2327 }
2328
2329 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2330 {
2331     gen_helper_neon_negl_u32(rm, rm);
2332     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2333 }
2334
2335 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2336 {
2337     tcg_gen_neg_i64(rm, rm);
2338     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2339 }
2340
2341 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2342 {
2343     static NeonGenTwoOpWidenFn * const opfn[] = {
2344         NULL,
2345         gen_VQDMULL_16,
2346         gen_VQDMULL_32,
2347         NULL,
2348     };
2349     static NeonGenTwo64OpFn * const accfn[] = {
2350         NULL,
2351         gen_VQDMLSL_acc_16,
2352         gen_VQDMLSL_acc_32,
2353         NULL,
2354     };
2355
2356     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2357 }
2358
2359 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2360 {
2361     gen_helper_gvec_3 *fn_gvec;
2362
2363     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2364         return false;
2365     }
2366
2367     /* UNDEF accesses to D16-D31 if they don't exist. */
2368     if (!dc_isar_feature(aa32_simd_r32, s) &&
2369         ((a->vd | a->vn | a->vm) & 0x10)) {
2370         return false;
2371     }
2372
2373     if (a->vd & 1) {
2374         return false;
2375     }
2376
2377     switch (a->size) {
2378     case 0:
2379         fn_gvec = gen_helper_neon_pmull_h;
2380         break;
2381     case 2:
2382         if (!dc_isar_feature(aa32_pmull, s)) {
2383             return false;
2384         }
2385         fn_gvec = gen_helper_gvec_pmull_q;
2386         break;
2387     default:
2388         return false;
2389     }
2390
2391     if (!vfp_access_check(s)) {
2392         return true;
2393     }
2394
2395     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2396                        neon_full_reg_offset(a->vn),
2397                        neon_full_reg_offset(a->vm),
2398                        16, 16, 0, fn_gvec);
2399     return true;
2400 }
2401
2402 static void gen_neon_dup_low16(TCGv_i32 var)
2403 {
2404     TCGv_i32 tmp = tcg_temp_new_i32();
2405     tcg_gen_ext16u_i32(var, var);
2406     tcg_gen_shli_i32(tmp, var, 16);
2407     tcg_gen_or_i32(var, var, tmp);
2408     tcg_temp_free_i32(tmp);
2409 }
2410
2411 static void gen_neon_dup_high16(TCGv_i32 var)
2412 {
2413     TCGv_i32 tmp = tcg_temp_new_i32();
2414     tcg_gen_andi_i32(var, var, 0xffff0000);
2415     tcg_gen_shri_i32(tmp, var, 16);
2416     tcg_gen_or_i32(var, var, tmp);
2417     tcg_temp_free_i32(tmp);
2418 }
2419
2420 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2421 {
2422     TCGv_i32 tmp = tcg_temp_new_i32();
2423     if (size == MO_16) {
2424         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2425         if (reg & 8) {
2426             gen_neon_dup_high16(tmp);
2427         } else {
2428             gen_neon_dup_low16(tmp);
2429         }
2430     } else {
2431         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2432     }
2433     return tmp;
2434 }
2435
2436 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2437                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2438 {
2439     /*
2440      * Two registers and a scalar: perform an operation between
2441      * the input elements and the scalar, and then possibly
2442      * perform an accumulation operation of that result into the
2443      * destination.
2444      */
2445     TCGv_i32 scalar, tmp;
2446     int pass;
2447
2448     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2449         return false;
2450     }
2451
2452     /* UNDEF accesses to D16-D31 if they don't exist. */
2453     if (!dc_isar_feature(aa32_simd_r32, s) &&
2454         ((a->vd | a->vn | a->vm) & 0x10)) {
2455         return false;
2456     }
2457
2458     if (!opfn) {
2459         /* Bad size (including size == 3, which is a different insn group) */
2460         return false;
2461     }
2462
2463     if (a->q && ((a->vd | a->vn) & 1)) {
2464         return false;
2465     }
2466
2467     if (!vfp_access_check(s)) {
2468         return true;
2469     }
2470
2471     scalar = neon_get_scalar(a->size, a->vm);
2472     tmp = tcg_temp_new_i32();
2473
2474     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2475         read_neon_element32(tmp, a->vn, pass, MO_32);
2476         opfn(tmp, tmp, scalar);
2477         if (accfn) {
2478             TCGv_i32 rd = tcg_temp_new_i32();
2479             read_neon_element32(rd, a->vd, pass, MO_32);
2480             accfn(tmp, rd, tmp);
2481             tcg_temp_free_i32(rd);
2482         }
2483         write_neon_element32(tmp, a->vd, pass, MO_32);
2484     }
2485     tcg_temp_free_i32(tmp);
2486     tcg_temp_free_i32(scalar);
2487     return true;
2488 }
2489
2490 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2491 {
2492     static NeonGenTwoOpFn * const opfn[] = {
2493         NULL,
2494         gen_helper_neon_mul_u16,
2495         tcg_gen_mul_i32,
2496         NULL,
2497     };
2498
2499     return do_2scalar(s, a, opfn[a->size], NULL);
2500 }
2501
2502 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2503 {
2504     static NeonGenTwoOpFn * const opfn[] = {
2505         NULL,
2506         gen_helper_neon_mul_u16,
2507         tcg_gen_mul_i32,
2508         NULL,
2509     };
2510     static NeonGenTwoOpFn * const accfn[] = {
2511         NULL,
2512         gen_helper_neon_add_u16,
2513         tcg_gen_add_i32,
2514         NULL,
2515     };
2516
2517     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2518 }
2519
2520 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2521 {
2522     static NeonGenTwoOpFn * const opfn[] = {
2523         NULL,
2524         gen_helper_neon_mul_u16,
2525         tcg_gen_mul_i32,
2526         NULL,
2527     };
2528     static NeonGenTwoOpFn * const accfn[] = {
2529         NULL,
2530         gen_helper_neon_sub_u16,
2531         tcg_gen_sub_i32,
2532         NULL,
2533     };
2534
2535     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2536 }
2537
2538 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2539                               gen_helper_gvec_3_ptr *fn)
2540 {
2541     /* Two registers and a scalar, using gvec */
2542     int vec_size = a->q ? 16 : 8;
2543     int rd_ofs = neon_full_reg_offset(a->vd);
2544     int rn_ofs = neon_full_reg_offset(a->vn);
2545     int rm_ofs;
2546     int idx;
2547     TCGv_ptr fpstatus;
2548
2549     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2550         return false;
2551     }
2552
2553     /* UNDEF accesses to D16-D31 if they don't exist. */
2554     if (!dc_isar_feature(aa32_simd_r32, s) &&
2555         ((a->vd | a->vn | a->vm) & 0x10)) {
2556         return false;
2557     }
2558
2559     if (!fn) {
2560         /* Bad size (including size == 3, which is a different insn group) */
2561         return false;
2562     }
2563
2564     if (a->q && ((a->vd | a->vn) & 1)) {
2565         return false;
2566     }
2567
2568     if (!vfp_access_check(s)) {
2569         return true;
2570     }
2571
2572     /* a->vm is M:Vm, which encodes both register and index */
2573     idx = extract32(a->vm, a->size + 2, 2);
2574     a->vm = extract32(a->vm, 0, a->size + 2);
2575     rm_ofs = neon_full_reg_offset(a->vm);
2576
2577     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2578     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2579                        vec_size, vec_size, idx, fn);
2580     tcg_temp_free_ptr(fpstatus);
2581     return true;
2582 }
2583
2584 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2585     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2586     {                                                                   \
2587         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2588             NULL,                                                       \
2589             gen_helper_##FUNC##_h,                                      \
2590             gen_helper_##FUNC##_s,                                      \
2591             NULL,                                                       \
2592         };                                                              \
2593         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2594             return false;                                               \
2595         }                                                               \
2596         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2597     }
2598
2599 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2600 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2601 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2602
2603 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2604 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2605 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2606 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2607
2608 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2609 {
2610     static NeonGenTwoOpFn * const opfn[] = {
2611         NULL,
2612         gen_VQDMULH_16,
2613         gen_VQDMULH_32,
2614         NULL,
2615     };
2616
2617     return do_2scalar(s, a, opfn[a->size], NULL);
2618 }
2619
2620 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2621 {
2622     static NeonGenTwoOpFn * const opfn[] = {
2623         NULL,
2624         gen_VQRDMULH_16,
2625         gen_VQRDMULH_32,
2626         NULL,
2627     };
2628
2629     return do_2scalar(s, a, opfn[a->size], NULL);
2630 }
2631
2632 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2633                             NeonGenThreeOpEnvFn *opfn)
2634 {
2635     /*
2636      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2637      * performs a kind of fused op-then-accumulate using a helper
2638      * function that takes all of rd, rn and the scalar at once.
2639      */
2640     TCGv_i32 scalar, rn, rd;
2641     int pass;
2642
2643     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2644         return false;
2645     }
2646
2647     if (!dc_isar_feature(aa32_rdm, s)) {
2648         return false;
2649     }
2650
2651     /* UNDEF accesses to D16-D31 if they don't exist. */
2652     if (!dc_isar_feature(aa32_simd_r32, s) &&
2653         ((a->vd | a->vn | a->vm) & 0x10)) {
2654         return false;
2655     }
2656
2657     if (!opfn) {
2658         /* Bad size (including size == 3, which is a different insn group) */
2659         return false;
2660     }
2661
2662     if (a->q && ((a->vd | a->vn) & 1)) {
2663         return false;
2664     }
2665
2666     if (!vfp_access_check(s)) {
2667         return true;
2668     }
2669
2670     scalar = neon_get_scalar(a->size, a->vm);
2671     rn = tcg_temp_new_i32();
2672     rd = tcg_temp_new_i32();
2673
2674     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2675         read_neon_element32(rn, a->vn, pass, MO_32);
2676         read_neon_element32(rd, a->vd, pass, MO_32);
2677         opfn(rd, cpu_env, rn, scalar, rd);
2678         write_neon_element32(rd, a->vd, pass, MO_32);
2679     }
2680     tcg_temp_free_i32(rn);
2681     tcg_temp_free_i32(rd);
2682     tcg_temp_free_i32(scalar);
2683
2684     return true;
2685 }
2686
2687 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2688 {
2689     static NeonGenThreeOpEnvFn *opfn[] = {
2690         NULL,
2691         gen_helper_neon_qrdmlah_s16,
2692         gen_helper_neon_qrdmlah_s32,
2693         NULL,
2694     };
2695     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2696 }
2697
2698 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2699 {
2700     static NeonGenThreeOpEnvFn *opfn[] = {
2701         NULL,
2702         gen_helper_neon_qrdmlsh_s16,
2703         gen_helper_neon_qrdmlsh_s32,
2704         NULL,
2705     };
2706     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2707 }
2708
2709 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2710                             NeonGenTwoOpWidenFn *opfn,
2711                             NeonGenTwo64OpFn *accfn)
2712 {
2713     /*
2714      * Two registers and a scalar, long operations: perform an
2715      * operation on the input elements and the scalar which produces
2716      * a double-width result, and then possibly perform an accumulation
2717      * operation of that result into the destination.
2718      */
2719     TCGv_i32 scalar, rn;
2720     TCGv_i64 rn0_64, rn1_64;
2721
2722     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2723         return false;
2724     }
2725
2726     /* UNDEF accesses to D16-D31 if they don't exist. */
2727     if (!dc_isar_feature(aa32_simd_r32, s) &&
2728         ((a->vd | a->vn | a->vm) & 0x10)) {
2729         return false;
2730     }
2731
2732     if (!opfn) {
2733         /* Bad size (including size == 3, which is a different insn group) */
2734         return false;
2735     }
2736
2737     if (a->vd & 1) {
2738         return false;
2739     }
2740
2741     if (!vfp_access_check(s)) {
2742         return true;
2743     }
2744
2745     scalar = neon_get_scalar(a->size, a->vm);
2746
2747     /* Load all inputs before writing any outputs, in case of overlap */
2748     rn = tcg_temp_new_i32();
2749     read_neon_element32(rn, a->vn, 0, MO_32);
2750     rn0_64 = tcg_temp_new_i64();
2751     opfn(rn0_64, rn, scalar);
2752
2753     read_neon_element32(rn, a->vn, 1, MO_32);
2754     rn1_64 = tcg_temp_new_i64();
2755     opfn(rn1_64, rn, scalar);
2756     tcg_temp_free_i32(rn);
2757     tcg_temp_free_i32(scalar);
2758
2759     if (accfn) {
2760         TCGv_i64 t64 = tcg_temp_new_i64();
2761         read_neon_element64(t64, a->vd, 0, MO_64);
2762         accfn(rn0_64, t64, rn0_64);
2763         read_neon_element64(t64, a->vd, 1, MO_64);
2764         accfn(rn1_64, t64, rn1_64);
2765         tcg_temp_free_i64(t64);
2766     }
2767
2768     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2769     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2770     tcg_temp_free_i64(rn0_64);
2771     tcg_temp_free_i64(rn1_64);
2772     return true;
2773 }
2774
2775 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2776 {
2777     static NeonGenTwoOpWidenFn * const opfn[] = {
2778         NULL,
2779         gen_helper_neon_mull_s16,
2780         gen_mull_s32,
2781         NULL,
2782     };
2783
2784     return do_2scalar_long(s, a, opfn[a->size], NULL);
2785 }
2786
2787 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2788 {
2789     static NeonGenTwoOpWidenFn * const opfn[] = {
2790         NULL,
2791         gen_helper_neon_mull_u16,
2792         gen_mull_u32,
2793         NULL,
2794     };
2795
2796     return do_2scalar_long(s, a, opfn[a->size], NULL);
2797 }
2798
2799 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2800     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2801     {                                                                   \
2802         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2803             NULL,                                                       \
2804             gen_helper_neon_##MULL##16,                                 \
2805             gen_##MULL##32,                                             \
2806             NULL,                                                       \
2807         };                                                              \
2808         static NeonGenTwo64OpFn * const accfn[] = {                     \
2809             NULL,                                                       \
2810             gen_helper_neon_##ACC##l_u32,                               \
2811             tcg_gen_##ACC##_i64,                                        \
2812             NULL,                                                       \
2813         };                                                              \
2814         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2815     }
2816
2817 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2818 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2819 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2820 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2821
2822 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2823 {
2824     static NeonGenTwoOpWidenFn * const opfn[] = {
2825         NULL,
2826         gen_VQDMULL_16,
2827         gen_VQDMULL_32,
2828         NULL,
2829     };
2830
2831     return do_2scalar_long(s, a, opfn[a->size], NULL);
2832 }
2833
2834 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2835 {
2836     static NeonGenTwoOpWidenFn * const opfn[] = {
2837         NULL,
2838         gen_VQDMULL_16,
2839         gen_VQDMULL_32,
2840         NULL,
2841     };
2842     static NeonGenTwo64OpFn * const accfn[] = {
2843         NULL,
2844         gen_VQDMLAL_acc_16,
2845         gen_VQDMLAL_acc_32,
2846         NULL,
2847     };
2848
2849     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2850 }
2851
2852 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2853 {
2854     static NeonGenTwoOpWidenFn * const opfn[] = {
2855         NULL,
2856         gen_VQDMULL_16,
2857         gen_VQDMULL_32,
2858         NULL,
2859     };
2860     static NeonGenTwo64OpFn * const accfn[] = {
2861         NULL,
2862         gen_VQDMLSL_acc_16,
2863         gen_VQDMLSL_acc_32,
2864         NULL,
2865     };
2866
2867     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2868 }
2869
2870 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2871 {
2872     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2873         return false;
2874     }
2875
2876     /* UNDEF accesses to D16-D31 if they don't exist. */
2877     if (!dc_isar_feature(aa32_simd_r32, s) &&
2878         ((a->vd | a->vn | a->vm) & 0x10)) {
2879         return false;
2880     }
2881
2882     if ((a->vn | a->vm | a->vd) & a->q) {
2883         return false;
2884     }
2885
2886     if (a->imm > 7 && !a->q) {
2887         return false;
2888     }
2889
2890     if (!vfp_access_check(s)) {
2891         return true;
2892     }
2893
2894     if (!a->q) {
2895         /* Extract 64 bits from <Vm:Vn> */
2896         TCGv_i64 left, right, dest;
2897
2898         left = tcg_temp_new_i64();
2899         right = tcg_temp_new_i64();
2900         dest = tcg_temp_new_i64();
2901
2902         read_neon_element64(right, a->vn, 0, MO_64);
2903         read_neon_element64(left, a->vm, 0, MO_64);
2904         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2905         write_neon_element64(dest, a->vd, 0, MO_64);
2906
2907         tcg_temp_free_i64(left);
2908         tcg_temp_free_i64(right);
2909         tcg_temp_free_i64(dest);
2910     } else {
2911         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2912         TCGv_i64 left, middle, right, destleft, destright;
2913
2914         left = tcg_temp_new_i64();
2915         middle = tcg_temp_new_i64();
2916         right = tcg_temp_new_i64();
2917         destleft = tcg_temp_new_i64();
2918         destright = tcg_temp_new_i64();
2919
2920         if (a->imm < 8) {
2921             read_neon_element64(right, a->vn, 0, MO_64);
2922             read_neon_element64(middle, a->vn, 1, MO_64);
2923             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2924             read_neon_element64(left, a->vm, 0, MO_64);
2925             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2926         } else {
2927             read_neon_element64(right, a->vn, 1, MO_64);
2928             read_neon_element64(middle, a->vm, 0, MO_64);
2929             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2930             read_neon_element64(left, a->vm, 1, MO_64);
2931             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2932         }
2933
2934         write_neon_element64(destright, a->vd, 0, MO_64);
2935         write_neon_element64(destleft, a->vd, 1, MO_64);
2936
2937         tcg_temp_free_i64(destright);
2938         tcg_temp_free_i64(destleft);
2939         tcg_temp_free_i64(right);
2940         tcg_temp_free_i64(middle);
2941         tcg_temp_free_i64(left);
2942     }
2943     return true;
2944 }
2945
2946 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2947 {
2948     TCGv_i64 val, def;
2949     TCGv_i32 desc;
2950
2951     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2952         return false;
2953     }
2954
2955     /* UNDEF accesses to D16-D31 if they don't exist. */
2956     if (!dc_isar_feature(aa32_simd_r32, s) &&
2957         ((a->vd | a->vn | a->vm) & 0x10)) {
2958         return false;
2959     }
2960
2961     if ((a->vn + a->len + 1) > 32) {
2962         /*
2963          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2964          * helper function running off the end of the register file.
2965          */
2966         return false;
2967     }
2968
2969     if (!vfp_access_check(s)) {
2970         return true;
2971     }
2972
2973     desc = tcg_const_i32((a->vn << 2) | a->len);
2974     def = tcg_temp_new_i64();
2975     if (a->op) {
2976         read_neon_element64(def, a->vd, 0, MO_64);
2977     } else {
2978         tcg_gen_movi_i64(def, 0);
2979     }
2980     val = tcg_temp_new_i64();
2981     read_neon_element64(val, a->vm, 0, MO_64);
2982
2983     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2984     write_neon_element64(val, a->vd, 0, MO_64);
2985
2986     tcg_temp_free_i64(def);
2987     tcg_temp_free_i64(val);
2988     tcg_temp_free_i32(desc);
2989     return true;
2990 }
2991
2992 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2993 {
2994     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2995         return false;
2996     }
2997
2998     /* UNDEF accesses to D16-D31 if they don't exist. */
2999     if (!dc_isar_feature(aa32_simd_r32, s) &&
3000         ((a->vd | a->vm) & 0x10)) {
3001         return false;
3002     }
3003
3004     if (a->vd & a->q) {
3005         return false;
3006     }
3007
3008     if (!vfp_access_check(s)) {
3009         return true;
3010     }
3011
3012     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
3013                          neon_element_offset(a->vm, a->index, a->size),
3014                          a->q ? 16 : 8, a->q ? 16 : 8);
3015     return true;
3016 }
3017
3018 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3019 {
3020     int pass, half;
3021     TCGv_i32 tmp[2];
3022
3023     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3024         return false;
3025     }
3026
3027     /* UNDEF accesses to D16-D31 if they don't exist. */
3028     if (!dc_isar_feature(aa32_simd_r32, s) &&
3029         ((a->vd | a->vm) & 0x10)) {
3030         return false;
3031     }
3032
3033     if ((a->vd | a->vm) & a->q) {
3034         return false;
3035     }
3036
3037     if (a->size == 3) {
3038         return false;
3039     }
3040
3041     if (!vfp_access_check(s)) {
3042         return true;
3043     }
3044
3045     tmp[0] = tcg_temp_new_i32();
3046     tmp[1] = tcg_temp_new_i32();
3047
3048     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3049         for (half = 0; half < 2; half++) {
3050             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
3051             switch (a->size) {
3052             case 0:
3053                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3054                 break;
3055             case 1:
3056                 gen_swap_half(tmp[half], tmp[half]);
3057                 break;
3058             case 2:
3059                 break;
3060             default:
3061                 g_assert_not_reached();
3062             }
3063         }
3064         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3065         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3066     }
3067
3068     tcg_temp_free_i32(tmp[0]);
3069     tcg_temp_free_i32(tmp[1]);
3070     return true;
3071 }
3072
3073 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3074                               NeonGenWidenFn *widenfn,
3075                               NeonGenTwo64OpFn *opfn,
3076                               NeonGenTwo64OpFn *accfn)
3077 {
3078     /*
3079      * Pairwise long operations: widen both halves of the pair,
3080      * combine the pairs with the opfn, and then possibly accumulate
3081      * into the destination with the accfn.
3082      */
3083     int pass;
3084
3085     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3086         return false;
3087     }
3088
3089     /* UNDEF accesses to D16-D31 if they don't exist. */
3090     if (!dc_isar_feature(aa32_simd_r32, s) &&
3091         ((a->vd | a->vm) & 0x10)) {
3092         return false;
3093     }
3094
3095     if ((a->vd | a->vm) & a->q) {
3096         return false;
3097     }
3098
3099     if (!widenfn) {
3100         return false;
3101     }
3102
3103     if (!vfp_access_check(s)) {
3104         return true;
3105     }
3106
3107     for (pass = 0; pass < a->q + 1; pass++) {
3108         TCGv_i32 tmp;
3109         TCGv_i64 rm0_64, rm1_64, rd_64;
3110
3111         rm0_64 = tcg_temp_new_i64();
3112         rm1_64 = tcg_temp_new_i64();
3113         rd_64 = tcg_temp_new_i64();
3114
3115         tmp = tcg_temp_new_i32();
3116         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3117         widenfn(rm0_64, tmp);
3118         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3119         widenfn(rm1_64, tmp);
3120         tcg_temp_free_i32(tmp);
3121
3122         opfn(rd_64, rm0_64, rm1_64);
3123         tcg_temp_free_i64(rm0_64);
3124         tcg_temp_free_i64(rm1_64);
3125
3126         if (accfn) {
3127             TCGv_i64 tmp64 = tcg_temp_new_i64();
3128             read_neon_element64(tmp64, a->vd, pass, MO_64);
3129             accfn(rd_64, tmp64, rd_64);
3130             tcg_temp_free_i64(tmp64);
3131         }
3132         write_neon_element64(rd_64, a->vd, pass, MO_64);
3133         tcg_temp_free_i64(rd_64);
3134     }
3135     return true;
3136 }
3137
3138 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3139 {
3140     static NeonGenWidenFn * const widenfn[] = {
3141         gen_helper_neon_widen_s8,
3142         gen_helper_neon_widen_s16,
3143         tcg_gen_ext_i32_i64,
3144         NULL,
3145     };
3146     static NeonGenTwo64OpFn * const opfn[] = {
3147         gen_helper_neon_paddl_u16,
3148         gen_helper_neon_paddl_u32,
3149         tcg_gen_add_i64,
3150         NULL,
3151     };
3152
3153     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3154 }
3155
3156 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3157 {
3158     static NeonGenWidenFn * const widenfn[] = {
3159         gen_helper_neon_widen_u8,
3160         gen_helper_neon_widen_u16,
3161         tcg_gen_extu_i32_i64,
3162         NULL,
3163     };
3164     static NeonGenTwo64OpFn * const opfn[] = {
3165         gen_helper_neon_paddl_u16,
3166         gen_helper_neon_paddl_u32,
3167         tcg_gen_add_i64,
3168         NULL,
3169     };
3170
3171     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3172 }
3173
3174 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3175 {
3176     static NeonGenWidenFn * const widenfn[] = {
3177         gen_helper_neon_widen_s8,
3178         gen_helper_neon_widen_s16,
3179         tcg_gen_ext_i32_i64,
3180         NULL,
3181     };
3182     static NeonGenTwo64OpFn * const opfn[] = {
3183         gen_helper_neon_paddl_u16,
3184         gen_helper_neon_paddl_u32,
3185         tcg_gen_add_i64,
3186         NULL,
3187     };
3188     static NeonGenTwo64OpFn * const accfn[] = {
3189         gen_helper_neon_addl_u16,
3190         gen_helper_neon_addl_u32,
3191         tcg_gen_add_i64,
3192         NULL,
3193     };
3194
3195     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3196                              accfn[a->size]);
3197 }
3198
3199 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3200 {
3201     static NeonGenWidenFn * const widenfn[] = {
3202         gen_helper_neon_widen_u8,
3203         gen_helper_neon_widen_u16,
3204         tcg_gen_extu_i32_i64,
3205         NULL,
3206     };
3207     static NeonGenTwo64OpFn * const opfn[] = {
3208         gen_helper_neon_paddl_u16,
3209         gen_helper_neon_paddl_u32,
3210         tcg_gen_add_i64,
3211         NULL,
3212     };
3213     static NeonGenTwo64OpFn * const accfn[] = {
3214         gen_helper_neon_addl_u16,
3215         gen_helper_neon_addl_u32,
3216         tcg_gen_add_i64,
3217         NULL,
3218     };
3219
3220     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3221                              accfn[a->size]);
3222 }
3223
3224 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3225
3226 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3227                        ZipFn *fn)
3228 {
3229     TCGv_ptr pd, pm;
3230
3231     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3232         return false;
3233     }
3234
3235     /* UNDEF accesses to D16-D31 if they don't exist. */
3236     if (!dc_isar_feature(aa32_simd_r32, s) &&
3237         ((a->vd | a->vm) & 0x10)) {
3238         return false;
3239     }
3240
3241     if ((a->vd | a->vm) & a->q) {
3242         return false;
3243     }
3244
3245     if (!fn) {
3246         /* Bad size or size/q combination */
3247         return false;
3248     }
3249
3250     if (!vfp_access_check(s)) {
3251         return true;
3252     }
3253
3254     pd = vfp_reg_ptr(true, a->vd);
3255     pm = vfp_reg_ptr(true, a->vm);
3256     fn(pd, pm);
3257     tcg_temp_free_ptr(pd);
3258     tcg_temp_free_ptr(pm);
3259     return true;
3260 }
3261
3262 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3263 {
3264     static ZipFn * const fn[2][4] = {
3265         {
3266             gen_helper_neon_unzip8,
3267             gen_helper_neon_unzip16,
3268             NULL,
3269             NULL,
3270         }, {
3271             gen_helper_neon_qunzip8,
3272             gen_helper_neon_qunzip16,
3273             gen_helper_neon_qunzip32,
3274             NULL,
3275         }
3276     };
3277     return do_zip_uzp(s, a, fn[a->q][a->size]);
3278 }
3279
3280 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3281 {
3282     static ZipFn * const fn[2][4] = {
3283         {
3284             gen_helper_neon_zip8,
3285             gen_helper_neon_zip16,
3286             NULL,
3287             NULL,
3288         }, {
3289             gen_helper_neon_qzip8,
3290             gen_helper_neon_qzip16,
3291             gen_helper_neon_qzip32,
3292             NULL,
3293         }
3294     };
3295     return do_zip_uzp(s, a, fn[a->q][a->size]);
3296 }
3297
3298 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3299                      NeonGenNarrowEnvFn *narrowfn)
3300 {
3301     TCGv_i64 rm;
3302     TCGv_i32 rd0, rd1;
3303
3304     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3305         return false;
3306     }
3307
3308     /* UNDEF accesses to D16-D31 if they don't exist. */
3309     if (!dc_isar_feature(aa32_simd_r32, s) &&
3310         ((a->vd | a->vm) & 0x10)) {
3311         return false;
3312     }
3313
3314     if (a->vm & 1) {
3315         return false;
3316     }
3317
3318     if (!narrowfn) {
3319         return false;
3320     }
3321
3322     if (!vfp_access_check(s)) {
3323         return true;
3324     }
3325
3326     rm = tcg_temp_new_i64();
3327     rd0 = tcg_temp_new_i32();
3328     rd1 = tcg_temp_new_i32();
3329
3330     read_neon_element64(rm, a->vm, 0, MO_64);
3331     narrowfn(rd0, cpu_env, rm);
3332     read_neon_element64(rm, a->vm, 1, MO_64);
3333     narrowfn(rd1, cpu_env, rm);
3334     write_neon_element32(rd0, a->vd, 0, MO_32);
3335     write_neon_element32(rd1, a->vd, 1, MO_32);
3336     tcg_temp_free_i32(rd0);
3337     tcg_temp_free_i32(rd1);
3338     tcg_temp_free_i64(rm);
3339     return true;
3340 }
3341
3342 #define DO_VMOVN(INSN, FUNC)                                    \
3343     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3344     {                                                           \
3345         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3346             FUNC##8,                                            \
3347             FUNC##16,                                           \
3348             FUNC##32,                                           \
3349             NULL,                                               \
3350         };                                                      \
3351         return do_vmovn(s, a, narrowfn[a->size]);               \
3352     }
3353
3354 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3355 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3356 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3357 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3358
3359 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3360 {
3361     TCGv_i32 rm0, rm1;
3362     TCGv_i64 rd;
3363     static NeonGenWidenFn * const widenfns[] = {
3364         gen_helper_neon_widen_u8,
3365         gen_helper_neon_widen_u16,
3366         tcg_gen_extu_i32_i64,
3367         NULL,
3368     };
3369     NeonGenWidenFn *widenfn = widenfns[a->size];
3370
3371     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3372         return false;
3373     }
3374
3375     /* UNDEF accesses to D16-D31 if they don't exist. */
3376     if (!dc_isar_feature(aa32_simd_r32, s) &&
3377         ((a->vd | a->vm) & 0x10)) {
3378         return false;
3379     }
3380
3381     if (a->vd & 1) {
3382         return false;
3383     }
3384
3385     if (!widenfn) {
3386         return false;
3387     }
3388
3389     if (!vfp_access_check(s)) {
3390         return true;
3391     }
3392
3393     rd = tcg_temp_new_i64();
3394     rm0 = tcg_temp_new_i32();
3395     rm1 = tcg_temp_new_i32();
3396
3397     read_neon_element32(rm0, a->vm, 0, MO_32);
3398     read_neon_element32(rm1, a->vm, 1, MO_32);
3399
3400     widenfn(rd, rm0);
3401     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3402     write_neon_element64(rd, a->vd, 0, MO_64);
3403     widenfn(rd, rm1);
3404     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3405     write_neon_element64(rd, a->vd, 1, MO_64);
3406
3407     tcg_temp_free_i64(rd);
3408     tcg_temp_free_i32(rm0);
3409     tcg_temp_free_i32(rm1);
3410     return true;
3411 }
3412
3413 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3414 {
3415     TCGv_ptr fpst;
3416     TCGv_i32 ahp, tmp, tmp2, tmp3;
3417
3418     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3419         !dc_isar_feature(aa32_fp16_spconv, s)) {
3420         return false;
3421     }
3422
3423     /* UNDEF accesses to D16-D31 if they don't exist. */
3424     if (!dc_isar_feature(aa32_simd_r32, s) &&
3425         ((a->vd | a->vm) & 0x10)) {
3426         return false;
3427     }
3428
3429     if ((a->vm & 1) || (a->size != 1)) {
3430         return false;
3431     }
3432
3433     if (!vfp_access_check(s)) {
3434         return true;
3435     }
3436
3437     fpst = fpstatus_ptr(FPST_STD);
3438     ahp = get_ahp_flag();
3439     tmp = tcg_temp_new_i32();
3440     read_neon_element32(tmp, a->vm, 0, MO_32);
3441     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3442     tmp2 = tcg_temp_new_i32();
3443     read_neon_element32(tmp2, a->vm, 1, MO_32);
3444     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3445     tcg_gen_shli_i32(tmp2, tmp2, 16);
3446     tcg_gen_or_i32(tmp2, tmp2, tmp);
3447     read_neon_element32(tmp, a->vm, 2, MO_32);
3448     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3449     tmp3 = tcg_temp_new_i32();
3450     read_neon_element32(tmp3, a->vm, 3, MO_32);
3451     write_neon_element32(tmp2, a->vd, 0, MO_32);
3452     tcg_temp_free_i32(tmp2);
3453     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3454     tcg_gen_shli_i32(tmp3, tmp3, 16);
3455     tcg_gen_or_i32(tmp3, tmp3, tmp);
3456     write_neon_element32(tmp3, a->vd, 1, MO_32);
3457     tcg_temp_free_i32(tmp3);
3458     tcg_temp_free_i32(tmp);
3459     tcg_temp_free_i32(ahp);
3460     tcg_temp_free_ptr(fpst);
3461
3462     return true;
3463 }
3464
3465 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3466 {
3467     TCGv_ptr fpst;
3468     TCGv_i32 ahp, tmp, tmp2, tmp3;
3469
3470     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3471         !dc_isar_feature(aa32_fp16_spconv, s)) {
3472         return false;
3473     }
3474
3475     /* UNDEF accesses to D16-D31 if they don't exist. */
3476     if (!dc_isar_feature(aa32_simd_r32, s) &&
3477         ((a->vd | a->vm) & 0x10)) {
3478         return false;
3479     }
3480
3481     if ((a->vd & 1) || (a->size != 1)) {
3482         return false;
3483     }
3484
3485     if (!vfp_access_check(s)) {
3486         return true;
3487     }
3488
3489     fpst = fpstatus_ptr(FPST_STD);
3490     ahp = get_ahp_flag();
3491     tmp3 = tcg_temp_new_i32();
3492     tmp2 = tcg_temp_new_i32();
3493     tmp = tcg_temp_new_i32();
3494     read_neon_element32(tmp, a->vm, 0, MO_32);
3495     read_neon_element32(tmp2, a->vm, 1, MO_32);
3496     tcg_gen_ext16u_i32(tmp3, tmp);
3497     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3498     write_neon_element32(tmp3, a->vd, 0, MO_32);
3499     tcg_gen_shri_i32(tmp, tmp, 16);
3500     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3501     write_neon_element32(tmp, a->vd, 1, MO_32);
3502     tcg_temp_free_i32(tmp);
3503     tcg_gen_ext16u_i32(tmp3, tmp2);
3504     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3505     write_neon_element32(tmp3, a->vd, 2, MO_32);
3506     tcg_temp_free_i32(tmp3);
3507     tcg_gen_shri_i32(tmp2, tmp2, 16);
3508     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3509     write_neon_element32(tmp2, a->vd, 3, MO_32);
3510     tcg_temp_free_i32(tmp2);
3511     tcg_temp_free_i32(ahp);
3512     tcg_temp_free_ptr(fpst);
3513
3514     return true;
3515 }
3516
3517 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3518 {
3519     int vec_size = a->q ? 16 : 8;
3520     int rd_ofs = neon_full_reg_offset(a->vd);
3521     int rm_ofs = neon_full_reg_offset(a->vm);
3522
3523     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3524         return false;
3525     }
3526
3527     /* UNDEF accesses to D16-D31 if they don't exist. */
3528     if (!dc_isar_feature(aa32_simd_r32, s) &&
3529         ((a->vd | a->vm) & 0x10)) {
3530         return false;
3531     }
3532
3533     if (a->size == 3) {
3534         return false;
3535     }
3536
3537     if ((a->vd | a->vm) & a->q) {
3538         return false;
3539     }
3540
3541     if (!vfp_access_check(s)) {
3542         return true;
3543     }
3544
3545     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3546
3547     return true;
3548 }
3549
3550 #define DO_2MISC_VEC(INSN, FN)                                  \
3551     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3552     {                                                           \
3553         return do_2misc_vec(s, a, FN);                          \
3554     }
3555
3556 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3557 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3558 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3559 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3560 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3561 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3562 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3563
3564 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3565 {
3566     if (a->size != 0) {
3567         return false;
3568     }
3569     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3570 }
3571
3572 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3573     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3574                          uint32_t rm_ofs, uint32_t oprsz,               \
3575                          uint32_t maxsz)                                \
3576     {                                                                   \
3577         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3578                            DATA, FUNC);                                 \
3579     }
3580
3581 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3582     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3583                          uint32_t rm_ofs, uint32_t oprsz,               \
3584                          uint32_t maxsz)                                \
3585     {                                                                   \
3586         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3587     }
3588
3589 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3590 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3591 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3592 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3593 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3594 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3595 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3596
3597 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3598     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3599     {                                                           \
3600         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3601             return false;                                       \
3602         }                                                       \
3603         return do_2misc_vec(s, a, gen_##INSN);                  \
3604     }
3605
3606 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3607 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3608 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3609 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3610 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3611 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3612 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3613
3614 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3615 {
3616     TCGv_i32 tmp;
3617     int pass;
3618
3619     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3620     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3621         return false;
3622     }
3623
3624     /* UNDEF accesses to D16-D31 if they don't exist. */
3625     if (!dc_isar_feature(aa32_simd_r32, s) &&
3626         ((a->vd | a->vm) & 0x10)) {
3627         return false;
3628     }
3629
3630     if (!fn) {
3631         return false;
3632     }
3633
3634     if ((a->vd | a->vm) & a->q) {
3635         return false;
3636     }
3637
3638     if (!vfp_access_check(s)) {
3639         return true;
3640     }
3641
3642     tmp = tcg_temp_new_i32();
3643     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3644         read_neon_element32(tmp, a->vm, pass, MO_32);
3645         fn(tmp, tmp);
3646         write_neon_element32(tmp, a->vd, pass, MO_32);
3647     }
3648     tcg_temp_free_i32(tmp);
3649
3650     return true;
3651 }
3652
3653 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3654 {
3655     static NeonGenOneOpFn * const fn[] = {
3656         tcg_gen_bswap32_i32,
3657         gen_swap_half,
3658         NULL,
3659         NULL,
3660     };
3661     return do_2misc(s, a, fn[a->size]);
3662 }
3663
3664 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3665 {
3666     if (a->size != 0) {
3667         return false;
3668     }
3669     return do_2misc(s, a, gen_rev16);
3670 }
3671
3672 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3673 {
3674     static NeonGenOneOpFn * const fn[] = {
3675         gen_helper_neon_cls_s8,
3676         gen_helper_neon_cls_s16,
3677         gen_helper_neon_cls_s32,
3678         NULL,
3679     };
3680     return do_2misc(s, a, fn[a->size]);
3681 }
3682
3683 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3684 {
3685     tcg_gen_clzi_i32(rd, rm, 32);
3686 }
3687
3688 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3689 {
3690     static NeonGenOneOpFn * const fn[] = {
3691         gen_helper_neon_clz_u8,
3692         gen_helper_neon_clz_u16,
3693         do_VCLZ_32,
3694         NULL,
3695     };
3696     return do_2misc(s, a, fn[a->size]);
3697 }
3698
3699 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3700 {
3701     if (a->size != 0) {
3702         return false;
3703     }
3704     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3705 }
3706
3707 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3708                        uint32_t oprsz, uint32_t maxsz)
3709 {
3710     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3711                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3712                       oprsz, maxsz);
3713 }
3714
3715 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3716 {
3717     if (a->size == MO_16) {
3718         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3719             return false;
3720         }
3721     } else if (a->size != MO_32) {
3722         return false;
3723     }
3724     return do_2misc_vec(s, a, gen_VABS_F);
3725 }
3726
3727 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3728                        uint32_t oprsz, uint32_t maxsz)
3729 {
3730     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3731                       vece == MO_16 ? 0x8000 : 0x80000000,
3732                       oprsz, maxsz);
3733 }
3734
3735 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3736 {
3737     if (a->size == MO_16) {
3738         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3739             return false;
3740         }
3741     } else if (a->size != MO_32) {
3742         return false;
3743     }
3744     return do_2misc_vec(s, a, gen_VNEG_F);
3745 }
3746
3747 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3748 {
3749     if (a->size != 2) {
3750         return false;
3751     }
3752     return do_2misc(s, a, gen_helper_recpe_u32);
3753 }
3754
3755 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3756 {
3757     if (a->size != 2) {
3758         return false;
3759     }
3760     return do_2misc(s, a, gen_helper_rsqrte_u32);
3761 }
3762
3763 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3764     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3765     {                                                   \
3766         FUNC(d, cpu_env, m);                            \
3767     }
3768
3769 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3770 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3771 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3772 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3773 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3774 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3775
3776 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3777 {
3778     static NeonGenOneOpFn * const fn[] = {
3779         gen_VQABS_s8,
3780         gen_VQABS_s16,
3781         gen_VQABS_s32,
3782         NULL,
3783     };
3784     return do_2misc(s, a, fn[a->size]);
3785 }
3786
3787 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3788 {
3789     static NeonGenOneOpFn * const fn[] = {
3790         gen_VQNEG_s8,
3791         gen_VQNEG_s16,
3792         gen_VQNEG_s32,
3793         NULL,
3794     };
3795     return do_2misc(s, a, fn[a->size]);
3796 }
3797
3798 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3799     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3800                            uint32_t rm_ofs,                             \
3801                            uint32_t oprsz, uint32_t maxsz)              \
3802     {                                                                   \
3803         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3804             NULL, HFUNC, SFUNC, NULL,                                   \
3805         };                                                              \
3806         TCGv_ptr fpst;                                                  \
3807         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3808         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3809                            fns[vece]);                                  \
3810         tcg_temp_free_ptr(fpst);                                        \
3811     }                                                                   \
3812     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3813     {                                                                   \
3814         if (a->size == MO_16) {                                         \
3815             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3816                 return false;                                           \
3817             }                                                           \
3818         } else if (a->size != MO_32) {                                  \
3819             return false;                                               \
3820         }                                                               \
3821         return do_2misc_vec(s, a, gen_##INSN);                          \
3822     }
3823
3824 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3825 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3826 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3827 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3828 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3829 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3830 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3831 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3832 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3833 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3834 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3835
3836 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3837
3838 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3839 {
3840     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3841         return false;
3842     }
3843     return trans_VRINTX_impl(s, a);
3844 }
3845
3846 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3847     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3848                            uint32_t rm_ofs,                             \
3849                            uint32_t oprsz, uint32_t maxsz)              \
3850     {                                                                   \
3851         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3852             NULL,                                                       \
3853             gen_helper_gvec_##OP##h,                                    \
3854             gen_helper_gvec_##OP##s,                                    \
3855             NULL,                                                       \
3856         };                                                              \
3857         TCGv_ptr fpst;                                                  \
3858         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3859         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3860                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3861         tcg_temp_free_ptr(fpst);                                        \
3862     }                                                                   \
3863     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3864     {                                                                   \
3865         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3866             return false;                                               \
3867         }                                                               \
3868         if (a->size == MO_16) {                                         \
3869             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3870                 return false;                                           \
3871             }                                                           \
3872         } else if (a->size != MO_32) {                                  \
3873             return false;                                               \
3874         }                                                               \
3875         return do_2misc_vec(s, a, gen_##INSN);                          \
3876     }
3877
3878 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3879 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3880 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3881 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3882 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3883 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3884 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3885 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3886
3887 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3888 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3889 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3890 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3891 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3892
3893 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3894 {
3895     TCGv_i64 rm, rd;
3896     int pass;
3897
3898     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3899         return false;
3900     }
3901
3902     /* UNDEF accesses to D16-D31 if they don't exist. */
3903     if (!dc_isar_feature(aa32_simd_r32, s) &&
3904         ((a->vd | a->vm) & 0x10)) {
3905         return false;
3906     }
3907
3908     if (a->size != 0) {
3909         return false;
3910     }
3911
3912     if ((a->vd | a->vm) & a->q) {
3913         return false;
3914     }
3915
3916     if (!vfp_access_check(s)) {
3917         return true;
3918     }
3919
3920     rm = tcg_temp_new_i64();
3921     rd = tcg_temp_new_i64();
3922     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3923         read_neon_element64(rm, a->vm, pass, MO_64);
3924         read_neon_element64(rd, a->vd, pass, MO_64);
3925         write_neon_element64(rm, a->vd, pass, MO_64);
3926         write_neon_element64(rd, a->vm, pass, MO_64);
3927     }
3928     tcg_temp_free_i64(rm);
3929     tcg_temp_free_i64(rd);
3930
3931     return true;
3932 }
3933 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3934 {
3935     TCGv_i32 rd, tmp;
3936
3937     rd = tcg_temp_new_i32();
3938     tmp = tcg_temp_new_i32();
3939
3940     tcg_gen_shli_i32(rd, t0, 8);
3941     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3942     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3943     tcg_gen_or_i32(rd, rd, tmp);
3944
3945     tcg_gen_shri_i32(t1, t1, 8);
3946     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3947     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3948     tcg_gen_or_i32(t1, t1, tmp);
3949     tcg_gen_mov_i32(t0, rd);
3950
3951     tcg_temp_free_i32(tmp);
3952     tcg_temp_free_i32(rd);
3953 }
3954
3955 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3956 {
3957     TCGv_i32 rd, tmp;
3958
3959     rd = tcg_temp_new_i32();
3960     tmp = tcg_temp_new_i32();
3961
3962     tcg_gen_shli_i32(rd, t0, 16);
3963     tcg_gen_andi_i32(tmp, t1, 0xffff);
3964     tcg_gen_or_i32(rd, rd, tmp);
3965     tcg_gen_shri_i32(t1, t1, 16);
3966     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3967     tcg_gen_or_i32(t1, t1, tmp);
3968     tcg_gen_mov_i32(t0, rd);
3969
3970     tcg_temp_free_i32(tmp);
3971     tcg_temp_free_i32(rd);
3972 }
3973
3974 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3975 {
3976     TCGv_i32 tmp, tmp2;
3977     int pass;
3978
3979     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3980         return false;
3981     }
3982
3983     /* UNDEF accesses to D16-D31 if they don't exist. */
3984     if (!dc_isar_feature(aa32_simd_r32, s) &&
3985         ((a->vd | a->vm) & 0x10)) {
3986         return false;
3987     }
3988
3989     if ((a->vd | a->vm) & a->q) {
3990         return false;
3991     }
3992
3993     if (a->size == 3) {
3994         return false;
3995     }
3996
3997     if (!vfp_access_check(s)) {
3998         return true;
3999     }
4000
4001     tmp = tcg_temp_new_i32();
4002     tmp2 = tcg_temp_new_i32();
4003     if (a->size == MO_32) {
4004         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
4005             read_neon_element32(tmp, a->vm, pass, MO_32);
4006             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
4007             write_neon_element32(tmp2, a->vm, pass, MO_32);
4008             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
4009         }
4010     } else {
4011         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4012             read_neon_element32(tmp, a->vm, pass, MO_32);
4013             read_neon_element32(tmp2, a->vd, pass, MO_32);
4014             if (a->size == MO_8) {
4015                 gen_neon_trn_u8(tmp, tmp2);
4016             } else {
4017                 gen_neon_trn_u16(tmp, tmp2);
4018             }
4019             write_neon_element32(tmp2, a->vm, pass, MO_32);
4020             write_neon_element32(tmp, a->vd, pass, MO_32);
4021         }
4022     }
4023     tcg_temp_free_i32(tmp);
4024     tcg_temp_free_i32(tmp2);
4025     return true;
4026 }