target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 static inline int plus1(DisasContext *s, int x)
  32 {
  33     return x + 1;
  34 }
  35
  36 static inline int rsub_64(DisasContext *s, int x)
  37 {
  38     return 64 - x;
  39 }
  40
  41 static inline int rsub_32(DisasContext *s, int x)
  42 {
  43     return 32 - x;
  44 }
  45 static inline int rsub_16(DisasContext *s, int x)
  46 {
  47     return 16 - x;
  48 }
  49 static inline int rsub_8(DisasContext *s, int x)
  50 {
  51     return 8 - x;
  52 }
  53
  54 static inline int neon_3same_fp_size(DisasContext *s, int x)
  55 {
  56     /* Convert 0==fp32, 1==fp16 into a MO_* value */
  57     return MO_32 - x;
  58 }
  59
  60 /* Include the generated Neon decoder */
  61 #include "decode-neon-dp.c.inc"
  62 #include "decode-neon-ls.c.inc"
  63 #include "decode-neon-shared.c.inc"
  64
  65 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  66 {
  67     TCGv_ptr ret = tcg_temp_new_ptr();
  68     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  69     return ret;
  70 }
  71
  72 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  73 {
  74     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  75
  76     switch (mop) {
  77     case MO_UB:
  78         tcg_gen_ld8u_i32(var, cpu_env, offset);
  79         break;
  80     case MO_UW:
  81         tcg_gen_ld16u_i32(var, cpu_env, offset);
  82         break;
  83     case MO_UL:
  84         tcg_gen_ld_i32(var, cpu_env, offset);
  85         break;
  86     default:
  87         g_assert_not_reached();
  88     }
  89 }
  90
  91 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  92 {
  93     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  94
  95     switch (mop) {
  96     case MO_UB:
  97         tcg_gen_ld8u_i64(var, cpu_env, offset);
  98         break;
  99     case MO_UW:
 100         tcg_gen_ld16u_i64(var, cpu_env, offset);
 101         break;
 102     case MO_UL:
 103         tcg_gen_ld32u_i64(var, cpu_env, offset);
 104         break;
 105     case MO_Q:
 106         tcg_gen_ld_i64(var, cpu_env, offset);
 107         break;
 108     default:
 109         g_assert_not_reached();
 110     }
 111 }
 112
 113 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
 114 {
 115     long offset = neon_element_offset(reg, ele, size);
 116
 117     switch (size) {
 118     case MO_8:
 119         tcg_gen_st8_i32(var, cpu_env, offset);
 120         break;
 121     case MO_16:
 122         tcg_gen_st16_i32(var, cpu_env, offset);
 123         break;
 124     case MO_32:
 125         tcg_gen_st_i32(var, cpu_env, offset);
 126         break;
 127     default:
 128         g_assert_not_reached();
 129     }
 130 }
 131
 132 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 133 {
 134     long offset = neon_element_offset(reg, ele, size);
 135
 136     switch (size) {
 137     case MO_8:
 138         tcg_gen_st8_i64(var, cpu_env, offset);
 139         break;
 140     case MO_16:
 141         tcg_gen_st16_i64(var, cpu_env, offset);
 142         break;
 143     case MO_32:
 144         tcg_gen_st32_i64(var, cpu_env, offset);
 145         break;
 146     case MO_64:
 147         tcg_gen_st_i64(var, cpu_env, offset);
 148         break;
 149     default:
 150         g_assert_not_reached();
 151     }
 152 }
 153
 154 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 155 {
 156     int opr_sz;
 157     TCGv_ptr fpst;
 158     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 159
 160     if (!dc_isar_feature(aa32_vcma, s)
 161         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 162         return false;
 163     }
 164
 165     /* UNDEF accesses to D16-D31 if they don't exist. */
 166     if (!dc_isar_feature(aa32_simd_r32, s) &&
 167         ((a->vd | a->vn | a->vm) & 0x10)) {
 168         return false;
 169     }
 170
 171     if ((a->vn | a->vm | a->vd) & a->q) {
 172         return false;
 173     }
 174
 175     if (!vfp_access_check(s)) {
 176         return true;
 177     }
 178
 179     opr_sz = (1 + a->q) * 8;
 180     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 181     fn_gvec_ptr = (a->size == MO_16) ?
 182         gen_helper_gvec_fcmlah : gen_helper_gvec_fcmlas;
 183     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 184                        vfp_reg_offset(1, a->vn),
 185                        vfp_reg_offset(1, a->vm),
 186                        fpst, opr_sz, opr_sz, a->rot,
 187                        fn_gvec_ptr);
 188     tcg_temp_free_ptr(fpst);
 189     return true;
 190 }
 191
 192 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 193 {
 194     int opr_sz;
 195     TCGv_ptr fpst;
 196     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 197
 198     if (!dc_isar_feature(aa32_vcma, s)
 199         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 200         return false;
 201     }
 202
 203     /* UNDEF accesses to D16-D31 if they don't exist. */
 204     if (!dc_isar_feature(aa32_simd_r32, s) &&
 205         ((a->vd | a->vn | a->vm) & 0x10)) {
 206         return false;
 207     }
 208
 209     if ((a->vn | a->vm | a->vd) & a->q) {
 210         return false;
 211     }
 212
 213     if (!vfp_access_check(s)) {
 214         return true;
 215     }
 216
 217     opr_sz = (1 + a->q) * 8;
 218     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 219     fn_gvec_ptr = (a->size == MO_16) ?
 220         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 221     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 222                        vfp_reg_offset(1, a->vn),
 223                        vfp_reg_offset(1, a->vm),
 224                        fpst, opr_sz, opr_sz, a->rot,
 225                        fn_gvec_ptr);
 226     tcg_temp_free_ptr(fpst);
 227     return true;
 228 }
 229
 230 static bool trans_VDOT(DisasContext *s, arg_VDOT *a)
 231 {
 232     int opr_sz;
 233     gen_helper_gvec_3 *fn_gvec;
 234
 235     if (!dc_isar_feature(aa32_dp, s)) {
 236         return false;
 237     }
 238
 239     /* UNDEF accesses to D16-D31 if they don't exist. */
 240     if (!dc_isar_feature(aa32_simd_r32, s) &&
 241         ((a->vd | a->vn | a->vm) & 0x10)) {
 242         return false;
 243     }
 244
 245     if ((a->vn | a->vm | a->vd) & a->q) {
 246         return false;
 247     }
 248
 249     if (!vfp_access_check(s)) {
 250         return true;
 251     }
 252
 253     opr_sz = (1 + a->q) * 8;
 254     fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
 255     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
 256                        vfp_reg_offset(1, a->vn),
 257                        vfp_reg_offset(1, a->vm),
 258                        opr_sz, opr_sz, 0, fn_gvec);
 259     return true;
 260 }
 261
 262 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 263 {
 264     int opr_sz;
 265
 266     if (!dc_isar_feature(aa32_fhm, s)) {
 267         return false;
 268     }
 269
 270     /* UNDEF accesses to D16-D31 if they don't exist. */
 271     if (!dc_isar_feature(aa32_simd_r32, s) &&
 272         (a->vd & 0x10)) {
 273         return false;
 274     }
 275
 276     if (a->vd & a->q) {
 277         return false;
 278     }
 279
 280     if (!vfp_access_check(s)) {
 281         return true;
 282     }
 283
 284     opr_sz = (1 + a->q) * 8;
 285     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 286                        vfp_reg_offset(a->q, a->vn),
 287                        vfp_reg_offset(a->q, a->vm),
 288                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 289                        gen_helper_gvec_fmlal_a32);
 290     return true;
 291 }
 292
 293 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 294 {
 295     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 296     int opr_sz;
 297     TCGv_ptr fpst;
 298
 299     if (!dc_isar_feature(aa32_vcma, s)) {
 300         return false;
 301     }
 302     if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) {
 303         return false;
 304     }
 305
 306     /* UNDEF accesses to D16-D31 if they don't exist. */
 307     if (!dc_isar_feature(aa32_simd_r32, s) &&
 308         ((a->vd | a->vn | a->vm) & 0x10)) {
 309         return false;
 310     }
 311
 312     if ((a->vd | a->vn) & a->q) {
 313         return false;
 314     }
 315
 316     if (!vfp_access_check(s)) {
 317         return true;
 318     }
 319
 320     fn_gvec_ptr = (a->size == MO_16) ?
 321         gen_helper_gvec_fcmlah_idx : gen_helper_gvec_fcmlas_idx;
 322     opr_sz = (1 + a->q) * 8;
 323     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 324     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 325                        vfp_reg_offset(1, a->vn),
 326                        vfp_reg_offset(1, a->vm),
 327                        fpst, opr_sz, opr_sz,
 328                        (a->index << 2) | a->rot, fn_gvec_ptr);
 329     tcg_temp_free_ptr(fpst);
 330     return true;
 331 }
 332
 333 static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a)
 334 {
 335     gen_helper_gvec_3 *fn_gvec;
 336     int opr_sz;
 337     TCGv_ptr fpst;
 338
 339     if (!dc_isar_feature(aa32_dp, s)) {
 340         return false;
 341     }
 342
 343     /* UNDEF accesses to D16-D31 if they don't exist. */
 344     if (!dc_isar_feature(aa32_simd_r32, s) &&
 345         ((a->vd | a->vn) & 0x10)) {
 346         return false;
 347     }
 348
 349     if ((a->vd | a->vn) & a->q) {
 350         return false;
 351     }
 352
 353     if (!vfp_access_check(s)) {
 354         return true;
 355     }
 356
 357     fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
 358     opr_sz = (1 + a->q) * 8;
 359     fpst = fpstatus_ptr(FPST_STD);
 360     tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd),
 361                        vfp_reg_offset(1, a->vn),
 362                        vfp_reg_offset(1, a->rm),
 363                        opr_sz, opr_sz, a->index, fn_gvec);
 364     tcg_temp_free_ptr(fpst);
 365     return true;
 366 }
 367
 368 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 369 {
 370     int opr_sz;
 371
 372     if (!dc_isar_feature(aa32_fhm, s)) {
 373         return false;
 374     }
 375
 376     /* UNDEF accesses to D16-D31 if they don't exist. */
 377     if (!dc_isar_feature(aa32_simd_r32, s) &&
 378         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 379         return false;
 380     }
 381
 382     if (a->vd & a->q) {
 383         return false;
 384     }
 385
 386     if (!vfp_access_check(s)) {
 387         return true;
 388     }
 389
 390     opr_sz = (1 + a->q) * 8;
 391     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 392                        vfp_reg_offset(a->q, a->vn),
 393                        vfp_reg_offset(a->q, a->rm),
 394                        cpu_env, opr_sz, opr_sz,
 395                        (a->index << 2) | a->s, /* is_2 == 0 */
 396                        gen_helper_gvec_fmlal_idx_a32);
 397     return true;
 398 }
 399
 400 static struct {
 401     int nregs;
 402     int interleave;
 403     int spacing;
 404 } const neon_ls_element_type[11] = {
 405     {1, 4, 1},
 406     {1, 4, 2},
 407     {4, 1, 1},
 408     {2, 2, 2},
 409     {1, 3, 1},
 410     {1, 3, 2},
 411     {3, 1, 1},
 412     {1, 1, 1},
 413     {1, 2, 1},
 414     {1, 2, 2},
 415     {2, 1, 1}
 416 };
 417
 418 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 419                                       int stride)
 420 {
 421     if (rm != 15) {
 422         TCGv_i32 base;
 423
 424         base = load_reg(s, rn);
 425         if (rm == 13) {
 426             tcg_gen_addi_i32(base, base, stride);
 427         } else {
 428             TCGv_i32 index;
 429             index = load_reg(s, rm);
 430             tcg_gen_add_i32(base, base, index);
 431             tcg_temp_free_i32(index);
 432         }
 433         store_reg(s, rn, base);
 434     }
 435 }
 436
 437 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 438 {
 439     /* Neon load/store multiple structures */
 440     int nregs, interleave, spacing, reg, n;
 441     MemOp mop, align, endian;
 442     int mmu_idx = get_mem_index(s);
 443     int size = a->size;
 444     TCGv_i64 tmp64;
 445     TCGv_i32 addr, tmp;
 446
 447     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 448         return false;
 449     }
 450
 451     /* UNDEF accesses to D16-D31 if they don't exist */
 452     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 453         return false;
 454     }
 455     if (a->itype > 10) {
 456         return false;
 457     }
 458     /* Catch UNDEF cases for bad values of align field */
 459     switch (a->itype & 0xc) {
 460     case 4:
 461         if (a->align >= 2) {
 462             return false;
 463         }
 464         break;
 465     case 8:
 466         if (a->align == 3) {
 467             return false;
 468         }
 469         break;
 470     default:
 471         break;
 472     }
 473     nregs = neon_ls_element_type[a->itype].nregs;
 474     interleave = neon_ls_element_type[a->itype].interleave;
 475     spacing = neon_ls_element_type[a->itype].spacing;
 476     if (size == 3 && (interleave | spacing) != 1) {
 477         return false;
 478     }
 479
 480     if (!vfp_access_check(s)) {
 481         return true;
 482     }
 483
 484     /* For our purposes, bytes are always little-endian.  */
 485     endian = s->be_data;
 486     if (size == 0) {
 487         endian = MO_LE;
 488     }
 489
 490     /* Enforce alignment requested by the instruction */
 491     if (a->align) {
 492         align = pow2_align(a->align + 2); /* 4 ** a->align */
 493     } else {
 494         align = s->align_mem ? MO_ALIGN : 0;
 495     }
 496
 497     /*
 498      * Consecutive little-endian elements from a single register
 499      * can be promoted to a larger little-endian operation.
 500      */
 501     if (interleave == 1 && endian == MO_LE) {
 502         /* Retain any natural alignment. */
 503         if (align == MO_ALIGN) {
 504             align = pow2_align(size);
 505         }
 506         size = 3;
 507     }
 508
 509     tmp64 = tcg_temp_new_i64();
 510     addr = tcg_temp_new_i32();
 511     tmp = tcg_const_i32(1 << size);
 512     load_reg_var(s, addr, a->rn);
 513
 514     mop = endian | size | align;
 515     for (reg = 0; reg < nregs; reg++) {
 516         for (n = 0; n < 8 >> size; n++) {
 517             int xs;
 518             for (xs = 0; xs < interleave; xs++) {
 519                 int tt = a->vd + reg + spacing * xs;
 520
 521                 if (a->l) {
 522                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 523                     neon_store_element64(tt, n, size, tmp64);
 524                 } else {
 525                     neon_load_element64(tmp64, tt, n, size);
 526                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 527                 }
 528                 tcg_gen_add_i32(addr, addr, tmp);
 529
 530                 /* Subsequent memory operations inherit alignment */
 531                 mop &= ~MO_AMASK;
 532             }
 533         }
 534     }
 535     tcg_temp_free_i32(addr);
 536     tcg_temp_free_i32(tmp);
 537     tcg_temp_free_i64(tmp64);
 538
 539     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 540     return true;
 541 }
 542
 543 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 544 {
 545     /* Neon load single structure to all lanes */
 546     int reg, stride, vec_size;
 547     int vd = a->vd;
 548     int size = a->size;
 549     int nregs = a->n + 1;
 550     TCGv_i32 addr, tmp;
 551     MemOp mop, align;
 552
 553     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 554         return false;
 555     }
 556
 557     /* UNDEF accesses to D16-D31 if they don't exist */
 558     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 559         return false;
 560     }
 561
 562     align = 0;
 563     if (size == 3) {
 564         if (nregs != 4 || a->a == 0) {
 565             return false;
 566         }
 567         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 568         size = MO_32;
 569         align = MO_ALIGN_16;
 570     } else if (a->a) {
 571         switch (nregs) {
 572         case 1:
 573             if (size == 0) {
 574                 return false;
 575             }
 576             align = MO_ALIGN;
 577             break;
 578         case 2:
 579             align = pow2_align(size + 1);
 580             break;
 581         case 3:
 582             return false;
 583         case 4:
 584             align = pow2_align(size + 2);
 585             break;
 586         default:
 587             g_assert_not_reached();
 588         }
 589     }
 590
 591     if (!vfp_access_check(s)) {
 592         return true;
 593     }
 594
 595     /*
 596      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 597      * VLD2/3/4 to all lanes: T bit indicates register stride.
 598      */
 599     stride = a->t ? 2 : 1;
 600     vec_size = nregs == 1 ? stride * 8 : 8;
 601     mop = size | align;
 602     tmp = tcg_temp_new_i32();
 603     addr = tcg_temp_new_i32();
 604     load_reg_var(s, addr, a->rn);
 605     for (reg = 0; reg < nregs; reg++) {
 606         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 607         if ((vd & 1) && vec_size == 16) {
 608             /*
 609              * We cannot write 16 bytes at once because the
 610              * destination is unaligned.
 611              */
 612             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 613                                  8, 8, tmp);
 614             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 615                              neon_full_reg_offset(vd), 8, 8);
 616         } else {
 617             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 618                                  vec_size, vec_size, tmp);
 619         }
 620         tcg_gen_addi_i32(addr, addr, 1 << size);
 621         vd += stride;
 622
 623         /* Subsequent memory operations inherit alignment */
 624         mop &= ~MO_AMASK;
 625     }
 626     tcg_temp_free_i32(tmp);
 627     tcg_temp_free_i32(addr);
 628
 629     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 630
 631     return true;
 632 }
 633
 634 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 635 {
 636     /* Neon load/store single structure to one lane */
 637     int reg;
 638     int nregs = a->n + 1;
 639     int vd = a->vd;
 640     TCGv_i32 addr, tmp;
 641     MemOp mop;
 642
 643     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 644         return false;
 645     }
 646
 647     /* UNDEF accesses to D16-D31 if they don't exist */
 648     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 649         return false;
 650     }
 651
 652     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 653     switch (nregs) {
 654     case 1:
 655         if (((a->align & (1 << a->size)) != 0) ||
 656             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 657             return false;
 658         }
 659         break;
 660     case 3:
 661         if ((a->align & 1) != 0) {
 662             return false;
 663         }
 664         /* fall through */
 665     case 2:
 666         if (a->size == 2 && (a->align & 2) != 0) {
 667             return false;
 668         }
 669         break;
 670     case 4:
 671         if (a->size == 2 && a->align == 3) {
 672             return false;
 673         }
 674         break;
 675     default:
 676         abort();
 677     }
 678     if ((vd + a->stride * (nregs - 1)) > 31) {
 679         /*
 680          * Attempts to write off the end of the register file are
 681          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 682          * access off the end of the array that holds the register data.
 683          */
 684         return false;
 685     }
 686
 687     if (!vfp_access_check(s)) {
 688         return true;
 689     }
 690
 691     /* Pick up SCTLR settings */
 692     mop = finalize_memop(s, a->size);
 693
 694     if (a->align) {
 695         MemOp align_op;
 696
 697         switch (nregs) {
 698         case 1:
 699             /* For VLD1, use natural alignment. */
 700             align_op = MO_ALIGN;
 701             break;
 702         case 2:
 703             /* For VLD2, use double alignment. */
 704             align_op = pow2_align(a->size + 1);
 705             break;
 706         case 4:
 707             if (a->size == MO_32) {
 708                 /*
 709                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 710                  * quad alignment; align = 3 is rejected above.
 711                  */
 712                 align_op = pow2_align(a->size + a->align);
 713             } else {
 714                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 715                 align_op = pow2_align(a->size + 2);
 716             }
 717             break;
 718         default:
 719             /* For VLD3, the alignment field is zero and rejected above. */
 720             g_assert_not_reached();
 721         }
 722
 723         mop = (mop & ~MO_AMASK) | align_op;
 724     }
 725
 726     tmp = tcg_temp_new_i32();
 727     addr = tcg_temp_new_i32();
 728     load_reg_var(s, addr, a->rn);
 729
 730     for (reg = 0; reg < nregs; reg++) {
 731         if (a->l) {
 732             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 733             neon_store_element(vd, a->reg_idx, a->size, tmp);
 734         } else { /* Store */
 735             neon_load_element(tmp, vd, a->reg_idx, a->size);
 736             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 737         }
 738         vd += a->stride;
 739         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 740
 741         /* Subsequent memory operations inherit alignment */
 742         mop &= ~MO_AMASK;
 743     }
 744     tcg_temp_free_i32(addr);
 745     tcg_temp_free_i32(tmp);
 746
 747     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 748
 749     return true;
 750 }
 751
 752 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 753 {
 754     int vec_size = a->q ? 16 : 8;
 755     int rd_ofs = neon_full_reg_offset(a->vd);
 756     int rn_ofs = neon_full_reg_offset(a->vn);
 757     int rm_ofs = neon_full_reg_offset(a->vm);
 758
 759     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 760         return false;
 761     }
 762
 763     /* UNDEF accesses to D16-D31 if they don't exist. */
 764     if (!dc_isar_feature(aa32_simd_r32, s) &&
 765         ((a->vd | a->vn | a->vm) & 0x10)) {
 766         return false;
 767     }
 768
 769     if ((a->vn | a->vm | a->vd) & a->q) {
 770         return false;
 771     }
 772
 773     if (!vfp_access_check(s)) {
 774         return true;
 775     }
 776
 777     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 778     return true;
 779 }
 780
 781 #define DO_3SAME(INSN, FUNC)                                            \
 782     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 783     {                                                                   \
 784         return do_3same(s, a, FUNC);                                    \
 785     }
 786
 787 DO_3SAME(VADD, tcg_gen_gvec_add)
 788 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 789 DO_3SAME(VAND, tcg_gen_gvec_and)
 790 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 791 DO_3SAME(VORR, tcg_gen_gvec_or)
 792 DO_3SAME(VORN, tcg_gen_gvec_orc)
 793 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 794 DO_3SAME(VSHL_S, gen_gvec_sshl)
 795 DO_3SAME(VSHL_U, gen_gvec_ushl)
 796 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 797 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 798 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 799 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 800
 801 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 802 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 803     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 804                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 805                                 uint32_t oprsz, uint32_t maxsz)         \
 806     {                                                                   \
 807         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 808     }                                                                   \
 809     DO_3SAME(INSN, gen_##INSN##_3s)
 810
 811 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 812 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 813 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 814
 815 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 816     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 817     {                                                                   \
 818         if (a->size == 3) {                                             \
 819             return false;                                               \
 820         }                                                               \
 821         return do_3same(s, a, FUNC);                                    \
 822     }
 823
 824 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 825 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 826 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 827 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 828 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 829 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 830 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 831 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 832 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 833 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 834 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 835 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 836
 837 #define DO_3SAME_CMP(INSN, COND)                                        \
 838     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 839                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 840                                 uint32_t oprsz, uint32_t maxsz)         \
 841     {                                                                   \
 842         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 843     }                                                                   \
 844     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 845
 846 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 847 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 848 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 849 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 850 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 851
 852 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 853     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 854                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 855     {                                                                      \
 856         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 857     }
 858
 859 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 860
 861 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 862 {
 863     if (a->size != 0) {
 864         return false;
 865     }
 866     return do_3same(s, a, gen_VMUL_p_3s);
 867 }
 868
 869 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 870     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 871     {                                                                   \
 872         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 873             return false;                                               \
 874         }                                                               \
 875         if (a->size != 1 && a->size != 2) {                             \
 876             return false;                                               \
 877         }                                                               \
 878         return do_3same(s, a, FUNC);                                    \
 879     }
 880
 881 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 882 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 883
 884 #define DO_SHA1(NAME, FUNC)                                             \
 885     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 886     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 887     {                                                                   \
 888         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 889             return false;                                               \
 890         }                                                               \
 891         return do_3same(s, a, gen_##NAME##_3s);                         \
 892     }
 893
 894 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 895 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 896 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 897 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 898
 899 #define DO_SHA2(NAME, FUNC)                                             \
 900     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 901     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 902     {                                                                   \
 903         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 904             return false;                                               \
 905         }                                                               \
 906         return do_3same(s, a, gen_##NAME##_3s);                         \
 907     }
 908
 909 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 910 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 911 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 912
 913 #define DO_3SAME_64(INSN, FUNC)                                         \
 914     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 915                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 916                                 uint32_t oprsz, uint32_t maxsz)         \
 917     {                                                                   \
 918         static const GVecGen3 op = { .fni8 = FUNC };                    \
 919         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 920     }                                                                   \
 921     DO_3SAME(INSN, gen_##INSN##_3s)
 922
 923 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 924     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 925     {                                                                   \
 926         FUNC(d, cpu_env, n, m);                                         \
 927     }                                                                   \
 928     DO_3SAME_64(INSN, gen_##INSN##_elt)
 929
 930 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 931 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 932 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 933 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 934 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 935 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 936
 937 #define DO_3SAME_32(INSN, FUNC)                                         \
 938     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 939                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 940                                 uint32_t oprsz, uint32_t maxsz)         \
 941     {                                                                   \
 942         static const GVecGen3 ops[4] = {                                \
 943             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 944             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 945             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 946             { 0 },                                                      \
 947         };                                                              \
 948         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 949     }                                                                   \
 950     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 951     {                                                                   \
 952         if (a->size > 2) {                                              \
 953             return false;                                               \
 954         }                                                               \
 955         return do_3same(s, a, gen_##INSN##_3s);                         \
 956     }
 957
 958 /*
 959  * Some helper functions need to be passed the cpu_env. In order
 960  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 961  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 962  * and which call a NeonGenTwoOpEnvFn().
 963  */
 964 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 965     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 966     {                                                                   \
 967         FUNC(d, cpu_env, n, m);                                         \
 968     }
 969
 970 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 971     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 972     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 973     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 974     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 975                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 976                                 uint32_t oprsz, uint32_t maxsz)         \
 977     {                                                                   \
 978         static const GVecGen3 ops[4] = {                                \
 979             { .fni4 = gen_##INSN##_tramp8 },                            \
 980             { .fni4 = gen_##INSN##_tramp16 },                           \
 981             { .fni4 = gen_##INSN##_tramp32 },                           \
 982             { 0 },                                                      \
 983         };                                                              \
 984         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 985     }                                                                   \
 986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 987     {                                                                   \
 988         if (a->size > 2) {                                              \
 989             return false;                                               \
 990         }                                                               \
 991         return do_3same(s, a, gen_##INSN##_3s);                         \
 992     }
 993
 994 DO_3SAME_32(VHADD_S, hadd_s)
 995 DO_3SAME_32(VHADD_U, hadd_u)
 996 DO_3SAME_32(VHSUB_S, hsub_s)
 997 DO_3SAME_32(VHSUB_U, hsub_u)
 998 DO_3SAME_32(VRHADD_S, rhadd_s)
 999 DO_3SAME_32(VRHADD_U, rhadd_u)
1000 DO_3SAME_32(VRSHL_S, rshl_s)
1001 DO_3SAME_32(VRSHL_U, rshl_u)
1002
1003 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1004 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1005 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1006 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1007
1008 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1009 {
1010     /* Operations handled pairwise 32 bits at a time */
1011     TCGv_i32 tmp, tmp2, tmp3;
1012
1013     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1014         return false;
1015     }
1016
1017     /* UNDEF accesses to D16-D31 if they don't exist. */
1018     if (!dc_isar_feature(aa32_simd_r32, s) &&
1019         ((a->vd | a->vn | a->vm) & 0x10)) {
1020         return false;
1021     }
1022
1023     if (a->size == 3) {
1024         return false;
1025     }
1026
1027     if (!vfp_access_check(s)) {
1028         return true;
1029     }
1030
1031     assert(a->q == 0); /* enforced by decode patterns */
1032
1033     /*
1034      * Note that we have to be careful not to clobber the source operands
1035      * in the "vm == vd" case by storing the result of the first pass too
1036      * early. Since Q is 0 there are always just two passes, so instead
1037      * of a complicated loop over each pass we just unroll.
1038      */
1039     tmp = tcg_temp_new_i32();
1040     tmp2 = tcg_temp_new_i32();
1041     tmp3 = tcg_temp_new_i32();
1042
1043     read_neon_element32(tmp, a->vn, 0, MO_32);
1044     read_neon_element32(tmp2, a->vn, 1, MO_32);
1045     fn(tmp, tmp, tmp2);
1046
1047     read_neon_element32(tmp3, a->vm, 0, MO_32);
1048     read_neon_element32(tmp2, a->vm, 1, MO_32);
1049     fn(tmp3, tmp3, tmp2);
1050
1051     write_neon_element32(tmp, a->vd, 0, MO_32);
1052     write_neon_element32(tmp3, a->vd, 1, MO_32);
1053
1054     tcg_temp_free_i32(tmp);
1055     tcg_temp_free_i32(tmp2);
1056     tcg_temp_free_i32(tmp3);
1057     return true;
1058 }
1059
1060 #define DO_3SAME_PAIR(INSN, func)                                       \
1061     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1062     {                                                                   \
1063         static NeonGenTwoOpFn * const fns[] = {                         \
1064             gen_helper_neon_##func##8,                                  \
1065             gen_helper_neon_##func##16,                                 \
1066             gen_helper_neon_##func##32,                                 \
1067         };                                                              \
1068         if (a->size > 2) {                                              \
1069             return false;                                               \
1070         }                                                               \
1071         return do_3same_pair(s, a, fns[a->size]);                       \
1072     }
1073
1074 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1075 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1076 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1077 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1078 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1079 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1080
1081 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1082 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1083 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1084 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1085 DO_3SAME_PAIR(VPADD, padd_u)
1086
1087 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1088     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1089     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1090     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1091                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1092                                 uint32_t oprsz, uint32_t maxsz)         \
1093     {                                                                   \
1094         static const GVecGen3 ops[2] = {                                \
1095             { .fni4 = gen_##INSN##_tramp16 },                           \
1096             { .fni4 = gen_##INSN##_tramp32 },                           \
1097         };                                                              \
1098         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1099     }                                                                   \
1100     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1101     {                                                                   \
1102         if (a->size != 1 && a->size != 2) {                             \
1103             return false;                                               \
1104         }                                                               \
1105         return do_3same(s, a, gen_##INSN##_3s);                         \
1106     }
1107
1108 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1109 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1110
1111 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1112     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1113                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1114                          uint32_t oprsz, uint32_t maxsz)                \
1115     {                                                                   \
1116         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1117         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1118                            oprsz, maxsz, 0, FUNC);                      \
1119         tcg_temp_free_ptr(fpst);                                        \
1120     }
1121
1122 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1123     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1124     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1125     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1126     {                                                                   \
1127         if (a->size == MO_16) {                                         \
1128             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1129                 return false;                                           \
1130             }                                                           \
1131             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1132         }                                                               \
1133         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1134     }
1135
1136
1137 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1138 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1139 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1140 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1141 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1142 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1143 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1144 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1145 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1146 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1147 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1148 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1149 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1150 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1151 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1152 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1153 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1154
1155 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1156 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1157 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1158 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1159
1160 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1161 {
1162     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1163         return false;
1164     }
1165
1166     if (a->size == MO_16) {
1167         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1168             return false;
1169         }
1170         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1171     }
1172     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1173 }
1174
1175 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1176 {
1177     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1178         return false;
1179     }
1180
1181     if (a->size == MO_16) {
1182         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1183             return false;
1184         }
1185         return do_3same(s, a, gen_VMINNM_fp16_3s);
1186     }
1187     return do_3same(s, a, gen_VMINNM_fp32_3s);
1188 }
1189
1190 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1191                              gen_helper_gvec_3_ptr *fn)
1192 {
1193     /* FP pairwise operations */
1194     TCGv_ptr fpstatus;
1195
1196     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1197         return false;
1198     }
1199
1200     /* UNDEF accesses to D16-D31 if they don't exist. */
1201     if (!dc_isar_feature(aa32_simd_r32, s) &&
1202         ((a->vd | a->vn | a->vm) & 0x10)) {
1203         return false;
1204     }
1205
1206     if (!vfp_access_check(s)) {
1207         return true;
1208     }
1209
1210     assert(a->q == 0); /* enforced by decode patterns */
1211
1212
1213     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1214     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1215                        vfp_reg_offset(1, a->vn),
1216                        vfp_reg_offset(1, a->vm),
1217                        fpstatus, 8, 8, 0, fn);
1218     tcg_temp_free_ptr(fpstatus);
1219
1220     return true;
1221 }
1222
1223 /*
1224  * For all the functions using this macro, size == 1 means fp16,
1225  * which is an architecture extension we don't implement yet.
1226  */
1227 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1228     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1229     {                                                               \
1230         if (a->size == MO_16) {                                     \
1231             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1232                 return false;                                       \
1233             }                                                       \
1234             return do_3same_fp_pair(s, a, FUNC##h);                 \
1235         }                                                           \
1236         return do_3same_fp_pair(s, a, FUNC##s);                     \
1237     }
1238
1239 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1240 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1241 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1242
1243 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1244 {
1245     /* Handle a 2-reg-shift insn which can be vectorized. */
1246     int vec_size = a->q ? 16 : 8;
1247     int rd_ofs = neon_full_reg_offset(a->vd);
1248     int rm_ofs = neon_full_reg_offset(a->vm);
1249
1250     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1251         return false;
1252     }
1253
1254     /* UNDEF accesses to D16-D31 if they don't exist. */
1255     if (!dc_isar_feature(aa32_simd_r32, s) &&
1256         ((a->vd | a->vm) & 0x10)) {
1257         return false;
1258     }
1259
1260     if ((a->vm | a->vd) & a->q) {
1261         return false;
1262     }
1263
1264     if (!vfp_access_check(s)) {
1265         return true;
1266     }
1267
1268     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1269     return true;
1270 }
1271
1272 #define DO_2SH(INSN, FUNC)                                              \
1273     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1274     {                                                                   \
1275         return do_vector_2sh(s, a, FUNC);                               \
1276     }                                                                   \
1277
1278 DO_2SH(VSHL, tcg_gen_gvec_shli)
1279 DO_2SH(VSLI, gen_gvec_sli)
1280 DO_2SH(VSRI, gen_gvec_sri)
1281 DO_2SH(VSRA_S, gen_gvec_ssra)
1282 DO_2SH(VSRA_U, gen_gvec_usra)
1283 DO_2SH(VRSHR_S, gen_gvec_srshr)
1284 DO_2SH(VRSHR_U, gen_gvec_urshr)
1285 DO_2SH(VRSRA_S, gen_gvec_srsra)
1286 DO_2SH(VRSRA_U, gen_gvec_ursra)
1287
1288 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1289 {
1290     /* Signed shift out of range results in all-sign-bits */
1291     a->shift = MIN(a->shift, (8 << a->size) - 1);
1292     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1293 }
1294
1295 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1296                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1297 {
1298     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1299 }
1300
1301 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1302 {
1303     /* Shift out of range is architecturally valid and results in zero. */
1304     if (a->shift >= (8 << a->size)) {
1305         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1306     } else {
1307         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1308     }
1309 }
1310
1311 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1312                              NeonGenTwo64OpEnvFn *fn)
1313 {
1314     /*
1315      * 2-reg-and-shift operations, size == 3 case, where the
1316      * function needs to be passed cpu_env.
1317      */
1318     TCGv_i64 constimm;
1319     int pass;
1320
1321     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1322         return false;
1323     }
1324
1325     /* UNDEF accesses to D16-D31 if they don't exist. */
1326     if (!dc_isar_feature(aa32_simd_r32, s) &&
1327         ((a->vd | a->vm) & 0x10)) {
1328         return false;
1329     }
1330
1331     if ((a->vm | a->vd) & a->q) {
1332         return false;
1333     }
1334
1335     if (!vfp_access_check(s)) {
1336         return true;
1337     }
1338
1339     /*
1340      * To avoid excessive duplication of ops we implement shift
1341      * by immediate using the variable shift operations.
1342      */
1343     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1344
1345     for (pass = 0; pass < a->q + 1; pass++) {
1346         TCGv_i64 tmp = tcg_temp_new_i64();
1347
1348         read_neon_element64(tmp, a->vm, pass, MO_64);
1349         fn(tmp, cpu_env, tmp, constimm);
1350         write_neon_element64(tmp, a->vd, pass, MO_64);
1351         tcg_temp_free_i64(tmp);
1352     }
1353     tcg_temp_free_i64(constimm);
1354     return true;
1355 }
1356
1357 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1358                              NeonGenTwoOpEnvFn *fn)
1359 {
1360     /*
1361      * 2-reg-and-shift operations, size < 3 case, where the
1362      * helper needs to be passed cpu_env.
1363      */
1364     TCGv_i32 constimm, tmp;
1365     int pass;
1366
1367     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1368         return false;
1369     }
1370
1371     /* UNDEF accesses to D16-D31 if they don't exist. */
1372     if (!dc_isar_feature(aa32_simd_r32, s) &&
1373         ((a->vd | a->vm) & 0x10)) {
1374         return false;
1375     }
1376
1377     if ((a->vm | a->vd) & a->q) {
1378         return false;
1379     }
1380
1381     if (!vfp_access_check(s)) {
1382         return true;
1383     }
1384
1385     /*
1386      * To avoid excessive duplication of ops we implement shift
1387      * by immediate using the variable shift operations.
1388      */
1389     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1390     tmp = tcg_temp_new_i32();
1391
1392     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1393         read_neon_element32(tmp, a->vm, pass, MO_32);
1394         fn(tmp, cpu_env, tmp, constimm);
1395         write_neon_element32(tmp, a->vd, pass, MO_32);
1396     }
1397     tcg_temp_free_i32(tmp);
1398     tcg_temp_free_i32(constimm);
1399     return true;
1400 }
1401
1402 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1403     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1404     {                                                                   \
1405         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1406     }                                                                   \
1407     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1408     {                                                                   \
1409         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1410             gen_helper_neon_##FUNC##8,                                  \
1411             gen_helper_neon_##FUNC##16,                                 \
1412             gen_helper_neon_##FUNC##32,                                 \
1413         };                                                              \
1414         assert(a->size < ARRAY_SIZE(fns));                              \
1415         return do_2shift_env_32(s, a, fns[a->size]);                    \
1416     }
1417
1418 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1419 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1420 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1421
1422 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1423                                 NeonGenTwo64OpFn *shiftfn,
1424                                 NeonGenNarrowEnvFn *narrowfn)
1425 {
1426     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1427     TCGv_i64 constimm, rm1, rm2;
1428     TCGv_i32 rd;
1429
1430     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1431         return false;
1432     }
1433
1434     /* UNDEF accesses to D16-D31 if they don't exist. */
1435     if (!dc_isar_feature(aa32_simd_r32, s) &&
1436         ((a->vd | a->vm) & 0x10)) {
1437         return false;
1438     }
1439
1440     if (a->vm & 1) {
1441         return false;
1442     }
1443
1444     if (!vfp_access_check(s)) {
1445         return true;
1446     }
1447
1448     /*
1449      * This is always a right shift, and the shiftfn is always a
1450      * left-shift helper, which thus needs the negated shift count.
1451      */
1452     constimm = tcg_const_i64(-a->shift);
1453     rm1 = tcg_temp_new_i64();
1454     rm2 = tcg_temp_new_i64();
1455     rd = tcg_temp_new_i32();
1456
1457     /* Load both inputs first to avoid potential overwrite if rm == rd */
1458     read_neon_element64(rm1, a->vm, 0, MO_64);
1459     read_neon_element64(rm2, a->vm, 1, MO_64);
1460
1461     shiftfn(rm1, rm1, constimm);
1462     narrowfn(rd, cpu_env, rm1);
1463     write_neon_element32(rd, a->vd, 0, MO_32);
1464
1465     shiftfn(rm2, rm2, constimm);
1466     narrowfn(rd, cpu_env, rm2);
1467     write_neon_element32(rd, a->vd, 1, MO_32);
1468
1469     tcg_temp_free_i32(rd);
1470     tcg_temp_free_i64(rm1);
1471     tcg_temp_free_i64(rm2);
1472     tcg_temp_free_i64(constimm);
1473
1474     return true;
1475 }
1476
1477 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1478                                 NeonGenTwoOpFn *shiftfn,
1479                                 NeonGenNarrowEnvFn *narrowfn)
1480 {
1481     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1482     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1483     TCGv_i64 rtmp;
1484     uint32_t imm;
1485
1486     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1487         return false;
1488     }
1489
1490     /* UNDEF accesses to D16-D31 if they don't exist. */
1491     if (!dc_isar_feature(aa32_simd_r32, s) &&
1492         ((a->vd | a->vm) & 0x10)) {
1493         return false;
1494     }
1495
1496     if (a->vm & 1) {
1497         return false;
1498     }
1499
1500     if (!vfp_access_check(s)) {
1501         return true;
1502     }
1503
1504     /*
1505      * This is always a right shift, and the shiftfn is always a
1506      * left-shift helper, which thus needs the negated shift count
1507      * duplicated into each lane of the immediate value.
1508      */
1509     if (a->size == 1) {
1510         imm = (uint16_t)(-a->shift);
1511         imm |= imm << 16;
1512     } else {
1513         /* size == 2 */
1514         imm = -a->shift;
1515     }
1516     constimm = tcg_const_i32(imm);
1517
1518     /* Load all inputs first to avoid potential overwrite */
1519     rm1 = tcg_temp_new_i32();
1520     rm2 = tcg_temp_new_i32();
1521     rm3 = tcg_temp_new_i32();
1522     rm4 = tcg_temp_new_i32();
1523     read_neon_element32(rm1, a->vm, 0, MO_32);
1524     read_neon_element32(rm2, a->vm, 1, MO_32);
1525     read_neon_element32(rm3, a->vm, 2, MO_32);
1526     read_neon_element32(rm4, a->vm, 3, MO_32);
1527     rtmp = tcg_temp_new_i64();
1528
1529     shiftfn(rm1, rm1, constimm);
1530     shiftfn(rm2, rm2, constimm);
1531
1532     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1533     tcg_temp_free_i32(rm2);
1534
1535     narrowfn(rm1, cpu_env, rtmp);
1536     write_neon_element32(rm1, a->vd, 0, MO_32);
1537     tcg_temp_free_i32(rm1);
1538
1539     shiftfn(rm3, rm3, constimm);
1540     shiftfn(rm4, rm4, constimm);
1541     tcg_temp_free_i32(constimm);
1542
1543     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1544     tcg_temp_free_i32(rm4);
1545
1546     narrowfn(rm3, cpu_env, rtmp);
1547     tcg_temp_free_i64(rtmp);
1548     write_neon_element32(rm3, a->vd, 1, MO_32);
1549     tcg_temp_free_i32(rm3);
1550     return true;
1551 }
1552
1553 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1554     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1555     {                                                                   \
1556         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1557     }
1558 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1559     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1560     {                                                                   \
1561         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1562     }
1563
1564 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1565 {
1566     tcg_gen_extrl_i64_i32(dest, src);
1567 }
1568
1569 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1570 {
1571     gen_helper_neon_narrow_u16(dest, src);
1572 }
1573
1574 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1575 {
1576     gen_helper_neon_narrow_u8(dest, src);
1577 }
1578
1579 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1580 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1581 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1582
1583 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1584 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1585 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1586
1587 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1588 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1589 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1590
1591 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1592 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1593 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1594 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1595 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1596 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1597
1598 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1599 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1600 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1601
1602 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1603 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1604 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1605
1606 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1607 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1608 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1609
1610 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1611                          NeonGenWidenFn *widenfn, bool u)
1612 {
1613     TCGv_i64 tmp;
1614     TCGv_i32 rm0, rm1;
1615     uint64_t widen_mask = 0;
1616
1617     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1618         return false;
1619     }
1620
1621     /* UNDEF accesses to D16-D31 if they don't exist. */
1622     if (!dc_isar_feature(aa32_simd_r32, s) &&
1623         ((a->vd | a->vm) & 0x10)) {
1624         return false;
1625     }
1626
1627     if (a->vd & 1) {
1628         return false;
1629     }
1630
1631     if (!vfp_access_check(s)) {
1632         return true;
1633     }
1634
1635     /*
1636      * This is a widen-and-shift operation. The shift is always less
1637      * than the width of the source type, so after widening the input
1638      * vector we can simply shift the whole 64-bit widened register,
1639      * and then clear the potential overflow bits resulting from left
1640      * bits of the narrow input appearing as right bits of the left
1641      * neighbour narrow input. Calculate a mask of bits to clear.
1642      */
1643     if ((a->shift != 0) && (a->size < 2 || u)) {
1644         int esize = 8 << a->size;
1645         widen_mask = MAKE_64BIT_MASK(0, esize);
1646         widen_mask >>= esize - a->shift;
1647         widen_mask = dup_const(a->size + 1, widen_mask);
1648     }
1649
1650     rm0 = tcg_temp_new_i32();
1651     rm1 = tcg_temp_new_i32();
1652     read_neon_element32(rm0, a->vm, 0, MO_32);
1653     read_neon_element32(rm1, a->vm, 1, MO_32);
1654     tmp = tcg_temp_new_i64();
1655
1656     widenfn(tmp, rm0);
1657     tcg_temp_free_i32(rm0);
1658     if (a->shift != 0) {
1659         tcg_gen_shli_i64(tmp, tmp, a->shift);
1660         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1661     }
1662     write_neon_element64(tmp, a->vd, 0, MO_64);
1663
1664     widenfn(tmp, rm1);
1665     tcg_temp_free_i32(rm1);
1666     if (a->shift != 0) {
1667         tcg_gen_shli_i64(tmp, tmp, a->shift);
1668         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1669     }
1670     write_neon_element64(tmp, a->vd, 1, MO_64);
1671     tcg_temp_free_i64(tmp);
1672     return true;
1673 }
1674
1675 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1676 {
1677     static NeonGenWidenFn * const widenfn[] = {
1678         gen_helper_neon_widen_s8,
1679         gen_helper_neon_widen_s16,
1680         tcg_gen_ext_i32_i64,
1681     };
1682     return do_vshll_2sh(s, a, widenfn[a->size], false);
1683 }
1684
1685 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1686 {
1687     static NeonGenWidenFn * const widenfn[] = {
1688         gen_helper_neon_widen_u8,
1689         gen_helper_neon_widen_u16,
1690         tcg_gen_extu_i32_i64,
1691     };
1692     return do_vshll_2sh(s, a, widenfn[a->size], true);
1693 }
1694
1695 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1696                       gen_helper_gvec_2_ptr *fn)
1697 {
1698     /* FP operations in 2-reg-and-shift group */
1699     int vec_size = a->q ? 16 : 8;
1700     int rd_ofs = neon_full_reg_offset(a->vd);
1701     int rm_ofs = neon_full_reg_offset(a->vm);
1702     TCGv_ptr fpst;
1703
1704     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1705         return false;
1706     }
1707
1708     if (a->size == MO_16) {
1709         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1710             return false;
1711         }
1712     }
1713
1714     /* UNDEF accesses to D16-D31 if they don't exist. */
1715     if (!dc_isar_feature(aa32_simd_r32, s) &&
1716         ((a->vd | a->vm) & 0x10)) {
1717         return false;
1718     }
1719
1720     if ((a->vm | a->vd) & a->q) {
1721         return false;
1722     }
1723
1724     if (!vfp_access_check(s)) {
1725         return true;
1726     }
1727
1728     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1729     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1730     tcg_temp_free_ptr(fpst);
1731     return true;
1732 }
1733
1734 #define DO_FP_2SH(INSN, FUNC)                                           \
1735     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1736     {                                                                   \
1737         return do_fp_2sh(s, a, FUNC);                                   \
1738     }
1739
1740 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1741 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1742 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1743 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1744
1745 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1746 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1747 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1748 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1749
1750 static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
1751 {
1752     /*
1753      * Expand the encoded constant.
1754      * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
1755      * We choose to not special-case this and will behave as if a
1756      * valid constant encoding of 0 had been given.
1757      * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
1758      */
1759     switch (cmode) {
1760     case 0: case 1:
1761         /* no-op */
1762         break;
1763     case 2: case 3:
1764         imm <<= 8;
1765         break;
1766     case 4: case 5:
1767         imm <<= 16;
1768         break;
1769     case 6: case 7:
1770         imm <<= 24;
1771         break;
1772     case 8: case 9:
1773         imm |= imm << 16;
1774         break;
1775     case 10: case 11:
1776         imm = (imm << 8) | (imm << 24);
1777         break;
1778     case 12:
1779         imm = (imm << 8) | 0xff;
1780         break;
1781     case 13:
1782         imm = (imm << 16) | 0xffff;
1783         break;
1784     case 14:
1785         if (op) {
1786             /*
1787              * This is the only case where the top and bottom 32 bits
1788              * of the encoded constant differ.
1789              */
1790             uint64_t imm64 = 0;
1791             int n;
1792
1793             for (n = 0; n < 8; n++) {
1794                 if (imm & (1 << n)) {
1795                     imm64 |= (0xffULL << (n * 8));
1796                 }
1797             }
1798             return imm64;
1799         }
1800         imm |= (imm << 8) | (imm << 16) | (imm << 24);
1801         break;
1802     case 15:
1803         imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
1804             | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
1805         break;
1806     }
1807     if (op) {
1808         imm = ~imm;
1809     }
1810     return dup_const(MO_32, imm);
1811 }
1812
1813 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1814                         GVecGen2iFn *fn)
1815 {
1816     uint64_t imm;
1817     int reg_ofs, vec_size;
1818
1819     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1820         return false;
1821     }
1822
1823     /* UNDEF accesses to D16-D31 if they don't exist. */
1824     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1825         return false;
1826     }
1827
1828     if (a->vd & a->q) {
1829         return false;
1830     }
1831
1832     if (!vfp_access_check(s)) {
1833         return true;
1834     }
1835
1836     reg_ofs = neon_full_reg_offset(a->vd);
1837     vec_size = a->q ? 16 : 8;
1838     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1839
1840     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1841     return true;
1842 }
1843
1844 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1845                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1846 {
1847     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1848 }
1849
1850 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1851 {
1852     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1853     GVecGen2iFn *fn;
1854
1855     if ((a->cmode & 1) && a->cmode < 12) {
1856         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1857         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1858     } else {
1859         /* There is one unallocated cmode/op combination in this space */
1860         if (a->cmode == 15 && a->op == 1) {
1861             return false;
1862         }
1863         fn = gen_VMOV_1r;
1864     }
1865     return do_1reg_imm(s, a, fn);
1866 }
1867
1868 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1869                            NeonGenWidenFn *widenfn,
1870                            NeonGenTwo64OpFn *opfn,
1871                            int src1_mop, int src2_mop)
1872 {
1873     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1874     TCGv_i64 rn0_64, rn1_64, rm_64;
1875
1876     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1877         return false;
1878     }
1879
1880     /* UNDEF accesses to D16-D31 if they don't exist. */
1881     if (!dc_isar_feature(aa32_simd_r32, s) &&
1882         ((a->vd | a->vn | a->vm) & 0x10)) {
1883         return false;
1884     }
1885
1886     if (!opfn) {
1887         /* size == 3 case, which is an entirely different insn group */
1888         return false;
1889     }
1890
1891     if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1892         return false;
1893     }
1894
1895     if (!vfp_access_check(s)) {
1896         return true;
1897     }
1898
1899     rn0_64 = tcg_temp_new_i64();
1900     rn1_64 = tcg_temp_new_i64();
1901     rm_64 = tcg_temp_new_i64();
1902
1903     if (src1_mop >= 0) {
1904         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1905     } else {
1906         TCGv_i32 tmp = tcg_temp_new_i32();
1907         read_neon_element32(tmp, a->vn, 0, MO_32);
1908         widenfn(rn0_64, tmp);
1909         tcg_temp_free_i32(tmp);
1910     }
1911     if (src2_mop >= 0) {
1912         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1913     } else {
1914         TCGv_i32 tmp = tcg_temp_new_i32();
1915         read_neon_element32(tmp, a->vm, 0, MO_32);
1916         widenfn(rm_64, tmp);
1917         tcg_temp_free_i32(tmp);
1918     }
1919
1920     opfn(rn0_64, rn0_64, rm_64);
1921
1922     /*
1923      * Load second pass inputs before storing the first pass result, to
1924      * avoid incorrect results if a narrow input overlaps with the result.
1925      */
1926     if (src1_mop >= 0) {
1927         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1928     } else {
1929         TCGv_i32 tmp = tcg_temp_new_i32();
1930         read_neon_element32(tmp, a->vn, 1, MO_32);
1931         widenfn(rn1_64, tmp);
1932         tcg_temp_free_i32(tmp);
1933     }
1934     if (src2_mop >= 0) {
1935         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1936     } else {
1937         TCGv_i32 tmp = tcg_temp_new_i32();
1938         read_neon_element32(tmp, a->vm, 1, MO_32);
1939         widenfn(rm_64, tmp);
1940         tcg_temp_free_i32(tmp);
1941     }
1942
1943     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1944
1945     opfn(rn1_64, rn1_64, rm_64);
1946     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1947
1948     tcg_temp_free_i64(rn0_64);
1949     tcg_temp_free_i64(rn1_64);
1950     tcg_temp_free_i64(rm_64);
1951
1952     return true;
1953 }
1954
1955 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1956     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1957     {                                                                   \
1958         static NeonGenWidenFn * const widenfn[] = {                     \
1959             gen_helper_neon_widen_##S##8,                               \
1960             gen_helper_neon_widen_##S##16,                              \
1961             NULL, NULL,                                                 \
1962         };                                                              \
1963         static NeonGenTwo64OpFn * const addfn[] = {                     \
1964             gen_helper_neon_##OP##l_u16,                                \
1965             gen_helper_neon_##OP##l_u32,                                \
1966             tcg_gen_##OP##_i64,                                         \
1967             NULL,                                                       \
1968         };                                                              \
1969         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1970         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1971                               SRC1WIDE ? MO_Q : narrow_mop,             \
1972                               narrow_mop);                              \
1973     }
1974
1975 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1976 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1977 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1978 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1979 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1980 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1981 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1982 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1983
1984 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1985                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1986 {
1987     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1988     TCGv_i64 rn_64, rm_64;
1989     TCGv_i32 rd0, rd1;
1990
1991     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1992         return false;
1993     }
1994
1995     /* UNDEF accesses to D16-D31 if they don't exist. */
1996     if (!dc_isar_feature(aa32_simd_r32, s) &&
1997         ((a->vd | a->vn | a->vm) & 0x10)) {
1998         return false;
1999     }
2000
2001     if (!opfn || !narrowfn) {
2002         /* size == 3 case, which is an entirely different insn group */
2003         return false;
2004     }
2005
2006     if ((a->vn | a->vm) & 1) {
2007         return false;
2008     }
2009
2010     if (!vfp_access_check(s)) {
2011         return true;
2012     }
2013
2014     rn_64 = tcg_temp_new_i64();
2015     rm_64 = tcg_temp_new_i64();
2016     rd0 = tcg_temp_new_i32();
2017     rd1 = tcg_temp_new_i32();
2018
2019     read_neon_element64(rn_64, a->vn, 0, MO_64);
2020     read_neon_element64(rm_64, a->vm, 0, MO_64);
2021
2022     opfn(rn_64, rn_64, rm_64);
2023
2024     narrowfn(rd0, rn_64);
2025
2026     read_neon_element64(rn_64, a->vn, 1, MO_64);
2027     read_neon_element64(rm_64, a->vm, 1, MO_64);
2028
2029     opfn(rn_64, rn_64, rm_64);
2030
2031     narrowfn(rd1, rn_64);
2032
2033     write_neon_element32(rd0, a->vd, 0, MO_32);
2034     write_neon_element32(rd1, a->vd, 1, MO_32);
2035
2036     tcg_temp_free_i32(rd0);
2037     tcg_temp_free_i32(rd1);
2038     tcg_temp_free_i64(rn_64);
2039     tcg_temp_free_i64(rm_64);
2040
2041     return true;
2042 }
2043
2044 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
2045     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2046     {                                                                   \
2047         static NeonGenTwo64OpFn * const addfn[] = {                     \
2048             gen_helper_neon_##OP##l_u16,                                \
2049             gen_helper_neon_##OP##l_u32,                                \
2050             tcg_gen_##OP##_i64,                                         \
2051             NULL,                                                       \
2052         };                                                              \
2053         static NeonGenNarrowFn * const narrowfn[] = {                   \
2054             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2055             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2056             EXTOP,                                                      \
2057             NULL,                                                       \
2058         };                                                              \
2059         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2060     }
2061
2062 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2063 {
2064     tcg_gen_addi_i64(rn, rn, 1u << 31);
2065     tcg_gen_extrh_i64_i32(rd, rn);
2066 }
2067
2068 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2069 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2070 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2071 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2072
2073 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2074                        NeonGenTwoOpWidenFn *opfn,
2075                        NeonGenTwo64OpFn *accfn)
2076 {
2077     /*
2078      * 3-regs different lengths, long operations.
2079      * These perform an operation on two inputs that returns a double-width
2080      * result, and then possibly perform an accumulation operation of
2081      * that result into the double-width destination.
2082      */
2083     TCGv_i64 rd0, rd1, tmp;
2084     TCGv_i32 rn, rm;
2085
2086     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2087         return false;
2088     }
2089
2090     /* UNDEF accesses to D16-D31 if they don't exist. */
2091     if (!dc_isar_feature(aa32_simd_r32, s) &&
2092         ((a->vd | a->vn | a->vm) & 0x10)) {
2093         return false;
2094     }
2095
2096     if (!opfn) {
2097         /* size == 3 case, which is an entirely different insn group */
2098         return false;
2099     }
2100
2101     if (a->vd & 1) {
2102         return false;
2103     }
2104
2105     if (!vfp_access_check(s)) {
2106         return true;
2107     }
2108
2109     rd0 = tcg_temp_new_i64();
2110     rd1 = tcg_temp_new_i64();
2111
2112     rn = tcg_temp_new_i32();
2113     rm = tcg_temp_new_i32();
2114     read_neon_element32(rn, a->vn, 0, MO_32);
2115     read_neon_element32(rm, a->vm, 0, MO_32);
2116     opfn(rd0, rn, rm);
2117
2118     read_neon_element32(rn, a->vn, 1, MO_32);
2119     read_neon_element32(rm, a->vm, 1, MO_32);
2120     opfn(rd1, rn, rm);
2121     tcg_temp_free_i32(rn);
2122     tcg_temp_free_i32(rm);
2123
2124     /* Don't store results until after all loads: they might overlap */
2125     if (accfn) {
2126         tmp = tcg_temp_new_i64();
2127         read_neon_element64(tmp, a->vd, 0, MO_64);
2128         accfn(rd0, tmp, rd0);
2129         read_neon_element64(tmp, a->vd, 1, MO_64);
2130         accfn(rd1, tmp, rd1);
2131         tcg_temp_free_i64(tmp);
2132     }
2133
2134     write_neon_element64(rd0, a->vd, 0, MO_64);
2135     write_neon_element64(rd1, a->vd, 1, MO_64);
2136     tcg_temp_free_i64(rd0);
2137     tcg_temp_free_i64(rd1);
2138
2139     return true;
2140 }
2141
2142 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2143 {
2144     static NeonGenTwoOpWidenFn * const opfn[] = {
2145         gen_helper_neon_abdl_s16,
2146         gen_helper_neon_abdl_s32,
2147         gen_helper_neon_abdl_s64,
2148         NULL,
2149     };
2150
2151     return do_long_3d(s, a, opfn[a->size], NULL);
2152 }
2153
2154 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2155 {
2156     static NeonGenTwoOpWidenFn * const opfn[] = {
2157         gen_helper_neon_abdl_u16,
2158         gen_helper_neon_abdl_u32,
2159         gen_helper_neon_abdl_u64,
2160         NULL,
2161     };
2162
2163     return do_long_3d(s, a, opfn[a->size], NULL);
2164 }
2165
2166 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2167 {
2168     static NeonGenTwoOpWidenFn * const opfn[] = {
2169         gen_helper_neon_abdl_s16,
2170         gen_helper_neon_abdl_s32,
2171         gen_helper_neon_abdl_s64,
2172         NULL,
2173     };
2174     static NeonGenTwo64OpFn * const addfn[] = {
2175         gen_helper_neon_addl_u16,
2176         gen_helper_neon_addl_u32,
2177         tcg_gen_add_i64,
2178         NULL,
2179     };
2180
2181     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2182 }
2183
2184 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2185 {
2186     static NeonGenTwoOpWidenFn * const opfn[] = {
2187         gen_helper_neon_abdl_u16,
2188         gen_helper_neon_abdl_u32,
2189         gen_helper_neon_abdl_u64,
2190         NULL,
2191     };
2192     static NeonGenTwo64OpFn * const addfn[] = {
2193         gen_helper_neon_addl_u16,
2194         gen_helper_neon_addl_u32,
2195         tcg_gen_add_i64,
2196         NULL,
2197     };
2198
2199     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2200 }
2201
2202 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2203 {
2204     TCGv_i32 lo = tcg_temp_new_i32();
2205     TCGv_i32 hi = tcg_temp_new_i32();
2206
2207     tcg_gen_muls2_i32(lo, hi, rn, rm);
2208     tcg_gen_concat_i32_i64(rd, lo, hi);
2209
2210     tcg_temp_free_i32(lo);
2211     tcg_temp_free_i32(hi);
2212 }
2213
2214 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2215 {
2216     TCGv_i32 lo = tcg_temp_new_i32();
2217     TCGv_i32 hi = tcg_temp_new_i32();
2218
2219     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2220     tcg_gen_concat_i32_i64(rd, lo, hi);
2221
2222     tcg_temp_free_i32(lo);
2223     tcg_temp_free_i32(hi);
2224 }
2225
2226 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2227 {
2228     static NeonGenTwoOpWidenFn * const opfn[] = {
2229         gen_helper_neon_mull_s8,
2230         gen_helper_neon_mull_s16,
2231         gen_mull_s32,
2232         NULL,
2233     };
2234
2235     return do_long_3d(s, a, opfn[a->size], NULL);
2236 }
2237
2238 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2239 {
2240     static NeonGenTwoOpWidenFn * const opfn[] = {
2241         gen_helper_neon_mull_u8,
2242         gen_helper_neon_mull_u16,
2243         gen_mull_u32,
2244         NULL,
2245     };
2246
2247     return do_long_3d(s, a, opfn[a->size], NULL);
2248 }
2249
2250 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2251     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2252     {                                                                   \
2253         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2254             gen_helper_neon_##MULL##8,                                  \
2255             gen_helper_neon_##MULL##16,                                 \
2256             gen_##MULL##32,                                             \
2257             NULL,                                                       \
2258         };                                                              \
2259         static NeonGenTwo64OpFn * const accfn[] = {                     \
2260             gen_helper_neon_##ACC##l_u16,                               \
2261             gen_helper_neon_##ACC##l_u32,                               \
2262             tcg_gen_##ACC##_i64,                                        \
2263             NULL,                                                       \
2264         };                                                              \
2265         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2266     }
2267
2268 DO_VMLAL(VMLAL_S,mull_s,add)
2269 DO_VMLAL(VMLAL_U,mull_u,add)
2270 DO_VMLAL(VMLSL_S,mull_s,sub)
2271 DO_VMLAL(VMLSL_U,mull_u,sub)
2272
2273 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2274 {
2275     gen_helper_neon_mull_s16(rd, rn, rm);
2276     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2277 }
2278
2279 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2280 {
2281     gen_mull_s32(rd, rn, rm);
2282     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2283 }
2284
2285 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2286 {
2287     static NeonGenTwoOpWidenFn * const opfn[] = {
2288         NULL,
2289         gen_VQDMULL_16,
2290         gen_VQDMULL_32,
2291         NULL,
2292     };
2293
2294     return do_long_3d(s, a, opfn[a->size], NULL);
2295 }
2296
2297 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2298 {
2299     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2300 }
2301
2302 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2303 {
2304     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2305 }
2306
2307 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2308 {
2309     static NeonGenTwoOpWidenFn * const opfn[] = {
2310         NULL,
2311         gen_VQDMULL_16,
2312         gen_VQDMULL_32,
2313         NULL,
2314     };
2315     static NeonGenTwo64OpFn * const accfn[] = {
2316         NULL,
2317         gen_VQDMLAL_acc_16,
2318         gen_VQDMLAL_acc_32,
2319         NULL,
2320     };
2321
2322     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2323 }
2324
2325 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2326 {
2327     gen_helper_neon_negl_u32(rm, rm);
2328     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2329 }
2330
2331 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2332 {
2333     tcg_gen_neg_i64(rm, rm);
2334     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2335 }
2336
2337 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2338 {
2339     static NeonGenTwoOpWidenFn * const opfn[] = {
2340         NULL,
2341         gen_VQDMULL_16,
2342         gen_VQDMULL_32,
2343         NULL,
2344     };
2345     static NeonGenTwo64OpFn * const accfn[] = {
2346         NULL,
2347         gen_VQDMLSL_acc_16,
2348         gen_VQDMLSL_acc_32,
2349         NULL,
2350     };
2351
2352     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2353 }
2354
2355 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2356 {
2357     gen_helper_gvec_3 *fn_gvec;
2358
2359     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2360         return false;
2361     }
2362
2363     /* UNDEF accesses to D16-D31 if they don't exist. */
2364     if (!dc_isar_feature(aa32_simd_r32, s) &&
2365         ((a->vd | a->vn | a->vm) & 0x10)) {
2366         return false;
2367     }
2368
2369     if (a->vd & 1) {
2370         return false;
2371     }
2372
2373     switch (a->size) {
2374     case 0:
2375         fn_gvec = gen_helper_neon_pmull_h;
2376         break;
2377     case 2:
2378         if (!dc_isar_feature(aa32_pmull, s)) {
2379             return false;
2380         }
2381         fn_gvec = gen_helper_gvec_pmull_q;
2382         break;
2383     default:
2384         return false;
2385     }
2386
2387     if (!vfp_access_check(s)) {
2388         return true;
2389     }
2390
2391     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2392                        neon_full_reg_offset(a->vn),
2393                        neon_full_reg_offset(a->vm),
2394                        16, 16, 0, fn_gvec);
2395     return true;
2396 }
2397
2398 static void gen_neon_dup_low16(TCGv_i32 var)
2399 {
2400     TCGv_i32 tmp = tcg_temp_new_i32();
2401     tcg_gen_ext16u_i32(var, var);
2402     tcg_gen_shli_i32(tmp, var, 16);
2403     tcg_gen_or_i32(var, var, tmp);
2404     tcg_temp_free_i32(tmp);
2405 }
2406
2407 static void gen_neon_dup_high16(TCGv_i32 var)
2408 {
2409     TCGv_i32 tmp = tcg_temp_new_i32();
2410     tcg_gen_andi_i32(var, var, 0xffff0000);
2411     tcg_gen_shri_i32(tmp, var, 16);
2412     tcg_gen_or_i32(var, var, tmp);
2413     tcg_temp_free_i32(tmp);
2414 }
2415
2416 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2417 {
2418     TCGv_i32 tmp = tcg_temp_new_i32();
2419     if (size == MO_16) {
2420         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2421         if (reg & 8) {
2422             gen_neon_dup_high16(tmp);
2423         } else {
2424             gen_neon_dup_low16(tmp);
2425         }
2426     } else {
2427         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2428     }
2429     return tmp;
2430 }
2431
2432 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2433                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2434 {
2435     /*
2436      * Two registers and a scalar: perform an operation between
2437      * the input elements and the scalar, and then possibly
2438      * perform an accumulation operation of that result into the
2439      * destination.
2440      */
2441     TCGv_i32 scalar, tmp;
2442     int pass;
2443
2444     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2445         return false;
2446     }
2447
2448     /* UNDEF accesses to D16-D31 if they don't exist. */
2449     if (!dc_isar_feature(aa32_simd_r32, s) &&
2450         ((a->vd | a->vn | a->vm) & 0x10)) {
2451         return false;
2452     }
2453
2454     if (!opfn) {
2455         /* Bad size (including size == 3, which is a different insn group) */
2456         return false;
2457     }
2458
2459     if (a->q && ((a->vd | a->vn) & 1)) {
2460         return false;
2461     }
2462
2463     if (!vfp_access_check(s)) {
2464         return true;
2465     }
2466
2467     scalar = neon_get_scalar(a->size, a->vm);
2468     tmp = tcg_temp_new_i32();
2469
2470     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2471         read_neon_element32(tmp, a->vn, pass, MO_32);
2472         opfn(tmp, tmp, scalar);
2473         if (accfn) {
2474             TCGv_i32 rd = tcg_temp_new_i32();
2475             read_neon_element32(rd, a->vd, pass, MO_32);
2476             accfn(tmp, rd, tmp);
2477             tcg_temp_free_i32(rd);
2478         }
2479         write_neon_element32(tmp, a->vd, pass, MO_32);
2480     }
2481     tcg_temp_free_i32(tmp);
2482     tcg_temp_free_i32(scalar);
2483     return true;
2484 }
2485
2486 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2487 {
2488     static NeonGenTwoOpFn * const opfn[] = {
2489         NULL,
2490         gen_helper_neon_mul_u16,
2491         tcg_gen_mul_i32,
2492         NULL,
2493     };
2494
2495     return do_2scalar(s, a, opfn[a->size], NULL);
2496 }
2497
2498 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2499 {
2500     static NeonGenTwoOpFn * const opfn[] = {
2501         NULL,
2502         gen_helper_neon_mul_u16,
2503         tcg_gen_mul_i32,
2504         NULL,
2505     };
2506     static NeonGenTwoOpFn * const accfn[] = {
2507         NULL,
2508         gen_helper_neon_add_u16,
2509         tcg_gen_add_i32,
2510         NULL,
2511     };
2512
2513     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2514 }
2515
2516 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2517 {
2518     static NeonGenTwoOpFn * const opfn[] = {
2519         NULL,
2520         gen_helper_neon_mul_u16,
2521         tcg_gen_mul_i32,
2522         NULL,
2523     };
2524     static NeonGenTwoOpFn * const accfn[] = {
2525         NULL,
2526         gen_helper_neon_sub_u16,
2527         tcg_gen_sub_i32,
2528         NULL,
2529     };
2530
2531     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2532 }
2533
2534 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2535                               gen_helper_gvec_3_ptr *fn)
2536 {
2537     /* Two registers and a scalar, using gvec */
2538     int vec_size = a->q ? 16 : 8;
2539     int rd_ofs = neon_full_reg_offset(a->vd);
2540     int rn_ofs = neon_full_reg_offset(a->vn);
2541     int rm_ofs;
2542     int idx;
2543     TCGv_ptr fpstatus;
2544
2545     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2546         return false;
2547     }
2548
2549     /* UNDEF accesses to D16-D31 if they don't exist. */
2550     if (!dc_isar_feature(aa32_simd_r32, s) &&
2551         ((a->vd | a->vn | a->vm) & 0x10)) {
2552         return false;
2553     }
2554
2555     if (!fn) {
2556         /* Bad size (including size == 3, which is a different insn group) */
2557         return false;
2558     }
2559
2560     if (a->q && ((a->vd | a->vn) & 1)) {
2561         return false;
2562     }
2563
2564     if (!vfp_access_check(s)) {
2565         return true;
2566     }
2567
2568     /* a->vm is M:Vm, which encodes both register and index */
2569     idx = extract32(a->vm, a->size + 2, 2);
2570     a->vm = extract32(a->vm, 0, a->size + 2);
2571     rm_ofs = neon_full_reg_offset(a->vm);
2572
2573     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2574     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2575                        vec_size, vec_size, idx, fn);
2576     tcg_temp_free_ptr(fpstatus);
2577     return true;
2578 }
2579
2580 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2581     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2582     {                                                                   \
2583         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2584             NULL,                                                       \
2585             gen_helper_##FUNC##_h,                                      \
2586             gen_helper_##FUNC##_s,                                      \
2587             NULL,                                                       \
2588         };                                                              \
2589         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2590             return false;                                               \
2591         }                                                               \
2592         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2593     }
2594
2595 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2596 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2597 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2598
2599 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2600 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2601 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2602 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2603
2604 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2605 {
2606     static NeonGenTwoOpFn * const opfn[] = {
2607         NULL,
2608         gen_VQDMULH_16,
2609         gen_VQDMULH_32,
2610         NULL,
2611     };
2612
2613     return do_2scalar(s, a, opfn[a->size], NULL);
2614 }
2615
2616 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2617 {
2618     static NeonGenTwoOpFn * const opfn[] = {
2619         NULL,
2620         gen_VQRDMULH_16,
2621         gen_VQRDMULH_32,
2622         NULL,
2623     };
2624
2625     return do_2scalar(s, a, opfn[a->size], NULL);
2626 }
2627
2628 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2629                             NeonGenThreeOpEnvFn *opfn)
2630 {
2631     /*
2632      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2633      * performs a kind of fused op-then-accumulate using a helper
2634      * function that takes all of rd, rn and the scalar at once.
2635      */
2636     TCGv_i32 scalar, rn, rd;
2637     int pass;
2638
2639     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2640         return false;
2641     }
2642
2643     if (!dc_isar_feature(aa32_rdm, s)) {
2644         return false;
2645     }
2646
2647     /* UNDEF accesses to D16-D31 if they don't exist. */
2648     if (!dc_isar_feature(aa32_simd_r32, s) &&
2649         ((a->vd | a->vn | a->vm) & 0x10)) {
2650         return false;
2651     }
2652
2653     if (!opfn) {
2654         /* Bad size (including size == 3, which is a different insn group) */
2655         return false;
2656     }
2657
2658     if (a->q && ((a->vd | a->vn) & 1)) {
2659         return false;
2660     }
2661
2662     if (!vfp_access_check(s)) {
2663         return true;
2664     }
2665
2666     scalar = neon_get_scalar(a->size, a->vm);
2667     rn = tcg_temp_new_i32();
2668     rd = tcg_temp_new_i32();
2669
2670     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2671         read_neon_element32(rn, a->vn, pass, MO_32);
2672         read_neon_element32(rd, a->vd, pass, MO_32);
2673         opfn(rd, cpu_env, rn, scalar, rd);
2674         write_neon_element32(rd, a->vd, pass, MO_32);
2675     }
2676     tcg_temp_free_i32(rn);
2677     tcg_temp_free_i32(rd);
2678     tcg_temp_free_i32(scalar);
2679
2680     return true;
2681 }
2682
2683 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2684 {
2685     static NeonGenThreeOpEnvFn *opfn[] = {
2686         NULL,
2687         gen_helper_neon_qrdmlah_s16,
2688         gen_helper_neon_qrdmlah_s32,
2689         NULL,
2690     };
2691     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2692 }
2693
2694 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2695 {
2696     static NeonGenThreeOpEnvFn *opfn[] = {
2697         NULL,
2698         gen_helper_neon_qrdmlsh_s16,
2699         gen_helper_neon_qrdmlsh_s32,
2700         NULL,
2701     };
2702     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2703 }
2704
2705 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2706                             NeonGenTwoOpWidenFn *opfn,
2707                             NeonGenTwo64OpFn *accfn)
2708 {
2709     /*
2710      * Two registers and a scalar, long operations: perform an
2711      * operation on the input elements and the scalar which produces
2712      * a double-width result, and then possibly perform an accumulation
2713      * operation of that result into the destination.
2714      */
2715     TCGv_i32 scalar, rn;
2716     TCGv_i64 rn0_64, rn1_64;
2717
2718     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2719         return false;
2720     }
2721
2722     /* UNDEF accesses to D16-D31 if they don't exist. */
2723     if (!dc_isar_feature(aa32_simd_r32, s) &&
2724         ((a->vd | a->vn | a->vm) & 0x10)) {
2725         return false;
2726     }
2727
2728     if (!opfn) {
2729         /* Bad size (including size == 3, which is a different insn group) */
2730         return false;
2731     }
2732
2733     if (a->vd & 1) {
2734         return false;
2735     }
2736
2737     if (!vfp_access_check(s)) {
2738         return true;
2739     }
2740
2741     scalar = neon_get_scalar(a->size, a->vm);
2742
2743     /* Load all inputs before writing any outputs, in case of overlap */
2744     rn = tcg_temp_new_i32();
2745     read_neon_element32(rn, a->vn, 0, MO_32);
2746     rn0_64 = tcg_temp_new_i64();
2747     opfn(rn0_64, rn, scalar);
2748
2749     read_neon_element32(rn, a->vn, 1, MO_32);
2750     rn1_64 = tcg_temp_new_i64();
2751     opfn(rn1_64, rn, scalar);
2752     tcg_temp_free_i32(rn);
2753     tcg_temp_free_i32(scalar);
2754
2755     if (accfn) {
2756         TCGv_i64 t64 = tcg_temp_new_i64();
2757         read_neon_element64(t64, a->vd, 0, MO_64);
2758         accfn(rn0_64, t64, rn0_64);
2759         read_neon_element64(t64, a->vd, 1, MO_64);
2760         accfn(rn1_64, t64, rn1_64);
2761         tcg_temp_free_i64(t64);
2762     }
2763
2764     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2765     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2766     tcg_temp_free_i64(rn0_64);
2767     tcg_temp_free_i64(rn1_64);
2768     return true;
2769 }
2770
2771 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2772 {
2773     static NeonGenTwoOpWidenFn * const opfn[] = {
2774         NULL,
2775         gen_helper_neon_mull_s16,
2776         gen_mull_s32,
2777         NULL,
2778     };
2779
2780     return do_2scalar_long(s, a, opfn[a->size], NULL);
2781 }
2782
2783 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2784 {
2785     static NeonGenTwoOpWidenFn * const opfn[] = {
2786         NULL,
2787         gen_helper_neon_mull_u16,
2788         gen_mull_u32,
2789         NULL,
2790     };
2791
2792     return do_2scalar_long(s, a, opfn[a->size], NULL);
2793 }
2794
2795 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2796     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2797     {                                                                   \
2798         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2799             NULL,                                                       \
2800             gen_helper_neon_##MULL##16,                                 \
2801             gen_##MULL##32,                                             \
2802             NULL,                                                       \
2803         };                                                              \
2804         static NeonGenTwo64OpFn * const accfn[] = {                     \
2805             NULL,                                                       \
2806             gen_helper_neon_##ACC##l_u32,                               \
2807             tcg_gen_##ACC##_i64,                                        \
2808             NULL,                                                       \
2809         };                                                              \
2810         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2811     }
2812
2813 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2814 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2815 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2816 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2817
2818 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2819 {
2820     static NeonGenTwoOpWidenFn * const opfn[] = {
2821         NULL,
2822         gen_VQDMULL_16,
2823         gen_VQDMULL_32,
2824         NULL,
2825     };
2826
2827     return do_2scalar_long(s, a, opfn[a->size], NULL);
2828 }
2829
2830 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2831 {
2832     static NeonGenTwoOpWidenFn * const opfn[] = {
2833         NULL,
2834         gen_VQDMULL_16,
2835         gen_VQDMULL_32,
2836         NULL,
2837     };
2838     static NeonGenTwo64OpFn * const accfn[] = {
2839         NULL,
2840         gen_VQDMLAL_acc_16,
2841         gen_VQDMLAL_acc_32,
2842         NULL,
2843     };
2844
2845     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2846 }
2847
2848 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2849 {
2850     static NeonGenTwoOpWidenFn * const opfn[] = {
2851         NULL,
2852         gen_VQDMULL_16,
2853         gen_VQDMULL_32,
2854         NULL,
2855     };
2856     static NeonGenTwo64OpFn * const accfn[] = {
2857         NULL,
2858         gen_VQDMLSL_acc_16,
2859         gen_VQDMLSL_acc_32,
2860         NULL,
2861     };
2862
2863     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2864 }
2865
2866 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2867 {
2868     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2869         return false;
2870     }
2871
2872     /* UNDEF accesses to D16-D31 if they don't exist. */
2873     if (!dc_isar_feature(aa32_simd_r32, s) &&
2874         ((a->vd | a->vn | a->vm) & 0x10)) {
2875         return false;
2876     }
2877
2878     if ((a->vn | a->vm | a->vd) & a->q) {
2879         return false;
2880     }
2881
2882     if (a->imm > 7 && !a->q) {
2883         return false;
2884     }
2885
2886     if (!vfp_access_check(s)) {
2887         return true;
2888     }
2889
2890     if (!a->q) {
2891         /* Extract 64 bits from <Vm:Vn> */
2892         TCGv_i64 left, right, dest;
2893
2894         left = tcg_temp_new_i64();
2895         right = tcg_temp_new_i64();
2896         dest = tcg_temp_new_i64();
2897
2898         read_neon_element64(right, a->vn, 0, MO_64);
2899         read_neon_element64(left, a->vm, 0, MO_64);
2900         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2901         write_neon_element64(dest, a->vd, 0, MO_64);
2902
2903         tcg_temp_free_i64(left);
2904         tcg_temp_free_i64(right);
2905         tcg_temp_free_i64(dest);
2906     } else {
2907         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2908         TCGv_i64 left, middle, right, destleft, destright;
2909
2910         left = tcg_temp_new_i64();
2911         middle = tcg_temp_new_i64();
2912         right = tcg_temp_new_i64();
2913         destleft = tcg_temp_new_i64();
2914         destright = tcg_temp_new_i64();
2915
2916         if (a->imm < 8) {
2917             read_neon_element64(right, a->vn, 0, MO_64);
2918             read_neon_element64(middle, a->vn, 1, MO_64);
2919             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2920             read_neon_element64(left, a->vm, 0, MO_64);
2921             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2922         } else {
2923             read_neon_element64(right, a->vn, 1, MO_64);
2924             read_neon_element64(middle, a->vm, 0, MO_64);
2925             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2926             read_neon_element64(left, a->vm, 1, MO_64);
2927             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2928         }
2929
2930         write_neon_element64(destright, a->vd, 0, MO_64);
2931         write_neon_element64(destleft, a->vd, 1, MO_64);
2932
2933         tcg_temp_free_i64(destright);
2934         tcg_temp_free_i64(destleft);
2935         tcg_temp_free_i64(right);
2936         tcg_temp_free_i64(middle);
2937         tcg_temp_free_i64(left);
2938     }
2939     return true;
2940 }
2941
2942 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2943 {
2944     TCGv_i64 val, def;
2945     TCGv_i32 desc;
2946
2947     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2948         return false;
2949     }
2950
2951     /* UNDEF accesses to D16-D31 if they don't exist. */
2952     if (!dc_isar_feature(aa32_simd_r32, s) &&
2953         ((a->vd | a->vn | a->vm) & 0x10)) {
2954         return false;
2955     }
2956
2957     if ((a->vn + a->len + 1) > 32) {
2958         /*
2959          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2960          * helper function running off the end of the register file.
2961          */
2962         return false;
2963     }
2964
2965     if (!vfp_access_check(s)) {
2966         return true;
2967     }
2968
2969     desc = tcg_const_i32((a->vn << 2) | a->len);
2970     def = tcg_temp_new_i64();
2971     if (a->op) {
2972         read_neon_element64(def, a->vd, 0, MO_64);
2973     } else {
2974         tcg_gen_movi_i64(def, 0);
2975     }
2976     val = tcg_temp_new_i64();
2977     read_neon_element64(val, a->vm, 0, MO_64);
2978
2979     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2980     write_neon_element64(val, a->vd, 0, MO_64);
2981
2982     tcg_temp_free_i64(def);
2983     tcg_temp_free_i64(val);
2984     tcg_temp_free_i32(desc);
2985     return true;
2986 }
2987
2988 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2989 {
2990     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2991         return false;
2992     }
2993
2994     /* UNDEF accesses to D16-D31 if they don't exist. */
2995     if (!dc_isar_feature(aa32_simd_r32, s) &&
2996         ((a->vd | a->vm) & 0x10)) {
2997         return false;
2998     }
2999
3000     if (a->vd & a->q) {
3001         return false;
3002     }
3003
3004     if (!vfp_access_check(s)) {
3005         return true;
3006     }
3007
3008     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
3009                          neon_element_offset(a->vm, a->index, a->size),
3010                          a->q ? 16 : 8, a->q ? 16 : 8);
3011     return true;
3012 }
3013
3014 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
3015 {
3016     int pass, half;
3017     TCGv_i32 tmp[2];
3018
3019     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3020         return false;
3021     }
3022
3023     /* UNDEF accesses to D16-D31 if they don't exist. */
3024     if (!dc_isar_feature(aa32_simd_r32, s) &&
3025         ((a->vd | a->vm) & 0x10)) {
3026         return false;
3027     }
3028
3029     if ((a->vd | a->vm) & a->q) {
3030         return false;
3031     }
3032
3033     if (a->size == 3) {
3034         return false;
3035     }
3036
3037     if (!vfp_access_check(s)) {
3038         return true;
3039     }
3040
3041     tmp[0] = tcg_temp_new_i32();
3042     tmp[1] = tcg_temp_new_i32();
3043
3044     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3045         for (half = 0; half < 2; half++) {
3046             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
3047             switch (a->size) {
3048             case 0:
3049                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
3050                 break;
3051             case 1:
3052                 gen_swap_half(tmp[half], tmp[half]);
3053                 break;
3054             case 2:
3055                 break;
3056             default:
3057                 g_assert_not_reached();
3058             }
3059         }
3060         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3061         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3062     }
3063
3064     tcg_temp_free_i32(tmp[0]);
3065     tcg_temp_free_i32(tmp[1]);
3066     return true;
3067 }
3068
3069 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3070                               NeonGenWidenFn *widenfn,
3071                               NeonGenTwo64OpFn *opfn,
3072                               NeonGenTwo64OpFn *accfn)
3073 {
3074     /*
3075      * Pairwise long operations: widen both halves of the pair,
3076      * combine the pairs with the opfn, and then possibly accumulate
3077      * into the destination with the accfn.
3078      */
3079     int pass;
3080
3081     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3082         return false;
3083     }
3084
3085     /* UNDEF accesses to D16-D31 if they don't exist. */
3086     if (!dc_isar_feature(aa32_simd_r32, s) &&
3087         ((a->vd | a->vm) & 0x10)) {
3088         return false;
3089     }
3090
3091     if ((a->vd | a->vm) & a->q) {
3092         return false;
3093     }
3094
3095     if (!widenfn) {
3096         return false;
3097     }
3098
3099     if (!vfp_access_check(s)) {
3100         return true;
3101     }
3102
3103     for (pass = 0; pass < a->q + 1; pass++) {
3104         TCGv_i32 tmp;
3105         TCGv_i64 rm0_64, rm1_64, rd_64;
3106
3107         rm0_64 = tcg_temp_new_i64();
3108         rm1_64 = tcg_temp_new_i64();
3109         rd_64 = tcg_temp_new_i64();
3110
3111         tmp = tcg_temp_new_i32();
3112         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3113         widenfn(rm0_64, tmp);
3114         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3115         widenfn(rm1_64, tmp);
3116         tcg_temp_free_i32(tmp);
3117
3118         opfn(rd_64, rm0_64, rm1_64);
3119         tcg_temp_free_i64(rm0_64);
3120         tcg_temp_free_i64(rm1_64);
3121
3122         if (accfn) {
3123             TCGv_i64 tmp64 = tcg_temp_new_i64();
3124             read_neon_element64(tmp64, a->vd, pass, MO_64);
3125             accfn(rd_64, tmp64, rd_64);
3126             tcg_temp_free_i64(tmp64);
3127         }
3128         write_neon_element64(rd_64, a->vd, pass, MO_64);
3129         tcg_temp_free_i64(rd_64);
3130     }
3131     return true;
3132 }
3133
3134 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3135 {
3136     static NeonGenWidenFn * const widenfn[] = {
3137         gen_helper_neon_widen_s8,
3138         gen_helper_neon_widen_s16,
3139         tcg_gen_ext_i32_i64,
3140         NULL,
3141     };
3142     static NeonGenTwo64OpFn * const opfn[] = {
3143         gen_helper_neon_paddl_u16,
3144         gen_helper_neon_paddl_u32,
3145         tcg_gen_add_i64,
3146         NULL,
3147     };
3148
3149     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3150 }
3151
3152 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3153 {
3154     static NeonGenWidenFn * const widenfn[] = {
3155         gen_helper_neon_widen_u8,
3156         gen_helper_neon_widen_u16,
3157         tcg_gen_extu_i32_i64,
3158         NULL,
3159     };
3160     static NeonGenTwo64OpFn * const opfn[] = {
3161         gen_helper_neon_paddl_u16,
3162         gen_helper_neon_paddl_u32,
3163         tcg_gen_add_i64,
3164         NULL,
3165     };
3166
3167     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3168 }
3169
3170 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3171 {
3172     static NeonGenWidenFn * const widenfn[] = {
3173         gen_helper_neon_widen_s8,
3174         gen_helper_neon_widen_s16,
3175         tcg_gen_ext_i32_i64,
3176         NULL,
3177     };
3178     static NeonGenTwo64OpFn * const opfn[] = {
3179         gen_helper_neon_paddl_u16,
3180         gen_helper_neon_paddl_u32,
3181         tcg_gen_add_i64,
3182         NULL,
3183     };
3184     static NeonGenTwo64OpFn * const accfn[] = {
3185         gen_helper_neon_addl_u16,
3186         gen_helper_neon_addl_u32,
3187         tcg_gen_add_i64,
3188         NULL,
3189     };
3190
3191     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3192                              accfn[a->size]);
3193 }
3194
3195 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3196 {
3197     static NeonGenWidenFn * const widenfn[] = {
3198         gen_helper_neon_widen_u8,
3199         gen_helper_neon_widen_u16,
3200         tcg_gen_extu_i32_i64,
3201         NULL,
3202     };
3203     static NeonGenTwo64OpFn * const opfn[] = {
3204         gen_helper_neon_paddl_u16,
3205         gen_helper_neon_paddl_u32,
3206         tcg_gen_add_i64,
3207         NULL,
3208     };
3209     static NeonGenTwo64OpFn * const accfn[] = {
3210         gen_helper_neon_addl_u16,
3211         gen_helper_neon_addl_u32,
3212         tcg_gen_add_i64,
3213         NULL,
3214     };
3215
3216     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3217                              accfn[a->size]);
3218 }
3219
3220 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3221
3222 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3223                        ZipFn *fn)
3224 {
3225     TCGv_ptr pd, pm;
3226
3227     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3228         return false;
3229     }
3230
3231     /* UNDEF accesses to D16-D31 if they don't exist. */
3232     if (!dc_isar_feature(aa32_simd_r32, s) &&
3233         ((a->vd | a->vm) & 0x10)) {
3234         return false;
3235     }
3236
3237     if ((a->vd | a->vm) & a->q) {
3238         return false;
3239     }
3240
3241     if (!fn) {
3242         /* Bad size or size/q combination */
3243         return false;
3244     }
3245
3246     if (!vfp_access_check(s)) {
3247         return true;
3248     }
3249
3250     pd = vfp_reg_ptr(true, a->vd);
3251     pm = vfp_reg_ptr(true, a->vm);
3252     fn(pd, pm);
3253     tcg_temp_free_ptr(pd);
3254     tcg_temp_free_ptr(pm);
3255     return true;
3256 }
3257
3258 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3259 {
3260     static ZipFn * const fn[2][4] = {
3261         {
3262             gen_helper_neon_unzip8,
3263             gen_helper_neon_unzip16,
3264             NULL,
3265             NULL,
3266         }, {
3267             gen_helper_neon_qunzip8,
3268             gen_helper_neon_qunzip16,
3269             gen_helper_neon_qunzip32,
3270             NULL,
3271         }
3272     };
3273     return do_zip_uzp(s, a, fn[a->q][a->size]);
3274 }
3275
3276 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3277 {
3278     static ZipFn * const fn[2][4] = {
3279         {
3280             gen_helper_neon_zip8,
3281             gen_helper_neon_zip16,
3282             NULL,
3283             NULL,
3284         }, {
3285             gen_helper_neon_qzip8,
3286             gen_helper_neon_qzip16,
3287             gen_helper_neon_qzip32,
3288             NULL,
3289         }
3290     };
3291     return do_zip_uzp(s, a, fn[a->q][a->size]);
3292 }
3293
3294 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3295                      NeonGenNarrowEnvFn *narrowfn)
3296 {
3297     TCGv_i64 rm;
3298     TCGv_i32 rd0, rd1;
3299
3300     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3301         return false;
3302     }
3303
3304     /* UNDEF accesses to D16-D31 if they don't exist. */
3305     if (!dc_isar_feature(aa32_simd_r32, s) &&
3306         ((a->vd | a->vm) & 0x10)) {
3307         return false;
3308     }
3309
3310     if (a->vm & 1) {
3311         return false;
3312     }
3313
3314     if (!narrowfn) {
3315         return false;
3316     }
3317
3318     if (!vfp_access_check(s)) {
3319         return true;
3320     }
3321
3322     rm = tcg_temp_new_i64();
3323     rd0 = tcg_temp_new_i32();
3324     rd1 = tcg_temp_new_i32();
3325
3326     read_neon_element64(rm, a->vm, 0, MO_64);
3327     narrowfn(rd0, cpu_env, rm);
3328     read_neon_element64(rm, a->vm, 1, MO_64);
3329     narrowfn(rd1, cpu_env, rm);
3330     write_neon_element32(rd0, a->vd, 0, MO_32);
3331     write_neon_element32(rd1, a->vd, 1, MO_32);
3332     tcg_temp_free_i32(rd0);
3333     tcg_temp_free_i32(rd1);
3334     tcg_temp_free_i64(rm);
3335     return true;
3336 }
3337
3338 #define DO_VMOVN(INSN, FUNC)                                    \
3339     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3340     {                                                           \
3341         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3342             FUNC##8,                                            \
3343             FUNC##16,                                           \
3344             FUNC##32,                                           \
3345             NULL,                                               \
3346         };                                                      \
3347         return do_vmovn(s, a, narrowfn[a->size]);               \
3348     }
3349
3350 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3351 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3352 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3353 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3354
3355 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3356 {
3357     TCGv_i32 rm0, rm1;
3358     TCGv_i64 rd;
3359     static NeonGenWidenFn * const widenfns[] = {
3360         gen_helper_neon_widen_u8,
3361         gen_helper_neon_widen_u16,
3362         tcg_gen_extu_i32_i64,
3363         NULL,
3364     };
3365     NeonGenWidenFn *widenfn = widenfns[a->size];
3366
3367     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3368         return false;
3369     }
3370
3371     /* UNDEF accesses to D16-D31 if they don't exist. */
3372     if (!dc_isar_feature(aa32_simd_r32, s) &&
3373         ((a->vd | a->vm) & 0x10)) {
3374         return false;
3375     }
3376
3377     if (a->vd & 1) {
3378         return false;
3379     }
3380
3381     if (!widenfn) {
3382         return false;
3383     }
3384
3385     if (!vfp_access_check(s)) {
3386         return true;
3387     }
3388
3389     rd = tcg_temp_new_i64();
3390     rm0 = tcg_temp_new_i32();
3391     rm1 = tcg_temp_new_i32();
3392
3393     read_neon_element32(rm0, a->vm, 0, MO_32);
3394     read_neon_element32(rm1, a->vm, 1, MO_32);
3395
3396     widenfn(rd, rm0);
3397     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3398     write_neon_element64(rd, a->vd, 0, MO_64);
3399     widenfn(rd, rm1);
3400     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3401     write_neon_element64(rd, a->vd, 1, MO_64);
3402
3403     tcg_temp_free_i64(rd);
3404     tcg_temp_free_i32(rm0);
3405     tcg_temp_free_i32(rm1);
3406     return true;
3407 }
3408
3409 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3410 {
3411     TCGv_ptr fpst;
3412     TCGv_i32 ahp, tmp, tmp2, tmp3;
3413
3414     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3415         !dc_isar_feature(aa32_fp16_spconv, s)) {
3416         return false;
3417     }
3418
3419     /* UNDEF accesses to D16-D31 if they don't exist. */
3420     if (!dc_isar_feature(aa32_simd_r32, s) &&
3421         ((a->vd | a->vm) & 0x10)) {
3422         return false;
3423     }
3424
3425     if ((a->vm & 1) || (a->size != 1)) {
3426         return false;
3427     }
3428
3429     if (!vfp_access_check(s)) {
3430         return true;
3431     }
3432
3433     fpst = fpstatus_ptr(FPST_STD);
3434     ahp = get_ahp_flag();
3435     tmp = tcg_temp_new_i32();
3436     read_neon_element32(tmp, a->vm, 0, MO_32);
3437     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3438     tmp2 = tcg_temp_new_i32();
3439     read_neon_element32(tmp2, a->vm, 1, MO_32);
3440     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3441     tcg_gen_shli_i32(tmp2, tmp2, 16);
3442     tcg_gen_or_i32(tmp2, tmp2, tmp);
3443     read_neon_element32(tmp, a->vm, 2, MO_32);
3444     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3445     tmp3 = tcg_temp_new_i32();
3446     read_neon_element32(tmp3, a->vm, 3, MO_32);
3447     write_neon_element32(tmp2, a->vd, 0, MO_32);
3448     tcg_temp_free_i32(tmp2);
3449     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3450     tcg_gen_shli_i32(tmp3, tmp3, 16);
3451     tcg_gen_or_i32(tmp3, tmp3, tmp);
3452     write_neon_element32(tmp3, a->vd, 1, MO_32);
3453     tcg_temp_free_i32(tmp3);
3454     tcg_temp_free_i32(tmp);
3455     tcg_temp_free_i32(ahp);
3456     tcg_temp_free_ptr(fpst);
3457
3458     return true;
3459 }
3460
3461 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3462 {
3463     TCGv_ptr fpst;
3464     TCGv_i32 ahp, tmp, tmp2, tmp3;
3465
3466     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3467         !dc_isar_feature(aa32_fp16_spconv, s)) {
3468         return false;
3469     }
3470
3471     /* UNDEF accesses to D16-D31 if they don't exist. */
3472     if (!dc_isar_feature(aa32_simd_r32, s) &&
3473         ((a->vd | a->vm) & 0x10)) {
3474         return false;
3475     }
3476
3477     if ((a->vd & 1) || (a->size != 1)) {
3478         return false;
3479     }
3480
3481     if (!vfp_access_check(s)) {
3482         return true;
3483     }
3484
3485     fpst = fpstatus_ptr(FPST_STD);
3486     ahp = get_ahp_flag();
3487     tmp3 = tcg_temp_new_i32();
3488     tmp2 = tcg_temp_new_i32();
3489     tmp = tcg_temp_new_i32();
3490     read_neon_element32(tmp, a->vm, 0, MO_32);
3491     read_neon_element32(tmp2, a->vm, 1, MO_32);
3492     tcg_gen_ext16u_i32(tmp3, tmp);
3493     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3494     write_neon_element32(tmp3, a->vd, 0, MO_32);
3495     tcg_gen_shri_i32(tmp, tmp, 16);
3496     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3497     write_neon_element32(tmp, a->vd, 1, MO_32);
3498     tcg_temp_free_i32(tmp);
3499     tcg_gen_ext16u_i32(tmp3, tmp2);
3500     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3501     write_neon_element32(tmp3, a->vd, 2, MO_32);
3502     tcg_temp_free_i32(tmp3);
3503     tcg_gen_shri_i32(tmp2, tmp2, 16);
3504     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3505     write_neon_element32(tmp2, a->vd, 3, MO_32);
3506     tcg_temp_free_i32(tmp2);
3507     tcg_temp_free_i32(ahp);
3508     tcg_temp_free_ptr(fpst);
3509
3510     return true;
3511 }
3512
3513 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3514 {
3515     int vec_size = a->q ? 16 : 8;
3516     int rd_ofs = neon_full_reg_offset(a->vd);
3517     int rm_ofs = neon_full_reg_offset(a->vm);
3518
3519     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3520         return false;
3521     }
3522
3523     /* UNDEF accesses to D16-D31 if they don't exist. */
3524     if (!dc_isar_feature(aa32_simd_r32, s) &&
3525         ((a->vd | a->vm) & 0x10)) {
3526         return false;
3527     }
3528
3529     if (a->size == 3) {
3530         return false;
3531     }
3532
3533     if ((a->vd | a->vm) & a->q) {
3534         return false;
3535     }
3536
3537     if (!vfp_access_check(s)) {
3538         return true;
3539     }
3540
3541     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3542
3543     return true;
3544 }
3545
3546 #define DO_2MISC_VEC(INSN, FN)                                  \
3547     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3548     {                                                           \
3549         return do_2misc_vec(s, a, FN);                          \
3550     }
3551
3552 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3553 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3554 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3555 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3556 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3557 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3558 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3559
3560 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3561 {
3562     if (a->size != 0) {
3563         return false;
3564     }
3565     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3566 }
3567
3568 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3569     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3570                          uint32_t rm_ofs, uint32_t oprsz,               \
3571                          uint32_t maxsz)                                \
3572     {                                                                   \
3573         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3574                            DATA, FUNC);                                 \
3575     }
3576
3577 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3578     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3579                          uint32_t rm_ofs, uint32_t oprsz,               \
3580                          uint32_t maxsz)                                \
3581     {                                                                   \
3582         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3583     }
3584
3585 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3586 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3587 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3588 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3589 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3590 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3591 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3592
3593 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3594     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3595     {                                                           \
3596         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3597             return false;                                       \
3598         }                                                       \
3599         return do_2misc_vec(s, a, gen_##INSN);                  \
3600     }
3601
3602 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3603 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3604 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3605 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3606 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3607 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3608 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3609
3610 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3611 {
3612     TCGv_i32 tmp;
3613     int pass;
3614
3615     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3616     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3617         return false;
3618     }
3619
3620     /* UNDEF accesses to D16-D31 if they don't exist. */
3621     if (!dc_isar_feature(aa32_simd_r32, s) &&
3622         ((a->vd | a->vm) & 0x10)) {
3623         return false;
3624     }
3625
3626     if (!fn) {
3627         return false;
3628     }
3629
3630     if ((a->vd | a->vm) & a->q) {
3631         return false;
3632     }
3633
3634     if (!vfp_access_check(s)) {
3635         return true;
3636     }
3637
3638     tmp = tcg_temp_new_i32();
3639     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3640         read_neon_element32(tmp, a->vm, pass, MO_32);
3641         fn(tmp, tmp);
3642         write_neon_element32(tmp, a->vd, pass, MO_32);
3643     }
3644     tcg_temp_free_i32(tmp);
3645
3646     return true;
3647 }
3648
3649 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3650 {
3651     static NeonGenOneOpFn * const fn[] = {
3652         tcg_gen_bswap32_i32,
3653         gen_swap_half,
3654         NULL,
3655         NULL,
3656     };
3657     return do_2misc(s, a, fn[a->size]);
3658 }
3659
3660 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3661 {
3662     if (a->size != 0) {
3663         return false;
3664     }
3665     return do_2misc(s, a, gen_rev16);
3666 }
3667
3668 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3669 {
3670     static NeonGenOneOpFn * const fn[] = {
3671         gen_helper_neon_cls_s8,
3672         gen_helper_neon_cls_s16,
3673         gen_helper_neon_cls_s32,
3674         NULL,
3675     };
3676     return do_2misc(s, a, fn[a->size]);
3677 }
3678
3679 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3680 {
3681     tcg_gen_clzi_i32(rd, rm, 32);
3682 }
3683
3684 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3685 {
3686     static NeonGenOneOpFn * const fn[] = {
3687         gen_helper_neon_clz_u8,
3688         gen_helper_neon_clz_u16,
3689         do_VCLZ_32,
3690         NULL,
3691     };
3692     return do_2misc(s, a, fn[a->size]);
3693 }
3694
3695 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3696 {
3697     if (a->size != 0) {
3698         return false;
3699     }
3700     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3701 }
3702
3703 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3704                        uint32_t oprsz, uint32_t maxsz)
3705 {
3706     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3707                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3708                       oprsz, maxsz);
3709 }
3710
3711 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3712 {
3713     if (a->size == MO_16) {
3714         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3715             return false;
3716         }
3717     } else if (a->size != MO_32) {
3718         return false;
3719     }
3720     return do_2misc_vec(s, a, gen_VABS_F);
3721 }
3722
3723 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3724                        uint32_t oprsz, uint32_t maxsz)
3725 {
3726     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3727                       vece == MO_16 ? 0x8000 : 0x80000000,
3728                       oprsz, maxsz);
3729 }
3730
3731 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3732 {
3733     if (a->size == MO_16) {
3734         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3735             return false;
3736         }
3737     } else if (a->size != MO_32) {
3738         return false;
3739     }
3740     return do_2misc_vec(s, a, gen_VNEG_F);
3741 }
3742
3743 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3744 {
3745     if (a->size != 2) {
3746         return false;
3747     }
3748     return do_2misc(s, a, gen_helper_recpe_u32);
3749 }
3750
3751 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3752 {
3753     if (a->size != 2) {
3754         return false;
3755     }
3756     return do_2misc(s, a, gen_helper_rsqrte_u32);
3757 }
3758
3759 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3760     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3761     {                                                   \
3762         FUNC(d, cpu_env, m);                            \
3763     }
3764
3765 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3766 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3767 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3768 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3769 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3770 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3771
3772 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3773 {
3774     static NeonGenOneOpFn * const fn[] = {
3775         gen_VQABS_s8,
3776         gen_VQABS_s16,
3777         gen_VQABS_s32,
3778         NULL,
3779     };
3780     return do_2misc(s, a, fn[a->size]);
3781 }
3782
3783 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3784 {
3785     static NeonGenOneOpFn * const fn[] = {
3786         gen_VQNEG_s8,
3787         gen_VQNEG_s16,
3788         gen_VQNEG_s32,
3789         NULL,
3790     };
3791     return do_2misc(s, a, fn[a->size]);
3792 }
3793
3794 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3795     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3796                            uint32_t rm_ofs,                             \
3797                            uint32_t oprsz, uint32_t maxsz)              \
3798     {                                                                   \
3799         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3800             NULL, HFUNC, SFUNC, NULL,                                   \
3801         };                                                              \
3802         TCGv_ptr fpst;                                                  \
3803         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3804         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3805                            fns[vece]);                                  \
3806         tcg_temp_free_ptr(fpst);                                        \
3807     }                                                                   \
3808     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3809     {                                                                   \
3810         if (a->size == MO_16) {                                         \
3811             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3812                 return false;                                           \
3813             }                                                           \
3814         } else if (a->size != MO_32) {                                  \
3815             return false;                                               \
3816         }                                                               \
3817         return do_2misc_vec(s, a, gen_##INSN);                          \
3818     }
3819
3820 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3821 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3822 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3823 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3824 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3825 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3826 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3827 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3828 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3829 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3830 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3831
3832 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3833
3834 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3835 {
3836     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3837         return false;
3838     }
3839     return trans_VRINTX_impl(s, a);
3840 }
3841
3842 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3843     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3844                            uint32_t rm_ofs,                             \
3845                            uint32_t oprsz, uint32_t maxsz)              \
3846     {                                                                   \
3847         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3848             NULL,                                                       \
3849             gen_helper_gvec_##OP##h,                                    \
3850             gen_helper_gvec_##OP##s,                                    \
3851             NULL,                                                       \
3852         };                                                              \
3853         TCGv_ptr fpst;                                                  \
3854         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3855         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3856                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3857         tcg_temp_free_ptr(fpst);                                        \
3858     }                                                                   \
3859     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3860     {                                                                   \
3861         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3862             return false;                                               \
3863         }                                                               \
3864         if (a->size == MO_16) {                                         \
3865             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3866                 return false;                                           \
3867             }                                                           \
3868         } else if (a->size != MO_32) {                                  \
3869             return false;                                               \
3870         }                                                               \
3871         return do_2misc_vec(s, a, gen_##INSN);                          \
3872     }
3873
3874 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3875 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3876 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3877 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3878 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3879 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3880 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3881 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3882
3883 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3884 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3885 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3886 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3887 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3888
3889 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3890 {
3891     TCGv_i64 rm, rd;
3892     int pass;
3893
3894     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3895         return false;
3896     }
3897
3898     /* UNDEF accesses to D16-D31 if they don't exist. */
3899     if (!dc_isar_feature(aa32_simd_r32, s) &&
3900         ((a->vd | a->vm) & 0x10)) {
3901         return false;
3902     }
3903
3904     if (a->size != 0) {
3905         return false;
3906     }
3907
3908     if ((a->vd | a->vm) & a->q) {
3909         return false;
3910     }
3911
3912     if (!vfp_access_check(s)) {
3913         return true;
3914     }
3915
3916     rm = tcg_temp_new_i64();
3917     rd = tcg_temp_new_i64();
3918     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3919         read_neon_element64(rm, a->vm, pass, MO_64);
3920         read_neon_element64(rd, a->vd, pass, MO_64);
3921         write_neon_element64(rm, a->vd, pass, MO_64);
3922         write_neon_element64(rd, a->vm, pass, MO_64);
3923     }
3924     tcg_temp_free_i64(rm);
3925     tcg_temp_free_i64(rd);
3926
3927     return true;
3928 }
3929 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3930 {
3931     TCGv_i32 rd, tmp;
3932
3933     rd = tcg_temp_new_i32();
3934     tmp = tcg_temp_new_i32();
3935
3936     tcg_gen_shli_i32(rd, t0, 8);
3937     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3938     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3939     tcg_gen_or_i32(rd, rd, tmp);
3940
3941     tcg_gen_shri_i32(t1, t1, 8);
3942     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3943     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3944     tcg_gen_or_i32(t1, t1, tmp);
3945     tcg_gen_mov_i32(t0, rd);
3946
3947     tcg_temp_free_i32(tmp);
3948     tcg_temp_free_i32(rd);
3949 }
3950
3951 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3952 {
3953     TCGv_i32 rd, tmp;
3954
3955     rd = tcg_temp_new_i32();
3956     tmp = tcg_temp_new_i32();
3957
3958     tcg_gen_shli_i32(rd, t0, 16);
3959     tcg_gen_andi_i32(tmp, t1, 0xffff);
3960     tcg_gen_or_i32(rd, rd, tmp);
3961     tcg_gen_shri_i32(t1, t1, 16);
3962     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3963     tcg_gen_or_i32(t1, t1, tmp);
3964     tcg_gen_mov_i32(t0, rd);
3965
3966     tcg_temp_free_i32(tmp);
3967     tcg_temp_free_i32(rd);
3968 }
3969
3970 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3971 {
3972     TCGv_i32 tmp, tmp2;
3973     int pass;
3974
3975     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3976         return false;
3977     }
3978
3979     /* UNDEF accesses to D16-D31 if they don't exist. */
3980     if (!dc_isar_feature(aa32_simd_r32, s) &&
3981         ((a->vd | a->vm) & 0x10)) {
3982         return false;
3983     }
3984
3985     if ((a->vd | a->vm) & a->q) {
3986         return false;
3987     }
3988
3989     if (a->size == 3) {
3990         return false;
3991     }
3992
3993     if (!vfp_access_check(s)) {
3994         return true;
3995     }
3996
3997     tmp = tcg_temp_new_i32();
3998     tmp2 = tcg_temp_new_i32();
3999     if (a->size == MO_32) {
4000         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
4001             read_neon_element32(tmp, a->vm, pass, MO_32);
4002             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
4003             write_neon_element32(tmp2, a->vm, pass, MO_32);
4004             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
4005         }
4006     } else {
4007         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4008             read_neon_element32(tmp, a->vm, pass, MO_32);
4009             read_neon_element32(tmp2, a->vd, pass, MO_32);
4010             if (a->size == MO_8) {
4011                 gen_neon_trn_u8(tmp, tmp2);
4012             } else {
4013                 gen_neon_trn_u16(tmp, tmp2);
4014             }
4015             write_neon_element32(tmp2, a->vm, pass, MO_32);
4016             write_neon_element32(tmp, a->vd, pass, MO_32);
4017         }
4018     }
4019     tcg_temp_free_i32(tmp);
4020     tcg_temp_free_i32(tmp2);
4021     return true;
4022 }