target/arm/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/gen-icount.h"
  28 #include "translate.h"
  29 #include "translate-a32.h"
  30
  31 static inline int neon_3same_fp_size(DisasContext *s, int x)
  32 {
  33     /* Convert 0==fp32, 1==fp16 into a MO_* value */
  34     return MO_32 - x;
  35 }
  36
  37 /* Include the generated Neon decoder */
  38 #include "decode-neon-dp.c.inc"
  39 #include "decode-neon-ls.c.inc"
  40 #include "decode-neon-shared.c.inc"
  41
  42 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  43 {
  44     TCGv_ptr ret = tcg_temp_new_ptr();
  45     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  46     return ret;
  47 }
  48
  49 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  50 {
  51     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  52
  53     switch (mop) {
  54     case MO_UB:
  55         tcg_gen_ld8u_i32(var, cpu_env, offset);
  56         break;
  57     case MO_UW:
  58         tcg_gen_ld16u_i32(var, cpu_env, offset);
  59         break;
  60     case MO_UL:
  61         tcg_gen_ld_i32(var, cpu_env, offset);
  62         break;
  63     default:
  64         g_assert_not_reached();
  65     }
  66 }
  67
  68 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  69 {
  70     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  71
  72     switch (mop) {
  73     case MO_UB:
  74         tcg_gen_ld8u_i64(var, cpu_env, offset);
  75         break;
  76     case MO_UW:
  77         tcg_gen_ld16u_i64(var, cpu_env, offset);
  78         break;
  79     case MO_UL:
  80         tcg_gen_ld32u_i64(var, cpu_env, offset);
  81         break;
  82     case MO_Q:
  83         tcg_gen_ld_i64(var, cpu_env, offset);
  84         break;
  85     default:
  86         g_assert_not_reached();
  87     }
  88 }
  89
  90 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  91 {
  92     long offset = neon_element_offset(reg, ele, size);
  93
  94     switch (size) {
  95     case MO_8:
  96         tcg_gen_st8_i32(var, cpu_env, offset);
  97         break;
  98     case MO_16:
  99         tcg_gen_st16_i32(var, cpu_env, offset);
 100         break;
 101     case MO_32:
 102         tcg_gen_st_i32(var, cpu_env, offset);
 103         break;
 104     default:
 105         g_assert_not_reached();
 106     }
 107 }
 108
 109 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 110 {
 111     long offset = neon_element_offset(reg, ele, size);
 112
 113     switch (size) {
 114     case MO_8:
 115         tcg_gen_st8_i64(var, cpu_env, offset);
 116         break;
 117     case MO_16:
 118         tcg_gen_st16_i64(var, cpu_env, offset);
 119         break;
 120     case MO_32:
 121         tcg_gen_st32_i64(var, cpu_env, offset);
 122         break;
 123     case MO_64:
 124         tcg_gen_st_i64(var, cpu_env, offset);
 125         break;
 126     default:
 127         g_assert_not_reached();
 128     }
 129 }
 130
 131 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 132                          int data, gen_helper_gvec_4 *fn_gvec)
 133 {
 134     /* UNDEF accesses to D16-D31 if they don't exist. */
 135     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 136         return false;
 137     }
 138
 139     /*
 140      * UNDEF accesses to odd registers for each bit of Q.
 141      * Q will be 0b111 for all Q-reg instructions, otherwise
 142      * when we have mixed Q- and D-reg inputs.
 143      */
 144     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 145         return false;
 146     }
 147
 148     if (!vfp_access_check(s)) {
 149         return true;
 150     }
 151
 152     int opr_sz = q ? 16 : 8;
 153     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 154                        vfp_reg_offset(1, vn),
 155                        vfp_reg_offset(1, vm),
 156                        vfp_reg_offset(1, vd),
 157                        opr_sz, opr_sz, data, fn_gvec);
 158     return true;
 159 }
 160
 161 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 162                               int data, ARMFPStatusFlavour fp_flavour,
 163                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 164 {
 165     /* UNDEF accesses to D16-D31 if they don't exist. */
 166     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 167         return false;
 168     }
 169
 170     /*
 171      * UNDEF accesses to odd registers for each bit of Q.
 172      * Q will be 0b111 for all Q-reg instructions, otherwise
 173      * when we have mixed Q- and D-reg inputs.
 174      */
 175     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 176         return false;
 177     }
 178
 179     if (!vfp_access_check(s)) {
 180         return true;
 181     }
 182
 183     int opr_sz = q ? 16 : 8;
 184     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 185
 186     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 187                        vfp_reg_offset(1, vn),
 188                        vfp_reg_offset(1, vm),
 189                        vfp_reg_offset(1, vd),
 190                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 191     tcg_temp_free_ptr(fpst);
 192     return true;
 193 }
 194
 195 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 196 {
 197     if (!dc_isar_feature(aa32_vcma, s)) {
 198         return false;
 199     }
 200     if (a->size == MO_16) {
 201         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 202             return false;
 203         }
 204         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 205                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 206     }
 207     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 208                              FPST_STD, gen_helper_gvec_fcmlas);
 209 }
 210
 211 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 212 {
 213     int opr_sz;
 214     TCGv_ptr fpst;
 215     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 216
 217     if (!dc_isar_feature(aa32_vcma, s)
 218         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 219         return false;
 220     }
 221
 222     /* UNDEF accesses to D16-D31 if they don't exist. */
 223     if (!dc_isar_feature(aa32_simd_r32, s) &&
 224         ((a->vd | a->vn | a->vm) & 0x10)) {
 225         return false;
 226     }
 227
 228     if ((a->vn | a->vm | a->vd) & a->q) {
 229         return false;
 230     }
 231
 232     if (!vfp_access_check(s)) {
 233         return true;
 234     }
 235
 236     opr_sz = (1 + a->q) * 8;
 237     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 238     fn_gvec_ptr = (a->size == MO_16) ?
 239         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 240     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 241                        vfp_reg_offset(1, a->vn),
 242                        vfp_reg_offset(1, a->vm),
 243                        fpst, opr_sz, opr_sz, a->rot,
 244                        fn_gvec_ptr);
 245     tcg_temp_free_ptr(fpst);
 246     return true;
 247 }
 248
 249 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 250 {
 251     if (!dc_isar_feature(aa32_dp, s)) {
 252         return false;
 253     }
 254     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 255                         gen_helper_gvec_sdot_b);
 256 }
 257
 258 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 259 {
 260     if (!dc_isar_feature(aa32_dp, s)) {
 261         return false;
 262     }
 263     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 264                         gen_helper_gvec_udot_b);
 265 }
 266
 267 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 268 {
 269     if (!dc_isar_feature(aa32_i8mm, s)) {
 270         return false;
 271     }
 272     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 273                         gen_helper_gvec_usdot_b);
 274 }
 275
 276 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 277 {
 278     if (!dc_isar_feature(aa32_bf16, s)) {
 279         return false;
 280     }
 281     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 282                         gen_helper_gvec_bfdot);
 283 }
 284
 285 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 286 {
 287     int opr_sz;
 288
 289     if (!dc_isar_feature(aa32_fhm, s)) {
 290         return false;
 291     }
 292
 293     /* UNDEF accesses to D16-D31 if they don't exist. */
 294     if (!dc_isar_feature(aa32_simd_r32, s) &&
 295         (a->vd & 0x10)) {
 296         return false;
 297     }
 298
 299     if (a->vd & a->q) {
 300         return false;
 301     }
 302
 303     if (!vfp_access_check(s)) {
 304         return true;
 305     }
 306
 307     opr_sz = (1 + a->q) * 8;
 308     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 309                        vfp_reg_offset(a->q, a->vn),
 310                        vfp_reg_offset(a->q, a->vm),
 311                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 312                        gen_helper_gvec_fmlal_a32);
 313     return true;
 314 }
 315
 316 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 317 {
 318     int data = (a->index << 2) | a->rot;
 319
 320     if (!dc_isar_feature(aa32_vcma, s)) {
 321         return false;
 322     }
 323     if (a->size == MO_16) {
 324         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 325             return false;
 326         }
 327         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 328                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 329     }
 330     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 331                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 332 }
 333
 334 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 335 {
 336     if (!dc_isar_feature(aa32_dp, s)) {
 337         return false;
 338     }
 339     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 340                         gen_helper_gvec_sdot_idx_b);
 341 }
 342
 343 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 344 {
 345     if (!dc_isar_feature(aa32_dp, s)) {
 346         return false;
 347     }
 348     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 349                         gen_helper_gvec_udot_idx_b);
 350 }
 351
 352 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 353 {
 354     if (!dc_isar_feature(aa32_i8mm, s)) {
 355         return false;
 356     }
 357     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 358                         gen_helper_gvec_usdot_idx_b);
 359 }
 360
 361 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 362 {
 363     if (!dc_isar_feature(aa32_i8mm, s)) {
 364         return false;
 365     }
 366     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 367                         gen_helper_gvec_sudot_idx_b);
 368 }
 369
 370 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 371 {
 372     if (!dc_isar_feature(aa32_bf16, s)) {
 373         return false;
 374     }
 375     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 376                         gen_helper_gvec_bfdot_idx);
 377 }
 378
 379 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 380 {
 381     int opr_sz;
 382
 383     if (!dc_isar_feature(aa32_fhm, s)) {
 384         return false;
 385     }
 386
 387     /* UNDEF accesses to D16-D31 if they don't exist. */
 388     if (!dc_isar_feature(aa32_simd_r32, s) &&
 389         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 390         return false;
 391     }
 392
 393     if (a->vd & a->q) {
 394         return false;
 395     }
 396
 397     if (!vfp_access_check(s)) {
 398         return true;
 399     }
 400
 401     opr_sz = (1 + a->q) * 8;
 402     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 403                        vfp_reg_offset(a->q, a->vn),
 404                        vfp_reg_offset(a->q, a->rm),
 405                        cpu_env, opr_sz, opr_sz,
 406                        (a->index << 2) | a->s, /* is_2 == 0 */
 407                        gen_helper_gvec_fmlal_idx_a32);
 408     return true;
 409 }
 410
 411 static struct {
 412     int nregs;
 413     int interleave;
 414     int spacing;
 415 } const neon_ls_element_type[11] = {
 416     {1, 4, 1},
 417     {1, 4, 2},
 418     {4, 1, 1},
 419     {2, 2, 2},
 420     {1, 3, 1},
 421     {1, 3, 2},
 422     {3, 1, 1},
 423     {1, 1, 1},
 424     {1, 2, 1},
 425     {1, 2, 2},
 426     {2, 1, 1}
 427 };
 428
 429 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 430                                       int stride)
 431 {
 432     if (rm != 15) {
 433         TCGv_i32 base;
 434
 435         base = load_reg(s, rn);
 436         if (rm == 13) {
 437             tcg_gen_addi_i32(base, base, stride);
 438         } else {
 439             TCGv_i32 index;
 440             index = load_reg(s, rm);
 441             tcg_gen_add_i32(base, base, index);
 442             tcg_temp_free_i32(index);
 443         }
 444         store_reg(s, rn, base);
 445     }
 446 }
 447
 448 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 449 {
 450     /* Neon load/store multiple structures */
 451     int nregs, interleave, spacing, reg, n;
 452     MemOp mop, align, endian;
 453     int mmu_idx = get_mem_index(s);
 454     int size = a->size;
 455     TCGv_i64 tmp64;
 456     TCGv_i32 addr, tmp;
 457
 458     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 459         return false;
 460     }
 461
 462     /* UNDEF accesses to D16-D31 if they don't exist */
 463     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 464         return false;
 465     }
 466     if (a->itype > 10) {
 467         return false;
 468     }
 469     /* Catch UNDEF cases for bad values of align field */
 470     switch (a->itype & 0xc) {
 471     case 4:
 472         if (a->align >= 2) {
 473             return false;
 474         }
 475         break;
 476     case 8:
 477         if (a->align == 3) {
 478             return false;
 479         }
 480         break;
 481     default:
 482         break;
 483     }
 484     nregs = neon_ls_element_type[a->itype].nregs;
 485     interleave = neon_ls_element_type[a->itype].interleave;
 486     spacing = neon_ls_element_type[a->itype].spacing;
 487     if (size == 3 && (interleave | spacing) != 1) {
 488         return false;
 489     }
 490
 491     if (!vfp_access_check(s)) {
 492         return true;
 493     }
 494
 495     /* For our purposes, bytes are always little-endian.  */
 496     endian = s->be_data;
 497     if (size == 0) {
 498         endian = MO_LE;
 499     }
 500
 501     /* Enforce alignment requested by the instruction */
 502     if (a->align) {
 503         align = pow2_align(a->align + 2); /* 4 ** a->align */
 504     } else {
 505         align = s->align_mem ? MO_ALIGN : 0;
 506     }
 507
 508     /*
 509      * Consecutive little-endian elements from a single register
 510      * can be promoted to a larger little-endian operation.
 511      */
 512     if (interleave == 1 && endian == MO_LE) {
 513         /* Retain any natural alignment. */
 514         if (align == MO_ALIGN) {
 515             align = pow2_align(size);
 516         }
 517         size = 3;
 518     }
 519
 520     tmp64 = tcg_temp_new_i64();
 521     addr = tcg_temp_new_i32();
 522     tmp = tcg_const_i32(1 << size);
 523     load_reg_var(s, addr, a->rn);
 524
 525     mop = endian | size | align;
 526     for (reg = 0; reg < nregs; reg++) {
 527         for (n = 0; n < 8 >> size; n++) {
 528             int xs;
 529             for (xs = 0; xs < interleave; xs++) {
 530                 int tt = a->vd + reg + spacing * xs;
 531
 532                 if (a->l) {
 533                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 534                     neon_store_element64(tt, n, size, tmp64);
 535                 } else {
 536                     neon_load_element64(tmp64, tt, n, size);
 537                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 538                 }
 539                 tcg_gen_add_i32(addr, addr, tmp);
 540
 541                 /* Subsequent memory operations inherit alignment */
 542                 mop &= ~MO_AMASK;
 543             }
 544         }
 545     }
 546     tcg_temp_free_i32(addr);
 547     tcg_temp_free_i32(tmp);
 548     tcg_temp_free_i64(tmp64);
 549
 550     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 551     return true;
 552 }
 553
 554 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 555 {
 556     /* Neon load single structure to all lanes */
 557     int reg, stride, vec_size;
 558     int vd = a->vd;
 559     int size = a->size;
 560     int nregs = a->n + 1;
 561     TCGv_i32 addr, tmp;
 562     MemOp mop, align;
 563
 564     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 565         return false;
 566     }
 567
 568     /* UNDEF accesses to D16-D31 if they don't exist */
 569     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 570         return false;
 571     }
 572
 573     align = 0;
 574     if (size == 3) {
 575         if (nregs != 4 || a->a == 0) {
 576             return false;
 577         }
 578         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 579         size = MO_32;
 580         align = MO_ALIGN_16;
 581     } else if (a->a) {
 582         switch (nregs) {
 583         case 1:
 584             if (size == 0) {
 585                 return false;
 586             }
 587             align = MO_ALIGN;
 588             break;
 589         case 2:
 590             align = pow2_align(size + 1);
 591             break;
 592         case 3:
 593             return false;
 594         case 4:
 595             align = pow2_align(size + 2);
 596             break;
 597         default:
 598             g_assert_not_reached();
 599         }
 600     }
 601
 602     if (!vfp_access_check(s)) {
 603         return true;
 604     }
 605
 606     /*
 607      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 608      * VLD2/3/4 to all lanes: T bit indicates register stride.
 609      */
 610     stride = a->t ? 2 : 1;
 611     vec_size = nregs == 1 ? stride * 8 : 8;
 612     mop = size | align;
 613     tmp = tcg_temp_new_i32();
 614     addr = tcg_temp_new_i32();
 615     load_reg_var(s, addr, a->rn);
 616     for (reg = 0; reg < nregs; reg++) {
 617         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 618         if ((vd & 1) && vec_size == 16) {
 619             /*
 620              * We cannot write 16 bytes at once because the
 621              * destination is unaligned.
 622              */
 623             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 624                                  8, 8, tmp);
 625             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 626                              neon_full_reg_offset(vd), 8, 8);
 627         } else {
 628             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 629                                  vec_size, vec_size, tmp);
 630         }
 631         tcg_gen_addi_i32(addr, addr, 1 << size);
 632         vd += stride;
 633
 634         /* Subsequent memory operations inherit alignment */
 635         mop &= ~MO_AMASK;
 636     }
 637     tcg_temp_free_i32(tmp);
 638     tcg_temp_free_i32(addr);
 639
 640     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 641
 642     return true;
 643 }
 644
 645 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 646 {
 647     /* Neon load/store single structure to one lane */
 648     int reg;
 649     int nregs = a->n + 1;
 650     int vd = a->vd;
 651     TCGv_i32 addr, tmp;
 652     MemOp mop;
 653
 654     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 655         return false;
 656     }
 657
 658     /* UNDEF accesses to D16-D31 if they don't exist */
 659     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 660         return false;
 661     }
 662
 663     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 664     switch (nregs) {
 665     case 1:
 666         if (((a->align & (1 << a->size)) != 0) ||
 667             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 668             return false;
 669         }
 670         break;
 671     case 3:
 672         if ((a->align & 1) != 0) {
 673             return false;
 674         }
 675         /* fall through */
 676     case 2:
 677         if (a->size == 2 && (a->align & 2) != 0) {
 678             return false;
 679         }
 680         break;
 681     case 4:
 682         if (a->size == 2 && a->align == 3) {
 683             return false;
 684         }
 685         break;
 686     default:
 687         abort();
 688     }
 689     if ((vd + a->stride * (nregs - 1)) > 31) {
 690         /*
 691          * Attempts to write off the end of the register file are
 692          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 693          * access off the end of the array that holds the register data.
 694          */
 695         return false;
 696     }
 697
 698     if (!vfp_access_check(s)) {
 699         return true;
 700     }
 701
 702     /* Pick up SCTLR settings */
 703     mop = finalize_memop(s, a->size);
 704
 705     if (a->align) {
 706         MemOp align_op;
 707
 708         switch (nregs) {
 709         case 1:
 710             /* For VLD1, use natural alignment. */
 711             align_op = MO_ALIGN;
 712             break;
 713         case 2:
 714             /* For VLD2, use double alignment. */
 715             align_op = pow2_align(a->size + 1);
 716             break;
 717         case 4:
 718             if (a->size == MO_32) {
 719                 /*
 720                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 721                  * quad alignment; align = 3 is rejected above.
 722                  */
 723                 align_op = pow2_align(a->size + a->align);
 724             } else {
 725                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 726                 align_op = pow2_align(a->size + 2);
 727             }
 728             break;
 729         default:
 730             /* For VLD3, the alignment field is zero and rejected above. */
 731             g_assert_not_reached();
 732         }
 733
 734         mop = (mop & ~MO_AMASK) | align_op;
 735     }
 736
 737     tmp = tcg_temp_new_i32();
 738     addr = tcg_temp_new_i32();
 739     load_reg_var(s, addr, a->rn);
 740
 741     for (reg = 0; reg < nregs; reg++) {
 742         if (a->l) {
 743             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 744             neon_store_element(vd, a->reg_idx, a->size, tmp);
 745         } else { /* Store */
 746             neon_load_element(tmp, vd, a->reg_idx, a->size);
 747             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 748         }
 749         vd += a->stride;
 750         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 751
 752         /* Subsequent memory operations inherit alignment */
 753         mop &= ~MO_AMASK;
 754     }
 755     tcg_temp_free_i32(addr);
 756     tcg_temp_free_i32(tmp);
 757
 758     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 759
 760     return true;
 761 }
 762
 763 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 764 {
 765     int vec_size = a->q ? 16 : 8;
 766     int rd_ofs = neon_full_reg_offset(a->vd);
 767     int rn_ofs = neon_full_reg_offset(a->vn);
 768     int rm_ofs = neon_full_reg_offset(a->vm);
 769
 770     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 771         return false;
 772     }
 773
 774     /* UNDEF accesses to D16-D31 if they don't exist. */
 775     if (!dc_isar_feature(aa32_simd_r32, s) &&
 776         ((a->vd | a->vn | a->vm) & 0x10)) {
 777         return false;
 778     }
 779
 780     if ((a->vn | a->vm | a->vd) & a->q) {
 781         return false;
 782     }
 783
 784     if (!vfp_access_check(s)) {
 785         return true;
 786     }
 787
 788     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 789     return true;
 790 }
 791
 792 #define DO_3SAME(INSN, FUNC)                                            \
 793     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 794     {                                                                   \
 795         return do_3same(s, a, FUNC);                                    \
 796     }
 797
 798 DO_3SAME(VADD, tcg_gen_gvec_add)
 799 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 800 DO_3SAME(VAND, tcg_gen_gvec_and)
 801 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 802 DO_3SAME(VORR, tcg_gen_gvec_or)
 803 DO_3SAME(VORN, tcg_gen_gvec_orc)
 804 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 805 DO_3SAME(VSHL_S, gen_gvec_sshl)
 806 DO_3SAME(VSHL_U, gen_gvec_ushl)
 807 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 808 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 809 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 810 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 811
 812 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 813 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 814     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 815                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 816                                 uint32_t oprsz, uint32_t maxsz)         \
 817     {                                                                   \
 818         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 819     }                                                                   \
 820     DO_3SAME(INSN, gen_##INSN##_3s)
 821
 822 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 823 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 824 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 825
 826 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 827     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 828     {                                                                   \
 829         if (a->size == 3) {                                             \
 830             return false;                                               \
 831         }                                                               \
 832         return do_3same(s, a, FUNC);                                    \
 833     }
 834
 835 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 836 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 837 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 838 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 839 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 840 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 841 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 842 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 843 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 844 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 845 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 846 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 847
 848 #define DO_3SAME_CMP(INSN, COND)                                        \
 849     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 850                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 851                                 uint32_t oprsz, uint32_t maxsz)         \
 852     {                                                                   \
 853         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 854     }                                                                   \
 855     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 856
 857 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 858 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 859 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 860 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 861 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 862
 863 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 864     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 865                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 866     {                                                                      \
 867         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 868     }
 869
 870 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 871
 872 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 873 {
 874     if (a->size != 0) {
 875         return false;
 876     }
 877     return do_3same(s, a, gen_VMUL_p_3s);
 878 }
 879
 880 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 881     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 882     {                                                                   \
 883         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 884             return false;                                               \
 885         }                                                               \
 886         if (a->size != 1 && a->size != 2) {                             \
 887             return false;                                               \
 888         }                                                               \
 889         return do_3same(s, a, FUNC);                                    \
 890     }
 891
 892 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 893 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 894
 895 #define DO_SHA1(NAME, FUNC)                                             \
 896     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 897     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 898     {                                                                   \
 899         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 900             return false;                                               \
 901         }                                                               \
 902         return do_3same(s, a, gen_##NAME##_3s);                         \
 903     }
 904
 905 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 906 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 907 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 908 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 909
 910 #define DO_SHA2(NAME, FUNC)                                             \
 911     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 912     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 913     {                                                                   \
 914         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 915             return false;                                               \
 916         }                                                               \
 917         return do_3same(s, a, gen_##NAME##_3s);                         \
 918     }
 919
 920 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 921 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 922 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 923
 924 #define DO_3SAME_64(INSN, FUNC)                                         \
 925     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 926                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 927                                 uint32_t oprsz, uint32_t maxsz)         \
 928     {                                                                   \
 929         static const GVecGen3 op = { .fni8 = FUNC };                    \
 930         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 931     }                                                                   \
 932     DO_3SAME(INSN, gen_##INSN##_3s)
 933
 934 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 935     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 936     {                                                                   \
 937         FUNC(d, cpu_env, n, m);                                         \
 938     }                                                                   \
 939     DO_3SAME_64(INSN, gen_##INSN##_elt)
 940
 941 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 942 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 943 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 944 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 945 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 946 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 947
 948 #define DO_3SAME_32(INSN, FUNC)                                         \
 949     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 950                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 951                                 uint32_t oprsz, uint32_t maxsz)         \
 952     {                                                                   \
 953         static const GVecGen3 ops[4] = {                                \
 954             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 955             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 956             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 957             { 0 },                                                      \
 958         };                                                              \
 959         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 960     }                                                                   \
 961     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 962     {                                                                   \
 963         if (a->size > 2) {                                              \
 964             return false;                                               \
 965         }                                                               \
 966         return do_3same(s, a, gen_##INSN##_3s);                         \
 967     }
 968
 969 /*
 970  * Some helper functions need to be passed the cpu_env. In order
 971  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 972  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 973  * and which call a NeonGenTwoOpEnvFn().
 974  */
 975 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 976     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 977     {                                                                   \
 978         FUNC(d, cpu_env, n, m);                                         \
 979     }
 980
 981 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 982     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 983     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 984     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 985     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 986                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 987                                 uint32_t oprsz, uint32_t maxsz)         \
 988     {                                                                   \
 989         static const GVecGen3 ops[4] = {                                \
 990             { .fni4 = gen_##INSN##_tramp8 },                            \
 991             { .fni4 = gen_##INSN##_tramp16 },                           \
 992             { .fni4 = gen_##INSN##_tramp32 },                           \
 993             { 0 },                                                      \
 994         };                                                              \
 995         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 996     }                                                                   \
 997     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 998     {                                                                   \
 999         if (a->size > 2) {                                              \
1000             return false;                                               \
1001         }                                                               \
1002         return do_3same(s, a, gen_##INSN##_3s);                         \
1003     }
1004
1005 DO_3SAME_32(VHADD_S, hadd_s)
1006 DO_3SAME_32(VHADD_U, hadd_u)
1007 DO_3SAME_32(VHSUB_S, hsub_s)
1008 DO_3SAME_32(VHSUB_U, hsub_u)
1009 DO_3SAME_32(VRHADD_S, rhadd_s)
1010 DO_3SAME_32(VRHADD_U, rhadd_u)
1011 DO_3SAME_32(VRSHL_S, rshl_s)
1012 DO_3SAME_32(VRSHL_U, rshl_u)
1013
1014 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1015 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1016 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1017 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1018
1019 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1020 {
1021     /* Operations handled pairwise 32 bits at a time */
1022     TCGv_i32 tmp, tmp2, tmp3;
1023
1024     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1025         return false;
1026     }
1027
1028     /* UNDEF accesses to D16-D31 if they don't exist. */
1029     if (!dc_isar_feature(aa32_simd_r32, s) &&
1030         ((a->vd | a->vn | a->vm) & 0x10)) {
1031         return false;
1032     }
1033
1034     if (a->size == 3) {
1035         return false;
1036     }
1037
1038     if (!vfp_access_check(s)) {
1039         return true;
1040     }
1041
1042     assert(a->q == 0); /* enforced by decode patterns */
1043
1044     /*
1045      * Note that we have to be careful not to clobber the source operands
1046      * in the "vm == vd" case by storing the result of the first pass too
1047      * early. Since Q is 0 there are always just two passes, so instead
1048      * of a complicated loop over each pass we just unroll.
1049      */
1050     tmp = tcg_temp_new_i32();
1051     tmp2 = tcg_temp_new_i32();
1052     tmp3 = tcg_temp_new_i32();
1053
1054     read_neon_element32(tmp, a->vn, 0, MO_32);
1055     read_neon_element32(tmp2, a->vn, 1, MO_32);
1056     fn(tmp, tmp, tmp2);
1057
1058     read_neon_element32(tmp3, a->vm, 0, MO_32);
1059     read_neon_element32(tmp2, a->vm, 1, MO_32);
1060     fn(tmp3, tmp3, tmp2);
1061
1062     write_neon_element32(tmp, a->vd, 0, MO_32);
1063     write_neon_element32(tmp3, a->vd, 1, MO_32);
1064
1065     tcg_temp_free_i32(tmp);
1066     tcg_temp_free_i32(tmp2);
1067     tcg_temp_free_i32(tmp3);
1068     return true;
1069 }
1070
1071 #define DO_3SAME_PAIR(INSN, func)                                       \
1072     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1073     {                                                                   \
1074         static NeonGenTwoOpFn * const fns[] = {                         \
1075             gen_helper_neon_##func##8,                                  \
1076             gen_helper_neon_##func##16,                                 \
1077             gen_helper_neon_##func##32,                                 \
1078         };                                                              \
1079         if (a->size > 2) {                                              \
1080             return false;                                               \
1081         }                                                               \
1082         return do_3same_pair(s, a, fns[a->size]);                       \
1083     }
1084
1085 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1086 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1087 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1088 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1089 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1090 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1091
1092 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1093 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1094 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1095 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1096 DO_3SAME_PAIR(VPADD, padd_u)
1097
1098 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1099     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1100     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1101     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1102                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1103                                 uint32_t oprsz, uint32_t maxsz)         \
1104     {                                                                   \
1105         static const GVecGen3 ops[2] = {                                \
1106             { .fni4 = gen_##INSN##_tramp16 },                           \
1107             { .fni4 = gen_##INSN##_tramp32 },                           \
1108         };                                                              \
1109         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1110     }                                                                   \
1111     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1112     {                                                                   \
1113         if (a->size != 1 && a->size != 2) {                             \
1114             return false;                                               \
1115         }                                                               \
1116         return do_3same(s, a, gen_##INSN##_3s);                         \
1117     }
1118
1119 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1120 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1121
1122 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1123     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1124                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1125                          uint32_t oprsz, uint32_t maxsz)                \
1126     {                                                                   \
1127         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1128         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1129                            oprsz, maxsz, 0, FUNC);                      \
1130         tcg_temp_free_ptr(fpst);                                        \
1131     }
1132
1133 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1134     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1135     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1136     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1137     {                                                                   \
1138         if (a->size == MO_16) {                                         \
1139             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1140                 return false;                                           \
1141             }                                                           \
1142             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1143         }                                                               \
1144         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1145     }
1146
1147
1148 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1149 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1150 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1151 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1152 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1153 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1154 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1155 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1156 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1157 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1158 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1159 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1160 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1161 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1162 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1163 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1164 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1165
1166 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1167 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1168 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1169 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1170
1171 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1172 {
1173     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1174         return false;
1175     }
1176
1177     if (a->size == MO_16) {
1178         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1179             return false;
1180         }
1181         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1182     }
1183     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1184 }
1185
1186 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1187 {
1188     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1189         return false;
1190     }
1191
1192     if (a->size == MO_16) {
1193         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1194             return false;
1195         }
1196         return do_3same(s, a, gen_VMINNM_fp16_3s);
1197     }
1198     return do_3same(s, a, gen_VMINNM_fp32_3s);
1199 }
1200
1201 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1202                              gen_helper_gvec_3_ptr *fn)
1203 {
1204     /* FP pairwise operations */
1205     TCGv_ptr fpstatus;
1206
1207     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1208         return false;
1209     }
1210
1211     /* UNDEF accesses to D16-D31 if they don't exist. */
1212     if (!dc_isar_feature(aa32_simd_r32, s) &&
1213         ((a->vd | a->vn | a->vm) & 0x10)) {
1214         return false;
1215     }
1216
1217     if (!vfp_access_check(s)) {
1218         return true;
1219     }
1220
1221     assert(a->q == 0); /* enforced by decode patterns */
1222
1223
1224     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1225     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1226                        vfp_reg_offset(1, a->vn),
1227                        vfp_reg_offset(1, a->vm),
1228                        fpstatus, 8, 8, 0, fn);
1229     tcg_temp_free_ptr(fpstatus);
1230
1231     return true;
1232 }
1233
1234 /*
1235  * For all the functions using this macro, size == 1 means fp16,
1236  * which is an architecture extension we don't implement yet.
1237  */
1238 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1239     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1240     {                                                               \
1241         if (a->size == MO_16) {                                     \
1242             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1243                 return false;                                       \
1244             }                                                       \
1245             return do_3same_fp_pair(s, a, FUNC##h);                 \
1246         }                                                           \
1247         return do_3same_fp_pair(s, a, FUNC##s);                     \
1248     }
1249
1250 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1251 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1252 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1253
1254 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1255 {
1256     /* Handle a 2-reg-shift insn which can be vectorized. */
1257     int vec_size = a->q ? 16 : 8;
1258     int rd_ofs = neon_full_reg_offset(a->vd);
1259     int rm_ofs = neon_full_reg_offset(a->vm);
1260
1261     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1262         return false;
1263     }
1264
1265     /* UNDEF accesses to D16-D31 if they don't exist. */
1266     if (!dc_isar_feature(aa32_simd_r32, s) &&
1267         ((a->vd | a->vm) & 0x10)) {
1268         return false;
1269     }
1270
1271     if ((a->vm | a->vd) & a->q) {
1272         return false;
1273     }
1274
1275     if (!vfp_access_check(s)) {
1276         return true;
1277     }
1278
1279     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1280     return true;
1281 }
1282
1283 #define DO_2SH(INSN, FUNC)                                              \
1284     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1285     {                                                                   \
1286         return do_vector_2sh(s, a, FUNC);                               \
1287     }                                                                   \
1288
1289 DO_2SH(VSHL, tcg_gen_gvec_shli)
1290 DO_2SH(VSLI, gen_gvec_sli)
1291 DO_2SH(VSRI, gen_gvec_sri)
1292 DO_2SH(VSRA_S, gen_gvec_ssra)
1293 DO_2SH(VSRA_U, gen_gvec_usra)
1294 DO_2SH(VRSHR_S, gen_gvec_srshr)
1295 DO_2SH(VRSHR_U, gen_gvec_urshr)
1296 DO_2SH(VRSRA_S, gen_gvec_srsra)
1297 DO_2SH(VRSRA_U, gen_gvec_ursra)
1298
1299 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1300 {
1301     /* Signed shift out of range results in all-sign-bits */
1302     a->shift = MIN(a->shift, (8 << a->size) - 1);
1303     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1304 }
1305
1306 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1307                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1308 {
1309     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1310 }
1311
1312 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1313 {
1314     /* Shift out of range is architecturally valid and results in zero. */
1315     if (a->shift >= (8 << a->size)) {
1316         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1317     } else {
1318         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1319     }
1320 }
1321
1322 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1323                              NeonGenTwo64OpEnvFn *fn)
1324 {
1325     /*
1326      * 2-reg-and-shift operations, size == 3 case, where the
1327      * function needs to be passed cpu_env.
1328      */
1329     TCGv_i64 constimm;
1330     int pass;
1331
1332     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1333         return false;
1334     }
1335
1336     /* UNDEF accesses to D16-D31 if they don't exist. */
1337     if (!dc_isar_feature(aa32_simd_r32, s) &&
1338         ((a->vd | a->vm) & 0x10)) {
1339         return false;
1340     }
1341
1342     if ((a->vm | a->vd) & a->q) {
1343         return false;
1344     }
1345
1346     if (!vfp_access_check(s)) {
1347         return true;
1348     }
1349
1350     /*
1351      * To avoid excessive duplication of ops we implement shift
1352      * by immediate using the variable shift operations.
1353      */
1354     constimm = tcg_const_i64(dup_const(a->size, a->shift));
1355
1356     for (pass = 0; pass < a->q + 1; pass++) {
1357         TCGv_i64 tmp = tcg_temp_new_i64();
1358
1359         read_neon_element64(tmp, a->vm, pass, MO_64);
1360         fn(tmp, cpu_env, tmp, constimm);
1361         write_neon_element64(tmp, a->vd, pass, MO_64);
1362         tcg_temp_free_i64(tmp);
1363     }
1364     tcg_temp_free_i64(constimm);
1365     return true;
1366 }
1367
1368 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1369                              NeonGenTwoOpEnvFn *fn)
1370 {
1371     /*
1372      * 2-reg-and-shift operations, size < 3 case, where the
1373      * helper needs to be passed cpu_env.
1374      */
1375     TCGv_i32 constimm, tmp;
1376     int pass;
1377
1378     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1379         return false;
1380     }
1381
1382     /* UNDEF accesses to D16-D31 if they don't exist. */
1383     if (!dc_isar_feature(aa32_simd_r32, s) &&
1384         ((a->vd | a->vm) & 0x10)) {
1385         return false;
1386     }
1387
1388     if ((a->vm | a->vd) & a->q) {
1389         return false;
1390     }
1391
1392     if (!vfp_access_check(s)) {
1393         return true;
1394     }
1395
1396     /*
1397      * To avoid excessive duplication of ops we implement shift
1398      * by immediate using the variable shift operations.
1399      */
1400     constimm = tcg_const_i32(dup_const(a->size, a->shift));
1401     tmp = tcg_temp_new_i32();
1402
1403     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1404         read_neon_element32(tmp, a->vm, pass, MO_32);
1405         fn(tmp, cpu_env, tmp, constimm);
1406         write_neon_element32(tmp, a->vd, pass, MO_32);
1407     }
1408     tcg_temp_free_i32(tmp);
1409     tcg_temp_free_i32(constimm);
1410     return true;
1411 }
1412
1413 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1414     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1415     {                                                                   \
1416         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1417     }                                                                   \
1418     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1419     {                                                                   \
1420         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1421             gen_helper_neon_##FUNC##8,                                  \
1422             gen_helper_neon_##FUNC##16,                                 \
1423             gen_helper_neon_##FUNC##32,                                 \
1424         };                                                              \
1425         assert(a->size < ARRAY_SIZE(fns));                              \
1426         return do_2shift_env_32(s, a, fns[a->size]);                    \
1427     }
1428
1429 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1430 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1431 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1432
1433 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1434                                 NeonGenTwo64OpFn *shiftfn,
1435                                 NeonGenNarrowEnvFn *narrowfn)
1436 {
1437     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1438     TCGv_i64 constimm, rm1, rm2;
1439     TCGv_i32 rd;
1440
1441     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1442         return false;
1443     }
1444
1445     /* UNDEF accesses to D16-D31 if they don't exist. */
1446     if (!dc_isar_feature(aa32_simd_r32, s) &&
1447         ((a->vd | a->vm) & 0x10)) {
1448         return false;
1449     }
1450
1451     if (a->vm & 1) {
1452         return false;
1453     }
1454
1455     if (!vfp_access_check(s)) {
1456         return true;
1457     }
1458
1459     /*
1460      * This is always a right shift, and the shiftfn is always a
1461      * left-shift helper, which thus needs the negated shift count.
1462      */
1463     constimm = tcg_const_i64(-a->shift);
1464     rm1 = tcg_temp_new_i64();
1465     rm2 = tcg_temp_new_i64();
1466     rd = tcg_temp_new_i32();
1467
1468     /* Load both inputs first to avoid potential overwrite if rm == rd */
1469     read_neon_element64(rm1, a->vm, 0, MO_64);
1470     read_neon_element64(rm2, a->vm, 1, MO_64);
1471
1472     shiftfn(rm1, rm1, constimm);
1473     narrowfn(rd, cpu_env, rm1);
1474     write_neon_element32(rd, a->vd, 0, MO_32);
1475
1476     shiftfn(rm2, rm2, constimm);
1477     narrowfn(rd, cpu_env, rm2);
1478     write_neon_element32(rd, a->vd, 1, MO_32);
1479
1480     tcg_temp_free_i32(rd);
1481     tcg_temp_free_i64(rm1);
1482     tcg_temp_free_i64(rm2);
1483     tcg_temp_free_i64(constimm);
1484
1485     return true;
1486 }
1487
1488 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1489                                 NeonGenTwoOpFn *shiftfn,
1490                                 NeonGenNarrowEnvFn *narrowfn)
1491 {
1492     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1493     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1494     TCGv_i64 rtmp;
1495     uint32_t imm;
1496
1497     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1498         return false;
1499     }
1500
1501     /* UNDEF accesses to D16-D31 if they don't exist. */
1502     if (!dc_isar_feature(aa32_simd_r32, s) &&
1503         ((a->vd | a->vm) & 0x10)) {
1504         return false;
1505     }
1506
1507     if (a->vm & 1) {
1508         return false;
1509     }
1510
1511     if (!vfp_access_check(s)) {
1512         return true;
1513     }
1514
1515     /*
1516      * This is always a right shift, and the shiftfn is always a
1517      * left-shift helper, which thus needs the negated shift count
1518      * duplicated into each lane of the immediate value.
1519      */
1520     if (a->size == 1) {
1521         imm = (uint16_t)(-a->shift);
1522         imm |= imm << 16;
1523     } else {
1524         /* size == 2 */
1525         imm = -a->shift;
1526     }
1527     constimm = tcg_const_i32(imm);
1528
1529     /* Load all inputs first to avoid potential overwrite */
1530     rm1 = tcg_temp_new_i32();
1531     rm2 = tcg_temp_new_i32();
1532     rm3 = tcg_temp_new_i32();
1533     rm4 = tcg_temp_new_i32();
1534     read_neon_element32(rm1, a->vm, 0, MO_32);
1535     read_neon_element32(rm2, a->vm, 1, MO_32);
1536     read_neon_element32(rm3, a->vm, 2, MO_32);
1537     read_neon_element32(rm4, a->vm, 3, MO_32);
1538     rtmp = tcg_temp_new_i64();
1539
1540     shiftfn(rm1, rm1, constimm);
1541     shiftfn(rm2, rm2, constimm);
1542
1543     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1544     tcg_temp_free_i32(rm2);
1545
1546     narrowfn(rm1, cpu_env, rtmp);
1547     write_neon_element32(rm1, a->vd, 0, MO_32);
1548     tcg_temp_free_i32(rm1);
1549
1550     shiftfn(rm3, rm3, constimm);
1551     shiftfn(rm4, rm4, constimm);
1552     tcg_temp_free_i32(constimm);
1553
1554     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1555     tcg_temp_free_i32(rm4);
1556
1557     narrowfn(rm3, cpu_env, rtmp);
1558     tcg_temp_free_i64(rtmp);
1559     write_neon_element32(rm3, a->vd, 1, MO_32);
1560     tcg_temp_free_i32(rm3);
1561     return true;
1562 }
1563
1564 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1565     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1566     {                                                                   \
1567         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1568     }
1569 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1570     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1571     {                                                                   \
1572         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1573     }
1574
1575 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1576 {
1577     tcg_gen_extrl_i64_i32(dest, src);
1578 }
1579
1580 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1581 {
1582     gen_helper_neon_narrow_u16(dest, src);
1583 }
1584
1585 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1586 {
1587     gen_helper_neon_narrow_u8(dest, src);
1588 }
1589
1590 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1591 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1592 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1593
1594 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1595 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1596 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1597
1598 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1599 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1600 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1601
1602 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1603 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1604 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1605 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1606 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1607 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1608
1609 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1610 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1611 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1612
1613 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1614 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1615 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1616
1617 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1618 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1619 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1620
1621 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1622                          NeonGenWidenFn *widenfn, bool u)
1623 {
1624     TCGv_i64 tmp;
1625     TCGv_i32 rm0, rm1;
1626     uint64_t widen_mask = 0;
1627
1628     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1629         return false;
1630     }
1631
1632     /* UNDEF accesses to D16-D31 if they don't exist. */
1633     if (!dc_isar_feature(aa32_simd_r32, s) &&
1634         ((a->vd | a->vm) & 0x10)) {
1635         return false;
1636     }
1637
1638     if (a->vd & 1) {
1639         return false;
1640     }
1641
1642     if (!vfp_access_check(s)) {
1643         return true;
1644     }
1645
1646     /*
1647      * This is a widen-and-shift operation. The shift is always less
1648      * than the width of the source type, so after widening the input
1649      * vector we can simply shift the whole 64-bit widened register,
1650      * and then clear the potential overflow bits resulting from left
1651      * bits of the narrow input appearing as right bits of the left
1652      * neighbour narrow input. Calculate a mask of bits to clear.
1653      */
1654     if ((a->shift != 0) && (a->size < 2 || u)) {
1655         int esize = 8 << a->size;
1656         widen_mask = MAKE_64BIT_MASK(0, esize);
1657         widen_mask >>= esize - a->shift;
1658         widen_mask = dup_const(a->size + 1, widen_mask);
1659     }
1660
1661     rm0 = tcg_temp_new_i32();
1662     rm1 = tcg_temp_new_i32();
1663     read_neon_element32(rm0, a->vm, 0, MO_32);
1664     read_neon_element32(rm1, a->vm, 1, MO_32);
1665     tmp = tcg_temp_new_i64();
1666
1667     widenfn(tmp, rm0);
1668     tcg_temp_free_i32(rm0);
1669     if (a->shift != 0) {
1670         tcg_gen_shli_i64(tmp, tmp, a->shift);
1671         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1672     }
1673     write_neon_element64(tmp, a->vd, 0, MO_64);
1674
1675     widenfn(tmp, rm1);
1676     tcg_temp_free_i32(rm1);
1677     if (a->shift != 0) {
1678         tcg_gen_shli_i64(tmp, tmp, a->shift);
1679         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1680     }
1681     write_neon_element64(tmp, a->vd, 1, MO_64);
1682     tcg_temp_free_i64(tmp);
1683     return true;
1684 }
1685
1686 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1687 {
1688     static NeonGenWidenFn * const widenfn[] = {
1689         gen_helper_neon_widen_s8,
1690         gen_helper_neon_widen_s16,
1691         tcg_gen_ext_i32_i64,
1692     };
1693     return do_vshll_2sh(s, a, widenfn[a->size], false);
1694 }
1695
1696 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1697 {
1698     static NeonGenWidenFn * const widenfn[] = {
1699         gen_helper_neon_widen_u8,
1700         gen_helper_neon_widen_u16,
1701         tcg_gen_extu_i32_i64,
1702     };
1703     return do_vshll_2sh(s, a, widenfn[a->size], true);
1704 }
1705
1706 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1707                       gen_helper_gvec_2_ptr *fn)
1708 {
1709     /* FP operations in 2-reg-and-shift group */
1710     int vec_size = a->q ? 16 : 8;
1711     int rd_ofs = neon_full_reg_offset(a->vd);
1712     int rm_ofs = neon_full_reg_offset(a->vm);
1713     TCGv_ptr fpst;
1714
1715     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1716         return false;
1717     }
1718
1719     if (a->size == MO_16) {
1720         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1721             return false;
1722         }
1723     }
1724
1725     /* UNDEF accesses to D16-D31 if they don't exist. */
1726     if (!dc_isar_feature(aa32_simd_r32, s) &&
1727         ((a->vd | a->vm) & 0x10)) {
1728         return false;
1729     }
1730
1731     if ((a->vm | a->vd) & a->q) {
1732         return false;
1733     }
1734
1735     if (!vfp_access_check(s)) {
1736         return true;
1737     }
1738
1739     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1740     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1741     tcg_temp_free_ptr(fpst);
1742     return true;
1743 }
1744
1745 #define DO_FP_2SH(INSN, FUNC)                                           \
1746     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1747     {                                                                   \
1748         return do_fp_2sh(s, a, FUNC);                                   \
1749     }
1750
1751 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1752 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1753 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1754 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1755
1756 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1757 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1758 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1759 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1760
1761 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1762                         GVecGen2iFn *fn)
1763 {
1764     uint64_t imm;
1765     int reg_ofs, vec_size;
1766
1767     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1768         return false;
1769     }
1770
1771     /* UNDEF accesses to D16-D31 if they don't exist. */
1772     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1773         return false;
1774     }
1775
1776     if (a->vd & a->q) {
1777         return false;
1778     }
1779
1780     if (!vfp_access_check(s)) {
1781         return true;
1782     }
1783
1784     reg_ofs = neon_full_reg_offset(a->vd);
1785     vec_size = a->q ? 16 : 8;
1786     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1787
1788     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1789     return true;
1790 }
1791
1792 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1793                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1794 {
1795     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1796 }
1797
1798 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1799 {
1800     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1801     GVecGen2iFn *fn;
1802
1803     if ((a->cmode & 1) && a->cmode < 12) {
1804         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1805         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1806     } else {
1807         /* There is one unallocated cmode/op combination in this space */
1808         if (a->cmode == 15 && a->op == 1) {
1809             return false;
1810         }
1811         fn = gen_VMOV_1r;
1812     }
1813     return do_1reg_imm(s, a, fn);
1814 }
1815
1816 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1817                            NeonGenWidenFn *widenfn,
1818                            NeonGenTwo64OpFn *opfn,
1819                            int src1_mop, int src2_mop)
1820 {
1821     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1822     TCGv_i64 rn0_64, rn1_64, rm_64;
1823
1824     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1825         return false;
1826     }
1827
1828     /* UNDEF accesses to D16-D31 if they don't exist. */
1829     if (!dc_isar_feature(aa32_simd_r32, s) &&
1830         ((a->vd | a->vn | a->vm) & 0x10)) {
1831         return false;
1832     }
1833
1834     if (!opfn) {
1835         /* size == 3 case, which is an entirely different insn group */
1836         return false;
1837     }
1838
1839     if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1840         return false;
1841     }
1842
1843     if (!vfp_access_check(s)) {
1844         return true;
1845     }
1846
1847     rn0_64 = tcg_temp_new_i64();
1848     rn1_64 = tcg_temp_new_i64();
1849     rm_64 = tcg_temp_new_i64();
1850
1851     if (src1_mop >= 0) {
1852         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1853     } else {
1854         TCGv_i32 tmp = tcg_temp_new_i32();
1855         read_neon_element32(tmp, a->vn, 0, MO_32);
1856         widenfn(rn0_64, tmp);
1857         tcg_temp_free_i32(tmp);
1858     }
1859     if (src2_mop >= 0) {
1860         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1861     } else {
1862         TCGv_i32 tmp = tcg_temp_new_i32();
1863         read_neon_element32(tmp, a->vm, 0, MO_32);
1864         widenfn(rm_64, tmp);
1865         tcg_temp_free_i32(tmp);
1866     }
1867
1868     opfn(rn0_64, rn0_64, rm_64);
1869
1870     /*
1871      * Load second pass inputs before storing the first pass result, to
1872      * avoid incorrect results if a narrow input overlaps with the result.
1873      */
1874     if (src1_mop >= 0) {
1875         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1876     } else {
1877         TCGv_i32 tmp = tcg_temp_new_i32();
1878         read_neon_element32(tmp, a->vn, 1, MO_32);
1879         widenfn(rn1_64, tmp);
1880         tcg_temp_free_i32(tmp);
1881     }
1882     if (src2_mop >= 0) {
1883         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1884     } else {
1885         TCGv_i32 tmp = tcg_temp_new_i32();
1886         read_neon_element32(tmp, a->vm, 1, MO_32);
1887         widenfn(rm_64, tmp);
1888         tcg_temp_free_i32(tmp);
1889     }
1890
1891     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1892
1893     opfn(rn1_64, rn1_64, rm_64);
1894     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1895
1896     tcg_temp_free_i64(rn0_64);
1897     tcg_temp_free_i64(rn1_64);
1898     tcg_temp_free_i64(rm_64);
1899
1900     return true;
1901 }
1902
1903 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1904     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1905     {                                                                   \
1906         static NeonGenWidenFn * const widenfn[] = {                     \
1907             gen_helper_neon_widen_##S##8,                               \
1908             gen_helper_neon_widen_##S##16,                              \
1909             NULL, NULL,                                                 \
1910         };                                                              \
1911         static NeonGenTwo64OpFn * const addfn[] = {                     \
1912             gen_helper_neon_##OP##l_u16,                                \
1913             gen_helper_neon_##OP##l_u32,                                \
1914             tcg_gen_##OP##_i64,                                         \
1915             NULL,                                                       \
1916         };                                                              \
1917         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1918         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1919                               SRC1WIDE ? MO_Q : narrow_mop,             \
1920                               narrow_mop);                              \
1921     }
1922
1923 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1924 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1925 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1926 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1927 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1928 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1929 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1930 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1931
1932 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1933                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1934 {
1935     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1936     TCGv_i64 rn_64, rm_64;
1937     TCGv_i32 rd0, rd1;
1938
1939     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1940         return false;
1941     }
1942
1943     /* UNDEF accesses to D16-D31 if they don't exist. */
1944     if (!dc_isar_feature(aa32_simd_r32, s) &&
1945         ((a->vd | a->vn | a->vm) & 0x10)) {
1946         return false;
1947     }
1948
1949     if (!opfn || !narrowfn) {
1950         /* size == 3 case, which is an entirely different insn group */
1951         return false;
1952     }
1953
1954     if ((a->vn | a->vm) & 1) {
1955         return false;
1956     }
1957
1958     if (!vfp_access_check(s)) {
1959         return true;
1960     }
1961
1962     rn_64 = tcg_temp_new_i64();
1963     rm_64 = tcg_temp_new_i64();
1964     rd0 = tcg_temp_new_i32();
1965     rd1 = tcg_temp_new_i32();
1966
1967     read_neon_element64(rn_64, a->vn, 0, MO_64);
1968     read_neon_element64(rm_64, a->vm, 0, MO_64);
1969
1970     opfn(rn_64, rn_64, rm_64);
1971
1972     narrowfn(rd0, rn_64);
1973
1974     read_neon_element64(rn_64, a->vn, 1, MO_64);
1975     read_neon_element64(rm_64, a->vm, 1, MO_64);
1976
1977     opfn(rn_64, rn_64, rm_64);
1978
1979     narrowfn(rd1, rn_64);
1980
1981     write_neon_element32(rd0, a->vd, 0, MO_32);
1982     write_neon_element32(rd1, a->vd, 1, MO_32);
1983
1984     tcg_temp_free_i32(rd0);
1985     tcg_temp_free_i32(rd1);
1986     tcg_temp_free_i64(rn_64);
1987     tcg_temp_free_i64(rm_64);
1988
1989     return true;
1990 }
1991
1992 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1993     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1994     {                                                                   \
1995         static NeonGenTwo64OpFn * const addfn[] = {                     \
1996             gen_helper_neon_##OP##l_u16,                                \
1997             gen_helper_neon_##OP##l_u32,                                \
1998             tcg_gen_##OP##_i64,                                         \
1999             NULL,                                                       \
2000         };                                                              \
2001         static NeonGenNarrowFn * const narrowfn[] = {                   \
2002             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2003             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2004             EXTOP,                                                      \
2005             NULL,                                                       \
2006         };                                                              \
2007         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2008     }
2009
2010 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2011 {
2012     tcg_gen_addi_i64(rn, rn, 1u << 31);
2013     tcg_gen_extrh_i64_i32(rd, rn);
2014 }
2015
2016 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2017 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2018 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2019 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2020
2021 static bool do_long_3d(DisasContext *s, arg_3diff *a,
2022                        NeonGenTwoOpWidenFn *opfn,
2023                        NeonGenTwo64OpFn *accfn)
2024 {
2025     /*
2026      * 3-regs different lengths, long operations.
2027      * These perform an operation on two inputs that returns a double-width
2028      * result, and then possibly perform an accumulation operation of
2029      * that result into the double-width destination.
2030      */
2031     TCGv_i64 rd0, rd1, tmp;
2032     TCGv_i32 rn, rm;
2033
2034     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2035         return false;
2036     }
2037
2038     /* UNDEF accesses to D16-D31 if they don't exist. */
2039     if (!dc_isar_feature(aa32_simd_r32, s) &&
2040         ((a->vd | a->vn | a->vm) & 0x10)) {
2041         return false;
2042     }
2043
2044     if (!opfn) {
2045         /* size == 3 case, which is an entirely different insn group */
2046         return false;
2047     }
2048
2049     if (a->vd & 1) {
2050         return false;
2051     }
2052
2053     if (!vfp_access_check(s)) {
2054         return true;
2055     }
2056
2057     rd0 = tcg_temp_new_i64();
2058     rd1 = tcg_temp_new_i64();
2059
2060     rn = tcg_temp_new_i32();
2061     rm = tcg_temp_new_i32();
2062     read_neon_element32(rn, a->vn, 0, MO_32);
2063     read_neon_element32(rm, a->vm, 0, MO_32);
2064     opfn(rd0, rn, rm);
2065
2066     read_neon_element32(rn, a->vn, 1, MO_32);
2067     read_neon_element32(rm, a->vm, 1, MO_32);
2068     opfn(rd1, rn, rm);
2069     tcg_temp_free_i32(rn);
2070     tcg_temp_free_i32(rm);
2071
2072     /* Don't store results until after all loads: they might overlap */
2073     if (accfn) {
2074         tmp = tcg_temp_new_i64();
2075         read_neon_element64(tmp, a->vd, 0, MO_64);
2076         accfn(rd0, tmp, rd0);
2077         read_neon_element64(tmp, a->vd, 1, MO_64);
2078         accfn(rd1, tmp, rd1);
2079         tcg_temp_free_i64(tmp);
2080     }
2081
2082     write_neon_element64(rd0, a->vd, 0, MO_64);
2083     write_neon_element64(rd1, a->vd, 1, MO_64);
2084     tcg_temp_free_i64(rd0);
2085     tcg_temp_free_i64(rd1);
2086
2087     return true;
2088 }
2089
2090 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2091 {
2092     static NeonGenTwoOpWidenFn * const opfn[] = {
2093         gen_helper_neon_abdl_s16,
2094         gen_helper_neon_abdl_s32,
2095         gen_helper_neon_abdl_s64,
2096         NULL,
2097     };
2098
2099     return do_long_3d(s, a, opfn[a->size], NULL);
2100 }
2101
2102 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2103 {
2104     static NeonGenTwoOpWidenFn * const opfn[] = {
2105         gen_helper_neon_abdl_u16,
2106         gen_helper_neon_abdl_u32,
2107         gen_helper_neon_abdl_u64,
2108         NULL,
2109     };
2110
2111     return do_long_3d(s, a, opfn[a->size], NULL);
2112 }
2113
2114 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2115 {
2116     static NeonGenTwoOpWidenFn * const opfn[] = {
2117         gen_helper_neon_abdl_s16,
2118         gen_helper_neon_abdl_s32,
2119         gen_helper_neon_abdl_s64,
2120         NULL,
2121     };
2122     static NeonGenTwo64OpFn * const addfn[] = {
2123         gen_helper_neon_addl_u16,
2124         gen_helper_neon_addl_u32,
2125         tcg_gen_add_i64,
2126         NULL,
2127     };
2128
2129     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2130 }
2131
2132 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2133 {
2134     static NeonGenTwoOpWidenFn * const opfn[] = {
2135         gen_helper_neon_abdl_u16,
2136         gen_helper_neon_abdl_u32,
2137         gen_helper_neon_abdl_u64,
2138         NULL,
2139     };
2140     static NeonGenTwo64OpFn * const addfn[] = {
2141         gen_helper_neon_addl_u16,
2142         gen_helper_neon_addl_u32,
2143         tcg_gen_add_i64,
2144         NULL,
2145     };
2146
2147     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2148 }
2149
2150 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2151 {
2152     TCGv_i32 lo = tcg_temp_new_i32();
2153     TCGv_i32 hi = tcg_temp_new_i32();
2154
2155     tcg_gen_muls2_i32(lo, hi, rn, rm);
2156     tcg_gen_concat_i32_i64(rd, lo, hi);
2157
2158     tcg_temp_free_i32(lo);
2159     tcg_temp_free_i32(hi);
2160 }
2161
2162 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2163 {
2164     TCGv_i32 lo = tcg_temp_new_i32();
2165     TCGv_i32 hi = tcg_temp_new_i32();
2166
2167     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2168     tcg_gen_concat_i32_i64(rd, lo, hi);
2169
2170     tcg_temp_free_i32(lo);
2171     tcg_temp_free_i32(hi);
2172 }
2173
2174 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2175 {
2176     static NeonGenTwoOpWidenFn * const opfn[] = {
2177         gen_helper_neon_mull_s8,
2178         gen_helper_neon_mull_s16,
2179         gen_mull_s32,
2180         NULL,
2181     };
2182
2183     return do_long_3d(s, a, opfn[a->size], NULL);
2184 }
2185
2186 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2187 {
2188     static NeonGenTwoOpWidenFn * const opfn[] = {
2189         gen_helper_neon_mull_u8,
2190         gen_helper_neon_mull_u16,
2191         gen_mull_u32,
2192         NULL,
2193     };
2194
2195     return do_long_3d(s, a, opfn[a->size], NULL);
2196 }
2197
2198 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2199     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2200     {                                                                   \
2201         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2202             gen_helper_neon_##MULL##8,                                  \
2203             gen_helper_neon_##MULL##16,                                 \
2204             gen_##MULL##32,                                             \
2205             NULL,                                                       \
2206         };                                                              \
2207         static NeonGenTwo64OpFn * const accfn[] = {                     \
2208             gen_helper_neon_##ACC##l_u16,                               \
2209             gen_helper_neon_##ACC##l_u32,                               \
2210             tcg_gen_##ACC##_i64,                                        \
2211             NULL,                                                       \
2212         };                                                              \
2213         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2214     }
2215
2216 DO_VMLAL(VMLAL_S,mull_s,add)
2217 DO_VMLAL(VMLAL_U,mull_u,add)
2218 DO_VMLAL(VMLSL_S,mull_s,sub)
2219 DO_VMLAL(VMLSL_U,mull_u,sub)
2220
2221 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2222 {
2223     gen_helper_neon_mull_s16(rd, rn, rm);
2224     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2225 }
2226
2227 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2228 {
2229     gen_mull_s32(rd, rn, rm);
2230     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2231 }
2232
2233 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2234 {
2235     static NeonGenTwoOpWidenFn * const opfn[] = {
2236         NULL,
2237         gen_VQDMULL_16,
2238         gen_VQDMULL_32,
2239         NULL,
2240     };
2241
2242     return do_long_3d(s, a, opfn[a->size], NULL);
2243 }
2244
2245 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2246 {
2247     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2248 }
2249
2250 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2251 {
2252     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2253 }
2254
2255 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2256 {
2257     static NeonGenTwoOpWidenFn * const opfn[] = {
2258         NULL,
2259         gen_VQDMULL_16,
2260         gen_VQDMULL_32,
2261         NULL,
2262     };
2263     static NeonGenTwo64OpFn * const accfn[] = {
2264         NULL,
2265         gen_VQDMLAL_acc_16,
2266         gen_VQDMLAL_acc_32,
2267         NULL,
2268     };
2269
2270     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2271 }
2272
2273 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2274 {
2275     gen_helper_neon_negl_u32(rm, rm);
2276     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2277 }
2278
2279 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2280 {
2281     tcg_gen_neg_i64(rm, rm);
2282     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2283 }
2284
2285 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2286 {
2287     static NeonGenTwoOpWidenFn * const opfn[] = {
2288         NULL,
2289         gen_VQDMULL_16,
2290         gen_VQDMULL_32,
2291         NULL,
2292     };
2293     static NeonGenTwo64OpFn * const accfn[] = {
2294         NULL,
2295         gen_VQDMLSL_acc_16,
2296         gen_VQDMLSL_acc_32,
2297         NULL,
2298     };
2299
2300     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2301 }
2302
2303 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2304 {
2305     gen_helper_gvec_3 *fn_gvec;
2306
2307     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2308         return false;
2309     }
2310
2311     /* UNDEF accesses to D16-D31 if they don't exist. */
2312     if (!dc_isar_feature(aa32_simd_r32, s) &&
2313         ((a->vd | a->vn | a->vm) & 0x10)) {
2314         return false;
2315     }
2316
2317     if (a->vd & 1) {
2318         return false;
2319     }
2320
2321     switch (a->size) {
2322     case 0:
2323         fn_gvec = gen_helper_neon_pmull_h;
2324         break;
2325     case 2:
2326         if (!dc_isar_feature(aa32_pmull, s)) {
2327             return false;
2328         }
2329         fn_gvec = gen_helper_gvec_pmull_q;
2330         break;
2331     default:
2332         return false;
2333     }
2334
2335     if (!vfp_access_check(s)) {
2336         return true;
2337     }
2338
2339     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2340                        neon_full_reg_offset(a->vn),
2341                        neon_full_reg_offset(a->vm),
2342                        16, 16, 0, fn_gvec);
2343     return true;
2344 }
2345
2346 static void gen_neon_dup_low16(TCGv_i32 var)
2347 {
2348     TCGv_i32 tmp = tcg_temp_new_i32();
2349     tcg_gen_ext16u_i32(var, var);
2350     tcg_gen_shli_i32(tmp, var, 16);
2351     tcg_gen_or_i32(var, var, tmp);
2352     tcg_temp_free_i32(tmp);
2353 }
2354
2355 static void gen_neon_dup_high16(TCGv_i32 var)
2356 {
2357     TCGv_i32 tmp = tcg_temp_new_i32();
2358     tcg_gen_andi_i32(var, var, 0xffff0000);
2359     tcg_gen_shri_i32(tmp, var, 16);
2360     tcg_gen_or_i32(var, var, tmp);
2361     tcg_temp_free_i32(tmp);
2362 }
2363
2364 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2365 {
2366     TCGv_i32 tmp = tcg_temp_new_i32();
2367     if (size == MO_16) {
2368         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2369         if (reg & 8) {
2370             gen_neon_dup_high16(tmp);
2371         } else {
2372             gen_neon_dup_low16(tmp);
2373         }
2374     } else {
2375         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2376     }
2377     return tmp;
2378 }
2379
2380 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2381                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2382 {
2383     /*
2384      * Two registers and a scalar: perform an operation between
2385      * the input elements and the scalar, and then possibly
2386      * perform an accumulation operation of that result into the
2387      * destination.
2388      */
2389     TCGv_i32 scalar, tmp;
2390     int pass;
2391
2392     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2393         return false;
2394     }
2395
2396     /* UNDEF accesses to D16-D31 if they don't exist. */
2397     if (!dc_isar_feature(aa32_simd_r32, s) &&
2398         ((a->vd | a->vn | a->vm) & 0x10)) {
2399         return false;
2400     }
2401
2402     if (!opfn) {
2403         /* Bad size (including size == 3, which is a different insn group) */
2404         return false;
2405     }
2406
2407     if (a->q && ((a->vd | a->vn) & 1)) {
2408         return false;
2409     }
2410
2411     if (!vfp_access_check(s)) {
2412         return true;
2413     }
2414
2415     scalar = neon_get_scalar(a->size, a->vm);
2416     tmp = tcg_temp_new_i32();
2417
2418     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2419         read_neon_element32(tmp, a->vn, pass, MO_32);
2420         opfn(tmp, tmp, scalar);
2421         if (accfn) {
2422             TCGv_i32 rd = tcg_temp_new_i32();
2423             read_neon_element32(rd, a->vd, pass, MO_32);
2424             accfn(tmp, rd, tmp);
2425             tcg_temp_free_i32(rd);
2426         }
2427         write_neon_element32(tmp, a->vd, pass, MO_32);
2428     }
2429     tcg_temp_free_i32(tmp);
2430     tcg_temp_free_i32(scalar);
2431     return true;
2432 }
2433
2434 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2435 {
2436     static NeonGenTwoOpFn * const opfn[] = {
2437         NULL,
2438         gen_helper_neon_mul_u16,
2439         tcg_gen_mul_i32,
2440         NULL,
2441     };
2442
2443     return do_2scalar(s, a, opfn[a->size], NULL);
2444 }
2445
2446 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2447 {
2448     static NeonGenTwoOpFn * const opfn[] = {
2449         NULL,
2450         gen_helper_neon_mul_u16,
2451         tcg_gen_mul_i32,
2452         NULL,
2453     };
2454     static NeonGenTwoOpFn * const accfn[] = {
2455         NULL,
2456         gen_helper_neon_add_u16,
2457         tcg_gen_add_i32,
2458         NULL,
2459     };
2460
2461     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2462 }
2463
2464 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2465 {
2466     static NeonGenTwoOpFn * const opfn[] = {
2467         NULL,
2468         gen_helper_neon_mul_u16,
2469         tcg_gen_mul_i32,
2470         NULL,
2471     };
2472     static NeonGenTwoOpFn * const accfn[] = {
2473         NULL,
2474         gen_helper_neon_sub_u16,
2475         tcg_gen_sub_i32,
2476         NULL,
2477     };
2478
2479     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2480 }
2481
2482 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2483                               gen_helper_gvec_3_ptr *fn)
2484 {
2485     /* Two registers and a scalar, using gvec */
2486     int vec_size = a->q ? 16 : 8;
2487     int rd_ofs = neon_full_reg_offset(a->vd);
2488     int rn_ofs = neon_full_reg_offset(a->vn);
2489     int rm_ofs;
2490     int idx;
2491     TCGv_ptr fpstatus;
2492
2493     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2494         return false;
2495     }
2496
2497     /* UNDEF accesses to D16-D31 if they don't exist. */
2498     if (!dc_isar_feature(aa32_simd_r32, s) &&
2499         ((a->vd | a->vn | a->vm) & 0x10)) {
2500         return false;
2501     }
2502
2503     if (!fn) {
2504         /* Bad size (including size == 3, which is a different insn group) */
2505         return false;
2506     }
2507
2508     if (a->q && ((a->vd | a->vn) & 1)) {
2509         return false;
2510     }
2511
2512     if (!vfp_access_check(s)) {
2513         return true;
2514     }
2515
2516     /* a->vm is M:Vm, which encodes both register and index */
2517     idx = extract32(a->vm, a->size + 2, 2);
2518     a->vm = extract32(a->vm, 0, a->size + 2);
2519     rm_ofs = neon_full_reg_offset(a->vm);
2520
2521     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2522     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2523                        vec_size, vec_size, idx, fn);
2524     tcg_temp_free_ptr(fpstatus);
2525     return true;
2526 }
2527
2528 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2529     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2530     {                                                                   \
2531         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2532             NULL,                                                       \
2533             gen_helper_##FUNC##_h,                                      \
2534             gen_helper_##FUNC##_s,                                      \
2535             NULL,                                                       \
2536         };                                                              \
2537         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2538             return false;                                               \
2539         }                                                               \
2540         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2541     }
2542
2543 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2544 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2545 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2546
2547 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2548 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2549 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2550 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2551
2552 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2553 {
2554     static NeonGenTwoOpFn * const opfn[] = {
2555         NULL,
2556         gen_VQDMULH_16,
2557         gen_VQDMULH_32,
2558         NULL,
2559     };
2560
2561     return do_2scalar(s, a, opfn[a->size], NULL);
2562 }
2563
2564 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2565 {
2566     static NeonGenTwoOpFn * const opfn[] = {
2567         NULL,
2568         gen_VQRDMULH_16,
2569         gen_VQRDMULH_32,
2570         NULL,
2571     };
2572
2573     return do_2scalar(s, a, opfn[a->size], NULL);
2574 }
2575
2576 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2577                             NeonGenThreeOpEnvFn *opfn)
2578 {
2579     /*
2580      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2581      * performs a kind of fused op-then-accumulate using a helper
2582      * function that takes all of rd, rn and the scalar at once.
2583      */
2584     TCGv_i32 scalar, rn, rd;
2585     int pass;
2586
2587     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2588         return false;
2589     }
2590
2591     if (!dc_isar_feature(aa32_rdm, s)) {
2592         return false;
2593     }
2594
2595     /* UNDEF accesses to D16-D31 if they don't exist. */
2596     if (!dc_isar_feature(aa32_simd_r32, s) &&
2597         ((a->vd | a->vn | a->vm) & 0x10)) {
2598         return false;
2599     }
2600
2601     if (!opfn) {
2602         /* Bad size (including size == 3, which is a different insn group) */
2603         return false;
2604     }
2605
2606     if (a->q && ((a->vd | a->vn) & 1)) {
2607         return false;
2608     }
2609
2610     if (!vfp_access_check(s)) {
2611         return true;
2612     }
2613
2614     scalar = neon_get_scalar(a->size, a->vm);
2615     rn = tcg_temp_new_i32();
2616     rd = tcg_temp_new_i32();
2617
2618     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2619         read_neon_element32(rn, a->vn, pass, MO_32);
2620         read_neon_element32(rd, a->vd, pass, MO_32);
2621         opfn(rd, cpu_env, rn, scalar, rd);
2622         write_neon_element32(rd, a->vd, pass, MO_32);
2623     }
2624     tcg_temp_free_i32(rn);
2625     tcg_temp_free_i32(rd);
2626     tcg_temp_free_i32(scalar);
2627
2628     return true;
2629 }
2630
2631 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2632 {
2633     static NeonGenThreeOpEnvFn *opfn[] = {
2634         NULL,
2635         gen_helper_neon_qrdmlah_s16,
2636         gen_helper_neon_qrdmlah_s32,
2637         NULL,
2638     };
2639     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2640 }
2641
2642 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2643 {
2644     static NeonGenThreeOpEnvFn *opfn[] = {
2645         NULL,
2646         gen_helper_neon_qrdmlsh_s16,
2647         gen_helper_neon_qrdmlsh_s32,
2648         NULL,
2649     };
2650     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2651 }
2652
2653 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2654                             NeonGenTwoOpWidenFn *opfn,
2655                             NeonGenTwo64OpFn *accfn)
2656 {
2657     /*
2658      * Two registers and a scalar, long operations: perform an
2659      * operation on the input elements and the scalar which produces
2660      * a double-width result, and then possibly perform an accumulation
2661      * operation of that result into the destination.
2662      */
2663     TCGv_i32 scalar, rn;
2664     TCGv_i64 rn0_64, rn1_64;
2665
2666     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2667         return false;
2668     }
2669
2670     /* UNDEF accesses to D16-D31 if they don't exist. */
2671     if (!dc_isar_feature(aa32_simd_r32, s) &&
2672         ((a->vd | a->vn | a->vm) & 0x10)) {
2673         return false;
2674     }
2675
2676     if (!opfn) {
2677         /* Bad size (including size == 3, which is a different insn group) */
2678         return false;
2679     }
2680
2681     if (a->vd & 1) {
2682         return false;
2683     }
2684
2685     if (!vfp_access_check(s)) {
2686         return true;
2687     }
2688
2689     scalar = neon_get_scalar(a->size, a->vm);
2690
2691     /* Load all inputs before writing any outputs, in case of overlap */
2692     rn = tcg_temp_new_i32();
2693     read_neon_element32(rn, a->vn, 0, MO_32);
2694     rn0_64 = tcg_temp_new_i64();
2695     opfn(rn0_64, rn, scalar);
2696
2697     read_neon_element32(rn, a->vn, 1, MO_32);
2698     rn1_64 = tcg_temp_new_i64();
2699     opfn(rn1_64, rn, scalar);
2700     tcg_temp_free_i32(rn);
2701     tcg_temp_free_i32(scalar);
2702
2703     if (accfn) {
2704         TCGv_i64 t64 = tcg_temp_new_i64();
2705         read_neon_element64(t64, a->vd, 0, MO_64);
2706         accfn(rn0_64, t64, rn0_64);
2707         read_neon_element64(t64, a->vd, 1, MO_64);
2708         accfn(rn1_64, t64, rn1_64);
2709         tcg_temp_free_i64(t64);
2710     }
2711
2712     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2713     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2714     tcg_temp_free_i64(rn0_64);
2715     tcg_temp_free_i64(rn1_64);
2716     return true;
2717 }
2718
2719 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2720 {
2721     static NeonGenTwoOpWidenFn * const opfn[] = {
2722         NULL,
2723         gen_helper_neon_mull_s16,
2724         gen_mull_s32,
2725         NULL,
2726     };
2727
2728     return do_2scalar_long(s, a, opfn[a->size], NULL);
2729 }
2730
2731 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2732 {
2733     static NeonGenTwoOpWidenFn * const opfn[] = {
2734         NULL,
2735         gen_helper_neon_mull_u16,
2736         gen_mull_u32,
2737         NULL,
2738     };
2739
2740     return do_2scalar_long(s, a, opfn[a->size], NULL);
2741 }
2742
2743 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2744     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2745     {                                                                   \
2746         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2747             NULL,                                                       \
2748             gen_helper_neon_##MULL##16,                                 \
2749             gen_##MULL##32,                                             \
2750             NULL,                                                       \
2751         };                                                              \
2752         static NeonGenTwo64OpFn * const accfn[] = {                     \
2753             NULL,                                                       \
2754             gen_helper_neon_##ACC##l_u32,                               \
2755             tcg_gen_##ACC##_i64,                                        \
2756             NULL,                                                       \
2757         };                                                              \
2758         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2759     }
2760
2761 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2762 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2763 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2764 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2765
2766 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2767 {
2768     static NeonGenTwoOpWidenFn * const opfn[] = {
2769         NULL,
2770         gen_VQDMULL_16,
2771         gen_VQDMULL_32,
2772         NULL,
2773     };
2774
2775     return do_2scalar_long(s, a, opfn[a->size], NULL);
2776 }
2777
2778 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2779 {
2780     static NeonGenTwoOpWidenFn * const opfn[] = {
2781         NULL,
2782         gen_VQDMULL_16,
2783         gen_VQDMULL_32,
2784         NULL,
2785     };
2786     static NeonGenTwo64OpFn * const accfn[] = {
2787         NULL,
2788         gen_VQDMLAL_acc_16,
2789         gen_VQDMLAL_acc_32,
2790         NULL,
2791     };
2792
2793     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2794 }
2795
2796 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2797 {
2798     static NeonGenTwoOpWidenFn * const opfn[] = {
2799         NULL,
2800         gen_VQDMULL_16,
2801         gen_VQDMULL_32,
2802         NULL,
2803     };
2804     static NeonGenTwo64OpFn * const accfn[] = {
2805         NULL,
2806         gen_VQDMLSL_acc_16,
2807         gen_VQDMLSL_acc_32,
2808         NULL,
2809     };
2810
2811     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2812 }
2813
2814 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2815 {
2816     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2817         return false;
2818     }
2819
2820     /* UNDEF accesses to D16-D31 if they don't exist. */
2821     if (!dc_isar_feature(aa32_simd_r32, s) &&
2822         ((a->vd | a->vn | a->vm) & 0x10)) {
2823         return false;
2824     }
2825
2826     if ((a->vn | a->vm | a->vd) & a->q) {
2827         return false;
2828     }
2829
2830     if (a->imm > 7 && !a->q) {
2831         return false;
2832     }
2833
2834     if (!vfp_access_check(s)) {
2835         return true;
2836     }
2837
2838     if (!a->q) {
2839         /* Extract 64 bits from <Vm:Vn> */
2840         TCGv_i64 left, right, dest;
2841
2842         left = tcg_temp_new_i64();
2843         right = tcg_temp_new_i64();
2844         dest = tcg_temp_new_i64();
2845
2846         read_neon_element64(right, a->vn, 0, MO_64);
2847         read_neon_element64(left, a->vm, 0, MO_64);
2848         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2849         write_neon_element64(dest, a->vd, 0, MO_64);
2850
2851         tcg_temp_free_i64(left);
2852         tcg_temp_free_i64(right);
2853         tcg_temp_free_i64(dest);
2854     } else {
2855         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2856         TCGv_i64 left, middle, right, destleft, destright;
2857
2858         left = tcg_temp_new_i64();
2859         middle = tcg_temp_new_i64();
2860         right = tcg_temp_new_i64();
2861         destleft = tcg_temp_new_i64();
2862         destright = tcg_temp_new_i64();
2863
2864         if (a->imm < 8) {
2865             read_neon_element64(right, a->vn, 0, MO_64);
2866             read_neon_element64(middle, a->vn, 1, MO_64);
2867             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2868             read_neon_element64(left, a->vm, 0, MO_64);
2869             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2870         } else {
2871             read_neon_element64(right, a->vn, 1, MO_64);
2872             read_neon_element64(middle, a->vm, 0, MO_64);
2873             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2874             read_neon_element64(left, a->vm, 1, MO_64);
2875             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2876         }
2877
2878         write_neon_element64(destright, a->vd, 0, MO_64);
2879         write_neon_element64(destleft, a->vd, 1, MO_64);
2880
2881         tcg_temp_free_i64(destright);
2882         tcg_temp_free_i64(destleft);
2883         tcg_temp_free_i64(right);
2884         tcg_temp_free_i64(middle);
2885         tcg_temp_free_i64(left);
2886     }
2887     return true;
2888 }
2889
2890 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2891 {
2892     TCGv_i64 val, def;
2893     TCGv_i32 desc;
2894
2895     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2896         return false;
2897     }
2898
2899     /* UNDEF accesses to D16-D31 if they don't exist. */
2900     if (!dc_isar_feature(aa32_simd_r32, s) &&
2901         ((a->vd | a->vn | a->vm) & 0x10)) {
2902         return false;
2903     }
2904
2905     if ((a->vn + a->len + 1) > 32) {
2906         /*
2907          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2908          * helper function running off the end of the register file.
2909          */
2910         return false;
2911     }
2912
2913     if (!vfp_access_check(s)) {
2914         return true;
2915     }
2916
2917     desc = tcg_const_i32((a->vn << 2) | a->len);
2918     def = tcg_temp_new_i64();
2919     if (a->op) {
2920         read_neon_element64(def, a->vd, 0, MO_64);
2921     } else {
2922         tcg_gen_movi_i64(def, 0);
2923     }
2924     val = tcg_temp_new_i64();
2925     read_neon_element64(val, a->vm, 0, MO_64);
2926
2927     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2928     write_neon_element64(val, a->vd, 0, MO_64);
2929
2930     tcg_temp_free_i64(def);
2931     tcg_temp_free_i64(val);
2932     tcg_temp_free_i32(desc);
2933     return true;
2934 }
2935
2936 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2937 {
2938     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2939         return false;
2940     }
2941
2942     /* UNDEF accesses to D16-D31 if they don't exist. */
2943     if (!dc_isar_feature(aa32_simd_r32, s) &&
2944         ((a->vd | a->vm) & 0x10)) {
2945         return false;
2946     }
2947
2948     if (a->vd & a->q) {
2949         return false;
2950     }
2951
2952     if (!vfp_access_check(s)) {
2953         return true;
2954     }
2955
2956     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2957                          neon_element_offset(a->vm, a->index, a->size),
2958                          a->q ? 16 : 8, a->q ? 16 : 8);
2959     return true;
2960 }
2961
2962 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2963 {
2964     int pass, half;
2965     TCGv_i32 tmp[2];
2966
2967     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2968         return false;
2969     }
2970
2971     /* UNDEF accesses to D16-D31 if they don't exist. */
2972     if (!dc_isar_feature(aa32_simd_r32, s) &&
2973         ((a->vd | a->vm) & 0x10)) {
2974         return false;
2975     }
2976
2977     if ((a->vd | a->vm) & a->q) {
2978         return false;
2979     }
2980
2981     if (a->size == 3) {
2982         return false;
2983     }
2984
2985     if (!vfp_access_check(s)) {
2986         return true;
2987     }
2988
2989     tmp[0] = tcg_temp_new_i32();
2990     tmp[1] = tcg_temp_new_i32();
2991
2992     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2993         for (half = 0; half < 2; half++) {
2994             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2995             switch (a->size) {
2996             case 0:
2997                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2998                 break;
2999             case 1:
3000                 gen_swap_half(tmp[half], tmp[half]);
3001                 break;
3002             case 2:
3003                 break;
3004             default:
3005                 g_assert_not_reached();
3006             }
3007         }
3008         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3009         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3010     }
3011
3012     tcg_temp_free_i32(tmp[0]);
3013     tcg_temp_free_i32(tmp[1]);
3014     return true;
3015 }
3016
3017 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3018                               NeonGenWidenFn *widenfn,
3019                               NeonGenTwo64OpFn *opfn,
3020                               NeonGenTwo64OpFn *accfn)
3021 {
3022     /*
3023      * Pairwise long operations: widen both halves of the pair,
3024      * combine the pairs with the opfn, and then possibly accumulate
3025      * into the destination with the accfn.
3026      */
3027     int pass;
3028
3029     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3030         return false;
3031     }
3032
3033     /* UNDEF accesses to D16-D31 if they don't exist. */
3034     if (!dc_isar_feature(aa32_simd_r32, s) &&
3035         ((a->vd | a->vm) & 0x10)) {
3036         return false;
3037     }
3038
3039     if ((a->vd | a->vm) & a->q) {
3040         return false;
3041     }
3042
3043     if (!widenfn) {
3044         return false;
3045     }
3046
3047     if (!vfp_access_check(s)) {
3048         return true;
3049     }
3050
3051     for (pass = 0; pass < a->q + 1; pass++) {
3052         TCGv_i32 tmp;
3053         TCGv_i64 rm0_64, rm1_64, rd_64;
3054
3055         rm0_64 = tcg_temp_new_i64();
3056         rm1_64 = tcg_temp_new_i64();
3057         rd_64 = tcg_temp_new_i64();
3058
3059         tmp = tcg_temp_new_i32();
3060         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3061         widenfn(rm0_64, tmp);
3062         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3063         widenfn(rm1_64, tmp);
3064         tcg_temp_free_i32(tmp);
3065
3066         opfn(rd_64, rm0_64, rm1_64);
3067         tcg_temp_free_i64(rm0_64);
3068         tcg_temp_free_i64(rm1_64);
3069
3070         if (accfn) {
3071             TCGv_i64 tmp64 = tcg_temp_new_i64();
3072             read_neon_element64(tmp64, a->vd, pass, MO_64);
3073             accfn(rd_64, tmp64, rd_64);
3074             tcg_temp_free_i64(tmp64);
3075         }
3076         write_neon_element64(rd_64, a->vd, pass, MO_64);
3077         tcg_temp_free_i64(rd_64);
3078     }
3079     return true;
3080 }
3081
3082 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3083 {
3084     static NeonGenWidenFn * const widenfn[] = {
3085         gen_helper_neon_widen_s8,
3086         gen_helper_neon_widen_s16,
3087         tcg_gen_ext_i32_i64,
3088         NULL,
3089     };
3090     static NeonGenTwo64OpFn * const opfn[] = {
3091         gen_helper_neon_paddl_u16,
3092         gen_helper_neon_paddl_u32,
3093         tcg_gen_add_i64,
3094         NULL,
3095     };
3096
3097     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3098 }
3099
3100 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3101 {
3102     static NeonGenWidenFn * const widenfn[] = {
3103         gen_helper_neon_widen_u8,
3104         gen_helper_neon_widen_u16,
3105         tcg_gen_extu_i32_i64,
3106         NULL,
3107     };
3108     static NeonGenTwo64OpFn * const opfn[] = {
3109         gen_helper_neon_paddl_u16,
3110         gen_helper_neon_paddl_u32,
3111         tcg_gen_add_i64,
3112         NULL,
3113     };
3114
3115     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3116 }
3117
3118 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3119 {
3120     static NeonGenWidenFn * const widenfn[] = {
3121         gen_helper_neon_widen_s8,
3122         gen_helper_neon_widen_s16,
3123         tcg_gen_ext_i32_i64,
3124         NULL,
3125     };
3126     static NeonGenTwo64OpFn * const opfn[] = {
3127         gen_helper_neon_paddl_u16,
3128         gen_helper_neon_paddl_u32,
3129         tcg_gen_add_i64,
3130         NULL,
3131     };
3132     static NeonGenTwo64OpFn * const accfn[] = {
3133         gen_helper_neon_addl_u16,
3134         gen_helper_neon_addl_u32,
3135         tcg_gen_add_i64,
3136         NULL,
3137     };
3138
3139     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3140                              accfn[a->size]);
3141 }
3142
3143 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3144 {
3145     static NeonGenWidenFn * const widenfn[] = {
3146         gen_helper_neon_widen_u8,
3147         gen_helper_neon_widen_u16,
3148         tcg_gen_extu_i32_i64,
3149         NULL,
3150     };
3151     static NeonGenTwo64OpFn * const opfn[] = {
3152         gen_helper_neon_paddl_u16,
3153         gen_helper_neon_paddl_u32,
3154         tcg_gen_add_i64,
3155         NULL,
3156     };
3157     static NeonGenTwo64OpFn * const accfn[] = {
3158         gen_helper_neon_addl_u16,
3159         gen_helper_neon_addl_u32,
3160         tcg_gen_add_i64,
3161         NULL,
3162     };
3163
3164     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3165                              accfn[a->size]);
3166 }
3167
3168 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3169
3170 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3171                        ZipFn *fn)
3172 {
3173     TCGv_ptr pd, pm;
3174
3175     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3176         return false;
3177     }
3178
3179     /* UNDEF accesses to D16-D31 if they don't exist. */
3180     if (!dc_isar_feature(aa32_simd_r32, s) &&
3181         ((a->vd | a->vm) & 0x10)) {
3182         return false;
3183     }
3184
3185     if ((a->vd | a->vm) & a->q) {
3186         return false;
3187     }
3188
3189     if (!fn) {
3190         /* Bad size or size/q combination */
3191         return false;
3192     }
3193
3194     if (!vfp_access_check(s)) {
3195         return true;
3196     }
3197
3198     pd = vfp_reg_ptr(true, a->vd);
3199     pm = vfp_reg_ptr(true, a->vm);
3200     fn(pd, pm);
3201     tcg_temp_free_ptr(pd);
3202     tcg_temp_free_ptr(pm);
3203     return true;
3204 }
3205
3206 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3207 {
3208     static ZipFn * const fn[2][4] = {
3209         {
3210             gen_helper_neon_unzip8,
3211             gen_helper_neon_unzip16,
3212             NULL,
3213             NULL,
3214         }, {
3215             gen_helper_neon_qunzip8,
3216             gen_helper_neon_qunzip16,
3217             gen_helper_neon_qunzip32,
3218             NULL,
3219         }
3220     };
3221     return do_zip_uzp(s, a, fn[a->q][a->size]);
3222 }
3223
3224 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3225 {
3226     static ZipFn * const fn[2][4] = {
3227         {
3228             gen_helper_neon_zip8,
3229             gen_helper_neon_zip16,
3230             NULL,
3231             NULL,
3232         }, {
3233             gen_helper_neon_qzip8,
3234             gen_helper_neon_qzip16,
3235             gen_helper_neon_qzip32,
3236             NULL,
3237         }
3238     };
3239     return do_zip_uzp(s, a, fn[a->q][a->size]);
3240 }
3241
3242 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3243                      NeonGenNarrowEnvFn *narrowfn)
3244 {
3245     TCGv_i64 rm;
3246     TCGv_i32 rd0, rd1;
3247
3248     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3249         return false;
3250     }
3251
3252     /* UNDEF accesses to D16-D31 if they don't exist. */
3253     if (!dc_isar_feature(aa32_simd_r32, s) &&
3254         ((a->vd | a->vm) & 0x10)) {
3255         return false;
3256     }
3257
3258     if (a->vm & 1) {
3259         return false;
3260     }
3261
3262     if (!narrowfn) {
3263         return false;
3264     }
3265
3266     if (!vfp_access_check(s)) {
3267         return true;
3268     }
3269
3270     rm = tcg_temp_new_i64();
3271     rd0 = tcg_temp_new_i32();
3272     rd1 = tcg_temp_new_i32();
3273
3274     read_neon_element64(rm, a->vm, 0, MO_64);
3275     narrowfn(rd0, cpu_env, rm);
3276     read_neon_element64(rm, a->vm, 1, MO_64);
3277     narrowfn(rd1, cpu_env, rm);
3278     write_neon_element32(rd0, a->vd, 0, MO_32);
3279     write_neon_element32(rd1, a->vd, 1, MO_32);
3280     tcg_temp_free_i32(rd0);
3281     tcg_temp_free_i32(rd1);
3282     tcg_temp_free_i64(rm);
3283     return true;
3284 }
3285
3286 #define DO_VMOVN(INSN, FUNC)                                    \
3287     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3288     {                                                           \
3289         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3290             FUNC##8,                                            \
3291             FUNC##16,                                           \
3292             FUNC##32,                                           \
3293             NULL,                                               \
3294         };                                                      \
3295         return do_vmovn(s, a, narrowfn[a->size]);               \
3296     }
3297
3298 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3299 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3300 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3301 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3302
3303 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3304 {
3305     TCGv_i32 rm0, rm1;
3306     TCGv_i64 rd;
3307     static NeonGenWidenFn * const widenfns[] = {
3308         gen_helper_neon_widen_u8,
3309         gen_helper_neon_widen_u16,
3310         tcg_gen_extu_i32_i64,
3311         NULL,
3312     };
3313     NeonGenWidenFn *widenfn = widenfns[a->size];
3314
3315     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3316         return false;
3317     }
3318
3319     /* UNDEF accesses to D16-D31 if they don't exist. */
3320     if (!dc_isar_feature(aa32_simd_r32, s) &&
3321         ((a->vd | a->vm) & 0x10)) {
3322         return false;
3323     }
3324
3325     if (a->vd & 1) {
3326         return false;
3327     }
3328
3329     if (!widenfn) {
3330         return false;
3331     }
3332
3333     if (!vfp_access_check(s)) {
3334         return true;
3335     }
3336
3337     rd = tcg_temp_new_i64();
3338     rm0 = tcg_temp_new_i32();
3339     rm1 = tcg_temp_new_i32();
3340
3341     read_neon_element32(rm0, a->vm, 0, MO_32);
3342     read_neon_element32(rm1, a->vm, 1, MO_32);
3343
3344     widenfn(rd, rm0);
3345     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3346     write_neon_element64(rd, a->vd, 0, MO_64);
3347     widenfn(rd, rm1);
3348     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3349     write_neon_element64(rd, a->vd, 1, MO_64);
3350
3351     tcg_temp_free_i64(rd);
3352     tcg_temp_free_i32(rm0);
3353     tcg_temp_free_i32(rm1);
3354     return true;
3355 }
3356
3357 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3358 {
3359     TCGv_ptr fpst;
3360     TCGv_i64 tmp;
3361     TCGv_i32 dst0, dst1;
3362
3363     if (!dc_isar_feature(aa32_bf16, s)) {
3364         return false;
3365     }
3366
3367     /* UNDEF accesses to D16-D31 if they don't exist. */
3368     if (!dc_isar_feature(aa32_simd_r32, s) &&
3369         ((a->vd | a->vm) & 0x10)) {
3370         return false;
3371     }
3372
3373     if ((a->vm & 1) || (a->size != 1)) {
3374         return false;
3375     }
3376
3377     if (!vfp_access_check(s)) {
3378         return true;
3379     }
3380
3381     fpst = fpstatus_ptr(FPST_STD);
3382     tmp = tcg_temp_new_i64();
3383     dst0 = tcg_temp_new_i32();
3384     dst1 = tcg_temp_new_i32();
3385
3386     read_neon_element64(tmp, a->vm, 0, MO_64);
3387     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3388
3389     read_neon_element64(tmp, a->vm, 1, MO_64);
3390     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3391
3392     write_neon_element32(dst0, a->vd, 0, MO_32);
3393     write_neon_element32(dst1, a->vd, 1, MO_32);
3394
3395     tcg_temp_free_i64(tmp);
3396     tcg_temp_free_i32(dst0);
3397     tcg_temp_free_i32(dst1);
3398     tcg_temp_free_ptr(fpst);
3399     return true;
3400 }
3401
3402 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3403 {
3404     TCGv_ptr fpst;
3405     TCGv_i32 ahp, tmp, tmp2, tmp3;
3406
3407     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3408         !dc_isar_feature(aa32_fp16_spconv, s)) {
3409         return false;
3410     }
3411
3412     /* UNDEF accesses to D16-D31 if they don't exist. */
3413     if (!dc_isar_feature(aa32_simd_r32, s) &&
3414         ((a->vd | a->vm) & 0x10)) {
3415         return false;
3416     }
3417
3418     if ((a->vm & 1) || (a->size != 1)) {
3419         return false;
3420     }
3421
3422     if (!vfp_access_check(s)) {
3423         return true;
3424     }
3425
3426     fpst = fpstatus_ptr(FPST_STD);
3427     ahp = get_ahp_flag();
3428     tmp = tcg_temp_new_i32();
3429     read_neon_element32(tmp, a->vm, 0, MO_32);
3430     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3431     tmp2 = tcg_temp_new_i32();
3432     read_neon_element32(tmp2, a->vm, 1, MO_32);
3433     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3434     tcg_gen_shli_i32(tmp2, tmp2, 16);
3435     tcg_gen_or_i32(tmp2, tmp2, tmp);
3436     read_neon_element32(tmp, a->vm, 2, MO_32);
3437     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3438     tmp3 = tcg_temp_new_i32();
3439     read_neon_element32(tmp3, a->vm, 3, MO_32);
3440     write_neon_element32(tmp2, a->vd, 0, MO_32);
3441     tcg_temp_free_i32(tmp2);
3442     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3443     tcg_gen_shli_i32(tmp3, tmp3, 16);
3444     tcg_gen_or_i32(tmp3, tmp3, tmp);
3445     write_neon_element32(tmp3, a->vd, 1, MO_32);
3446     tcg_temp_free_i32(tmp3);
3447     tcg_temp_free_i32(tmp);
3448     tcg_temp_free_i32(ahp);
3449     tcg_temp_free_ptr(fpst);
3450
3451     return true;
3452 }
3453
3454 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3455 {
3456     TCGv_ptr fpst;
3457     TCGv_i32 ahp, tmp, tmp2, tmp3;
3458
3459     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3460         !dc_isar_feature(aa32_fp16_spconv, s)) {
3461         return false;
3462     }
3463
3464     /* UNDEF accesses to D16-D31 if they don't exist. */
3465     if (!dc_isar_feature(aa32_simd_r32, s) &&
3466         ((a->vd | a->vm) & 0x10)) {
3467         return false;
3468     }
3469
3470     if ((a->vd & 1) || (a->size != 1)) {
3471         return false;
3472     }
3473
3474     if (!vfp_access_check(s)) {
3475         return true;
3476     }
3477
3478     fpst = fpstatus_ptr(FPST_STD);
3479     ahp = get_ahp_flag();
3480     tmp3 = tcg_temp_new_i32();
3481     tmp2 = tcg_temp_new_i32();
3482     tmp = tcg_temp_new_i32();
3483     read_neon_element32(tmp, a->vm, 0, MO_32);
3484     read_neon_element32(tmp2, a->vm, 1, MO_32);
3485     tcg_gen_ext16u_i32(tmp3, tmp);
3486     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3487     write_neon_element32(tmp3, a->vd, 0, MO_32);
3488     tcg_gen_shri_i32(tmp, tmp, 16);
3489     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3490     write_neon_element32(tmp, a->vd, 1, MO_32);
3491     tcg_temp_free_i32(tmp);
3492     tcg_gen_ext16u_i32(tmp3, tmp2);
3493     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3494     write_neon_element32(tmp3, a->vd, 2, MO_32);
3495     tcg_temp_free_i32(tmp3);
3496     tcg_gen_shri_i32(tmp2, tmp2, 16);
3497     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3498     write_neon_element32(tmp2, a->vd, 3, MO_32);
3499     tcg_temp_free_i32(tmp2);
3500     tcg_temp_free_i32(ahp);
3501     tcg_temp_free_ptr(fpst);
3502
3503     return true;
3504 }
3505
3506 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3507 {
3508     int vec_size = a->q ? 16 : 8;
3509     int rd_ofs = neon_full_reg_offset(a->vd);
3510     int rm_ofs = neon_full_reg_offset(a->vm);
3511
3512     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3513         return false;
3514     }
3515
3516     /* UNDEF accesses to D16-D31 if they don't exist. */
3517     if (!dc_isar_feature(aa32_simd_r32, s) &&
3518         ((a->vd | a->vm) & 0x10)) {
3519         return false;
3520     }
3521
3522     if (a->size == 3) {
3523         return false;
3524     }
3525
3526     if ((a->vd | a->vm) & a->q) {
3527         return false;
3528     }
3529
3530     if (!vfp_access_check(s)) {
3531         return true;
3532     }
3533
3534     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3535
3536     return true;
3537 }
3538
3539 #define DO_2MISC_VEC(INSN, FN)                                  \
3540     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3541     {                                                           \
3542         return do_2misc_vec(s, a, FN);                          \
3543     }
3544
3545 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3546 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3547 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3548 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3549 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3550 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3551 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3552
3553 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3554 {
3555     if (a->size != 0) {
3556         return false;
3557     }
3558     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3559 }
3560
3561 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3562     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3563                          uint32_t rm_ofs, uint32_t oprsz,               \
3564                          uint32_t maxsz)                                \
3565     {                                                                   \
3566         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3567                            DATA, FUNC);                                 \
3568     }
3569
3570 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3571     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3572                          uint32_t rm_ofs, uint32_t oprsz,               \
3573                          uint32_t maxsz)                                \
3574     {                                                                   \
3575         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3576     }
3577
3578 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3579 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3580 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3581 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3582 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3583 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3584 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3585
3586 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3587     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3588     {                                                           \
3589         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3590             return false;                                       \
3591         }                                                       \
3592         return do_2misc_vec(s, a, gen_##INSN);                  \
3593     }
3594
3595 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3596 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3597 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3598 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3599 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3600 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3601 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3602
3603 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3604 {
3605     TCGv_i32 tmp;
3606     int pass;
3607
3608     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3609     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3610         return false;
3611     }
3612
3613     /* UNDEF accesses to D16-D31 if they don't exist. */
3614     if (!dc_isar_feature(aa32_simd_r32, s) &&
3615         ((a->vd | a->vm) & 0x10)) {
3616         return false;
3617     }
3618
3619     if (!fn) {
3620         return false;
3621     }
3622
3623     if ((a->vd | a->vm) & a->q) {
3624         return false;
3625     }
3626
3627     if (!vfp_access_check(s)) {
3628         return true;
3629     }
3630
3631     tmp = tcg_temp_new_i32();
3632     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3633         read_neon_element32(tmp, a->vm, pass, MO_32);
3634         fn(tmp, tmp);
3635         write_neon_element32(tmp, a->vd, pass, MO_32);
3636     }
3637     tcg_temp_free_i32(tmp);
3638
3639     return true;
3640 }
3641
3642 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3643 {
3644     static NeonGenOneOpFn * const fn[] = {
3645         tcg_gen_bswap32_i32,
3646         gen_swap_half,
3647         NULL,
3648         NULL,
3649     };
3650     return do_2misc(s, a, fn[a->size]);
3651 }
3652
3653 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3654 {
3655     if (a->size != 0) {
3656         return false;
3657     }
3658     return do_2misc(s, a, gen_rev16);
3659 }
3660
3661 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3662 {
3663     static NeonGenOneOpFn * const fn[] = {
3664         gen_helper_neon_cls_s8,
3665         gen_helper_neon_cls_s16,
3666         gen_helper_neon_cls_s32,
3667         NULL,
3668     };
3669     return do_2misc(s, a, fn[a->size]);
3670 }
3671
3672 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3673 {
3674     tcg_gen_clzi_i32(rd, rm, 32);
3675 }
3676
3677 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3678 {
3679     static NeonGenOneOpFn * const fn[] = {
3680         gen_helper_neon_clz_u8,
3681         gen_helper_neon_clz_u16,
3682         do_VCLZ_32,
3683         NULL,
3684     };
3685     return do_2misc(s, a, fn[a->size]);
3686 }
3687
3688 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3689 {
3690     if (a->size != 0) {
3691         return false;
3692     }
3693     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3694 }
3695
3696 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3697                        uint32_t oprsz, uint32_t maxsz)
3698 {
3699     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3700                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3701                       oprsz, maxsz);
3702 }
3703
3704 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3705 {
3706     if (a->size == MO_16) {
3707         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3708             return false;
3709         }
3710     } else if (a->size != MO_32) {
3711         return false;
3712     }
3713     return do_2misc_vec(s, a, gen_VABS_F);
3714 }
3715
3716 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3717                        uint32_t oprsz, uint32_t maxsz)
3718 {
3719     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3720                       vece == MO_16 ? 0x8000 : 0x80000000,
3721                       oprsz, maxsz);
3722 }
3723
3724 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3725 {
3726     if (a->size == MO_16) {
3727         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3728             return false;
3729         }
3730     } else if (a->size != MO_32) {
3731         return false;
3732     }
3733     return do_2misc_vec(s, a, gen_VNEG_F);
3734 }
3735
3736 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3737 {
3738     if (a->size != 2) {
3739         return false;
3740     }
3741     return do_2misc(s, a, gen_helper_recpe_u32);
3742 }
3743
3744 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3745 {
3746     if (a->size != 2) {
3747         return false;
3748     }
3749     return do_2misc(s, a, gen_helper_rsqrte_u32);
3750 }
3751
3752 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3753     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3754     {                                                   \
3755         FUNC(d, cpu_env, m);                            \
3756     }
3757
3758 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3759 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3760 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3761 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3762 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3763 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3764
3765 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3766 {
3767     static NeonGenOneOpFn * const fn[] = {
3768         gen_VQABS_s8,
3769         gen_VQABS_s16,
3770         gen_VQABS_s32,
3771         NULL,
3772     };
3773     return do_2misc(s, a, fn[a->size]);
3774 }
3775
3776 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3777 {
3778     static NeonGenOneOpFn * const fn[] = {
3779         gen_VQNEG_s8,
3780         gen_VQNEG_s16,
3781         gen_VQNEG_s32,
3782         NULL,
3783     };
3784     return do_2misc(s, a, fn[a->size]);
3785 }
3786
3787 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3788     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3789                            uint32_t rm_ofs,                             \
3790                            uint32_t oprsz, uint32_t maxsz)              \
3791     {                                                                   \
3792         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3793             NULL, HFUNC, SFUNC, NULL,                                   \
3794         };                                                              \
3795         TCGv_ptr fpst;                                                  \
3796         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3797         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3798                            fns[vece]);                                  \
3799         tcg_temp_free_ptr(fpst);                                        \
3800     }                                                                   \
3801     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3802     {                                                                   \
3803         if (a->size == MO_16) {                                         \
3804             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3805                 return false;                                           \
3806             }                                                           \
3807         } else if (a->size != MO_32) {                                  \
3808             return false;                                               \
3809         }                                                               \
3810         return do_2misc_vec(s, a, gen_##INSN);                          \
3811     }
3812
3813 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3814 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3815 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3816 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3817 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3818 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3819 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3820 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3821 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3822 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3823 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3824
3825 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3826
3827 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3828 {
3829     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3830         return false;
3831     }
3832     return trans_VRINTX_impl(s, a);
3833 }
3834
3835 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3836     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3837                            uint32_t rm_ofs,                             \
3838                            uint32_t oprsz, uint32_t maxsz)              \
3839     {                                                                   \
3840         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3841             NULL,                                                       \
3842             gen_helper_gvec_##OP##h,                                    \
3843             gen_helper_gvec_##OP##s,                                    \
3844             NULL,                                                       \
3845         };                                                              \
3846         TCGv_ptr fpst;                                                  \
3847         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3848         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3849                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3850         tcg_temp_free_ptr(fpst);                                        \
3851     }                                                                   \
3852     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3853     {                                                                   \
3854         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3855             return false;                                               \
3856         }                                                               \
3857         if (a->size == MO_16) {                                         \
3858             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3859                 return false;                                           \
3860             }                                                           \
3861         } else if (a->size != MO_32) {                                  \
3862             return false;                                               \
3863         }                                                               \
3864         return do_2misc_vec(s, a, gen_##INSN);                          \
3865     }
3866
3867 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3868 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3869 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3870 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3871 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3872 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3873 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3874 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3875
3876 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3877 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3878 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3879 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3880 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3881
3882 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3883 {
3884     TCGv_i64 rm, rd;
3885     int pass;
3886
3887     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3888         return false;
3889     }
3890
3891     /* UNDEF accesses to D16-D31 if they don't exist. */
3892     if (!dc_isar_feature(aa32_simd_r32, s) &&
3893         ((a->vd | a->vm) & 0x10)) {
3894         return false;
3895     }
3896
3897     if (a->size != 0) {
3898         return false;
3899     }
3900
3901     if ((a->vd | a->vm) & a->q) {
3902         return false;
3903     }
3904
3905     if (!vfp_access_check(s)) {
3906         return true;
3907     }
3908
3909     rm = tcg_temp_new_i64();
3910     rd = tcg_temp_new_i64();
3911     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3912         read_neon_element64(rm, a->vm, pass, MO_64);
3913         read_neon_element64(rd, a->vd, pass, MO_64);
3914         write_neon_element64(rm, a->vd, pass, MO_64);
3915         write_neon_element64(rd, a->vm, pass, MO_64);
3916     }
3917     tcg_temp_free_i64(rm);
3918     tcg_temp_free_i64(rd);
3919
3920     return true;
3921 }
3922 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3923 {
3924     TCGv_i32 rd, tmp;
3925
3926     rd = tcg_temp_new_i32();
3927     tmp = tcg_temp_new_i32();
3928
3929     tcg_gen_shli_i32(rd, t0, 8);
3930     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3931     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3932     tcg_gen_or_i32(rd, rd, tmp);
3933
3934     tcg_gen_shri_i32(t1, t1, 8);
3935     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3936     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3937     tcg_gen_or_i32(t1, t1, tmp);
3938     tcg_gen_mov_i32(t0, rd);
3939
3940     tcg_temp_free_i32(tmp);
3941     tcg_temp_free_i32(rd);
3942 }
3943
3944 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3945 {
3946     TCGv_i32 rd, tmp;
3947
3948     rd = tcg_temp_new_i32();
3949     tmp = tcg_temp_new_i32();
3950
3951     tcg_gen_shli_i32(rd, t0, 16);
3952     tcg_gen_andi_i32(tmp, t1, 0xffff);
3953     tcg_gen_or_i32(rd, rd, tmp);
3954     tcg_gen_shri_i32(t1, t1, 16);
3955     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3956     tcg_gen_or_i32(t1, t1, tmp);
3957     tcg_gen_mov_i32(t0, rd);
3958
3959     tcg_temp_free_i32(tmp);
3960     tcg_temp_free_i32(rd);
3961 }
3962
3963 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3964 {
3965     TCGv_i32 tmp, tmp2;
3966     int pass;
3967
3968     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3969         return false;
3970     }
3971
3972     /* UNDEF accesses to D16-D31 if they don't exist. */
3973     if (!dc_isar_feature(aa32_simd_r32, s) &&
3974         ((a->vd | a->vm) & 0x10)) {
3975         return false;
3976     }
3977
3978     if ((a->vd | a->vm) & a->q) {
3979         return false;
3980     }
3981
3982     if (a->size == 3) {
3983         return false;
3984     }
3985
3986     if (!vfp_access_check(s)) {
3987         return true;
3988     }
3989
3990     tmp = tcg_temp_new_i32();
3991     tmp2 = tcg_temp_new_i32();
3992     if (a->size == MO_32) {
3993         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3994             read_neon_element32(tmp, a->vm, pass, MO_32);
3995             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3996             write_neon_element32(tmp2, a->vm, pass, MO_32);
3997             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3998         }
3999     } else {
4000         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4001             read_neon_element32(tmp, a->vm, pass, MO_32);
4002             read_neon_element32(tmp2, a->vd, pass, MO_32);
4003             if (a->size == MO_8) {
4004                 gen_neon_trn_u8(tmp, tmp2);
4005             } else {
4006                 gen_neon_trn_u16(tmp, tmp2);
4007             }
4008             write_neon_element32(tmp2, a->vm, pass, MO_32);
4009             write_neon_element32(tmp, a->vd, pass, MO_32);
4010         }
4011     }
4012     tcg_temp_free_i32(tmp);
4013     tcg_temp_free_i32(tmp2);
4014     return true;
4015 }
4016
4017 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4018 {
4019     if (!dc_isar_feature(aa32_i8mm, s)) {
4020         return false;
4021     }
4022     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4023                         gen_helper_gvec_smmla_b);
4024 }
4025
4026 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4027 {
4028     if (!dc_isar_feature(aa32_i8mm, s)) {
4029         return false;
4030     }
4031     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4032                         gen_helper_gvec_ummla_b);
4033 }
4034
4035 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4036 {
4037     if (!dc_isar_feature(aa32_i8mm, s)) {
4038         return false;
4039     }
4040     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4041                         gen_helper_gvec_usmmla_b);
4042 }
4043
4044 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4045 {
4046     if (!dc_isar_feature(aa32_bf16, s)) {
4047         return false;
4048     }
4049     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4050                         gen_helper_gvec_bfmmla);
4051 }
4052
4053 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4054 {
4055     if (!dc_isar_feature(aa32_bf16, s)) {
4056         return false;
4057     }
4058     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4059                              gen_helper_gvec_bfmlal);
4060 }
4061
4062 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4063 {
4064     if (!dc_isar_feature(aa32_bf16, s)) {
4065         return false;
4066     }
4067     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4068                              (a->index << 1) | a->q, FPST_STD,
4069                              gen_helper_gvec_bfmlal_idx);
4070 }