tcg/aarch64/tcg-target.c.inc

   1 /*
   2  * Initial TCG Implementation for aarch64
   3  *
   4  * Copyright (c) 2013 Huawei Technologies Duesseldorf GmbH
   5  * Written by Claudio Fontana
   6  *
   7  * This work is licensed under the terms of the GNU GPL, version 2 or
   8  * (at your option) any later version.
   9  *
  10  * See the COPYING file in the top-level directory for details.
  11  */
  12
  13 #include "../tcg-ldst.c.inc"
  14 #include "../tcg-pool.c.inc"
  15 #include "qemu/bitops.h"
  16
  17 /* We're going to re-use TCGType in setting of the SF bit, which controls
  18    the size of the operation performed.  If we know the values match, it
  19    makes things much cleaner.  */
  20 QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
  21
  22 #ifdef CONFIG_DEBUG_TCG
  23 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  24     "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
  25     "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
  26     "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
  27     "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
  28
  29     "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  30     "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
  31     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
  32     "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
  33 };
  34 #endif /* CONFIG_DEBUG_TCG */
  35
  36 static const int tcg_target_reg_alloc_order[] = {
  37     TCG_REG_X20, TCG_REG_X21, TCG_REG_X22, TCG_REG_X23,
  38     TCG_REG_X24, TCG_REG_X25, TCG_REG_X26, TCG_REG_X27,
  39     TCG_REG_X28, /* we will reserve this for guest_base if configured */
  40
  41     TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11,
  42     TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15,
  43
  44     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
  45     TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7,
  46
  47     /* X16 reserved as temporary */
  48     /* X17 reserved as temporary */
  49     /* X18 reserved by system */
  50     /* X19 reserved for AREG0 */
  51     /* X29 reserved as fp */
  52     /* X30 reserved as temporary */
  53
  54     TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
  55     TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
  56     /* V8 - V15 are call-saved, and skipped.  */
  57     TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
  58     TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
  59     TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
  60     TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
  61 };
  62
  63 static const int tcg_target_call_iarg_regs[8] = {
  64     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
  65     TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7
  66 };
  67
  68 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
  69 {
  70     tcg_debug_assert(kind == TCG_CALL_RET_NORMAL);
  71     tcg_debug_assert(slot >= 0 && slot <= 1);
  72     return TCG_REG_X0 + slot;
  73 }
  74
  75 #define TCG_REG_TMP0 TCG_REG_X16
  76 #define TCG_REG_TMP1 TCG_REG_X17
  77 #define TCG_REG_TMP2 TCG_REG_X30
  78 #define TCG_VEC_TMP0 TCG_REG_V31
  79
  80 #define TCG_REG_GUEST_BASE TCG_REG_X28
  81
  82 static bool reloc_pc26(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  83 {
  84     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
  85     ptrdiff_t offset = target - src_rx;
  86
  87     if (offset == sextract64(offset, 0, 26)) {
  88         /* read instruction, mask away previous PC_REL26 parameter contents,
  89            set the proper offset, then write back the instruction. */
  90         *src_rw = deposit32(*src_rw, 0, 26, offset);
  91         return true;
  92     }
  93     return false;
  94 }
  95
  96 static bool reloc_pc19(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  97 {
  98     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
  99     ptrdiff_t offset = target - src_rx;
 100
 101     if (offset == sextract64(offset, 0, 19)) {
 102         *src_rw = deposit32(*src_rw, 5, 19, offset);
 103         return true;
 104     }
 105     return false;
 106 }
 107
 108 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 109                         intptr_t value, intptr_t addend)
 110 {
 111     tcg_debug_assert(addend == 0);
 112     switch (type) {
 113     case R_AARCH64_JUMP26:
 114     case R_AARCH64_CALL26:
 115         return reloc_pc26(code_ptr, (const tcg_insn_unit *)value);
 116     case R_AARCH64_CONDBR19:
 117         return reloc_pc19(code_ptr, (const tcg_insn_unit *)value);
 118     default:
 119         g_assert_not_reached();
 120     }
 121 }
 122
 123 #define TCG_CT_CONST_AIMM 0x100
 124 #define TCG_CT_CONST_LIMM 0x200
 125 #define TCG_CT_CONST_ZERO 0x400
 126 #define TCG_CT_CONST_MONE 0x800
 127 #define TCG_CT_CONST_ORRI 0x1000
 128 #define TCG_CT_CONST_ANDI 0x2000
 129
 130 #define ALL_GENERAL_REGS  0xffffffffu
 131 #define ALL_VECTOR_REGS   0xffffffff00000000ull
 132
 133 /* Match a constant valid for addition (12-bit, optionally shifted).  */
 134 static inline bool is_aimm(uint64_t val)
 135 {
 136     return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0;
 137 }
 138
 139 /* Match a constant valid for logical operations.  */
 140 static inline bool is_limm(uint64_t val)
 141 {
 142     /* Taking a simplified view of the logical immediates for now, ignoring
 143        the replication that can happen across the field.  Match bit patterns
 144        of the forms
 145            0....01....1
 146            0..01..10..0
 147        and their inverses.  */
 148
 149     /* Make things easier below, by testing the form with msb clear. */
 150     if ((int64_t)val < 0) {
 151         val = ~val;
 152     }
 153     if (val == 0) {
 154         return false;
 155     }
 156     val += val & -val;
 157     return (val & (val - 1)) == 0;
 158 }
 159
 160 /* Return true if v16 is a valid 16-bit shifted immediate.  */
 161 static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
 162 {
 163     if (v16 == (v16 & 0xff)) {
 164         *cmode = 0x8;
 165         *imm8 = v16 & 0xff;
 166         return true;
 167     } else if (v16 == (v16 & 0xff00)) {
 168         *cmode = 0xa;
 169         *imm8 = v16 >> 8;
 170         return true;
 171     }
 172     return false;
 173 }
 174
 175 /* Return true if v32 is a valid 32-bit shifted immediate.  */
 176 static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
 177 {
 178     if (v32 == (v32 & 0xff)) {
 179         *cmode = 0x0;
 180         *imm8 = v32 & 0xff;
 181         return true;
 182     } else if (v32 == (v32 & 0xff00)) {
 183         *cmode = 0x2;
 184         *imm8 = (v32 >> 8) & 0xff;
 185         return true;
 186     } else if (v32 == (v32 & 0xff0000)) {
 187         *cmode = 0x4;
 188         *imm8 = (v32 >> 16) & 0xff;
 189         return true;
 190     } else if (v32 == (v32 & 0xff000000)) {
 191         *cmode = 0x6;
 192         *imm8 = v32 >> 24;
 193         return true;
 194     }
 195     return false;
 196 }
 197
 198 /* Return true if v32 is a valid 32-bit shifting ones immediate.  */
 199 static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
 200 {
 201     if ((v32 & 0xffff00ff) == 0xff) {
 202         *cmode = 0xc;
 203         *imm8 = (v32 >> 8) & 0xff;
 204         return true;
 205     } else if ((v32 & 0xff00ffff) == 0xffff) {
 206         *cmode = 0xd;
 207         *imm8 = (v32 >> 16) & 0xff;
 208         return true;
 209     }
 210     return false;
 211 }
 212
 213 /* Return true if v32 is a valid float32 immediate.  */
 214 static bool is_fimm32(uint32_t v32, int *cmode, int *imm8)
 215 {
 216     if (extract32(v32, 0, 19) == 0
 217         && (extract32(v32, 25, 6) == 0x20
 218             || extract32(v32, 25, 6) == 0x1f)) {
 219         *cmode = 0xf;
 220         *imm8 = (extract32(v32, 31, 1) << 7)
 221               | (extract32(v32, 25, 1) << 6)
 222               | extract32(v32, 19, 6);
 223         return true;
 224     }
 225     return false;
 226 }
 227
 228 /* Return true if v64 is a valid float64 immediate.  */
 229 static bool is_fimm64(uint64_t v64, int *cmode, int *imm8)
 230 {
 231     if (extract64(v64, 0, 48) == 0
 232         && (extract64(v64, 54, 9) == 0x100
 233             || extract64(v64, 54, 9) == 0x0ff)) {
 234         *cmode = 0xf;
 235         *imm8 = (extract64(v64, 63, 1) << 7)
 236               | (extract64(v64, 54, 1) << 6)
 237               | extract64(v64, 48, 6);
 238         return true;
 239     }
 240     return false;
 241 }
 242
 243 /*
 244  * Return non-zero if v32 can be formed by MOVI+ORR.
 245  * Place the parameters for MOVI in (cmode, imm8).
 246  * Return the cmode for ORR; the imm8 can be had via extraction from v32.
 247  */
 248 static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
 249 {
 250     int i;
 251
 252     for (i = 6; i > 0; i -= 2) {
 253         /* Mask out one byte we can add with ORR.  */
 254         uint32_t tmp = v32 & ~(0xffu << (i * 4));
 255         if (is_shimm32(tmp, cmode, imm8) ||
 256             is_soimm32(tmp, cmode, imm8)) {
 257             break;
 258         }
 259     }
 260     return i;
 261 }
 262
 263 /* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
 264 static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
 265 {
 266     if (v32 == deposit32(v32, 16, 16, v32)) {
 267         return is_shimm16(v32, cmode, imm8);
 268     } else {
 269         return is_shimm32(v32, cmode, imm8);
 270     }
 271 }
 272
 273 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 274 {
 275     if (ct & TCG_CT_CONST) {
 276         return 1;
 277     }
 278     if (type == TCG_TYPE_I32) {
 279         val = (int32_t)val;
 280     }
 281     if ((ct & TCG_CT_CONST_AIMM) && (is_aimm(val) || is_aimm(-val))) {
 282         return 1;
 283     }
 284     if ((ct & TCG_CT_CONST_LIMM) && is_limm(val)) {
 285         return 1;
 286     }
 287     if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
 288         return 1;
 289     }
 290     if ((ct & TCG_CT_CONST_MONE) && val == -1) {
 291         return 1;
 292     }
 293
 294     switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
 295     case 0:
 296         break;
 297     case TCG_CT_CONST_ANDI:
 298         val = ~val;
 299         /* fallthru */
 300     case TCG_CT_CONST_ORRI:
 301         if (val == deposit64(val, 32, 32, val)) {
 302             int cmode, imm8;
 303             return is_shimm1632(val, &cmode, &imm8);
 304         }
 305         break;
 306     default:
 307         /* Both bits should not be set for the same insn.  */
 308         g_assert_not_reached();
 309     }
 310
 311     return 0;
 312 }
 313
 314 enum aarch64_cond_code {
 315     COND_EQ = 0x0,
 316     COND_NE = 0x1,
 317     COND_CS = 0x2,     /* Unsigned greater or equal */
 318     COND_HS = COND_CS, /* ALIAS greater or equal */
 319     COND_CC = 0x3,     /* Unsigned less than */
 320     COND_LO = COND_CC, /* ALIAS Lower */
 321     COND_MI = 0x4,     /* Negative */
 322     COND_PL = 0x5,     /* Zero or greater */
 323     COND_VS = 0x6,     /* Overflow */
 324     COND_VC = 0x7,     /* No overflow */
 325     COND_HI = 0x8,     /* Unsigned greater than */
 326     COND_LS = 0x9,     /* Unsigned less or equal */
 327     COND_GE = 0xa,
 328     COND_LT = 0xb,
 329     COND_GT = 0xc,
 330     COND_LE = 0xd,
 331     COND_AL = 0xe,
 332     COND_NV = 0xf, /* behaves like COND_AL here */
 333 };
 334
 335 static const enum aarch64_cond_code tcg_cond_to_aarch64[] = {
 336     [TCG_COND_EQ] = COND_EQ,
 337     [TCG_COND_NE] = COND_NE,
 338     [TCG_COND_LT] = COND_LT,
 339     [TCG_COND_GE] = COND_GE,
 340     [TCG_COND_LE] = COND_LE,
 341     [TCG_COND_GT] = COND_GT,
 342     /* unsigned */
 343     [TCG_COND_LTU] = COND_LO,
 344     [TCG_COND_GTU] = COND_HI,
 345     [TCG_COND_GEU] = COND_HS,
 346     [TCG_COND_LEU] = COND_LS,
 347 };
 348
 349 typedef enum {
 350     LDST_ST = 0,    /* store */
 351     LDST_LD = 1,    /* load */
 352     LDST_LD_S_X = 2,  /* load and sign-extend into Xt */
 353     LDST_LD_S_W = 3,  /* load and sign-extend into Wt */
 354 } AArch64LdstType;
 355
 356 /* We encode the format of the insn into the beginning of the name, so that
 357    we can have the preprocessor help "typecheck" the insn vs the output
 358    function.  Arm didn't provide us with nice names for the formats, so we
 359    use the section number of the architecture reference manual in which the
 360    instruction group is described.  */
 361 typedef enum {
 362     /* Compare and branch (immediate).  */
 363     I3201_CBZ       = 0x34000000,
 364     I3201_CBNZ      = 0x35000000,
 365
 366     /* Conditional branch (immediate).  */
 367     I3202_B_C       = 0x54000000,
 368
 369     /* Unconditional branch (immediate).  */
 370     I3206_B         = 0x14000000,
 371     I3206_BL        = 0x94000000,
 372
 373     /* Unconditional branch (register).  */
 374     I3207_BR        = 0xd61f0000,
 375     I3207_BLR       = 0xd63f0000,
 376     I3207_RET       = 0xd65f0000,
 377
 378     /* AdvSIMD load/store single structure.  */
 379     I3303_LD1R      = 0x0d40c000,
 380
 381     /* Load literal for loading the address at pc-relative offset */
 382     I3305_LDR       = 0x58000000,
 383     I3305_LDR_v64   = 0x5c000000,
 384     I3305_LDR_v128  = 0x9c000000,
 385
 386     /* Load/store exclusive. */
 387     I3306_LDXP      = 0xc8600000,
 388     I3306_STXP      = 0xc8200000,
 389
 390     /* Load/store register.  Described here as 3.3.12, but the helper
 391        that emits them can transform to 3.3.10 or 3.3.13.  */
 392     I3312_STRB      = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
 393     I3312_STRH      = 0x38000000 | LDST_ST << 22 | MO_16 << 30,
 394     I3312_STRW      = 0x38000000 | LDST_ST << 22 | MO_32 << 30,
 395     I3312_STRX      = 0x38000000 | LDST_ST << 22 | MO_64 << 30,
 396
 397     I3312_LDRB      = 0x38000000 | LDST_LD << 22 | MO_8 << 30,
 398     I3312_LDRH      = 0x38000000 | LDST_LD << 22 | MO_16 << 30,
 399     I3312_LDRW      = 0x38000000 | LDST_LD << 22 | MO_32 << 30,
 400     I3312_LDRX      = 0x38000000 | LDST_LD << 22 | MO_64 << 30,
 401
 402     I3312_LDRSBW    = 0x38000000 | LDST_LD_S_W << 22 | MO_8 << 30,
 403     I3312_LDRSHW    = 0x38000000 | LDST_LD_S_W << 22 | MO_16 << 30,
 404
 405     I3312_LDRSBX    = 0x38000000 | LDST_LD_S_X << 22 | MO_8 << 30,
 406     I3312_LDRSHX    = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
 407     I3312_LDRSWX    = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
 408
 409     I3312_LDRVS     = 0x3c000000 | LDST_LD << 22 | MO_32 << 30,
 410     I3312_STRVS     = 0x3c000000 | LDST_ST << 22 | MO_32 << 30,
 411
 412     I3312_LDRVD     = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
 413     I3312_STRVD     = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
 414
 415     I3312_LDRVQ     = 0x3c000000 | 3 << 22 | 0 << 30,
 416     I3312_STRVQ     = 0x3c000000 | 2 << 22 | 0 << 30,
 417
 418     I3312_TO_I3310  = 0x00200800,
 419     I3312_TO_I3313  = 0x01000000,
 420
 421     /* Load/store register pair instructions.  */
 422     I3314_LDP       = 0x28400000,
 423     I3314_STP       = 0x28000000,
 424
 425     /* Add/subtract immediate instructions.  */
 426     I3401_ADDI      = 0x11000000,
 427     I3401_ADDSI     = 0x31000000,
 428     I3401_SUBI      = 0x51000000,
 429     I3401_SUBSI     = 0x71000000,
 430
 431     /* Bitfield instructions.  */
 432     I3402_BFM       = 0x33000000,
 433     I3402_SBFM      = 0x13000000,
 434     I3402_UBFM      = 0x53000000,
 435
 436     /* Extract instruction.  */
 437     I3403_EXTR      = 0x13800000,
 438
 439     /* Logical immediate instructions.  */
 440     I3404_ANDI      = 0x12000000,
 441     I3404_ORRI      = 0x32000000,
 442     I3404_EORI      = 0x52000000,
 443     I3404_ANDSI     = 0x72000000,
 444
 445     /* Move wide immediate instructions.  */
 446     I3405_MOVN      = 0x12800000,
 447     I3405_MOVZ      = 0x52800000,
 448     I3405_MOVK      = 0x72800000,
 449
 450     /* PC relative addressing instructions.  */
 451     I3406_ADR       = 0x10000000,
 452     I3406_ADRP      = 0x90000000,
 453
 454     /* Add/subtract extended register instructions. */
 455     I3501_ADD       = 0x0b200000,
 456
 457     /* Add/subtract shifted register instructions (without a shift).  */
 458     I3502_ADD       = 0x0b000000,
 459     I3502_ADDS      = 0x2b000000,
 460     I3502_SUB       = 0x4b000000,
 461     I3502_SUBS      = 0x6b000000,
 462
 463     /* Add/subtract shifted register instructions (with a shift).  */
 464     I3502S_ADD_LSL  = I3502_ADD,
 465
 466     /* Add/subtract with carry instructions.  */
 467     I3503_ADC       = 0x1a000000,
 468     I3503_SBC       = 0x5a000000,
 469
 470     /* Conditional select instructions.  */
 471     I3506_CSEL      = 0x1a800000,
 472     I3506_CSINC     = 0x1a800400,
 473     I3506_CSINV     = 0x5a800000,
 474     I3506_CSNEG     = 0x5a800400,
 475
 476     /* Data-processing (1 source) instructions.  */
 477     I3507_CLZ       = 0x5ac01000,
 478     I3507_RBIT      = 0x5ac00000,
 479     I3507_REV       = 0x5ac00000, /* + size << 10 */
 480
 481     /* Data-processing (2 source) instructions.  */
 482     I3508_LSLV      = 0x1ac02000,
 483     I3508_LSRV      = 0x1ac02400,
 484     I3508_ASRV      = 0x1ac02800,
 485     I3508_RORV      = 0x1ac02c00,
 486     I3508_SMULH     = 0x9b407c00,
 487     I3508_UMULH     = 0x9bc07c00,
 488     I3508_UDIV      = 0x1ac00800,
 489     I3508_SDIV      = 0x1ac00c00,
 490
 491     /* Data-processing (3 source) instructions.  */
 492     I3509_MADD      = 0x1b000000,
 493     I3509_MSUB      = 0x1b008000,
 494
 495     /* Logical shifted register instructions (without a shift).  */
 496     I3510_AND       = 0x0a000000,
 497     I3510_BIC       = 0x0a200000,
 498     I3510_ORR       = 0x2a000000,
 499     I3510_ORN       = 0x2a200000,
 500     I3510_EOR       = 0x4a000000,
 501     I3510_EON       = 0x4a200000,
 502     I3510_ANDS      = 0x6a000000,
 503
 504     /* Logical shifted register instructions (with a shift).  */
 505     I3502S_AND_LSR  = I3510_AND | (1 << 22),
 506
 507     /* AdvSIMD copy */
 508     I3605_DUP      = 0x0e000400,
 509     I3605_INS      = 0x4e001c00,
 510     I3605_UMOV     = 0x0e003c00,
 511
 512     /* AdvSIMD modified immediate */
 513     I3606_MOVI      = 0x0f000400,
 514     I3606_MVNI      = 0x2f000400,
 515     I3606_BIC       = 0x2f001400,
 516     I3606_ORR       = 0x0f001400,
 517
 518     /* AdvSIMD scalar shift by immediate */
 519     I3609_SSHR      = 0x5f000400,
 520     I3609_SSRA      = 0x5f001400,
 521     I3609_SHL       = 0x5f005400,
 522     I3609_USHR      = 0x7f000400,
 523     I3609_USRA      = 0x7f001400,
 524     I3609_SLI       = 0x7f005400,
 525
 526     /* AdvSIMD scalar three same */
 527     I3611_SQADD     = 0x5e200c00,
 528     I3611_SQSUB     = 0x5e202c00,
 529     I3611_CMGT      = 0x5e203400,
 530     I3611_CMGE      = 0x5e203c00,
 531     I3611_SSHL      = 0x5e204400,
 532     I3611_ADD       = 0x5e208400,
 533     I3611_CMTST     = 0x5e208c00,
 534     I3611_UQADD     = 0x7e200c00,
 535     I3611_UQSUB     = 0x7e202c00,
 536     I3611_CMHI      = 0x7e203400,
 537     I3611_CMHS      = 0x7e203c00,
 538     I3611_USHL      = 0x7e204400,
 539     I3611_SUB       = 0x7e208400,
 540     I3611_CMEQ      = 0x7e208c00,
 541
 542     /* AdvSIMD scalar two-reg misc */
 543     I3612_CMGT0     = 0x5e208800,
 544     I3612_CMEQ0     = 0x5e209800,
 545     I3612_CMLT0     = 0x5e20a800,
 546     I3612_ABS       = 0x5e20b800,
 547     I3612_CMGE0     = 0x7e208800,
 548     I3612_CMLE0     = 0x7e209800,
 549     I3612_NEG       = 0x7e20b800,
 550
 551     /* AdvSIMD shift by immediate */
 552     I3614_SSHR      = 0x0f000400,
 553     I3614_SSRA      = 0x0f001400,
 554     I3614_SHL       = 0x0f005400,
 555     I3614_SLI       = 0x2f005400,
 556     I3614_USHR      = 0x2f000400,
 557     I3614_USRA      = 0x2f001400,
 558
 559     /* AdvSIMD three same.  */
 560     I3616_ADD       = 0x0e208400,
 561     I3616_AND       = 0x0e201c00,
 562     I3616_BIC       = 0x0e601c00,
 563     I3616_BIF       = 0x2ee01c00,
 564     I3616_BIT       = 0x2ea01c00,
 565     I3616_BSL       = 0x2e601c00,
 566     I3616_EOR       = 0x2e201c00,
 567     I3616_MUL       = 0x0e209c00,
 568     I3616_ORR       = 0x0ea01c00,
 569     I3616_ORN       = 0x0ee01c00,
 570     I3616_SUB       = 0x2e208400,
 571     I3616_CMGT      = 0x0e203400,
 572     I3616_CMGE      = 0x0e203c00,
 573     I3616_CMTST     = 0x0e208c00,
 574     I3616_CMHI      = 0x2e203400,
 575     I3616_CMHS      = 0x2e203c00,
 576     I3616_CMEQ      = 0x2e208c00,
 577     I3616_SMAX      = 0x0e206400,
 578     I3616_SMIN      = 0x0e206c00,
 579     I3616_SSHL      = 0x0e204400,
 580     I3616_SQADD     = 0x0e200c00,
 581     I3616_SQSUB     = 0x0e202c00,
 582     I3616_UMAX      = 0x2e206400,
 583     I3616_UMIN      = 0x2e206c00,
 584     I3616_UQADD     = 0x2e200c00,
 585     I3616_UQSUB     = 0x2e202c00,
 586     I3616_USHL      = 0x2e204400,
 587
 588     /* AdvSIMD two-reg misc.  */
 589     I3617_CMGT0     = 0x0e208800,
 590     I3617_CMEQ0     = 0x0e209800,
 591     I3617_CMLT0     = 0x0e20a800,
 592     I3617_CMGE0     = 0x2e208800,
 593     I3617_CMLE0     = 0x2e209800,
 594     I3617_NOT       = 0x2e205800,
 595     I3617_ABS       = 0x0e20b800,
 596     I3617_NEG       = 0x2e20b800,
 597
 598     /* System instructions.  */
 599     NOP             = 0xd503201f,
 600     DMB_ISH         = 0xd50338bf,
 601     DMB_LD          = 0x00000100,
 602     DMB_ST          = 0x00000200,
 603
 604     BTI_C           = 0xd503245f,
 605     BTI_J           = 0xd503249f,
 606     BTI_JC          = 0xd50324df,
 607 } AArch64Insn;
 608
 609 static inline uint32_t tcg_in32(TCGContext *s)
 610 {
 611     uint32_t v = *(uint32_t *)s->code_ptr;
 612     return v;
 613 }
 614
 615 /* Emit an opcode with "type-checking" of the format.  */
 616 #define tcg_out_insn(S, FMT, OP, ...) \
 617     glue(tcg_out_insn_,FMT)(S, glue(glue(glue(I,FMT),_),OP), ## __VA_ARGS__)
 618
 619 static void tcg_out_insn_3303(TCGContext *s, AArch64Insn insn, bool q,
 620                               TCGReg rt, TCGReg rn, unsigned size)
 621 {
 622     tcg_out32(s, insn | (rt & 0x1f) | (rn << 5) | (size << 10) | (q << 30));
 623 }
 624
 625 static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
 626                               int imm19, TCGReg rt)
 627 {
 628     tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
 629 }
 630
 631 static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs,
 632                               TCGReg rt, TCGReg rt2, TCGReg rn)
 633 {
 634     tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt);
 635 }
 636
 637 static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
 638                               TCGReg rt, int imm19)
 639 {
 640     tcg_out32(s, insn | ext << 31 | (imm19 & 0x7ffff) << 5 | rt);
 641 }
 642
 643 static void tcg_out_insn_3202(TCGContext *s, AArch64Insn insn,
 644                               TCGCond c, int imm19)
 645 {
 646     tcg_out32(s, insn | tcg_cond_to_aarch64[c] | (imm19 & 0x7ffff) << 5);
 647 }
 648
 649 static void tcg_out_insn_3206(TCGContext *s, AArch64Insn insn, int imm26)
 650 {
 651     tcg_out32(s, insn | (imm26 & 0x03ffffff));
 652 }
 653
 654 static void tcg_out_insn_3207(TCGContext *s, AArch64Insn insn, TCGReg rn)
 655 {
 656     tcg_out32(s, insn | rn << 5);
 657 }
 658
 659 static void tcg_out_insn_3314(TCGContext *s, AArch64Insn insn,
 660                               TCGReg r1, TCGReg r2, TCGReg rn,
 661                               tcg_target_long ofs, bool pre, bool w)
 662 {
 663     insn |= 1u << 31; /* ext */
 664     insn |= pre << 24;
 665     insn |= w << 23;
 666
 667     tcg_debug_assert(ofs >= -0x200 && ofs < 0x200 && (ofs & 7) == 0);
 668     insn |= (ofs & (0x7f << 3)) << (15 - 3);
 669
 670     tcg_out32(s, insn | r2 << 10 | rn << 5 | r1);
 671 }
 672
 673 static void tcg_out_insn_3401(TCGContext *s, AArch64Insn insn, TCGType ext,
 674                               TCGReg rd, TCGReg rn, uint64_t aimm)
 675 {
 676     if (aimm > 0xfff) {
 677         tcg_debug_assert((aimm & 0xfff) == 0);
 678         aimm >>= 12;
 679         tcg_debug_assert(aimm <= 0xfff);
 680         aimm |= 1 << 12;  /* apply LSL 12 */
 681     }
 682     tcg_out32(s, insn | ext << 31 | aimm << 10 | rn << 5 | rd);
 683 }
 684
 685 /* This function can be used for both 3.4.2 (Bitfield) and 3.4.4
 686    (Logical immediate).  Both insn groups have N, IMMR and IMMS fields
 687    that feed the DecodeBitMasks pseudo function.  */
 688 static void tcg_out_insn_3402(TCGContext *s, AArch64Insn insn, TCGType ext,
 689                               TCGReg rd, TCGReg rn, int n, int immr, int imms)
 690 {
 691     tcg_out32(s, insn | ext << 31 | n << 22 | immr << 16 | imms << 10
 692               | rn << 5 | rd);
 693 }
 694
 695 #define tcg_out_insn_3404  tcg_out_insn_3402
 696
 697 static void tcg_out_insn_3403(TCGContext *s, AArch64Insn insn, TCGType ext,
 698                               TCGReg rd, TCGReg rn, TCGReg rm, int imms)
 699 {
 700     tcg_out32(s, insn | ext << 31 | ext << 22 | rm << 16 | imms << 10
 701               | rn << 5 | rd);
 702 }
 703
 704 /* This function is used for the Move (wide immediate) instruction group.
 705    Note that SHIFT is a full shift count, not the 2 bit HW field. */
 706 static void tcg_out_insn_3405(TCGContext *s, AArch64Insn insn, TCGType ext,
 707                               TCGReg rd, uint16_t half, unsigned shift)
 708 {
 709     tcg_debug_assert((shift & ~0x30) == 0);
 710     tcg_out32(s, insn | ext << 31 | shift << (21 - 4) | half << 5 | rd);
 711 }
 712
 713 static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn,
 714                               TCGReg rd, int64_t disp)
 715 {
 716     tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd);
 717 }
 718
 719 static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
 720                                      TCGType sf, TCGReg rd, TCGReg rn,
 721                                      TCGReg rm, int opt, int imm3)
 722 {
 723     tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 |
 724               imm3 << 10 | rn << 5 | rd);
 725 }
 726
 727 /* This function is for both 3.5.2 (Add/Subtract shifted register), for
 728    the rare occasion when we actually want to supply a shift amount.  */
 729 static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
 730                                       TCGType ext, TCGReg rd, TCGReg rn,
 731                                       TCGReg rm, int imm6)
 732 {
 733     tcg_out32(s, insn | ext << 31 | rm << 16 | imm6 << 10 | rn << 5 | rd);
 734 }
 735
 736 /* This function is for 3.5.2 (Add/subtract shifted register),
 737    and 3.5.10 (Logical shifted register), for the vast majorty of cases
 738    when we don't want to apply a shift.  Thus it can also be used for
 739    3.5.3 (Add/subtract with carry) and 3.5.8 (Data processing 2 source).  */
 740 static void tcg_out_insn_3502(TCGContext *s, AArch64Insn insn, TCGType ext,
 741                               TCGReg rd, TCGReg rn, TCGReg rm)
 742 {
 743     tcg_out32(s, insn | ext << 31 | rm << 16 | rn << 5 | rd);
 744 }
 745
 746 #define tcg_out_insn_3503  tcg_out_insn_3502
 747 #define tcg_out_insn_3508  tcg_out_insn_3502
 748 #define tcg_out_insn_3510  tcg_out_insn_3502
 749
 750 static void tcg_out_insn_3506(TCGContext *s, AArch64Insn insn, TCGType ext,
 751                               TCGReg rd, TCGReg rn, TCGReg rm, TCGCond c)
 752 {
 753     tcg_out32(s, insn | ext << 31 | rm << 16 | rn << 5 | rd
 754               | tcg_cond_to_aarch64[c] << 12);
 755 }
 756
 757 static void tcg_out_insn_3507(TCGContext *s, AArch64Insn insn, TCGType ext,
 758                               TCGReg rd, TCGReg rn)
 759 {
 760     tcg_out32(s, insn | ext << 31 | rn << 5 | rd);
 761 }
 762
 763 static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
 764                               TCGReg rd, TCGReg rn, TCGReg rm, TCGReg ra)
 765 {
 766     tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
 767 }
 768
 769 static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q,
 770                               TCGReg rd, TCGReg rn, int dst_idx, int src_idx)
 771 {
 772     /* Note that bit 11 set means general register input.  Therefore
 773        we can handle both register sets with one function.  */
 774     tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11)
 775               | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5);
 776 }
 777
 778 static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
 779                               TCGReg rd, bool op, int cmode, uint8_t imm8)
 780 {
 781     tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
 782               | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
 783 }
 784
 785 static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
 786                               TCGReg rd, TCGReg rn, unsigned immhb)
 787 {
 788     tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
 789 }
 790
 791 static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
 792                               unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
 793 {
 794     tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
 795               | (rn & 0x1f) << 5 | (rd & 0x1f));
 796 }
 797
 798 static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
 799                               unsigned size, TCGReg rd, TCGReg rn)
 800 {
 801     tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
 802 }
 803
 804 static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
 805                               TCGReg rd, TCGReg rn, unsigned immhb)
 806 {
 807     tcg_out32(s, insn | q << 30 | immhb << 16
 808               | (rn & 0x1f) << 5 | (rd & 0x1f));
 809 }
 810
 811 static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
 812                               unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
 813 {
 814     tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16
 815               | (rn & 0x1f) << 5 | (rd & 0x1f));
 816 }
 817
 818 static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
 819                               unsigned size, TCGReg rd, TCGReg rn)
 820 {
 821     tcg_out32(s, insn | q << 30 | (size << 22)
 822               | (rn & 0x1f) << 5 | (rd & 0x1f));
 823 }
 824
 825 static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
 826                               TCGReg rd, TCGReg base, TCGType ext,
 827                               TCGReg regoff)
 828 {
 829     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
 830     tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
 831               0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
 832 }
 833
 834 static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
 835                               TCGReg rd, TCGReg rn, intptr_t offset)
 836 {
 837     tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
 838 }
 839
 840 static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
 841                               TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
 842 {
 843     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
 844     tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
 845               | rn << 5 | (rd & 0x1f));
 846 }
 847
 848 static void tcg_out_bti(TCGContext *s, AArch64Insn insn)
 849 {
 850     /*
 851      * While BTI insns are nops on hosts without FEAT_BTI,
 852      * there is no point in emitting them in that case either.
 853      */
 854     if (cpuinfo & CPUINFO_BTI) {
 855         tcg_out32(s, insn);
 856     }
 857 }
 858
 859 /* Register to register move using ORR (shifted register with no shift). */
 860 static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm)
 861 {
 862     tcg_out_insn(s, 3510, ORR, ext, rd, TCG_REG_XZR, rm);
 863 }
 864
 865 /* Register to register move using ADDI (move to/from SP).  */
 866 static void tcg_out_movr_sp(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rn)
 867 {
 868     tcg_out_insn(s, 3401, ADDI, ext, rd, rn, 0);
 869 }
 870
 871 /* This function is used for the Logical (immediate) instruction group.
 872    The value of LIMM must satisfy IS_LIMM.  See the comment above about
 873    only supporting simplified logical immediates.  */
 874 static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
 875                              TCGReg rd, TCGReg rn, uint64_t limm)
 876 {
 877     unsigned h, l, r, c;
 878
 879     tcg_debug_assert(is_limm(limm));
 880
 881     h = clz64(limm);
 882     l = ctz64(limm);
 883     if (l == 0) {
 884         r = 0;                  /* form 0....01....1 */
 885         c = ctz64(~limm) - 1;
 886         if (h == 0) {
 887             r = clz64(~limm);   /* form 1..10..01..1 */
 888             c += r;
 889         }
 890     } else {
 891         r = 64 - l;             /* form 1....10....0 or 0..01..10..0 */
 892         c = r - h - 1;
 893     }
 894     if (ext == TCG_TYPE_I32) {
 895         r &= 31;
 896         c &= 31;
 897     }
 898
 899     tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
 900 }
 901
 902 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 903                              TCGReg rd, int64_t v64)
 904 {
 905     bool q = type == TCG_TYPE_V128;
 906     int cmode, imm8, i;
 907
 908     /* Test all bytes equal first.  */
 909     if (vece == MO_8) {
 910         imm8 = (uint8_t)v64;
 911         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
 912         return;
 913     }
 914
 915     /*
 916      * Test all bytes 0x00 or 0xff second.  This can match cases that
 917      * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
 918      */
 919     for (i = imm8 = 0; i < 8; i++) {
 920         uint8_t byte = v64 >> (i * 8);
 921         if (byte == 0xff) {
 922             imm8 |= 1 << i;
 923         } else if (byte != 0) {
 924             goto fail_bytes;
 925         }
 926     }
 927     tcg_out_insn(s, 3606, MOVI, q, rd, 1, 0xe, imm8);
 928     return;
 929  fail_bytes:
 930
 931     /*
 932      * Tests for various replications.  For each element width, if we
 933      * cannot find an expansion there's no point checking a larger
 934      * width because we already know by replication it cannot match.
 935      */
 936     if (vece == MO_16) {
 937         uint16_t v16 = v64;
 938
 939         if (is_shimm16(v16, &cmode, &imm8)) {
 940             tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
 941             return;
 942         }
 943         if (is_shimm16(~v16, &cmode, &imm8)) {
 944             tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
 945             return;
 946         }
 947
 948         /*
 949          * Otherwise, all remaining constants can be loaded in two insns:
 950          * rd = v16 & 0xff, rd |= v16 & 0xff00.
 951          */
 952         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
 953         tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
 954         return;
 955     } else if (vece == MO_32) {
 956         uint32_t v32 = v64;
 957         uint32_t n32 = ~v32;
 958
 959         if (is_shimm32(v32, &cmode, &imm8) ||
 960             is_soimm32(v32, &cmode, &imm8) ||
 961             is_fimm32(v32, &cmode, &imm8)) {
 962             tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
 963             return;
 964         }
 965         if (is_shimm32(n32, &cmode, &imm8) ||
 966             is_soimm32(n32, &cmode, &imm8)) {
 967             tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
 968             return;
 969         }
 970
 971         /*
 972          * Restrict the set of constants to those we can load with
 973          * two instructions.  Others we load from the pool.
 974          */
 975         i = is_shimm32_pair(v32, &cmode, &imm8);
 976         if (i) {
 977             tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
 978             tcg_out_insn(s, 3606, ORR, q, rd, 0, i, extract32(v32, i * 4, 8));
 979             return;
 980         }
 981         i = is_shimm32_pair(n32, &cmode, &imm8);
 982         if (i) {
 983             tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
 984             tcg_out_insn(s, 3606, BIC, q, rd, 0, i, extract32(n32, i * 4, 8));
 985             return;
 986         }
 987     } else if (is_fimm64(v64, &cmode, &imm8)) {
 988         tcg_out_insn(s, 3606, MOVI, q, rd, 1, cmode, imm8);
 989         return;
 990     }
 991
 992     /*
 993      * As a last resort, load from the constant pool.  Sadly there
 994      * is no LD1R (literal), so store the full 16-byte vector.
 995      */
 996     if (type == TCG_TYPE_V128) {
 997         new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64);
 998         tcg_out_insn(s, 3305, LDR_v128, 0, rd);
 999     } else {
1000         new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0);
1001         tcg_out_insn(s, 3305, LDR_v64, 0, rd);
1002     }
1003 }
1004
1005 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
1006                             TCGReg rd, TCGReg rs)
1007 {
1008     int is_q = type - TCG_TYPE_V64;
1009     tcg_out_insn(s, 3605, DUP, is_q, rd, rs, 1 << vece, 0);
1010     return true;
1011 }
1012
1013 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
1014                              TCGReg r, TCGReg base, intptr_t offset)
1015 {
1016     TCGReg temp = TCG_REG_TMP0;
1017
1018     if (offset < -0xffffff || offset > 0xffffff) {
1019         tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
1020         tcg_out_insn(s, 3502, ADD, 1, temp, temp, base);
1021         base = temp;
1022     } else {
1023         AArch64Insn add_insn = I3401_ADDI;
1024
1025         if (offset < 0) {
1026             add_insn = I3401_SUBI;
1027             offset = -offset;
1028         }
1029         if (offset & 0xfff000) {
1030             tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff000);
1031             base = temp;
1032         }
1033         if (offset & 0xfff) {
1034             tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff);
1035             base = temp;
1036         }
1037     }
1038     tcg_out_insn(s, 3303, LD1R, type == TCG_TYPE_V128, r, base, vece);
1039     return true;
1040 }
1041
1042 static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
1043                          tcg_target_long value)
1044 {
1045     tcg_target_long svalue = value;
1046     tcg_target_long ivalue = ~value;
1047     tcg_target_long t0, t1, t2;
1048     int s0, s1;
1049     AArch64Insn opc;
1050
1051     switch (type) {
1052     case TCG_TYPE_I32:
1053     case TCG_TYPE_I64:
1054         tcg_debug_assert(rd < 32);
1055         break;
1056     default:
1057         g_assert_not_reached();
1058     }
1059
1060     /* For 32-bit values, discard potential garbage in value.  For 64-bit
1061        values within [2**31, 2**32-1], we can create smaller sequences by
1062        interpreting this as a negative 32-bit number, while ensuring that
1063        the high 32 bits are cleared by setting SF=0.  */
1064     if (type == TCG_TYPE_I32 || (value & ~0xffffffffull) == 0) {
1065         svalue = (int32_t)value;
1066         value = (uint32_t)value;
1067         ivalue = (uint32_t)ivalue;
1068         type = TCG_TYPE_I32;
1069     }
1070
1071     /* Speed things up by handling the common case of small positive
1072        and negative values specially.  */
1073     if ((value & ~0xffffull) == 0) {
1074         tcg_out_insn(s, 3405, MOVZ, type, rd, value, 0);
1075         return;
1076     } else if ((ivalue & ~0xffffull) == 0) {
1077         tcg_out_insn(s, 3405, MOVN, type, rd, ivalue, 0);
1078         return;
1079     }
1080
1081     /* Check for bitfield immediates.  For the benefit of 32-bit quantities,
1082        use the sign-extended value.  That lets us match rotated values such
1083        as 0xff0000ff with the same 64-bit logic matching 0xffffffffff0000ff. */
1084     if (is_limm(svalue)) {
1085         tcg_out_logicali(s, I3404_ORRI, type, rd, TCG_REG_XZR, svalue);
1086         return;
1087     }
1088
1089     /* Look for host pointer values within 4G of the PC.  This happens
1090        often when loading pointers to QEMU's own data structures.  */
1091     if (type == TCG_TYPE_I64) {
1092         intptr_t src_rx = (intptr_t)tcg_splitwx_to_rx(s->code_ptr);
1093         tcg_target_long disp = value - src_rx;
1094         if (disp == sextract64(disp, 0, 21)) {
1095             tcg_out_insn(s, 3406, ADR, rd, disp);
1096             return;
1097         }
1098         disp = (value >> 12) - (src_rx >> 12);
1099         if (disp == sextract64(disp, 0, 21)) {
1100             tcg_out_insn(s, 3406, ADRP, rd, disp);
1101             if (value & 0xfff) {
1102                 tcg_out_insn(s, 3401, ADDI, type, rd, rd, value & 0xfff);
1103             }
1104             return;
1105         }
1106     }
1107
1108     /* Would it take fewer insns to begin with MOVN?  */
1109     if (ctpop64(value) >= 32) {
1110         t0 = ivalue;
1111         opc = I3405_MOVN;
1112     } else {
1113         t0 = value;
1114         opc = I3405_MOVZ;
1115     }
1116     s0 = ctz64(t0) & (63 & -16);
1117     t1 = t0 & ~(0xffffull << s0);
1118     s1 = ctz64(t1) & (63 & -16);
1119     t2 = t1 & ~(0xffffull << s1);
1120     if (t2 == 0) {
1121         tcg_out_insn_3405(s, opc, type, rd, t0 >> s0, s0);
1122         if (t1 != 0) {
1123             tcg_out_insn(s, 3405, MOVK, type, rd, value >> s1, s1);
1124         }
1125         return;
1126     }
1127
1128     /* For more than 2 insns, dump it into the constant pool.  */
1129     new_pool_label(s, value, R_AARCH64_CONDBR19, s->code_ptr, 0);
1130     tcg_out_insn(s, 3305, LDR, 0, rd);
1131 }
1132
1133 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1134 {
1135     return false;
1136 }
1137
1138 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1139                              tcg_target_long imm)
1140 {
1141     /* This function is only used for passing structs by reference. */
1142     g_assert_not_reached();
1143 }
1144
1145 /* Define something more legible for general use.  */
1146 #define tcg_out_ldst_r  tcg_out_insn_3310
1147
1148 static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
1149                          TCGReg rn, intptr_t offset, int lgsize)
1150 {
1151     /* If the offset is naturally aligned and in range, then we can
1152        use the scaled uimm12 encoding */
1153     if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
1154         uintptr_t scaled_uimm = offset >> lgsize;
1155         if (scaled_uimm <= 0xfff) {
1156             tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
1157             return;
1158         }
1159     }
1160
1161     /* Small signed offsets can use the unscaled encoding.  */
1162     if (offset >= -256 && offset < 256) {
1163         tcg_out_insn_3312(s, insn, rd, rn, offset);
1164         return;
1165     }
1166
1167     /* Worst-case scenario, move offset to temp register, use reg offset.  */
1168     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset);
1169     tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0);
1170 }
1171
1172 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
1173 {
1174     if (ret == arg) {
1175         return true;
1176     }
1177     switch (type) {
1178     case TCG_TYPE_I32:
1179     case TCG_TYPE_I64:
1180         if (ret < 32 && arg < 32) {
1181             tcg_out_movr(s, type, ret, arg);
1182             break;
1183         } else if (ret < 32) {
1184             tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0);
1185             break;
1186         } else if (arg < 32) {
1187             tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0);
1188             break;
1189         }
1190         /* FALLTHRU */
1191
1192     case TCG_TYPE_V64:
1193         tcg_debug_assert(ret >= 32 && arg >= 32);
1194         tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg);
1195         break;
1196     case TCG_TYPE_V128:
1197         tcg_debug_assert(ret >= 32 && arg >= 32);
1198         tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg);
1199         break;
1200
1201     default:
1202         g_assert_not_reached();
1203     }
1204     return true;
1205 }
1206
1207 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1208                        TCGReg base, intptr_t ofs)
1209 {
1210     AArch64Insn insn;
1211     int lgsz;
1212
1213     switch (type) {
1214     case TCG_TYPE_I32:
1215         insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS);
1216         lgsz = 2;
1217         break;
1218     case TCG_TYPE_I64:
1219         insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD);
1220         lgsz = 3;
1221         break;
1222     case TCG_TYPE_V64:
1223         insn = I3312_LDRVD;
1224         lgsz = 3;
1225         break;
1226     case TCG_TYPE_V128:
1227         insn = I3312_LDRVQ;
1228         lgsz = 4;
1229         break;
1230     default:
1231         g_assert_not_reached();
1232     }
1233     tcg_out_ldst(s, insn, ret, base, ofs, lgsz);
1234 }
1235
1236 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src,
1237                        TCGReg base, intptr_t ofs)
1238 {
1239     AArch64Insn insn;
1240     int lgsz;
1241
1242     switch (type) {
1243     case TCG_TYPE_I32:
1244         insn = (src < 32 ? I3312_STRW : I3312_STRVS);
1245         lgsz = 2;
1246         break;
1247     case TCG_TYPE_I64:
1248         insn = (src < 32 ? I3312_STRX : I3312_STRVD);
1249         lgsz = 3;
1250         break;
1251     case TCG_TYPE_V64:
1252         insn = I3312_STRVD;
1253         lgsz = 3;
1254         break;
1255     case TCG_TYPE_V128:
1256         insn = I3312_STRVQ;
1257         lgsz = 4;
1258         break;
1259     default:
1260         g_assert_not_reached();
1261     }
1262     tcg_out_ldst(s, insn, src, base, ofs, lgsz);
1263 }
1264
1265 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1266                                TCGReg base, intptr_t ofs)
1267 {
1268     if (type <= TCG_TYPE_I64 && val == 0) {
1269         tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
1270         return true;
1271     }
1272     return false;
1273 }
1274
1275 static inline void tcg_out_bfm(TCGContext *s, TCGType ext, TCGReg rd,
1276                                TCGReg rn, unsigned int a, unsigned int b)
1277 {
1278     tcg_out_insn(s, 3402, BFM, ext, rd, rn, ext, a, b);
1279 }
1280
1281 static inline void tcg_out_ubfm(TCGContext *s, TCGType ext, TCGReg rd,
1282                                 TCGReg rn, unsigned int a, unsigned int b)
1283 {
1284     tcg_out_insn(s, 3402, UBFM, ext, rd, rn, ext, a, b);
1285 }
1286
1287 static inline void tcg_out_sbfm(TCGContext *s, TCGType ext, TCGReg rd,
1288                                 TCGReg rn, unsigned int a, unsigned int b)
1289 {
1290     tcg_out_insn(s, 3402, SBFM, ext, rd, rn, ext, a, b);
1291 }
1292
1293 static inline void tcg_out_extr(TCGContext *s, TCGType ext, TCGReg rd,
1294                                 TCGReg rn, TCGReg rm, unsigned int a)
1295 {
1296     tcg_out_insn(s, 3403, EXTR, ext, rd, rn, rm, a);
1297 }
1298
1299 static inline void tcg_out_shl(TCGContext *s, TCGType ext,
1300                                TCGReg rd, TCGReg rn, unsigned int m)
1301 {
1302     int bits = ext ? 64 : 32;
1303     int max = bits - 1;
1304     tcg_out_ubfm(s, ext, rd, rn, (bits - m) & max, (max - m) & max);
1305 }
1306
1307 static inline void tcg_out_shr(TCGContext *s, TCGType ext,
1308                                TCGReg rd, TCGReg rn, unsigned int m)
1309 {
1310     int max = ext ? 63 : 31;
1311     tcg_out_ubfm(s, ext, rd, rn, m & max, max);
1312 }
1313
1314 static inline void tcg_out_sar(TCGContext *s, TCGType ext,
1315                                TCGReg rd, TCGReg rn, unsigned int m)
1316 {
1317     int max = ext ? 63 : 31;
1318     tcg_out_sbfm(s, ext, rd, rn, m & max, max);
1319 }
1320
1321 static inline void tcg_out_rotr(TCGContext *s, TCGType ext,
1322                                 TCGReg rd, TCGReg rn, unsigned int m)
1323 {
1324     int max = ext ? 63 : 31;
1325     tcg_out_extr(s, ext, rd, rn, rn, m & max);
1326 }
1327
1328 static inline void tcg_out_rotl(TCGContext *s, TCGType ext,
1329                                 TCGReg rd, TCGReg rn, unsigned int m)
1330 {
1331     int max = ext ? 63 : 31;
1332     tcg_out_extr(s, ext, rd, rn, rn, -m & max);
1333 }
1334
1335 static inline void tcg_out_dep(TCGContext *s, TCGType ext, TCGReg rd,
1336                                TCGReg rn, unsigned lsb, unsigned width)
1337 {
1338     unsigned size = ext ? 64 : 32;
1339     unsigned a = (size - lsb) & (size - 1);
1340     unsigned b = width - 1;
1341     tcg_out_bfm(s, ext, rd, rn, a, b);
1342 }
1343
1344 static void tcg_out_cmp(TCGContext *s, TCGType ext, TCGReg a,
1345                         tcg_target_long b, bool const_b)
1346 {
1347     if (const_b) {
1348         /* Using CMP or CMN aliases.  */
1349         if (b >= 0) {
1350             tcg_out_insn(s, 3401, SUBSI, ext, TCG_REG_XZR, a, b);
1351         } else {
1352             tcg_out_insn(s, 3401, ADDSI, ext, TCG_REG_XZR, a, -b);
1353         }
1354     } else {
1355         /* Using CMP alias SUBS wzr, Wn, Wm */
1356         tcg_out_insn(s, 3502, SUBS, ext, TCG_REG_XZR, a, b);
1357     }
1358 }
1359
1360 static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target)
1361 {
1362     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
1363     tcg_debug_assert(offset == sextract64(offset, 0, 26));
1364     tcg_out_insn(s, 3206, B, offset);
1365 }
1366
1367 static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
1368 {
1369     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
1370     if (offset == sextract64(offset, 0, 26)) {
1371         tcg_out_insn(s, 3206, BL, offset);
1372     } else {
1373         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
1374         tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0);
1375     }
1376 }
1377
1378 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
1379                          const TCGHelperInfo *info)
1380 {
1381     tcg_out_call_int(s, target);
1382 }
1383
1384 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
1385 {
1386     if (!l->has_value) {
1387         tcg_out_reloc(s, s->code_ptr, R_AARCH64_JUMP26, l, 0);
1388         tcg_out_insn(s, 3206, B, 0);
1389     } else {
1390         tcg_out_goto(s, l->u.value_ptr);
1391     }
1392 }
1393
1394 static void tcg_out_brcond(TCGContext *s, TCGType ext, TCGCond c, TCGArg a,
1395                            TCGArg b, bool b_const, TCGLabel *l)
1396 {
1397     intptr_t offset;
1398     bool need_cmp;
1399
1400     if (b_const && b == 0 && (c == TCG_COND_EQ || c == TCG_COND_NE)) {
1401         need_cmp = false;
1402     } else {
1403         need_cmp = true;
1404         tcg_out_cmp(s, ext, a, b, b_const);
1405     }
1406
1407     if (!l->has_value) {
1408         tcg_out_reloc(s, s->code_ptr, R_AARCH64_CONDBR19, l, 0);
1409         offset = tcg_in32(s) >> 5;
1410     } else {
1411         offset = tcg_pcrel_diff(s, l->u.value_ptr) >> 2;
1412         tcg_debug_assert(offset == sextract64(offset, 0, 19));
1413     }
1414
1415     if (need_cmp) {
1416         tcg_out_insn(s, 3202, B_C, c, offset);
1417     } else if (c == TCG_COND_EQ) {
1418         tcg_out_insn(s, 3201, CBZ, ext, a, offset);
1419     } else {
1420         tcg_out_insn(s, 3201, CBNZ, ext, a, offset);
1421     }
1422 }
1423
1424 static inline void tcg_out_rev(TCGContext *s, int ext, MemOp s_bits,
1425                                TCGReg rd, TCGReg rn)
1426 {
1427     /* REV, REV16, REV32 */
1428     tcg_out_insn_3507(s, I3507_REV | (s_bits << 10), ext, rd, rn);
1429 }
1430
1431 static inline void tcg_out_sxt(TCGContext *s, TCGType ext, MemOp s_bits,
1432                                TCGReg rd, TCGReg rn)
1433 {
1434     /* Using ALIASes SXTB, SXTH, SXTW, of SBFM Xd, Xn, #0, #7|15|31 */
1435     int bits = (8 << s_bits) - 1;
1436     tcg_out_sbfm(s, ext, rd, rn, 0, bits);
1437 }
1438
1439 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rn)
1440 {
1441     tcg_out_sxt(s, type, MO_8, rd, rn);
1442 }
1443
1444 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rn)
1445 {
1446     tcg_out_sxt(s, type, MO_16, rd, rn);
1447 }
1448
1449 static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rn)
1450 {
1451     tcg_out_sxt(s, TCG_TYPE_I64, MO_32, rd, rn);
1452 }
1453
1454 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rn)
1455 {
1456     tcg_out_ext32s(s, rd, rn);
1457 }
1458
1459 static inline void tcg_out_uxt(TCGContext *s, MemOp s_bits,
1460                                TCGReg rd, TCGReg rn)
1461 {
1462     /* Using ALIASes UXTB, UXTH of UBFM Wd, Wn, #0, #7|15 */
1463     int bits = (8 << s_bits) - 1;
1464     tcg_out_ubfm(s, 0, rd, rn, 0, bits);
1465 }
1466
1467 static void tcg_out_ext8u(TCGContext *s, TCGReg rd, TCGReg rn)
1468 {
1469     tcg_out_uxt(s, MO_8, rd, rn);
1470 }
1471
1472 static void tcg_out_ext16u(TCGContext *s, TCGReg rd, TCGReg rn)
1473 {
1474     tcg_out_uxt(s, MO_16, rd, rn);
1475 }
1476
1477 static void tcg_out_ext32u(TCGContext *s, TCGReg rd, TCGReg rn)
1478 {
1479     tcg_out_movr(s, TCG_TYPE_I32, rd, rn);
1480 }
1481
1482 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rn)
1483 {
1484     tcg_out_ext32u(s, rd, rn);
1485 }
1486
1487 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg rd, TCGReg rn)
1488 {
1489     tcg_out_mov(s, TCG_TYPE_I32, rd, rn);
1490 }
1491
1492 static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
1493                             TCGReg rn, int64_t aimm)
1494 {
1495     if (aimm >= 0) {
1496         tcg_out_insn(s, 3401, ADDI, ext, rd, rn, aimm);
1497     } else {
1498         tcg_out_insn(s, 3401, SUBI, ext, rd, rn, -aimm);
1499     }
1500 }
1501
1502 static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
1503                             TCGReg rh, TCGReg al, TCGReg ah,
1504                             tcg_target_long bl, tcg_target_long bh,
1505                             bool const_bl, bool const_bh, bool sub)
1506 {
1507     TCGReg orig_rl = rl;
1508     AArch64Insn insn;
1509
1510     if (rl == ah || (!const_bh && rl == bh)) {
1511         rl = TCG_REG_TMP0;
1512     }
1513
1514     if (const_bl) {
1515         if (bl < 0) {
1516             bl = -bl;
1517             insn = sub ? I3401_ADDSI : I3401_SUBSI;
1518         } else {
1519             insn = sub ? I3401_SUBSI : I3401_ADDSI;
1520         }
1521
1522         if (unlikely(al == TCG_REG_XZR)) {
1523             /* ??? We want to allow al to be zero for the benefit of
1524                negation via subtraction.  However, that leaves open the
1525                possibility of adding 0+const in the low part, and the
1526                immediate add instructions encode XSP not XZR.  Don't try
1527                anything more elaborate here than loading another zero.  */
1528             al = TCG_REG_TMP0;
1529             tcg_out_movi(s, ext, al, 0);
1530         }
1531         tcg_out_insn_3401(s, insn, ext, rl, al, bl);
1532     } else {
1533         tcg_out_insn_3502(s, sub ? I3502_SUBS : I3502_ADDS, ext, rl, al, bl);
1534     }
1535
1536     insn = I3503_ADC;
1537     if (const_bh) {
1538         /* Note that the only two constants we support are 0 and -1, and
1539            that SBC = rn + ~rm + c, so adc -1 is sbc 0, and vice-versa.  */
1540         if ((bh != 0) ^ sub) {
1541             insn = I3503_SBC;
1542         }
1543         bh = TCG_REG_XZR;
1544     } else if (sub) {
1545         insn = I3503_SBC;
1546     }
1547     tcg_out_insn_3503(s, insn, ext, rh, ah, bh);
1548
1549     tcg_out_mov(s, ext, orig_rl, rl);
1550 }
1551
1552 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1553 {
1554     static const uint32_t sync[] = {
1555         [0 ... TCG_MO_ALL]            = DMB_ISH | DMB_LD | DMB_ST,
1556         [TCG_MO_ST_ST]                = DMB_ISH | DMB_ST,
1557         [TCG_MO_LD_LD]                = DMB_ISH | DMB_LD,
1558         [TCG_MO_LD_ST]                = DMB_ISH | DMB_LD,
1559         [TCG_MO_LD_ST | TCG_MO_LD_LD] = DMB_ISH | DMB_LD,
1560     };
1561     tcg_out32(s, sync[a0 & TCG_MO_ALL]);
1562 }
1563
1564 static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
1565                          TCGReg a0, TCGArg b, bool const_b, bool is_ctz)
1566 {
1567     TCGReg a1 = a0;
1568     if (is_ctz) {
1569         a1 = TCG_REG_TMP0;
1570         tcg_out_insn(s, 3507, RBIT, ext, a1, a0);
1571     }
1572     if (const_b && b == (ext ? 64 : 32)) {
1573         tcg_out_insn(s, 3507, CLZ, ext, d, a1);
1574     } else {
1575         AArch64Insn sel = I3506_CSEL;
1576
1577         tcg_out_cmp(s, ext, a0, 0, 1);
1578         tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1);
1579
1580         if (const_b) {
1581             if (b == -1) {
1582                 b = TCG_REG_XZR;
1583                 sel = I3506_CSINV;
1584             } else if (b == 0) {
1585                 b = TCG_REG_XZR;
1586             } else {
1587                 tcg_out_movi(s, ext, d, b);
1588                 b = d;
1589             }
1590         }
1591         tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE);
1592     }
1593 }
1594
1595 typedef struct {
1596     TCGReg base;
1597     TCGReg index;
1598     TCGType index_ext;
1599     TCGAtomAlign aa;
1600 } HostAddress;
1601
1602 bool tcg_target_has_memory_bswap(MemOp memop)
1603 {
1604     return false;
1605 }
1606
1607 static const TCGLdstHelperParam ldst_helper_param = {
1608     .ntmp = 1, .tmp = { TCG_REG_TMP0 }
1609 };
1610
1611 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
1612 {
1613     MemOp opc = get_memop(lb->oi);
1614
1615     if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
1616         return false;
1617     }
1618
1619     tcg_out_ld_helper_args(s, lb, &ldst_helper_param);
1620     tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
1621     tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param);
1622     tcg_out_goto(s, lb->raddr);
1623     return true;
1624 }
1625
1626 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
1627 {
1628     MemOp opc = get_memop(lb->oi);
1629
1630     if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
1631         return false;
1632     }
1633
1634     tcg_out_st_helper_args(s, lb, &ldst_helper_param);
1635     tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]);
1636     tcg_out_goto(s, lb->raddr);
1637     return true;
1638 }
1639
1640 /* We expect to use a 7-bit scaled negative offset from ENV.  */
1641 #define MIN_TLB_MASK_TABLE_OFS  -512
1642
1643 /*
1644  * For system-mode, perform the TLB load and compare.
1645  * For user-mode, perform any required alignment tests.
1646  * In both cases, return a TCGLabelQemuLdst structure if the slow path
1647  * is required and fill in @h with the host address for the fast path.
1648  */
1649 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1650                                            TCGReg addr_reg, MemOpIdx oi,
1651                                            bool is_ld)
1652 {
1653     TCGType addr_type = s->addr_type;
1654     TCGLabelQemuLdst *ldst = NULL;
1655     MemOp opc = get_memop(oi);
1656     MemOp s_bits = opc & MO_SIZE;
1657     unsigned a_mask;
1658
1659     h->aa = atom_and_align_for_opc(s, opc,
1660                                    have_lse2 ? MO_ATOM_WITHIN16
1661                                              : MO_ATOM_IFALIGN,
1662                                    s_bits == MO_128);
1663     a_mask = (1 << h->aa.align) - 1;
1664
1665     if (tcg_use_softmmu) {
1666         unsigned s_mask = (1u << s_bits) - 1;
1667         unsigned mem_index = get_mmuidx(oi);
1668         TCGReg addr_adj;
1669         TCGType mask_type;
1670         uint64_t compare_mask;
1671
1672         ldst = new_ldst_label(s);
1673         ldst->is_ld = is_ld;
1674         ldst->oi = oi;
1675         ldst->addrlo_reg = addr_reg;
1676
1677         mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32
1678                      ? TCG_TYPE_I64 : TCG_TYPE_I32);
1679
1680         /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
1681         QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
1682         QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
1683         tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
1684                      tlb_mask_table_ofs(s, mem_index), 1, 0);
1685
1686         /* Extract the TLB index from the address into X0.  */
1687         tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
1688                      TCG_REG_TMP0, TCG_REG_TMP0, addr_reg,
1689                      s->page_bits - CPU_TLB_ENTRY_BITS);
1690
1691         /* Add the tlb_table pointer, forming the CPUTLBEntry address. */
1692         tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);
1693
1694         /* Load the tlb comparator into TMP0, and the fast path addend. */
1695         QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
1696         tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
1697                    is_ld ? offsetof(CPUTLBEntry, addr_read)
1698                          : offsetof(CPUTLBEntry, addr_write));
1699         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
1700                    offsetof(CPUTLBEntry, addend));
1701
1702         /*
1703          * For aligned accesses, we check the first byte and include
1704          * the alignment bits within the address.  For unaligned access,
1705          * we check that we don't cross pages using the address of the
1706          * last byte of the access.
1707          */
1708         if (a_mask >= s_mask) {
1709             addr_adj = addr_reg;
1710         } else {
1711             addr_adj = TCG_REG_TMP2;
1712             tcg_out_insn(s, 3401, ADDI, addr_type,
1713                          addr_adj, addr_reg, s_mask - a_mask);
1714         }
1715         compare_mask = (uint64_t)s->page_mask | a_mask;
1716
1717         /* Store the page mask part of the address into TMP2.  */
1718         tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2,
1719                          addr_adj, compare_mask);
1720
1721         /* Perform the address comparison. */
1722         tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0);
1723
1724         /* If not equal, we jump to the slow path. */
1725         ldst->label_ptr[0] = s->code_ptr;
1726         tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1727
1728         h->base = TCG_REG_TMP1;
1729         h->index = addr_reg;
1730         h->index_ext = addr_type;
1731     } else {
1732         if (a_mask) {
1733             ldst = new_ldst_label(s);
1734
1735             ldst->is_ld = is_ld;
1736             ldst->oi = oi;
1737             ldst->addrlo_reg = addr_reg;
1738
1739             /* tst addr, #mask */
1740             tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
1741
1742             /* b.ne slow_path */
1743             ldst->label_ptr[0] = s->code_ptr;
1744             tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1745         }
1746
1747         if (guest_base || addr_type == TCG_TYPE_I32) {
1748             h->base = TCG_REG_GUEST_BASE;
1749             h->index = addr_reg;
1750             h->index_ext = addr_type;
1751         } else {
1752             h->base = addr_reg;
1753             h->index = TCG_REG_XZR;
1754             h->index_ext = TCG_TYPE_I64;
1755         }
1756     }
1757
1758     return ldst;
1759 }
1760
1761 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
1762                                    TCGReg data_r, HostAddress h)
1763 {
1764     switch (memop & MO_SSIZE) {
1765     case MO_UB:
1766         tcg_out_ldst_r(s, I3312_LDRB, data_r, h.base, h.index_ext, h.index);
1767         break;
1768     case MO_SB:
1769         tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
1770                        data_r, h.base, h.index_ext, h.index);
1771         break;
1772     case MO_UW:
1773         tcg_out_ldst_r(s, I3312_LDRH, data_r, h.base, h.index_ext, h.index);
1774         break;
1775     case MO_SW:
1776         tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
1777                        data_r, h.base, h.index_ext, h.index);
1778         break;
1779     case MO_UL:
1780         tcg_out_ldst_r(s, I3312_LDRW, data_r, h.base, h.index_ext, h.index);
1781         break;
1782     case MO_SL:
1783         tcg_out_ldst_r(s, I3312_LDRSWX, data_r, h.base, h.index_ext, h.index);
1784         break;
1785     case MO_UQ:
1786         tcg_out_ldst_r(s, I3312_LDRX, data_r, h.base, h.index_ext, h.index);
1787         break;
1788     default:
1789         g_assert_not_reached();
1790     }
1791 }
1792
1793 static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
1794                                    TCGReg data_r, HostAddress h)
1795 {
1796     switch (memop & MO_SIZE) {
1797     case MO_8:
1798         tcg_out_ldst_r(s, I3312_STRB, data_r, h.base, h.index_ext, h.index);
1799         break;
1800     case MO_16:
1801         tcg_out_ldst_r(s, I3312_STRH, data_r, h.base, h.index_ext, h.index);
1802         break;
1803     case MO_32:
1804         tcg_out_ldst_r(s, I3312_STRW, data_r, h.base, h.index_ext, h.index);
1805         break;
1806     case MO_64:
1807         tcg_out_ldst_r(s, I3312_STRX, data_r, h.base, h.index_ext, h.index);
1808         break;
1809     default:
1810         g_assert_not_reached();
1811     }
1812 }
1813
1814 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
1815                             MemOpIdx oi, TCGType data_type)
1816 {
1817     TCGLabelQemuLdst *ldst;
1818     HostAddress h;
1819
1820     ldst = prepare_host_addr(s, &h, addr_reg, oi, true);
1821     tcg_out_qemu_ld_direct(s, get_memop(oi), data_type, data_reg, h);
1822
1823     if (ldst) {
1824         ldst->type = data_type;
1825         ldst->datalo_reg = data_reg;
1826         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1827     }
1828 }
1829
1830 static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
1831                             MemOpIdx oi, TCGType data_type)
1832 {
1833     TCGLabelQemuLdst *ldst;
1834     HostAddress h;
1835
1836     ldst = prepare_host_addr(s, &h, addr_reg, oi, false);
1837     tcg_out_qemu_st_direct(s, get_memop(oi), data_reg, h);
1838
1839     if (ldst) {
1840         ldst->type = data_type;
1841         ldst->datalo_reg = data_reg;
1842         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1843     }
1844 }
1845
1846 static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
1847                                    TCGReg addr_reg, MemOpIdx oi, bool is_ld)
1848 {
1849     TCGLabelQemuLdst *ldst;
1850     HostAddress h;
1851     TCGReg base;
1852     bool use_pair;
1853
1854     ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
1855
1856     /* Compose the final address, as LDP/STP have no indexing. */
1857     if (h.index == TCG_REG_XZR) {
1858         base = h.base;
1859     } else {
1860         base = TCG_REG_TMP2;
1861         if (h.index_ext == TCG_TYPE_I32) {
1862             /* add base, base, index, uxtw */
1863             tcg_out_insn(s, 3501, ADD, TCG_TYPE_I64, base,
1864                          h.base, h.index, MO_32, 0);
1865         } else {
1866             /* add base, base, index */
1867             tcg_out_insn(s, 3502, ADD, 1, base, h.base, h.index);
1868         }
1869     }
1870
1871     use_pair = h.aa.atom < MO_128 || have_lse2;
1872
1873     if (!use_pair) {
1874         tcg_insn_unit *branch = NULL;
1875         TCGReg ll, lh, sl, sh;
1876
1877         /*
1878          * If we have already checked for 16-byte alignment, that's all
1879          * we need. Otherwise we have determined that misaligned atomicity
1880          * may be handled with two 8-byte loads.
1881          */
1882         if (h.aa.align < MO_128) {
1883             /*
1884              * TODO: align should be MO_64, so we only need test bit 3,
1885              * which means we could use TBNZ instead of ANDS+B_C.
1886              */
1887             tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, 15);
1888             branch = s->code_ptr;
1889             tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1890             use_pair = true;
1891         }
1892
1893         if (is_ld) {
1894             /*
1895              * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
1896              *    ldxp lo, hi, [base]
1897              *    stxp t0, lo, hi, [base]
1898              *    cbnz t0, .-8
1899              * Require no overlap between data{lo,hi} and base.
1900              */
1901             if (datalo == base || datahi == base) {
1902                 tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_TMP2, base);
1903                 base = TCG_REG_TMP2;
1904             }
1905             ll = sl = datalo;
1906             lh = sh = datahi;
1907         } else {
1908             /*
1909              * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
1910              * 1: ldxp t0, t1, [base]
1911              *    stxp t0, lo, hi, [base]
1912              *    cbnz t0, 1b
1913              */
1914             tcg_debug_assert(base != TCG_REG_TMP0 && base != TCG_REG_TMP1);
1915             ll = TCG_REG_TMP0;
1916             lh = TCG_REG_TMP1;
1917             sl = datalo;
1918             sh = datahi;
1919         }
1920
1921         tcg_out_insn(s, 3306, LDXP, TCG_REG_XZR, ll, lh, base);
1922         tcg_out_insn(s, 3306, STXP, TCG_REG_TMP0, sl, sh, base);
1923         tcg_out_insn(s, 3201, CBNZ, 0, TCG_REG_TMP0, -2);
1924
1925         if (use_pair) {
1926             /* "b .+8", branching across the one insn of use_pair. */
1927             tcg_out_insn(s, 3206, B, 2);
1928             reloc_pc19(branch, tcg_splitwx_to_rx(s->code_ptr));
1929         }
1930     }
1931
1932     if (use_pair) {
1933         if (is_ld) {
1934             tcg_out_insn(s, 3314, LDP, datalo, datahi, base, 0, 1, 0);
1935         } else {
1936             tcg_out_insn(s, 3314, STP, datalo, datahi, base, 0, 1, 0);
1937         }
1938     }
1939
1940     if (ldst) {
1941         ldst->type = TCG_TYPE_I128;
1942         ldst->datalo_reg = datalo;
1943         ldst->datahi_reg = datahi;
1944         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1945     }
1946 }
1947
1948 static const tcg_insn_unit *tb_ret_addr;
1949
1950 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
1951 {
1952     const tcg_insn_unit *target;
1953     ptrdiff_t offset;
1954
1955     /* Reuse the zeroing that exists for goto_ptr.  */
1956     if (a0 == 0) {
1957         target = tcg_code_gen_epilogue;
1958     } else {
1959         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
1960         target = tb_ret_addr;
1961     }
1962
1963     offset = tcg_pcrel_diff(s, target) >> 2;
1964     if (offset == sextract64(offset, 0, 26)) {
1965         tcg_out_insn(s, 3206, B, offset);
1966     } else {
1967         /*
1968          * Only x16/x17 generate BTI type Jump (2),
1969          * other registers generate BTI type Jump|Call (3).
1970          */
1971         QEMU_BUILD_BUG_ON(TCG_REG_TMP0 != TCG_REG_X16);
1972         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
1973         tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
1974     }
1975 }
1976
1977 static void tcg_out_goto_tb(TCGContext *s, int which)
1978 {
1979     /*
1980      * Direct branch, or indirect address load, will be patched
1981      * by tb_target_set_jmp_target.  Assert indirect load offset
1982      * in range early, regardless of direct branch distance.
1983      */
1984     intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
1985     tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
1986
1987     set_jmp_insn_offset(s, which);
1988     tcg_out32(s, I3206_B);
1989     tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
1990     set_jmp_reset_offset(s, which);
1991     tcg_out_bti(s, BTI_J);
1992 }
1993
1994 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
1995                               uintptr_t jmp_rx, uintptr_t jmp_rw)
1996 {
1997     uintptr_t d_addr = tb->jmp_target_addr[n];
1998     ptrdiff_t d_offset = d_addr - jmp_rx;
1999     tcg_insn_unit insn;
2000
2001     /* Either directly branch, or indirect branch load. */
2002     if (d_offset == sextract64(d_offset, 0, 28)) {
2003         insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
2004     } else {
2005         uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
2006         ptrdiff_t i_offset = i_addr - jmp_rx;
2007
2008         /* Note that we asserted this in range in tcg_out_goto_tb. */
2009         insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2);
2010     }
2011     qatomic_set((uint32_t *)jmp_rw, insn);
2012     flush_idcache_range(jmp_rx, jmp_rw, 4);
2013 }
2014
2015 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
2016                        const TCGArg args[TCG_MAX_OP_ARGS],
2017                        const int const_args[TCG_MAX_OP_ARGS])
2018 {
2019     /* 99% of the time, we can signal the use of extension registers
2020        by looking to see if the opcode handles 64-bit data.  */
2021     TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_64BIT) != 0;
2022
2023     /* Hoist the loads of the most common arguments.  */
2024     TCGArg a0 = args[0];
2025     TCGArg a1 = args[1];
2026     TCGArg a2 = args[2];
2027     int c2 = const_args[2];
2028
2029     /* Some operands are defined with "rZ" constraint, a register or
2030        the zero register.  These need not actually test args[I] == 0.  */
2031 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
2032
2033     switch (opc) {
2034     case INDEX_op_goto_ptr:
2035         tcg_out_insn(s, 3207, BR, a0);
2036         break;
2037
2038     case INDEX_op_br:
2039         tcg_out_goto_label(s, arg_label(a0));
2040         break;
2041
2042     case INDEX_op_ld8u_i32:
2043     case INDEX_op_ld8u_i64:
2044         tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
2045         break;
2046     case INDEX_op_ld8s_i32:
2047         tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
2048         break;
2049     case INDEX_op_ld8s_i64:
2050         tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
2051         break;
2052     case INDEX_op_ld16u_i32:
2053     case INDEX_op_ld16u_i64:
2054         tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
2055         break;
2056     case INDEX_op_ld16s_i32:
2057         tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
2058         break;
2059     case INDEX_op_ld16s_i64:
2060         tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
2061         break;
2062     case INDEX_op_ld_i32:
2063     case INDEX_op_ld32u_i64:
2064         tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
2065         break;
2066     case INDEX_op_ld32s_i64:
2067         tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
2068         break;
2069     case INDEX_op_ld_i64:
2070         tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
2071         break;
2072
2073     case INDEX_op_st8_i32:
2074     case INDEX_op_st8_i64:
2075         tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
2076         break;
2077     case INDEX_op_st16_i32:
2078     case INDEX_op_st16_i64:
2079         tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
2080         break;
2081     case INDEX_op_st_i32:
2082     case INDEX_op_st32_i64:
2083         tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
2084         break;
2085     case INDEX_op_st_i64:
2086         tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
2087         break;
2088
2089     case INDEX_op_add_i32:
2090         a2 = (int32_t)a2;
2091         /* FALLTHRU */
2092     case INDEX_op_add_i64:
2093         if (c2) {
2094             tcg_out_addsubi(s, ext, a0, a1, a2);
2095         } else {
2096             tcg_out_insn(s, 3502, ADD, ext, a0, a1, a2);
2097         }
2098         break;
2099
2100     case INDEX_op_sub_i32:
2101         a2 = (int32_t)a2;
2102         /* FALLTHRU */
2103     case INDEX_op_sub_i64:
2104         if (c2) {
2105             tcg_out_addsubi(s, ext, a0, a1, -a2);
2106         } else {
2107             tcg_out_insn(s, 3502, SUB, ext, a0, a1, a2);
2108         }
2109         break;
2110
2111     case INDEX_op_neg_i64:
2112     case INDEX_op_neg_i32:
2113         tcg_out_insn(s, 3502, SUB, ext, a0, TCG_REG_XZR, a1);
2114         break;
2115
2116     case INDEX_op_and_i32:
2117         a2 = (int32_t)a2;
2118         /* FALLTHRU */
2119     case INDEX_op_and_i64:
2120         if (c2) {
2121             tcg_out_logicali(s, I3404_ANDI, ext, a0, a1, a2);
2122         } else {
2123             tcg_out_insn(s, 3510, AND, ext, a0, a1, a2);
2124         }
2125         break;
2126
2127     case INDEX_op_andc_i32:
2128         a2 = (int32_t)a2;
2129         /* FALLTHRU */
2130     case INDEX_op_andc_i64:
2131         if (c2) {
2132             tcg_out_logicali(s, I3404_ANDI, ext, a0, a1, ~a2);
2133         } else {
2134             tcg_out_insn(s, 3510, BIC, ext, a0, a1, a2);
2135         }
2136         break;
2137
2138     case INDEX_op_or_i32:
2139         a2 = (int32_t)a2;
2140         /* FALLTHRU */
2141     case INDEX_op_or_i64:
2142         if (c2) {
2143             tcg_out_logicali(s, I3404_ORRI, ext, a0, a1, a2);
2144         } else {
2145             tcg_out_insn(s, 3510, ORR, ext, a0, a1, a2);
2146         }
2147         break;
2148
2149     case INDEX_op_orc_i32:
2150         a2 = (int32_t)a2;
2151         /* FALLTHRU */
2152     case INDEX_op_orc_i64:
2153         if (c2) {
2154             tcg_out_logicali(s, I3404_ORRI, ext, a0, a1, ~a2);
2155         } else {
2156             tcg_out_insn(s, 3510, ORN, ext, a0, a1, a2);
2157         }
2158         break;
2159
2160     case INDEX_op_xor_i32:
2161         a2 = (int32_t)a2;
2162         /* FALLTHRU */
2163     case INDEX_op_xor_i64:
2164         if (c2) {
2165             tcg_out_logicali(s, I3404_EORI, ext, a0, a1, a2);
2166         } else {
2167             tcg_out_insn(s, 3510, EOR, ext, a0, a1, a2);
2168         }
2169         break;
2170
2171     case INDEX_op_eqv_i32:
2172         a2 = (int32_t)a2;
2173         /* FALLTHRU */
2174     case INDEX_op_eqv_i64:
2175         if (c2) {
2176             tcg_out_logicali(s, I3404_EORI, ext, a0, a1, ~a2);
2177         } else {
2178             tcg_out_insn(s, 3510, EON, ext, a0, a1, a2);
2179         }
2180         break;
2181
2182     case INDEX_op_not_i64:
2183     case INDEX_op_not_i32:
2184         tcg_out_insn(s, 3510, ORN, ext, a0, TCG_REG_XZR, a1);
2185         break;
2186
2187     case INDEX_op_mul_i64:
2188     case INDEX_op_mul_i32:
2189         tcg_out_insn(s, 3509, MADD, ext, a0, a1, a2, TCG_REG_XZR);
2190         break;
2191
2192     case INDEX_op_div_i64:
2193     case INDEX_op_div_i32:
2194         tcg_out_insn(s, 3508, SDIV, ext, a0, a1, a2);
2195         break;
2196     case INDEX_op_divu_i64:
2197     case INDEX_op_divu_i32:
2198         tcg_out_insn(s, 3508, UDIV, ext, a0, a1, a2);
2199         break;
2200
2201     case INDEX_op_rem_i64:
2202     case INDEX_op_rem_i32:
2203         tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP0, a1, a2);
2204         tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
2205         break;
2206     case INDEX_op_remu_i64:
2207     case INDEX_op_remu_i32:
2208         tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP0, a1, a2);
2209         tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
2210         break;
2211
2212     case INDEX_op_shl_i64:
2213     case INDEX_op_shl_i32:
2214         if (c2) {
2215             tcg_out_shl(s, ext, a0, a1, a2);
2216         } else {
2217             tcg_out_insn(s, 3508, LSLV, ext, a0, a1, a2);
2218         }
2219         break;
2220
2221     case INDEX_op_shr_i64:
2222     case INDEX_op_shr_i32:
2223         if (c2) {
2224             tcg_out_shr(s, ext, a0, a1, a2);
2225         } else {
2226             tcg_out_insn(s, 3508, LSRV, ext, a0, a1, a2);
2227         }
2228         break;
2229
2230     case INDEX_op_sar_i64:
2231     case INDEX_op_sar_i32:
2232         if (c2) {
2233             tcg_out_sar(s, ext, a0, a1, a2);
2234         } else {
2235             tcg_out_insn(s, 3508, ASRV, ext, a0, a1, a2);
2236         }
2237         break;
2238
2239     case INDEX_op_rotr_i64:
2240     case INDEX_op_rotr_i32:
2241         if (c2) {
2242             tcg_out_rotr(s, ext, a0, a1, a2);
2243         } else {
2244             tcg_out_insn(s, 3508, RORV, ext, a0, a1, a2);
2245         }
2246         break;
2247
2248     case INDEX_op_rotl_i64:
2249     case INDEX_op_rotl_i32:
2250         if (c2) {
2251             tcg_out_rotl(s, ext, a0, a1, a2);
2252         } else {
2253             tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP0, TCG_REG_XZR, a2);
2254             tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP0);
2255         }
2256         break;
2257
2258     case INDEX_op_clz_i64:
2259     case INDEX_op_clz_i32:
2260         tcg_out_cltz(s, ext, a0, a1, a2, c2, false);
2261         break;
2262     case INDEX_op_ctz_i64:
2263     case INDEX_op_ctz_i32:
2264         tcg_out_cltz(s, ext, a0, a1, a2, c2, true);
2265         break;
2266
2267     case INDEX_op_brcond_i32:
2268         a1 = (int32_t)a1;
2269         /* FALLTHRU */
2270     case INDEX_op_brcond_i64:
2271         tcg_out_brcond(s, ext, a2, a0, a1, const_args[1], arg_label(args[3]));
2272         break;
2273
2274     case INDEX_op_setcond_i32:
2275         a2 = (int32_t)a2;
2276         /* FALLTHRU */
2277     case INDEX_op_setcond_i64:
2278         tcg_out_cmp(s, ext, a1, a2, c2);
2279         /* Use CSET alias of CSINC Wd, WZR, WZR, invert(cond).  */
2280         tcg_out_insn(s, 3506, CSINC, TCG_TYPE_I32, a0, TCG_REG_XZR,
2281                      TCG_REG_XZR, tcg_invert_cond(args[3]));
2282         break;
2283
2284     case INDEX_op_negsetcond_i32:
2285         a2 = (int32_t)a2;
2286         /* FALLTHRU */
2287     case INDEX_op_negsetcond_i64:
2288         tcg_out_cmp(s, ext, a1, a2, c2);
2289         /* Use CSETM alias of CSINV Wd, WZR, WZR, invert(cond).  */
2290         tcg_out_insn(s, 3506, CSINV, ext, a0, TCG_REG_XZR,
2291                      TCG_REG_XZR, tcg_invert_cond(args[3]));
2292         break;
2293
2294     case INDEX_op_movcond_i32:
2295         a2 = (int32_t)a2;
2296         /* FALLTHRU */
2297     case INDEX_op_movcond_i64:
2298         tcg_out_cmp(s, ext, a1, a2, c2);
2299         tcg_out_insn(s, 3506, CSEL, ext, a0, REG0(3), REG0(4), args[5]);
2300         break;
2301
2302     case INDEX_op_qemu_ld_a32_i32:
2303     case INDEX_op_qemu_ld_a64_i32:
2304     case INDEX_op_qemu_ld_a32_i64:
2305     case INDEX_op_qemu_ld_a64_i64:
2306         tcg_out_qemu_ld(s, a0, a1, a2, ext);
2307         break;
2308     case INDEX_op_qemu_st_a32_i32:
2309     case INDEX_op_qemu_st_a64_i32:
2310     case INDEX_op_qemu_st_a32_i64:
2311     case INDEX_op_qemu_st_a64_i64:
2312         tcg_out_qemu_st(s, REG0(0), a1, a2, ext);
2313         break;
2314     case INDEX_op_qemu_ld_a32_i128:
2315     case INDEX_op_qemu_ld_a64_i128:
2316         tcg_out_qemu_ldst_i128(s, a0, a1, a2, args[3], true);
2317         break;
2318     case INDEX_op_qemu_st_a32_i128:
2319     case INDEX_op_qemu_st_a64_i128:
2320         tcg_out_qemu_ldst_i128(s, REG0(0), REG0(1), a2, args[3], false);
2321         break;
2322
2323     case INDEX_op_bswap64_i64:
2324         tcg_out_rev(s, TCG_TYPE_I64, MO_64, a0, a1);
2325         break;
2326     case INDEX_op_bswap32_i64:
2327         tcg_out_rev(s, TCG_TYPE_I32, MO_32, a0, a1);
2328         if (a2 & TCG_BSWAP_OS) {
2329             tcg_out_ext32s(s, a0, a0);
2330         }
2331         break;
2332     case INDEX_op_bswap32_i32:
2333         tcg_out_rev(s, TCG_TYPE_I32, MO_32, a0, a1);
2334         break;
2335     case INDEX_op_bswap16_i64:
2336     case INDEX_op_bswap16_i32:
2337         tcg_out_rev(s, TCG_TYPE_I32, MO_16, a0, a1);
2338         if (a2 & TCG_BSWAP_OS) {
2339             /* Output must be sign-extended. */
2340             tcg_out_ext16s(s, ext, a0, a0);
2341         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2342             /* Output must be zero-extended, but input isn't. */
2343             tcg_out_ext16u(s, a0, a0);
2344         }
2345         break;
2346
2347     case INDEX_op_deposit_i64:
2348     case INDEX_op_deposit_i32:
2349         tcg_out_dep(s, ext, a0, REG0(2), args[3], args[4]);
2350         break;
2351
2352     case INDEX_op_extract_i64:
2353     case INDEX_op_extract_i32:
2354         tcg_out_ubfm(s, ext, a0, a1, a2, a2 + args[3] - 1);
2355         break;
2356
2357     case INDEX_op_sextract_i64:
2358     case INDEX_op_sextract_i32:
2359         tcg_out_sbfm(s, ext, a0, a1, a2, a2 + args[3] - 1);
2360         break;
2361
2362     case INDEX_op_extract2_i64:
2363     case INDEX_op_extract2_i32:
2364         tcg_out_extr(s, ext, a0, REG0(2), REG0(1), args[3]);
2365         break;
2366
2367     case INDEX_op_add2_i32:
2368         tcg_out_addsub2(s, TCG_TYPE_I32, a0, a1, REG0(2), REG0(3),
2369                         (int32_t)args[4], args[5], const_args[4],
2370                         const_args[5], false);
2371         break;
2372     case INDEX_op_add2_i64:
2373         tcg_out_addsub2(s, TCG_TYPE_I64, a0, a1, REG0(2), REG0(3), args[4],
2374                         args[5], const_args[4], const_args[5], false);
2375         break;
2376     case INDEX_op_sub2_i32:
2377         tcg_out_addsub2(s, TCG_TYPE_I32, a0, a1, REG0(2), REG0(3),
2378                         (int32_t)args[4], args[5], const_args[4],
2379                         const_args[5], true);
2380         break;
2381     case INDEX_op_sub2_i64:
2382         tcg_out_addsub2(s, TCG_TYPE_I64, a0, a1, REG0(2), REG0(3), args[4],
2383                         args[5], const_args[4], const_args[5], true);
2384         break;
2385
2386     case INDEX_op_muluh_i64:
2387         tcg_out_insn(s, 3508, UMULH, TCG_TYPE_I64, a0, a1, a2);
2388         break;
2389     case INDEX_op_mulsh_i64:
2390         tcg_out_insn(s, 3508, SMULH, TCG_TYPE_I64, a0, a1, a2);
2391         break;
2392
2393     case INDEX_op_mb:
2394         tcg_out_mb(s, a0);
2395         break;
2396
2397     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2398     case INDEX_op_mov_i64:
2399     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2400     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2401     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2402     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2403     case INDEX_op_ext8s_i64:
2404     case INDEX_op_ext8u_i32:
2405     case INDEX_op_ext8u_i64:
2406     case INDEX_op_ext16s_i64:
2407     case INDEX_op_ext16s_i32:
2408     case INDEX_op_ext16u_i64:
2409     case INDEX_op_ext16u_i32:
2410     case INDEX_op_ext32s_i64:
2411     case INDEX_op_ext32u_i64:
2412     case INDEX_op_ext_i32_i64:
2413     case INDEX_op_extu_i32_i64:
2414     case INDEX_op_extrl_i64_i32:
2415     default:
2416         g_assert_not_reached();
2417     }
2418
2419 #undef REG0
2420 }
2421
2422 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2423                            unsigned vecl, unsigned vece,
2424                            const TCGArg args[TCG_MAX_OP_ARGS],
2425                            const int const_args[TCG_MAX_OP_ARGS])
2426 {
2427     static const AArch64Insn cmp_vec_insn[16] = {
2428         [TCG_COND_EQ] = I3616_CMEQ,
2429         [TCG_COND_GT] = I3616_CMGT,
2430         [TCG_COND_GE] = I3616_CMGE,
2431         [TCG_COND_GTU] = I3616_CMHI,
2432         [TCG_COND_GEU] = I3616_CMHS,
2433     };
2434     static const AArch64Insn cmp_scalar_insn[16] = {
2435         [TCG_COND_EQ] = I3611_CMEQ,
2436         [TCG_COND_GT] = I3611_CMGT,
2437         [TCG_COND_GE] = I3611_CMGE,
2438         [TCG_COND_GTU] = I3611_CMHI,
2439         [TCG_COND_GEU] = I3611_CMHS,
2440     };
2441     static const AArch64Insn cmp0_vec_insn[16] = {
2442         [TCG_COND_EQ] = I3617_CMEQ0,
2443         [TCG_COND_GT] = I3617_CMGT0,
2444         [TCG_COND_GE] = I3617_CMGE0,
2445         [TCG_COND_LT] = I3617_CMLT0,
2446         [TCG_COND_LE] = I3617_CMLE0,
2447     };
2448     static const AArch64Insn cmp0_scalar_insn[16] = {
2449         [TCG_COND_EQ] = I3612_CMEQ0,
2450         [TCG_COND_GT] = I3612_CMGT0,
2451         [TCG_COND_GE] = I3612_CMGE0,
2452         [TCG_COND_LT] = I3612_CMLT0,
2453         [TCG_COND_LE] = I3612_CMLE0,
2454     };
2455
2456     TCGType type = vecl + TCG_TYPE_V64;
2457     unsigned is_q = vecl;
2458     bool is_scalar = !is_q && vece == MO_64;
2459     TCGArg a0, a1, a2, a3;
2460     int cmode, imm8;
2461
2462     a0 = args[0];
2463     a1 = args[1];
2464     a2 = args[2];
2465
2466     switch (opc) {
2467     case INDEX_op_ld_vec:
2468         tcg_out_ld(s, type, a0, a1, a2);
2469         break;
2470     case INDEX_op_st_vec:
2471         tcg_out_st(s, type, a0, a1, a2);
2472         break;
2473     case INDEX_op_dupm_vec:
2474         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2475         break;
2476     case INDEX_op_add_vec:
2477         if (is_scalar) {
2478             tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
2479         } else {
2480             tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
2481         }
2482         break;
2483     case INDEX_op_sub_vec:
2484         if (is_scalar) {
2485             tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
2486         } else {
2487             tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
2488         }
2489         break;
2490     case INDEX_op_mul_vec:
2491         tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
2492         break;
2493     case INDEX_op_neg_vec:
2494         if (is_scalar) {
2495             tcg_out_insn(s, 3612, NEG, vece, a0, a1);
2496         } else {
2497             tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
2498         }
2499         break;
2500     case INDEX_op_abs_vec:
2501         if (is_scalar) {
2502             tcg_out_insn(s, 3612, ABS, vece, a0, a1);
2503         } else {
2504             tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
2505         }
2506         break;
2507     case INDEX_op_and_vec:
2508         if (const_args[2]) {
2509             is_shimm1632(~a2, &cmode, &imm8);
2510             if (a0 == a1) {
2511                 tcg_out_insn(s, 3606, BIC, is_q, a0, 0, cmode, imm8);
2512                 return;
2513             }
2514             tcg_out_insn(s, 3606, MVNI, is_q, a0, 0, cmode, imm8);
2515             a2 = a0;
2516         }
2517         tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2);
2518         break;
2519     case INDEX_op_or_vec:
2520         if (const_args[2]) {
2521             is_shimm1632(a2, &cmode, &imm8);
2522             if (a0 == a1) {
2523                 tcg_out_insn(s, 3606, ORR, is_q, a0, 0, cmode, imm8);
2524                 return;
2525             }
2526             tcg_out_insn(s, 3606, MOVI, is_q, a0, 0, cmode, imm8);
2527             a2 = a0;
2528         }
2529         tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2);
2530         break;
2531     case INDEX_op_andc_vec:
2532         if (const_args[2]) {
2533             is_shimm1632(a2, &cmode, &imm8);
2534             if (a0 == a1) {
2535                 tcg_out_insn(s, 3606, BIC, is_q, a0, 0, cmode, imm8);
2536                 return;
2537             }
2538             tcg_out_insn(s, 3606, MOVI, is_q, a0, 0, cmode, imm8);
2539             a2 = a0;
2540         }
2541         tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2);
2542         break;
2543     case INDEX_op_orc_vec:
2544         if (const_args[2]) {
2545             is_shimm1632(~a2, &cmode, &imm8);
2546             if (a0 == a1) {
2547                 tcg_out_insn(s, 3606, ORR, is_q, a0, 0, cmode, imm8);
2548                 return;
2549             }
2550             tcg_out_insn(s, 3606, MVNI, is_q, a0, 0, cmode, imm8);
2551             a2 = a0;
2552         }
2553         tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
2554         break;
2555     case INDEX_op_xor_vec:
2556         tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
2557         break;
2558     case INDEX_op_ssadd_vec:
2559         if (is_scalar) {
2560             tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
2561         } else {
2562             tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
2563         }
2564         break;
2565     case INDEX_op_sssub_vec:
2566         if (is_scalar) {
2567             tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
2568         } else {
2569             tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
2570         }
2571         break;
2572     case INDEX_op_usadd_vec:
2573         if (is_scalar) {
2574             tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
2575         } else {
2576             tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
2577         }
2578         break;
2579     case INDEX_op_ussub_vec:
2580         if (is_scalar) {
2581             tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
2582         } else {
2583             tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
2584         }
2585         break;
2586     case INDEX_op_smax_vec:
2587         tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
2588         break;
2589     case INDEX_op_smin_vec:
2590         tcg_out_insn(s, 3616, SMIN, is_q, vece, a0, a1, a2);
2591         break;
2592     case INDEX_op_umax_vec:
2593         tcg_out_insn(s, 3616, UMAX, is_q, vece, a0, a1, a2);
2594         break;
2595     case INDEX_op_umin_vec:
2596         tcg_out_insn(s, 3616, UMIN, is_q, vece, a0, a1, a2);
2597         break;
2598     case INDEX_op_not_vec:
2599         tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
2600         break;
2601     case INDEX_op_shli_vec:
2602         if (is_scalar) {
2603             tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
2604         } else {
2605             tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
2606         }
2607         break;
2608     case INDEX_op_shri_vec:
2609         if (is_scalar) {
2610             tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
2611         } else {
2612             tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
2613         }
2614         break;
2615     case INDEX_op_sari_vec:
2616         if (is_scalar) {
2617             tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
2618         } else {
2619             tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
2620         }
2621         break;
2622     case INDEX_op_aa64_sli_vec:
2623         if (is_scalar) {
2624             tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
2625         } else {
2626             tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
2627         }
2628         break;
2629     case INDEX_op_shlv_vec:
2630         if (is_scalar) {
2631             tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
2632         } else {
2633             tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
2634         }
2635         break;
2636     case INDEX_op_aa64_sshl_vec:
2637         if (is_scalar) {
2638             tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
2639         } else {
2640             tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
2641         }
2642         break;
2643     case INDEX_op_cmp_vec:
2644         {
2645             TCGCond cond = args[3];
2646             AArch64Insn insn;
2647
2648             if (cond == TCG_COND_NE) {
2649                 if (const_args[2]) {
2650                     if (is_scalar) {
2651                         tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
2652                     } else {
2653                         tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
2654                     }
2655                 } else {
2656                     if (is_scalar) {
2657                         tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
2658                     } else {
2659                         tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
2660                     }
2661                     tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
2662                 }
2663             } else {
2664                 if (const_args[2]) {
2665                     if (is_scalar) {
2666                         insn = cmp0_scalar_insn[cond];
2667                         if (insn) {
2668                             tcg_out_insn_3612(s, insn, vece, a0, a1);
2669                             break;
2670                         }
2671                     } else {
2672                         insn = cmp0_vec_insn[cond];
2673                         if (insn) {
2674                             tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
2675                             break;
2676                         }
2677                     }
2678                     tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP0, 0);
2679                     a2 = TCG_VEC_TMP0;
2680                 }
2681                 if (is_scalar) {
2682                     insn = cmp_scalar_insn[cond];
2683                     if (insn == 0) {
2684                         TCGArg t;
2685                         t = a1, a1 = a2, a2 = t;
2686                         cond = tcg_swap_cond(cond);
2687                         insn = cmp_scalar_insn[cond];
2688                         tcg_debug_assert(insn != 0);
2689                     }
2690                     tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
2691                 } else {
2692                     insn = cmp_vec_insn[cond];
2693                     if (insn == 0) {
2694                         TCGArg t;
2695                         t = a1, a1 = a2, a2 = t;
2696                         cond = tcg_swap_cond(cond);
2697                         insn = cmp_vec_insn[cond];
2698                         tcg_debug_assert(insn != 0);
2699                     }
2700                     tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
2701                 }
2702             }
2703         }
2704         break;
2705
2706     case INDEX_op_bitsel_vec:
2707         a3 = args[3];
2708         if (a0 == a3) {
2709             tcg_out_insn(s, 3616, BIT, is_q, 0, a0, a2, a1);
2710         } else if (a0 == a2) {
2711             tcg_out_insn(s, 3616, BIF, is_q, 0, a0, a3, a1);
2712         } else {
2713             if (a0 != a1) {
2714                 tcg_out_mov(s, type, a0, a1);
2715             }
2716             tcg_out_insn(s, 3616, BSL, is_q, 0, a0, a2, a3);
2717         }
2718         break;
2719
2720     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2721     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2722     default:
2723         g_assert_not_reached();
2724     }
2725 }
2726
2727 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
2728 {
2729     switch (opc) {
2730     case INDEX_op_add_vec:
2731     case INDEX_op_sub_vec:
2732     case INDEX_op_and_vec:
2733     case INDEX_op_or_vec:
2734     case INDEX_op_xor_vec:
2735     case INDEX_op_andc_vec:
2736     case INDEX_op_orc_vec:
2737     case INDEX_op_neg_vec:
2738     case INDEX_op_abs_vec:
2739     case INDEX_op_not_vec:
2740     case INDEX_op_cmp_vec:
2741     case INDEX_op_shli_vec:
2742     case INDEX_op_shri_vec:
2743     case INDEX_op_sari_vec:
2744     case INDEX_op_ssadd_vec:
2745     case INDEX_op_sssub_vec:
2746     case INDEX_op_usadd_vec:
2747     case INDEX_op_ussub_vec:
2748     case INDEX_op_shlv_vec:
2749     case INDEX_op_bitsel_vec:
2750         return 1;
2751     case INDEX_op_rotli_vec:
2752     case INDEX_op_shrv_vec:
2753     case INDEX_op_sarv_vec:
2754     case INDEX_op_rotlv_vec:
2755     case INDEX_op_rotrv_vec:
2756         return -1;
2757     case INDEX_op_mul_vec:
2758     case INDEX_op_smax_vec:
2759     case INDEX_op_smin_vec:
2760     case INDEX_op_umax_vec:
2761     case INDEX_op_umin_vec:
2762         return vece < MO_64;
2763
2764     default:
2765         return 0;
2766     }
2767 }
2768
2769 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
2770                        TCGArg a0, ...)
2771 {
2772     va_list va;
2773     TCGv_vec v0, v1, v2, t1, t2, c1;
2774     TCGArg a2;
2775
2776     va_start(va, a0);
2777     v0 = temp_tcgv_vec(arg_temp(a0));
2778     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
2779     a2 = va_arg(va, TCGArg);
2780     va_end(va);
2781
2782     switch (opc) {
2783     case INDEX_op_rotli_vec:
2784         t1 = tcg_temp_new_vec(type);
2785         tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
2786         vec_gen_4(INDEX_op_aa64_sli_vec, type, vece,
2787                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
2788         tcg_temp_free_vec(t1);
2789         break;
2790
2791     case INDEX_op_shrv_vec:
2792     case INDEX_op_sarv_vec:
2793         /* Right shifts are negative left shifts for AArch64.  */
2794         v2 = temp_tcgv_vec(arg_temp(a2));
2795         t1 = tcg_temp_new_vec(type);
2796         tcg_gen_neg_vec(vece, t1, v2);
2797         opc = (opc == INDEX_op_shrv_vec
2798                ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec);
2799         vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
2800                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2801         tcg_temp_free_vec(t1);
2802         break;
2803
2804     case INDEX_op_rotlv_vec:
2805         v2 = temp_tcgv_vec(arg_temp(a2));
2806         t1 = tcg_temp_new_vec(type);
2807         c1 = tcg_constant_vec(type, vece, 8 << vece);
2808         tcg_gen_sub_vec(vece, t1, v2, c1);
2809         /* Right shifts are negative left shifts for AArch64.  */
2810         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
2811                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2812         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
2813                   tcgv_vec_arg(v1), tcgv_vec_arg(v2));
2814         tcg_gen_or_vec(vece, v0, v0, t1);
2815         tcg_temp_free_vec(t1);
2816         break;
2817
2818     case INDEX_op_rotrv_vec:
2819         v2 = temp_tcgv_vec(arg_temp(a2));
2820         t1 = tcg_temp_new_vec(type);
2821         t2 = tcg_temp_new_vec(type);
2822         c1 = tcg_constant_vec(type, vece, 8 << vece);
2823         tcg_gen_neg_vec(vece, t1, v2);
2824         tcg_gen_sub_vec(vece, t2, c1, v2);
2825         /* Right shifts are negative left shifts for AArch64.  */
2826         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
2827                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2828         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t2),
2829                   tcgv_vec_arg(v1), tcgv_vec_arg(t2));
2830         tcg_gen_or_vec(vece, v0, t1, t2);
2831         tcg_temp_free_vec(t1);
2832         tcg_temp_free_vec(t2);
2833         break;
2834
2835     default:
2836         g_assert_not_reached();
2837     }
2838 }
2839
2840 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2841 {
2842     switch (op) {
2843     case INDEX_op_goto_ptr:
2844         return C_O0_I1(r);
2845
2846     case INDEX_op_ld8u_i32:
2847     case INDEX_op_ld8s_i32:
2848     case INDEX_op_ld16u_i32:
2849     case INDEX_op_ld16s_i32:
2850     case INDEX_op_ld_i32:
2851     case INDEX_op_ld8u_i64:
2852     case INDEX_op_ld8s_i64:
2853     case INDEX_op_ld16u_i64:
2854     case INDEX_op_ld16s_i64:
2855     case INDEX_op_ld32u_i64:
2856     case INDEX_op_ld32s_i64:
2857     case INDEX_op_ld_i64:
2858     case INDEX_op_neg_i32:
2859     case INDEX_op_neg_i64:
2860     case INDEX_op_not_i32:
2861     case INDEX_op_not_i64:
2862     case INDEX_op_bswap16_i32:
2863     case INDEX_op_bswap32_i32:
2864     case INDEX_op_bswap16_i64:
2865     case INDEX_op_bswap32_i64:
2866     case INDEX_op_bswap64_i64:
2867     case INDEX_op_ext8s_i32:
2868     case INDEX_op_ext16s_i32:
2869     case INDEX_op_ext8u_i32:
2870     case INDEX_op_ext16u_i32:
2871     case INDEX_op_ext8s_i64:
2872     case INDEX_op_ext16s_i64:
2873     case INDEX_op_ext32s_i64:
2874     case INDEX_op_ext8u_i64:
2875     case INDEX_op_ext16u_i64:
2876     case INDEX_op_ext32u_i64:
2877     case INDEX_op_ext_i32_i64:
2878     case INDEX_op_extu_i32_i64:
2879     case INDEX_op_extract_i32:
2880     case INDEX_op_extract_i64:
2881     case INDEX_op_sextract_i32:
2882     case INDEX_op_sextract_i64:
2883         return C_O1_I1(r, r);
2884
2885     case INDEX_op_st8_i32:
2886     case INDEX_op_st16_i32:
2887     case INDEX_op_st_i32:
2888     case INDEX_op_st8_i64:
2889     case INDEX_op_st16_i64:
2890     case INDEX_op_st32_i64:
2891     case INDEX_op_st_i64:
2892         return C_O0_I2(rZ, r);
2893
2894     case INDEX_op_add_i32:
2895     case INDEX_op_add_i64:
2896     case INDEX_op_sub_i32:
2897     case INDEX_op_sub_i64:
2898     case INDEX_op_setcond_i32:
2899     case INDEX_op_setcond_i64:
2900     case INDEX_op_negsetcond_i32:
2901     case INDEX_op_negsetcond_i64:
2902         return C_O1_I2(r, r, rA);
2903
2904     case INDEX_op_mul_i32:
2905     case INDEX_op_mul_i64:
2906     case INDEX_op_div_i32:
2907     case INDEX_op_div_i64:
2908     case INDEX_op_divu_i32:
2909     case INDEX_op_divu_i64:
2910     case INDEX_op_rem_i32:
2911     case INDEX_op_rem_i64:
2912     case INDEX_op_remu_i32:
2913     case INDEX_op_remu_i64:
2914     case INDEX_op_muluh_i64:
2915     case INDEX_op_mulsh_i64:
2916         return C_O1_I2(r, r, r);
2917
2918     case INDEX_op_and_i32:
2919     case INDEX_op_and_i64:
2920     case INDEX_op_or_i32:
2921     case INDEX_op_or_i64:
2922     case INDEX_op_xor_i32:
2923     case INDEX_op_xor_i64:
2924     case INDEX_op_andc_i32:
2925     case INDEX_op_andc_i64:
2926     case INDEX_op_orc_i32:
2927     case INDEX_op_orc_i64:
2928     case INDEX_op_eqv_i32:
2929     case INDEX_op_eqv_i64:
2930         return C_O1_I2(r, r, rL);
2931
2932     case INDEX_op_shl_i32:
2933     case INDEX_op_shr_i32:
2934     case INDEX_op_sar_i32:
2935     case INDEX_op_rotl_i32:
2936     case INDEX_op_rotr_i32:
2937     case INDEX_op_shl_i64:
2938     case INDEX_op_shr_i64:
2939     case INDEX_op_sar_i64:
2940     case INDEX_op_rotl_i64:
2941     case INDEX_op_rotr_i64:
2942         return C_O1_I2(r, r, ri);
2943
2944     case INDEX_op_clz_i32:
2945     case INDEX_op_ctz_i32:
2946     case INDEX_op_clz_i64:
2947     case INDEX_op_ctz_i64:
2948         return C_O1_I2(r, r, rAL);
2949
2950     case INDEX_op_brcond_i32:
2951     case INDEX_op_brcond_i64:
2952         return C_O0_I2(r, rA);
2953
2954     case INDEX_op_movcond_i32:
2955     case INDEX_op_movcond_i64:
2956         return C_O1_I4(r, r, rA, rZ, rZ);
2957
2958     case INDEX_op_qemu_ld_a32_i32:
2959     case INDEX_op_qemu_ld_a64_i32:
2960     case INDEX_op_qemu_ld_a32_i64:
2961     case INDEX_op_qemu_ld_a64_i64:
2962         return C_O1_I1(r, r);
2963     case INDEX_op_qemu_ld_a32_i128:
2964     case INDEX_op_qemu_ld_a64_i128:
2965         return C_O2_I1(r, r, r);
2966     case INDEX_op_qemu_st_a32_i32:
2967     case INDEX_op_qemu_st_a64_i32:
2968     case INDEX_op_qemu_st_a32_i64:
2969     case INDEX_op_qemu_st_a64_i64:
2970         return C_O0_I2(rZ, r);
2971     case INDEX_op_qemu_st_a32_i128:
2972     case INDEX_op_qemu_st_a64_i128:
2973         return C_O0_I3(rZ, rZ, r);
2974
2975     case INDEX_op_deposit_i32:
2976     case INDEX_op_deposit_i64:
2977         return C_O1_I2(r, 0, rZ);
2978
2979     case INDEX_op_extract2_i32:
2980     case INDEX_op_extract2_i64:
2981         return C_O1_I2(r, rZ, rZ);
2982
2983     case INDEX_op_add2_i32:
2984     case INDEX_op_add2_i64:
2985     case INDEX_op_sub2_i32:
2986     case INDEX_op_sub2_i64:
2987         return C_O2_I4(r, r, rZ, rZ, rA, rMZ);
2988
2989     case INDEX_op_add_vec:
2990     case INDEX_op_sub_vec:
2991     case INDEX_op_mul_vec:
2992     case INDEX_op_xor_vec:
2993     case INDEX_op_ssadd_vec:
2994     case INDEX_op_sssub_vec:
2995     case INDEX_op_usadd_vec:
2996     case INDEX_op_ussub_vec:
2997     case INDEX_op_smax_vec:
2998     case INDEX_op_smin_vec:
2999     case INDEX_op_umax_vec:
3000     case INDEX_op_umin_vec:
3001     case INDEX_op_shlv_vec:
3002     case INDEX_op_shrv_vec:
3003     case INDEX_op_sarv_vec:
3004     case INDEX_op_aa64_sshl_vec:
3005         return C_O1_I2(w, w, w);
3006     case INDEX_op_not_vec:
3007     case INDEX_op_neg_vec:
3008     case INDEX_op_abs_vec:
3009     case INDEX_op_shli_vec:
3010     case INDEX_op_shri_vec:
3011     case INDEX_op_sari_vec:
3012         return C_O1_I1(w, w);
3013     case INDEX_op_ld_vec:
3014     case INDEX_op_dupm_vec:
3015         return C_O1_I1(w, r);
3016     case INDEX_op_st_vec:
3017         return C_O0_I2(w, r);
3018     case INDEX_op_dup_vec:
3019         return C_O1_I1(w, wr);
3020     case INDEX_op_or_vec:
3021     case INDEX_op_andc_vec:
3022         return C_O1_I2(w, w, wO);
3023     case INDEX_op_and_vec:
3024     case INDEX_op_orc_vec:
3025         return C_O1_I2(w, w, wN);
3026     case INDEX_op_cmp_vec:
3027         return C_O1_I2(w, w, wZ);
3028     case INDEX_op_bitsel_vec:
3029         return C_O1_I3(w, w, w, w);
3030     case INDEX_op_aa64_sli_vec:
3031         return C_O1_I2(w, 0, w);
3032
3033     default:
3034         g_assert_not_reached();
3035     }
3036 }
3037
3038 static void tcg_target_init(TCGContext *s)
3039 {
3040     tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
3041     tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
3042     tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
3043     tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
3044
3045     tcg_target_call_clobber_regs = -1ull;
3046     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
3047     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
3048     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
3049     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X22);
3050     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X23);
3051     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X24);
3052     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X25);
3053     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X26);
3054     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
3055     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
3056     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
3057     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
3058     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
3059     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
3060     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
3061     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
3062     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
3063     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
3064     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
3065
3066     s->reserved_regs = 0;
3067     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
3068     tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
3069     tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
3070     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
3071     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1);
3072     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP2);
3073     tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0);
3074 }
3075
3076 /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)).  */
3077 #define PUSH_SIZE  ((30 - 19 + 1) * 8)
3078
3079 #define FRAME_SIZE \
3080     ((PUSH_SIZE \
3081       + TCG_STATIC_CALL_ARGS_SIZE \
3082       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3083       + TCG_TARGET_STACK_ALIGN - 1) \
3084      & ~(TCG_TARGET_STACK_ALIGN - 1))
3085
3086 /* We're expecting a 2 byte uleb128 encoded value.  */
3087 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3088
3089 /* We're expecting to use a single ADDI insn.  */
3090 QEMU_BUILD_BUG_ON(FRAME_SIZE - PUSH_SIZE > 0xfff);
3091
3092 static void tcg_target_qemu_prologue(TCGContext *s)
3093 {
3094     TCGReg r;
3095
3096     tcg_out_bti(s, BTI_C);
3097
3098     /* Push (FP, LR) and allocate space for all saved registers.  */
3099     tcg_out_insn(s, 3314, STP, TCG_REG_FP, TCG_REG_LR,
3100                  TCG_REG_SP, -PUSH_SIZE, 1, 1);
3101
3102     /* Set up frame pointer for canonical unwinding.  */
3103     tcg_out_movr_sp(s, TCG_TYPE_I64, TCG_REG_FP, TCG_REG_SP);
3104
3105     /* Store callee-preserved regs x19..x28.  */
3106     for (r = TCG_REG_X19; r <= TCG_REG_X27; r += 2) {
3107         int ofs = (r - TCG_REG_X19 + 2) * 8;
3108         tcg_out_insn(s, 3314, STP, r, r + 1, TCG_REG_SP, ofs, 1, 0);
3109     }
3110
3111     /* Make stack space for TCG locals.  */
3112     tcg_out_insn(s, 3401, SUBI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
3113                  FRAME_SIZE - PUSH_SIZE);
3114
3115     /* Inform TCG about how to find TCG locals with register, offset, size.  */
3116     tcg_set_frame(s, TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE,
3117                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3118
3119     if (!tcg_use_softmmu) {
3120         /*
3121          * Note that XZR cannot be encoded in the address base register slot,
3122          * as that actually encodes SP.  Depending on the guest, we may need
3123          * to zero-extend the guest address via the address index register slot,
3124          * therefore we need to load even a zero guest base into a register.
3125          */
3126         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base);
3127         tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE);
3128     }
3129
3130     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3131     tcg_out_insn(s, 3207, BR, tcg_target_call_iarg_regs[1]);
3132
3133     /*
3134      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3135      * and fall through to the rest of the epilogue.
3136      */
3137     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3138     tcg_out_bti(s, BTI_J);
3139     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_X0, 0);
3140
3141     /* TB epilogue */
3142     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3143     tcg_out_bti(s, BTI_J);
3144
3145     /* Remove TCG locals stack space.  */
3146     tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
3147                  FRAME_SIZE - PUSH_SIZE);
3148
3149     /* Restore registers x19..x28.  */
3150     for (r = TCG_REG_X19; r <= TCG_REG_X27; r += 2) {
3151         int ofs = (r - TCG_REG_X19 + 2) * 8;
3152         tcg_out_insn(s, 3314, LDP, r, r + 1, TCG_REG_SP, ofs, 1, 0);
3153     }
3154
3155     /* Pop (FP, LR), restore SP to previous frame.  */
3156     tcg_out_insn(s, 3314, LDP, TCG_REG_FP, TCG_REG_LR,
3157                  TCG_REG_SP, PUSH_SIZE, 0, 1);
3158     tcg_out_insn(s, 3207, RET, TCG_REG_LR);
3159 }
3160
3161 static void tcg_out_tb_start(TCGContext *s)
3162 {
3163     tcg_out_bti(s, BTI_J);
3164 }
3165
3166 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3167 {
3168     int i;
3169     for (i = 0; i < count; ++i) {
3170         p[i] = NOP;
3171     }
3172 }
3173
3174 typedef struct {
3175     DebugFrameHeader h;
3176     uint8_t fde_def_cfa[4];
3177     uint8_t fde_reg_ofs[24];
3178 } DebugFrame;
3179
3180 #define ELF_HOST_MACHINE EM_AARCH64
3181
3182 static const DebugFrame debug_frame = {
3183     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3184     .h.cie.id = -1,
3185     .h.cie.version = 1,
3186     .h.cie.code_align = 1,
3187     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3188     .h.cie.return_column = TCG_REG_LR,
3189
3190     /* Total FDE size does not include the "len" member.  */
3191     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3192
3193     .fde_def_cfa = {
3194         12, TCG_REG_SP,                 /* DW_CFA_def_cfa sp, ... */
3195         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3196         (FRAME_SIZE >> 7)
3197     },
3198     .fde_reg_ofs = {
3199         0x80 + 28, 1,                   /* DW_CFA_offset, x28,  -8 */
3200         0x80 + 27, 2,                   /* DW_CFA_offset, x27, -16 */
3201         0x80 + 26, 3,                   /* DW_CFA_offset, x26, -24 */
3202         0x80 + 25, 4,                   /* DW_CFA_offset, x25, -32 */
3203         0x80 + 24, 5,                   /* DW_CFA_offset, x24, -40 */
3204         0x80 + 23, 6,                   /* DW_CFA_offset, x23, -48 */
3205         0x80 + 22, 7,                   /* DW_CFA_offset, x22, -56 */
3206         0x80 + 21, 8,                   /* DW_CFA_offset, x21, -64 */
3207         0x80 + 20, 9,                   /* DW_CFA_offset, x20, -72 */
3208         0x80 + 19, 10,                  /* DW_CFA_offset, x1p, -80 */
3209         0x80 + 30, 11,                  /* DW_CFA_offset,  lr, -88 */
3210         0x80 + 29, 12,                  /* DW_CFA_offset,  fp, -96 */
3211     }
3212 };
3213
3214 void tcg_register_jit(const void *buf, size_t buf_size)
3215 {
3216     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3217 }