tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135
 136 /* Registers used with L constraint, which are the first argument
 137    registers on x86_64, and two random call clobbered registers on
 138    i386. */
 139 #if TCG_TARGET_REG_BITS == 64
 140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 142 #else
 143 # define TCG_REG_L0 TCG_REG_EAX
 144 # define TCG_REG_L1 TCG_REG_EDX
 145 #endif
 146
 147 #if TCG_TARGET_REG_BITS == 64
 148 # define ALL_GENERAL_REGS      0x0000ffffu
 149 # define ALL_VECTOR_REGS       0xffff0000u
 150 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 151 #else
 152 # define ALL_GENERAL_REGS      0x000000ffu
 153 # define ALL_VECTOR_REGS       0x00ff0000u
 154 # define ALL_BYTEL_REGS        0x0000000fu
 155 #endif
 156 #define SOFTMMU_RESERVE_REGS \
 157     (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
 158
 159 /* For 64-bit, we always know that CMOV is available.  */
 160 #if TCG_TARGET_REG_BITS == 64
 161 # define have_cmov      true
 162 #else
 163 # define have_cmov      (cpuinfo & CPUINFO_CMOV)
 164 #endif
 165 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 166 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 167
 168 static const tcg_insn_unit *tb_ret_addr;
 169
 170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                         intptr_t value, intptr_t addend)
 172 {
 173     value += addend;
 174     switch(type) {
 175     case R_386_PC32:
 176         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 177         if (value != (int32_t)value) {
 178             return false;
 179         }
 180         /* FALLTHRU */
 181     case R_386_32:
 182         tcg_patch32(code_ptr, value);
 183         break;
 184     case R_386_PC8:
 185         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 186         if (value != (int8_t)value) {
 187             return false;
 188         }
 189         tcg_patch8(code_ptr, value);
 190         break;
 191     default:
 192         g_assert_not_reached();
 193     }
 194     return true;
 195 }
 196
 197 /* test if a constant matches the constraint */
 198 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 199 {
 200     if (ct & TCG_CT_CONST) {
 201         return 1;
 202     }
 203     if (type == TCG_TYPE_I32) {
 204         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 205             return 1;
 206         }
 207     } else {
 208         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 209             return 1;
 210         }
 211         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 212             return 1;
 213         }
 214         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 215             return 1;
 216         }
 217     }
 218     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 219         return 1;
 220     }
 221     return 0;
 222 }
 223
 224 # define LOWREGMASK(x)  ((x) & 7)
 225
 226 #define P_EXT           0x100           /* 0x0f opcode prefix */
 227 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 228 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 229 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 230 #if TCG_TARGET_REG_BITS == 64
 231 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 232 # define P_REXB_R       0x2000          /* REG field as byte register */
 233 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 234 # define P_GS           0x8000          /* gs segment override */
 235 #else
 236 # define P_REXW         0
 237 # define P_REXB_R       0
 238 # define P_REXB_RM      0
 239 # define P_GS           0
 240 #endif
 241 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 242 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 243 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 244 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 245 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 246
 247 #define OPC_ARITH_EbIb  (0x80)
 248 #define OPC_ARITH_EvIz  (0x81)
 249 #define OPC_ARITH_EvIb  (0x83)
 250 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 251 #define OPC_ANDN        (0xf2 | P_EXT38)
 252 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 253 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 254 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 255 #define OPC_BSF         (0xbc | P_EXT)
 256 #define OPC_BSR         (0xbd | P_EXT)
 257 #define OPC_BSWAP       (0xc8 | P_EXT)
 258 #define OPC_CALL_Jz     (0xe8)
 259 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 260 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 261 #define OPC_DEC_r32     (0x48)
 262 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 263 #define OPC_IMUL_GvEvIb (0x6b)
 264 #define OPC_IMUL_GvEvIz (0x69)
 265 #define OPC_INC_r32     (0x40)
 266 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 267 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 268 #define OPC_JMP_long    (0xe9)
 269 #define OPC_JMP_short   (0xeb)
 270 #define OPC_LEA         (0x8d)
 271 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 272 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 273 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 274 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 275 #define OPC_MOVB_EvIz   (0xc6)
 276 #define OPC_MOVL_EvIz   (0xc7)
 277 #define OPC_MOVB_Ib     (0xb0)
 278 #define OPC_MOVL_Iv     (0xb8)
 279 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 280 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 281 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 282 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 283 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 284 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 285 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 286 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 287 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 288 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 289 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 290 #define OPC_MOVSBL      (0xbe | P_EXT)
 291 #define OPC_MOVSWL      (0xbf | P_EXT)
 292 #define OPC_MOVSLQ      (0x63 | P_REXW)
 293 #define OPC_MOVZBL      (0xb6 | P_EXT)
 294 #define OPC_MOVZWL      (0xb7 | P_EXT)
 295 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 296 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 297 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 298 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 299 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 300 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 301 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 302 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 303 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 304 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 305 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 306 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 307 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 308 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 309 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 310 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 311 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 312 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 313 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 314 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 315 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 316 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 317 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 318 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 319 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 320 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 321 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 322 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 323 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 324 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 325 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 326 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 327 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 328 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 329 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 330 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 331 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 332 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 333 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 334 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 335 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 336 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 337 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 338 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 339 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 340 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 341 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 342 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 343 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 344 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 345 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 346 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 347 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 348 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 349 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 350 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 351 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 352 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 353 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 354 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 355 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 356 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 357 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 358 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 359 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 360 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 361 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 362 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 363 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 364 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 365 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 366 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 367 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 368 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 369 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 370 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 371 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 372 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 373 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 374 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 375 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 376 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 382 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 383 #define OPC_POP_r32     (0x58)
 384 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 385 #define OPC_PUSH_r32    (0x50)
 386 #define OPC_PUSH_Iv     (0x68)
 387 #define OPC_PUSH_Ib     (0x6a)
 388 #define OPC_RET         (0xc3)
 389 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 390 #define OPC_SHIFT_1     (0xd1)
 391 #define OPC_SHIFT_Ib    (0xc1)
 392 #define OPC_SHIFT_cl    (0xd3)
 393 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 394 #define OPC_SHUFPS      (0xc6 | P_EXT)
 395 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 396 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 397 #define OPC_SHRD_Ib     (0xac | P_EXT)
 398 #define OPC_TESTL       (0x85)
 399 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 400 #define OPC_UD2         (0x0b | P_EXT)
 401 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 402 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 403 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 404 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 405 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 406 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 407 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 408 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 409 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 410 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 411 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 412 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 413 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 414 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 415 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 416 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 417 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 418 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 419 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 420 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 421 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 422 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 423 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 424 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 425 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 426 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 427 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 428 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 429 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 430 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 431 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 432 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 433 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 434 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 435 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 436 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 437 #define OPC_XCHG_ax_r32 (0x90)
 438 #define OPC_XCHG_EvGv   (0x87)
 439
 440 #define OPC_GRP3_Eb     (0xf6)
 441 #define OPC_GRP3_Ev     (0xf7)
 442 #define OPC_GRP5        (0xff)
 443 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 444
 445 /* Group 1 opcode extensions for 0x80-0x83.
 446    These are also used as modifiers for OPC_ARITH.  */
 447 #define ARITH_ADD 0
 448 #define ARITH_OR  1
 449 #define ARITH_ADC 2
 450 #define ARITH_SBB 3
 451 #define ARITH_AND 4
 452 #define ARITH_SUB 5
 453 #define ARITH_XOR 6
 454 #define ARITH_CMP 7
 455
 456 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 457 #define SHIFT_ROL 0
 458 #define SHIFT_ROR 1
 459 #define SHIFT_SHL 4
 460 #define SHIFT_SHR 5
 461 #define SHIFT_SAR 7
 462
 463 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 464 #define EXT3_TESTi 0
 465 #define EXT3_NOT   2
 466 #define EXT3_NEG   3
 467 #define EXT3_MUL   4
 468 #define EXT3_IMUL  5
 469 #define EXT3_DIV   6
 470 #define EXT3_IDIV  7
 471
 472 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 473 #define EXT5_INC_Ev     0
 474 #define EXT5_DEC_Ev     1
 475 #define EXT5_CALLN_Ev   2
 476 #define EXT5_JMPN_Ev    4
 477
 478 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 479 #define JCC_JMP (-1)
 480 #define JCC_JO  0x0
 481 #define JCC_JNO 0x1
 482 #define JCC_JB  0x2
 483 #define JCC_JAE 0x3
 484 #define JCC_JE  0x4
 485 #define JCC_JNE 0x5
 486 #define JCC_JBE 0x6
 487 #define JCC_JA  0x7
 488 #define JCC_JS  0x8
 489 #define JCC_JNS 0x9
 490 #define JCC_JP  0xa
 491 #define JCC_JNP 0xb
 492 #define JCC_JL  0xc
 493 #define JCC_JGE 0xd
 494 #define JCC_JLE 0xe
 495 #define JCC_JG  0xf
 496
 497 static const uint8_t tcg_cond_to_jcc[] = {
 498     [TCG_COND_EQ] = JCC_JE,
 499     [TCG_COND_NE] = JCC_JNE,
 500     [TCG_COND_LT] = JCC_JL,
 501     [TCG_COND_GE] = JCC_JGE,
 502     [TCG_COND_LE] = JCC_JLE,
 503     [TCG_COND_GT] = JCC_JG,
 504     [TCG_COND_LTU] = JCC_JB,
 505     [TCG_COND_GEU] = JCC_JAE,
 506     [TCG_COND_LEU] = JCC_JBE,
 507     [TCG_COND_GTU] = JCC_JA,
 508 };
 509
 510 #if TCG_TARGET_REG_BITS == 64
 511 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 512 {
 513     int rex;
 514
 515     if (opc & P_GS) {
 516         tcg_out8(s, 0x65);
 517     }
 518     if (opc & P_DATA16) {
 519         /* We should never be asking for both 16 and 64-bit operation.  */
 520         tcg_debug_assert((opc & P_REXW) == 0);
 521         tcg_out8(s, 0x66);
 522     }
 523     if (opc & P_SIMDF3) {
 524         tcg_out8(s, 0xf3);
 525     } else if (opc & P_SIMDF2) {
 526         tcg_out8(s, 0xf2);
 527     }
 528
 529     rex = 0;
 530     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 531     rex |= (r & 8) >> 1;                /* REX.R */
 532     rex |= (x & 8) >> 2;                /* REX.X */
 533     rex |= (rm & 8) >> 3;               /* REX.B */
 534
 535     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 536        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 537        as otherwise the encoding indicates %[abcd]h.  Note that the values
 538        that are ORed in merely indicate that the REX byte must be present;
 539        those bits get discarded in output.  */
 540     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 541     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 542
 543     if (rex) {
 544         tcg_out8(s, (uint8_t)(rex | 0x40));
 545     }
 546
 547     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 548         tcg_out8(s, 0x0f);
 549         if (opc & P_EXT38) {
 550             tcg_out8(s, 0x38);
 551         } else if (opc & P_EXT3A) {
 552             tcg_out8(s, 0x3a);
 553         }
 554     }
 555
 556     tcg_out8(s, opc);
 557 }
 558 #else
 559 static void tcg_out_opc(TCGContext *s, int opc)
 560 {
 561     if (opc & P_DATA16) {
 562         tcg_out8(s, 0x66);
 563     }
 564     if (opc & P_SIMDF3) {
 565         tcg_out8(s, 0xf3);
 566     } else if (opc & P_SIMDF2) {
 567         tcg_out8(s, 0xf2);
 568     }
 569     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 570         tcg_out8(s, 0x0f);
 571         if (opc & P_EXT38) {
 572             tcg_out8(s, 0x38);
 573         } else if (opc & P_EXT3A) {
 574             tcg_out8(s, 0x3a);
 575         }
 576     }
 577     tcg_out8(s, opc);
 578 }
 579 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 580    the 32-bit compilation paths.  This method works with all versions of gcc,
 581    whereas relying on optimization may not be able to exclude them.  */
 582 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 583 #endif
 584
 585 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 586 {
 587     tcg_out_opc(s, opc, r, rm, 0);
 588     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 589 }
 590
 591 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 592                             int rm, int index)
 593 {
 594     int tmp;
 595
 596     if (opc & P_GS) {
 597         tcg_out8(s, 0x65);
 598     }
 599     /* Use the two byte form if possible, which cannot encode
 600        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 601     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 602         && ((rm | index) & 8) == 0) {
 603         /* Two byte VEX prefix.  */
 604         tcg_out8(s, 0xc5);
 605
 606         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 607     } else {
 608         /* Three byte VEX prefix.  */
 609         tcg_out8(s, 0xc4);
 610
 611         /* VEX.m-mmmm */
 612         if (opc & P_EXT3A) {
 613             tmp = 3;
 614         } else if (opc & P_EXT38) {
 615             tmp = 2;
 616         } else if (opc & P_EXT) {
 617             tmp = 1;
 618         } else {
 619             g_assert_not_reached();
 620         }
 621         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 622         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 623         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 624         tcg_out8(s, tmp);
 625
 626         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 627     }
 628
 629     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 630     /* VEX.pp */
 631     if (opc & P_DATA16) {
 632         tmp |= 1;                          /* 0x66 */
 633     } else if (opc & P_SIMDF3) {
 634         tmp |= 2;                          /* 0xf3 */
 635     } else if (opc & P_SIMDF2) {
 636         tmp |= 3;                          /* 0xf2 */
 637     }
 638     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 639     tcg_out8(s, tmp);
 640     tcg_out8(s, opc);
 641 }
 642
 643 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 644                              int rm, int index)
 645 {
 646     /* The entire 4-byte evex prefix; with R' and V' set. */
 647     uint32_t p = 0x08041062;
 648     int mm, pp;
 649
 650     tcg_debug_assert(have_avx512vl);
 651
 652     /* EVEX.mm */
 653     if (opc & P_EXT3A) {
 654         mm = 3;
 655     } else if (opc & P_EXT38) {
 656         mm = 2;
 657     } else if (opc & P_EXT) {
 658         mm = 1;
 659     } else {
 660         g_assert_not_reached();
 661     }
 662
 663     /* EVEX.pp */
 664     if (opc & P_DATA16) {
 665         pp = 1;                          /* 0x66 */
 666     } else if (opc & P_SIMDF3) {
 667         pp = 2;                          /* 0xf3 */
 668     } else if (opc & P_SIMDF2) {
 669         pp = 3;                          /* 0xf2 */
 670     } else {
 671         pp = 0;
 672     }
 673
 674     p = deposit32(p, 8, 2, mm);
 675     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 676     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 677     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 678     p = deposit32(p, 16, 2, pp);
 679     p = deposit32(p, 19, 4, ~v);
 680     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 681     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 682
 683     tcg_out32(s, p);
 684     tcg_out8(s, opc);
 685 }
 686
 687 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 688 {
 689     if (opc & P_EVEX) {
 690         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 691     } else {
 692         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 693     }
 694     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 695 }
 696
 697 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 698    We handle either RM and INDEX missing with a negative value.  In 64-bit
 699    mode for absolute addresses, ~RM is the size of the immediate operand
 700    that will follow the instruction.  */
 701
 702 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 703                                int shift, intptr_t offset)
 704 {
 705     int mod, len;
 706
 707     if (index < 0 && rm < 0) {
 708         if (TCG_TARGET_REG_BITS == 64) {
 709             /* Try for a rip-relative addressing mode.  This has replaced
 710                the 32-bit-mode absolute addressing encoding.  */
 711             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 712             intptr_t disp = offset - pc;
 713             if (disp == (int32_t)disp) {
 714                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 715                 tcg_out32(s, disp);
 716                 return;
 717             }
 718
 719             /* Try for an absolute address encoding.  This requires the
 720                use of the MODRM+SIB encoding and is therefore larger than
 721                rip-relative addressing.  */
 722             if (offset == (int32_t)offset) {
 723                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 724                 tcg_out8(s, (4 << 3) | 5);
 725                 tcg_out32(s, offset);
 726                 return;
 727             }
 728
 729             /* ??? The memory isn't directly addressable.  */
 730             g_assert_not_reached();
 731         } else {
 732             /* Absolute address.  */
 733             tcg_out8(s, (r << 3) | 5);
 734             tcg_out32(s, offset);
 735             return;
 736         }
 737     }
 738
 739     /* Find the length of the immediate addend.  Note that the encoding
 740        that would be used for (%ebp) indicates absolute addressing.  */
 741     if (rm < 0) {
 742         mod = 0, len = 4, rm = 5;
 743     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 744         mod = 0, len = 0;
 745     } else if (offset == (int8_t)offset) {
 746         mod = 0x40, len = 1;
 747     } else {
 748         mod = 0x80, len = 4;
 749     }
 750
 751     /* Use a single byte MODRM format if possible.  Note that the encoding
 752        that would be used for %esp is the escape to the two byte form.  */
 753     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 754         /* Single byte MODRM format.  */
 755         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 756     } else {
 757         /* Two byte MODRM+SIB format.  */
 758
 759         /* Note that the encoding that would place %esp into the index
 760            field indicates no index register.  In 64-bit mode, the REX.X
 761            bit counts, so %r12 can be used as the index.  */
 762         if (index < 0) {
 763             index = 4;
 764         } else {
 765             tcg_debug_assert(index != TCG_REG_ESP);
 766         }
 767
 768         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 769         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 770     }
 771
 772     if (len == 1) {
 773         tcg_out8(s, offset);
 774     } else if (len == 4) {
 775         tcg_out32(s, offset);
 776     }
 777 }
 778
 779 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 780                                      int index, int shift, intptr_t offset)
 781 {
 782     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 783     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 784 }
 785
 786 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 787                                          int rm, int index, int shift,
 788                                          intptr_t offset)
 789 {
 790     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 791     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 792 }
 793
 794 /* A simplification of the above with no index or shift.  */
 795 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 796                                         int rm, intptr_t offset)
 797 {
 798     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 799 }
 800
 801 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 802                                             int v, int rm, intptr_t offset)
 803 {
 804     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 805 }
 806
 807 /* Output an opcode with an expected reference to the constant pool.  */
 808 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 809 {
 810     tcg_out_opc(s, opc, r, 0, 0);
 811     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 812     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 813     tcg_out32(s, 0);
 814 }
 815
 816 /* Output an opcode with an expected reference to the constant pool.  */
 817 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 818 {
 819     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 820     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 821     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 822     tcg_out32(s, 0);
 823 }
 824
 825 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 826 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 827 {
 828     /* Propagate an opcode prefix, such as P_REXW.  */
 829     int ext = subop & ~0x7;
 830     subop &= 0x7;
 831
 832     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 833 }
 834
 835 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 836 {
 837     int rexw = 0;
 838
 839     if (arg == ret) {
 840         return true;
 841     }
 842     switch (type) {
 843     case TCG_TYPE_I64:
 844         rexw = P_REXW;
 845         /* fallthru */
 846     case TCG_TYPE_I32:
 847         if (ret < 16) {
 848             if (arg < 16) {
 849                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 850             } else {
 851                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 852             }
 853         } else {
 854             if (arg < 16) {
 855                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 856             } else {
 857                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 858             }
 859         }
 860         break;
 861
 862     case TCG_TYPE_V64:
 863         tcg_debug_assert(ret >= 16 && arg >= 16);
 864         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 865         break;
 866     case TCG_TYPE_V128:
 867         tcg_debug_assert(ret >= 16 && arg >= 16);
 868         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 869         break;
 870     case TCG_TYPE_V256:
 871         tcg_debug_assert(ret >= 16 && arg >= 16);
 872         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 873         break;
 874
 875     default:
 876         g_assert_not_reached();
 877     }
 878     return true;
 879 }
 880
 881 static const int avx2_dup_insn[4] = {
 882     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 883     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 884 };
 885
 886 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 887                             TCGReg r, TCGReg a)
 888 {
 889     if (have_avx2) {
 890         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 891         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 892     } else {
 893         switch (vece) {
 894         case MO_8:
 895             /* ??? With zero in a register, use PSHUFB.  */
 896             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 897             a = r;
 898             /* FALLTHRU */
 899         case MO_16:
 900             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 901             a = r;
 902             /* FALLTHRU */
 903         case MO_32:
 904             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 905             /* imm8 operand: all output lanes selected from input lane 0.  */
 906             tcg_out8(s, 0);
 907             break;
 908         case MO_64:
 909             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 910             break;
 911         default:
 912             g_assert_not_reached();
 913         }
 914     }
 915     return true;
 916 }
 917
 918 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 919                              TCGReg r, TCGReg base, intptr_t offset)
 920 {
 921     if (have_avx2) {
 922         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 923         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 924                                  r, 0, base, offset);
 925     } else {
 926         switch (vece) {
 927         case MO_64:
 928             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 929             break;
 930         case MO_32:
 931             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 932             break;
 933         case MO_16:
 934             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 935             tcg_out8(s, 0); /* imm8 */
 936             tcg_out_dup_vec(s, type, vece, r, r);
 937             break;
 938         case MO_8:
 939             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 940             tcg_out8(s, 0); /* imm8 */
 941             tcg_out_dup_vec(s, type, vece, r, r);
 942             break;
 943         default:
 944             g_assert_not_reached();
 945         }
 946     }
 947     return true;
 948 }
 949
 950 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 951                              TCGReg ret, int64_t arg)
 952 {
 953     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 954
 955     if (arg == 0) {
 956         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 957         return;
 958     }
 959     if (arg == -1) {
 960         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 961         return;
 962     }
 963
 964     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 965         if (have_avx2) {
 966             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 967         } else {
 968             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 969         }
 970         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 971     } else {
 972         if (type == TCG_TYPE_V64) {
 973             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 974         } else if (have_avx2) {
 975             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 976         } else {
 977             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 978         }
 979         if (TCG_TARGET_REG_BITS == 64) {
 980             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 981         } else {
 982             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 983         }
 984     }
 985 }
 986
 987 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 988                              TCGReg ret, tcg_target_long arg)
 989 {
 990     if (arg == 0) {
 991         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 992         return;
 993     }
 994     if (arg == -1) {
 995         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 996         return;
 997     }
 998
 999     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1000     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1001     if (TCG_TARGET_REG_BITS == 64) {
1002         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1003     } else {
1004         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1005     }
1006 }
1007
1008 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1009                              TCGReg ret, tcg_target_long arg)
1010 {
1011     tcg_target_long diff;
1012
1013     if (arg == 0) {
1014         tgen_arithr(s, ARITH_XOR, ret, ret);
1015         return;
1016     }
1017     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1018         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1019         tcg_out32(s, arg);
1020         return;
1021     }
1022     if (arg == (int32_t)arg) {
1023         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1024         tcg_out32(s, arg);
1025         return;
1026     }
1027
1028     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1029     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1030     if (diff == (int32_t)diff) {
1031         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1032         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1033         tcg_out32(s, diff);
1034         return;
1035     }
1036
1037     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1038     tcg_out64(s, arg);
1039 }
1040
1041 static void tcg_out_movi(TCGContext *s, TCGType type,
1042                          TCGReg ret, tcg_target_long arg)
1043 {
1044     switch (type) {
1045     case TCG_TYPE_I32:
1046 #if TCG_TARGET_REG_BITS == 64
1047     case TCG_TYPE_I64:
1048 #endif
1049         if (ret < 16) {
1050             tcg_out_movi_int(s, type, ret, arg);
1051         } else {
1052             tcg_out_movi_vec(s, type, ret, arg);
1053         }
1054         break;
1055     default:
1056         g_assert_not_reached();
1057     }
1058 }
1059
1060 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1061 {
1062     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1063     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1064     return true;
1065 }
1066
1067 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1068                              tcg_target_long imm)
1069 {
1070     /* This function is only used for passing structs by reference. */
1071     tcg_debug_assert(imm == (int32_t)imm);
1072     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1073 }
1074
1075 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1076 {
1077     if (val == (int8_t)val) {
1078         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1079         tcg_out8(s, val);
1080     } else if (val == (int32_t)val) {
1081         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1082         tcg_out32(s, val);
1083     } else {
1084         g_assert_not_reached();
1085     }
1086 }
1087
1088 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1089 {
1090     /* Given the strength of x86 memory ordering, we only need care for
1091        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1092        faster than "mfence", so don't bother with the sse insn.  */
1093     if (a0 & TCG_MO_ST_LD) {
1094         tcg_out8(s, 0xf0);
1095         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1096         tcg_out8(s, 0);
1097     }
1098 }
1099
1100 static inline void tcg_out_push(TCGContext *s, int reg)
1101 {
1102     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1103 }
1104
1105 static inline void tcg_out_pop(TCGContext *s, int reg)
1106 {
1107     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1108 }
1109
1110 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1111                        TCGReg arg1, intptr_t arg2)
1112 {
1113     switch (type) {
1114     case TCG_TYPE_I32:
1115         if (ret < 16) {
1116             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1117         } else {
1118             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1119         }
1120         break;
1121     case TCG_TYPE_I64:
1122         if (ret < 16) {
1123             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1124             break;
1125         }
1126         /* FALLTHRU */
1127     case TCG_TYPE_V64:
1128         /* There is no instruction that can validate 8-byte alignment.  */
1129         tcg_debug_assert(ret >= 16);
1130         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1131         break;
1132     case TCG_TYPE_V128:
1133         /*
1134          * The gvec infrastructure is asserts that v128 vector loads
1135          * and stores use a 16-byte aligned offset.  Validate that the
1136          * final pointer is aligned by using an insn that will SIGSEGV.
1137          */
1138         tcg_debug_assert(ret >= 16);
1139         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1140         break;
1141     case TCG_TYPE_V256:
1142         /*
1143          * The gvec infrastructure only requires 16-byte alignment,
1144          * so here we must use an unaligned load.
1145          */
1146         tcg_debug_assert(ret >= 16);
1147         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1148                                  ret, 0, arg1, arg2);
1149         break;
1150     default:
1151         g_assert_not_reached();
1152     }
1153 }
1154
1155 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1156                        TCGReg arg1, intptr_t arg2)
1157 {
1158     switch (type) {
1159     case TCG_TYPE_I32:
1160         if (arg < 16) {
1161             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1162         } else {
1163             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1164         }
1165         break;
1166     case TCG_TYPE_I64:
1167         if (arg < 16) {
1168             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1169             break;
1170         }
1171         /* FALLTHRU */
1172     case TCG_TYPE_V64:
1173         /* There is no instruction that can validate 8-byte alignment.  */
1174         tcg_debug_assert(arg >= 16);
1175         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1176         break;
1177     case TCG_TYPE_V128:
1178         /*
1179          * The gvec infrastructure is asserts that v128 vector loads
1180          * and stores use a 16-byte aligned offset.  Validate that the
1181          * final pointer is aligned by using an insn that will SIGSEGV.
1182          *
1183          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1184          * for _WIN64, which must have SSE2 but may not have AVX.
1185          */
1186         tcg_debug_assert(arg >= 16);
1187         if (have_avx1) {
1188             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1189         } else {
1190             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1191         }
1192         break;
1193     case TCG_TYPE_V256:
1194         /*
1195          * The gvec infrastructure only requires 16-byte alignment,
1196          * so here we must use an unaligned store.
1197          */
1198         tcg_debug_assert(arg >= 16);
1199         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1200                                  arg, 0, arg1, arg2);
1201         break;
1202     default:
1203         g_assert_not_reached();
1204     }
1205 }
1206
1207 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1208                         TCGReg base, intptr_t ofs)
1209 {
1210     int rexw = 0;
1211     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1212         if (val != (int32_t)val) {
1213             return false;
1214         }
1215         rexw = P_REXW;
1216     } else if (type != TCG_TYPE_I32) {
1217         return false;
1218     }
1219     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1220     tcg_out32(s, val);
1221     return true;
1222 }
1223
1224 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1225 {
1226     /* Propagate an opcode prefix, such as P_DATA16.  */
1227     int ext = subopc & ~0x7;
1228     subopc &= 0x7;
1229
1230     if (count == 1) {
1231         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1232     } else {
1233         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1234         tcg_out8(s, count);
1235     }
1236 }
1237
1238 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1239 {
1240     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1241 }
1242
1243 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1244 {
1245     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1246 }
1247
1248 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1249 {
1250     /* movzbl */
1251     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1252     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1253 }
1254
1255 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1256 {
1257     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1258     /* movsbl */
1259     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1260     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1261 }
1262
1263 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1264 {
1265     /* movzwl */
1266     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1267 }
1268
1269 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1270 {
1271     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1272     /* movsw[lq] */
1273     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1274 }
1275
1276 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1277 {
1278     /* 32-bit mov zero extends.  */
1279     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1280 }
1281
1282 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1283 {
1284     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1285     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1286 }
1287
1288 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1289 {
1290     tcg_out_ext32s(s, dest, src);
1291 }
1292
1293 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1294 {
1295     if (dest != src) {
1296         tcg_out_ext32u(s, dest, src);
1297     }
1298 }
1299
1300 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1301 {
1302     tcg_out_ext32u(s, dest, src);
1303 }
1304
1305 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1306 {
1307     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1308 }
1309
1310 static void tgen_arithi(TCGContext *s, int c, int r0,
1311                         tcg_target_long val, int cf)
1312 {
1313     int rexw = 0;
1314
1315     if (TCG_TARGET_REG_BITS == 64) {
1316         rexw = c & -8;
1317         c &= 7;
1318     }
1319
1320     switch (c) {
1321     case ARITH_ADD:
1322     case ARITH_SUB:
1323         if (!cf) {
1324             /*
1325              * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1326              * partial flags update stalls on Pentium4 and are not recommended
1327              * by current Intel optimization manuals.
1328              */
1329             if (val == 1 || val == -1) {
1330                 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1331                 if (TCG_TARGET_REG_BITS == 64) {
1332                     /*
1333                      * The single-byte increment encodings are re-tasked
1334                      * as the REX prefixes.  Use the MODRM encoding.
1335                      */
1336                     tcg_out_modrm(s, OPC_GRP5 + rexw,
1337                                   (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1338                 } else {
1339                     tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1340                 }
1341                 return;
1342             }
1343             if (val == 128) {
1344                 /*
1345                  * Facilitate using an 8-bit immediate.  Carry is inverted
1346                  * by this transformation, so do it only if cf == 0.
1347                  */
1348                 c ^= ARITH_ADD ^ ARITH_SUB;
1349                 val = -128;
1350             }
1351         }
1352         break;
1353
1354     case ARITH_AND:
1355         if (TCG_TARGET_REG_BITS == 64) {
1356             if (val == 0xffffffffu) {
1357                 tcg_out_ext32u(s, r0, r0);
1358                 return;
1359             }
1360             if (val == (uint32_t)val) {
1361                 /* AND with no high bits set can use a 32-bit operation.  */
1362                 rexw = 0;
1363             }
1364         }
1365         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1366             tcg_out_ext8u(s, r0, r0);
1367             return;
1368         }
1369         if (val == 0xffffu) {
1370             tcg_out_ext16u(s, r0, r0);
1371             return;
1372         }
1373         break;
1374
1375     case ARITH_OR:
1376     case ARITH_XOR:
1377         if (val >= 0x80 && val <= 0xff
1378             && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1379             tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1380             tcg_out8(s, val);
1381             return;
1382         }
1383         break;
1384     }
1385
1386     if (val == (int8_t)val) {
1387         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1388         tcg_out8(s, val);
1389         return;
1390     }
1391     if (rexw == 0 || val == (int32_t)val) {
1392         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1393         tcg_out32(s, val);
1394         return;
1395     }
1396
1397     g_assert_not_reached();
1398 }
1399
1400 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1401 {
1402     if (val != 0) {
1403         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1404     }
1405 }
1406
1407 /* Set SMALL to force a short forward branch.  */
1408 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1409 {
1410     int32_t val, val1;
1411
1412     if (l->has_value) {
1413         val = tcg_pcrel_diff(s, l->u.value_ptr);
1414         val1 = val - 2;
1415         if ((int8_t)val1 == val1) {
1416             if (opc == -1) {
1417                 tcg_out8(s, OPC_JMP_short);
1418             } else {
1419                 tcg_out8(s, OPC_JCC_short + opc);
1420             }
1421             tcg_out8(s, val1);
1422         } else {
1423             tcg_debug_assert(!small);
1424             if (opc == -1) {
1425                 tcg_out8(s, OPC_JMP_long);
1426                 tcg_out32(s, val - 5);
1427             } else {
1428                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1429                 tcg_out32(s, val - 6);
1430             }
1431         }
1432     } else if (small) {
1433         if (opc == -1) {
1434             tcg_out8(s, OPC_JMP_short);
1435         } else {
1436             tcg_out8(s, OPC_JCC_short + opc);
1437         }
1438         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1439         s->code_ptr += 1;
1440     } else {
1441         if (opc == -1) {
1442             tcg_out8(s, OPC_JMP_long);
1443         } else {
1444             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1445         }
1446         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1447         s->code_ptr += 4;
1448     }
1449 }
1450
1451 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1452                         int const_arg2, int rexw)
1453 {
1454     if (const_arg2) {
1455         if (arg2 == 0) {
1456             /* test r, r */
1457             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1458         } else {
1459             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1460         }
1461     } else {
1462         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1463     }
1464 }
1465
1466 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1467                            TCGArg arg1, TCGArg arg2, int const_arg2,
1468                            TCGLabel *label, bool small)
1469 {
1470     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1471     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1472 }
1473
1474 #if TCG_TARGET_REG_BITS == 32
1475 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1476                             const int *const_args, bool small)
1477 {
1478     TCGLabel *label_next = gen_new_label();
1479     TCGLabel *label_this = arg_label(args[5]);
1480
1481     switch(args[4]) {
1482     case TCG_COND_EQ:
1483         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1484                        label_next, 1);
1485         tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1486                        label_this, small);
1487         break;
1488     case TCG_COND_NE:
1489         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1490                        label_this, small);
1491         tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1492                        label_this, small);
1493         break;
1494     case TCG_COND_LT:
1495         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1496                        label_this, small);
1497         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1498         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1499                        label_this, small);
1500         break;
1501     case TCG_COND_LE:
1502         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1503                        label_this, small);
1504         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1505         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1506                        label_this, small);
1507         break;
1508     case TCG_COND_GT:
1509         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1510                        label_this, small);
1511         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1512         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1513                        label_this, small);
1514         break;
1515     case TCG_COND_GE:
1516         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1517                        label_this, small);
1518         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1519         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1520                        label_this, small);
1521         break;
1522     case TCG_COND_LTU:
1523         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1524                        label_this, small);
1525         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1526         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1527                        label_this, small);
1528         break;
1529     case TCG_COND_LEU:
1530         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1531                        label_this, small);
1532         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1533         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1534                        label_this, small);
1535         break;
1536     case TCG_COND_GTU:
1537         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1538                        label_this, small);
1539         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1540         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1541                        label_this, small);
1542         break;
1543     case TCG_COND_GEU:
1544         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1545                        label_this, small);
1546         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1547         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1548                        label_this, small);
1549         break;
1550     default:
1551         g_assert_not_reached();
1552     }
1553     tcg_out_label(s, label_next);
1554 }
1555 #endif
1556
1557 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1558                             TCGArg dest, TCGArg arg1, TCGArg arg2,
1559                             int const_arg2, bool neg)
1560 {
1561     bool inv = false;
1562     bool cleared;
1563
1564     switch (cond) {
1565     case TCG_COND_NE:
1566         inv = true;
1567         /* fall through */
1568     case TCG_COND_EQ:
1569         /* If arg2 is 0, convert to LTU/GEU vs 1. */
1570         if (const_arg2 && arg2 == 0) {
1571             arg2 = 1;
1572             goto do_ltu;
1573         }
1574         break;
1575
1576     case TCG_COND_LEU:
1577         inv = true;
1578         /* fall through */
1579     case TCG_COND_GTU:
1580         /* If arg2 is a register, swap for LTU/GEU. */
1581         if (!const_arg2) {
1582             TCGReg t = arg1;
1583             arg1 = arg2;
1584             arg2 = t;
1585             goto do_ltu;
1586         }
1587         break;
1588
1589     case TCG_COND_GEU:
1590         inv = true;
1591         /* fall through */
1592     case TCG_COND_LTU:
1593     do_ltu:
1594         /*
1595          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1596          * We can then use NEG or INC to produce the desired result.
1597          * This is always smaller than the SETCC expansion.
1598          */
1599         tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1600
1601         /* X - X - C = -C = (C ? -1 : 0) */
1602         tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1603         if (inv && neg) {
1604             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1605             tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1606         } else if (inv) {
1607             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1608             tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1609         } else if (!neg) {
1610             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1611             tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1612         }
1613         return;
1614
1615     case TCG_COND_GE:
1616         inv = true;
1617         /* fall through */
1618     case TCG_COND_LT:
1619         /* If arg2 is 0, extract the sign bit. */
1620         if (const_arg2 && arg2 == 0) {
1621             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1622             if (inv) {
1623                 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1624             }
1625             tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1626                            dest, rexw ? 63 : 31);
1627             return;
1628         }
1629         break;
1630
1631     default:
1632         break;
1633     }
1634
1635     /*
1636      * If dest does not overlap the inputs, clearing it first is preferred.
1637      * The XOR breaks any false dependency for the low-byte write to dest,
1638      * and is also one byte smaller than MOVZBL.
1639      */
1640     cleared = false;
1641     if (dest != arg1 && (const_arg2 || dest != arg2)) {
1642         tgen_arithr(s, ARITH_XOR, dest, dest);
1643         cleared = true;
1644     }
1645
1646     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1647     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1648
1649     if (!cleared) {
1650         tcg_out_ext8u(s, dest, dest);
1651     }
1652     if (neg) {
1653         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1654     }
1655 }
1656
1657 #if TCG_TARGET_REG_BITS == 32
1658 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1659                              const int *const_args)
1660 {
1661     TCGArg new_args[6];
1662     TCGLabel *label_true, *label_over;
1663
1664     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1665
1666     if (args[0] == args[1] || args[0] == args[2]
1667         || (!const_args[3] && args[0] == args[3])
1668         || (!const_args[4] && args[0] == args[4])) {
1669         /* When the destination overlaps with one of the argument
1670            registers, don't do anything tricky.  */
1671         label_true = gen_new_label();
1672         label_over = gen_new_label();
1673
1674         new_args[5] = label_arg(label_true);
1675         tcg_out_brcond2(s, new_args, const_args+1, 1);
1676
1677         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1678         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1679         tcg_out_label(s, label_true);
1680
1681         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1682         tcg_out_label(s, label_over);
1683     } else {
1684         /* When the destination does not overlap one of the arguments,
1685            clear the destination first, jump if cond false, and emit an
1686            increment in the true case.  This results in smaller code.  */
1687
1688         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1689
1690         label_over = gen_new_label();
1691         new_args[4] = tcg_invert_cond(new_args[4]);
1692         new_args[5] = label_arg(label_over);
1693         tcg_out_brcond2(s, new_args, const_args+1, 1);
1694
1695         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1696         tcg_out_label(s, label_over);
1697     }
1698 }
1699 #endif
1700
1701 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1702                          TCGReg dest, TCGReg v1)
1703 {
1704     if (have_cmov) {
1705         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1706     } else {
1707         TCGLabel *over = gen_new_label();
1708         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1709         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1710         tcg_out_label(s, over);
1711     }
1712 }
1713
1714 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1715                             TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1716                             TCGReg v1)
1717 {
1718     tcg_out_cmp(s, c1, c2, const_c2, rexw);
1719     tcg_out_cmov(s, cond, rexw, dest, v1);
1720 }
1721
1722 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1723                         TCGArg arg2, bool const_a2)
1724 {
1725     if (have_bmi1) {
1726         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1727         if (const_a2) {
1728             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1729         } else {
1730             tcg_debug_assert(dest != arg2);
1731             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1732         }
1733     } else {
1734         tcg_debug_assert(dest != arg2);
1735         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1736         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1737     }
1738 }
1739
1740 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1741                         TCGArg arg2, bool const_a2)
1742 {
1743     if (have_lzcnt) {
1744         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1745         if (const_a2) {
1746             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1747         } else {
1748             tcg_debug_assert(dest != arg2);
1749             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1750         }
1751     } else {
1752         tcg_debug_assert(!const_a2);
1753         tcg_debug_assert(dest != arg1);
1754         tcg_debug_assert(dest != arg2);
1755
1756         /* Recall that the output of BSR is the index not the count.  */
1757         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1758         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1759
1760         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1761         tcg_out_cmp(s, arg1, 0, 1, rexw);
1762         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1763     }
1764 }
1765
1766 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1767 {
1768     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1769
1770     if (disp == (int32_t)disp) {
1771         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1772         tcg_out32(s, disp);
1773     } else {
1774         /* rip-relative addressing into the constant pool.
1775            This is 6 + 8 = 14 bytes, as compared to using an
1776            immediate load 10 + 6 = 16 bytes, plus we may
1777            be able to re-use the pool constant for more calls.  */
1778         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1779         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1780         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1781         tcg_out32(s, 0);
1782     }
1783 }
1784
1785 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1786                          const TCGHelperInfo *info)
1787 {
1788     tcg_out_branch(s, 1, dest);
1789
1790 #ifndef _WIN32
1791     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1792         /*
1793          * The sysv i386 abi for struct return places a reference as the
1794          * first argument of the stack, and pops that argument with the
1795          * return statement.  Since we want to retain the aligned stack
1796          * pointer for the callee, we do not want to actually push that
1797          * argument before the call but rely on the normal store to the
1798          * stack slot.  But we do need to compensate for the pop in order
1799          * to reset our correct stack pointer value.
1800          * Pushing a garbage value back onto the stack is quickest.
1801          */
1802         tcg_out_push(s, TCG_REG_EAX);
1803     }
1804 #endif
1805 }
1806
1807 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1808 {
1809     tcg_out_branch(s, 0, dest);
1810 }
1811
1812 static void tcg_out_nopn(TCGContext *s, int n)
1813 {
1814     int i;
1815     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1816      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1817      * duplicate prefix, and all of the interesting recent cores can
1818      * decode and discard the duplicates in a single cycle.
1819      */
1820     tcg_debug_assert(n >= 1);
1821     for (i = 1; i < n; ++i) {
1822         tcg_out8(s, 0x66);
1823     }
1824     tcg_out8(s, 0x90);
1825 }
1826
1827 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1828 static void __attribute__((unused))
1829 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1830 {
1831     /*
1832      * This is used for testing alignment, so we can usually use testb.
1833      * For i686, we have to use testl for %esi/%edi.
1834      */
1835     if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1836         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1837         tcg_out8(s, i);
1838     } else {
1839         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1840         tcg_out32(s, i);
1841     }
1842 }
1843
1844 typedef struct {
1845     TCGReg base;
1846     int index;
1847     int ofs;
1848     int seg;
1849     TCGAtomAlign aa;
1850 } HostAddress;
1851
1852 bool tcg_target_has_memory_bswap(MemOp memop)
1853 {
1854     TCGAtomAlign aa;
1855
1856     if (!have_movbe) {
1857         return false;
1858     }
1859     if ((memop & MO_SIZE) < MO_128) {
1860         return true;
1861     }
1862
1863     /*
1864      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1865      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1866      */
1867     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1868     return aa.atom < MO_128;
1869 }
1870
1871 /*
1872  * Because i686 has no register parameters and because x86_64 has xchg
1873  * to handle addr/data register overlap, we have placed all input arguments
1874  * before we need might need a scratch reg.
1875  *
1876  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1877  * a general-purpose scratch when we don't actually know it's available,
1878  * use the ra_gen hook to load into RAX if needed.
1879  */
1880 #if TCG_TARGET_REG_BITS == 64
1881 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1882 {
1883     if (arg < 0) {
1884         arg = TCG_REG_RAX;
1885     }
1886     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1887     return arg;
1888 }
1889 static const TCGLdstHelperParam ldst_helper_param = {
1890     .ra_gen = ldst_ra_gen
1891 };
1892 #else
1893 static const TCGLdstHelperParam ldst_helper_param = { };
1894 #endif
1895
1896 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1897                                 TCGReg l, TCGReg h, TCGReg v)
1898 {
1899     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1900
1901     /* vpmov{d,q} %v, %l */
1902     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1903     /* vpextr{d,q} $1, %v, %h */
1904     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1905     tcg_out8(s, 1);
1906 }
1907
1908 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1909                                 TCGReg v, TCGReg l, TCGReg h)
1910 {
1911     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1912
1913     /* vmov{d,q} %l, %v */
1914     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1915     /* vpinsr{d,q} $1, %h, %v, %v */
1916     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1917     tcg_out8(s, 1);
1918 }
1919
1920 /*
1921  * Generate code for the slow path for a load at the end of block
1922  */
1923 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1924 {
1925     MemOp opc = get_memop(l->oi);
1926     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1927
1928     /* resolve label address */
1929     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1930     if (label_ptr[1]) {
1931         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1932     }
1933
1934     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1935     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1936     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1937
1938     tcg_out_jmp(s, l->raddr);
1939     return true;
1940 }
1941
1942 /*
1943  * Generate code for the slow path for a store at the end of block
1944  */
1945 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1946 {
1947     MemOp opc = get_memop(l->oi);
1948     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1949
1950     /* resolve label address */
1951     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1952     if (label_ptr[1]) {
1953         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1954     }
1955
1956     tcg_out_st_helper_args(s, l, &ldst_helper_param);
1957     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1958
1959     tcg_out_jmp(s, l->raddr);
1960     return true;
1961 }
1962
1963 #ifdef CONFIG_USER_ONLY
1964 static HostAddress x86_guest_base = {
1965     .index = -1
1966 };
1967
1968 #if defined(__x86_64__) && defined(__linux__)
1969 # include <asm/prctl.h>
1970 # include <sys/prctl.h>
1971 int arch_prctl(int code, unsigned long addr);
1972 static inline int setup_guest_base_seg(void)
1973 {
1974     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1975         return P_GS;
1976     }
1977     return 0;
1978 }
1979 #define setup_guest_base_seg  setup_guest_base_seg
1980 #elif defined(__x86_64__) && \
1981       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1982 # include <machine/sysarch.h>
1983 static inline int setup_guest_base_seg(void)
1984 {
1985     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1986         return P_GS;
1987     }
1988     return 0;
1989 }
1990 #define setup_guest_base_seg  setup_guest_base_seg
1991 #endif
1992 #else
1993 # define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1994 #endif /* CONFIG_USER_ONLY */
1995 #ifndef setup_guest_base_seg
1996 # define setup_guest_base_seg()  0
1997 #endif
1998
1999 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2000
2001 /*
2002  * For softmmu, perform the TLB load and compare.
2003  * For useronly, perform any required alignment tests.
2004  * In both cases, return a TCGLabelQemuLdst structure if the slow path
2005  * is required and fill in @h with the host address for the fast path.
2006  */
2007 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2008                                            TCGReg addrlo, TCGReg addrhi,
2009                                            MemOpIdx oi, bool is_ld)
2010 {
2011     TCGLabelQemuLdst *ldst = NULL;
2012     MemOp opc = get_memop(oi);
2013     MemOp s_bits = opc & MO_SIZE;
2014     unsigned a_mask;
2015
2016     if (tcg_use_softmmu) {
2017         h->index = TCG_REG_L0;
2018         h->ofs = 0;
2019         h->seg = 0;
2020     } else {
2021         *h = x86_guest_base;
2022     }
2023     h->base = addrlo;
2024     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2025     a_mask = (1 << h->aa.align) - 1;
2026
2027     if (tcg_use_softmmu) {
2028         int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2029                             : offsetof(CPUTLBEntry, addr_write);
2030         TCGType ttype = TCG_TYPE_I32;
2031         TCGType tlbtype = TCG_TYPE_I32;
2032         int trexw = 0, hrexw = 0, tlbrexw = 0;
2033         unsigned mem_index = get_mmuidx(oi);
2034         unsigned s_mask = (1 << s_bits) - 1;
2035         int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2036         int tlb_mask;
2037
2038         ldst = new_ldst_label(s);
2039         ldst->is_ld = is_ld;
2040         ldst->oi = oi;
2041         ldst->addrlo_reg = addrlo;
2042         ldst->addrhi_reg = addrhi;
2043
2044         if (TCG_TARGET_REG_BITS == 64) {
2045             ttype = s->addr_type;
2046             trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2047             if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2048                 hrexw = P_REXW;
2049                 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2050                     tlbtype = TCG_TYPE_I64;
2051                     tlbrexw = P_REXW;
2052                 }
2053             }
2054         }
2055
2056         tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2057         tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2058                        s->page_bits - CPU_TLB_ENTRY_BITS);
2059
2060         tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2061                              fast_ofs + offsetof(CPUTLBDescFast, mask));
2062
2063         tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2064                              fast_ofs + offsetof(CPUTLBDescFast, table));
2065
2066         /*
2067          * If the required alignment is at least as large as the access,
2068          * simply copy the address and mask.  For lesser alignments,
2069          * check that we don't cross pages for the complete access.
2070          */
2071         if (a_mask >= s_mask) {
2072             tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2073         } else {
2074             tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2075                                  addrlo, s_mask - a_mask);
2076         }
2077         tlb_mask = s->page_mask | a_mask;
2078         tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2079
2080         /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2081         tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2082                              TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2083
2084         /* jne slow_path */
2085         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2086         ldst->label_ptr[0] = s->code_ptr;
2087         s->code_ptr += 4;
2088
2089         if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2090             /* cmp 4(TCG_REG_L0), addrhi */
2091             tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2092                                  TCG_REG_L0, cmp_ofs + 4);
2093
2094             /* jne slow_path */
2095             tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2096             ldst->label_ptr[1] = s->code_ptr;
2097             s->code_ptr += 4;
2098         }
2099
2100         /* TLB Hit.  */
2101         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2102                    offsetof(CPUTLBEntry, addend));
2103     } else if (a_mask) {
2104         ldst = new_ldst_label(s);
2105
2106         ldst->is_ld = is_ld;
2107         ldst->oi = oi;
2108         ldst->addrlo_reg = addrlo;
2109         ldst->addrhi_reg = addrhi;
2110
2111         tcg_out_testi(s, addrlo, a_mask);
2112         /* jne slow_path */
2113         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2114         ldst->label_ptr[0] = s->code_ptr;
2115         s->code_ptr += 4;
2116     }
2117
2118     return ldst;
2119 }
2120
2121 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2122                                    HostAddress h, TCGType type, MemOp memop)
2123 {
2124     bool use_movbe = false;
2125     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2126     int movop = OPC_MOVL_GvEv;
2127
2128     /* Do big-endian loads with movbe.  */
2129     if (memop & MO_BSWAP) {
2130         tcg_debug_assert(have_movbe);
2131         use_movbe = true;
2132         movop = OPC_MOVBE_GyMy;
2133     }
2134
2135     switch (memop & MO_SSIZE) {
2136     case MO_UB:
2137         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2138                                  h.base, h.index, 0, h.ofs);
2139         break;
2140     case MO_SB:
2141         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2142                                  h.base, h.index, 0, h.ofs);
2143         break;
2144     case MO_UW:
2145         if (use_movbe) {
2146             /* There is no extending movbe; only low 16-bits are modified.  */
2147             if (datalo != h.base && datalo != h.index) {
2148                 /* XOR breaks dependency chains.  */
2149                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2150                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2151                                          datalo, h.base, h.index, 0, h.ofs);
2152             } else {
2153                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2154                                          datalo, h.base, h.index, 0, h.ofs);
2155                 tcg_out_ext16u(s, datalo, datalo);
2156             }
2157         } else {
2158             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2159                                      h.base, h.index, 0, h.ofs);
2160         }
2161         break;
2162     case MO_SW:
2163         if (use_movbe) {
2164             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2165                                      datalo, h.base, h.index, 0, h.ofs);
2166             tcg_out_ext16s(s, type, datalo, datalo);
2167         } else {
2168             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2169                                      datalo, h.base, h.index, 0, h.ofs);
2170         }
2171         break;
2172     case MO_UL:
2173         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2174                                  h.base, h.index, 0, h.ofs);
2175         break;
2176 #if TCG_TARGET_REG_BITS == 64
2177     case MO_SL:
2178         if (use_movbe) {
2179             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2180                                      h.base, h.index, 0, h.ofs);
2181             tcg_out_ext32s(s, datalo, datalo);
2182         } else {
2183             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2184                                      h.base, h.index, 0, h.ofs);
2185         }
2186         break;
2187 #endif
2188     case MO_UQ:
2189         if (TCG_TARGET_REG_BITS == 64) {
2190             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2191                                      h.base, h.index, 0, h.ofs);
2192             break;
2193         }
2194         if (use_movbe) {
2195             TCGReg t = datalo;
2196             datalo = datahi;
2197             datahi = t;
2198         }
2199         if (h.base == datalo || h.index == datalo) {
2200             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2201                                      h.base, h.index, 0, h.ofs);
2202             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2203             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2204         } else {
2205             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2206                                      h.base, h.index, 0, h.ofs);
2207             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2208                                      h.base, h.index, 0, h.ofs + 4);
2209         }
2210         break;
2211
2212     case MO_128:
2213         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2214
2215         /*
2216          * Without 16-byte atomicity, use integer regs.
2217          * That is where we want the data, and it allows bswaps.
2218          */
2219         if (h.aa.atom < MO_128) {
2220             if (use_movbe) {
2221                 TCGReg t = datalo;
2222                 datalo = datahi;
2223                 datahi = t;
2224             }
2225             if (h.base == datalo || h.index == datalo) {
2226                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2227                                          h.base, h.index, 0, h.ofs);
2228                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2229                                      datalo, datahi, 0);
2230                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2231                                      datahi, datahi, 8);
2232             } else {
2233                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2234                                          h.base, h.index, 0, h.ofs);
2235                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2236                                          h.base, h.index, 0, h.ofs + 8);
2237             }
2238             break;
2239         }
2240
2241         /*
2242          * With 16-byte atomicity, a vector load is required.
2243          * If we already have 16-byte alignment, then VMOVDQA always works.
2244          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2245          * Else use we require a runtime test for alignment for VMOVDQA;
2246          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2247          */
2248         if (h.aa.align >= MO_128) {
2249             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2250                                          TCG_TMP_VEC, 0,
2251                                          h.base, h.index, 0, h.ofs);
2252         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2253             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2254                                          TCG_TMP_VEC, 0,
2255                                          h.base, h.index, 0, h.ofs);
2256         } else {
2257             TCGLabel *l1 = gen_new_label();
2258             TCGLabel *l2 = gen_new_label();
2259
2260             tcg_out_testi(s, h.base, 15);
2261             tcg_out_jxx(s, JCC_JNE, l1, true);
2262
2263             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2264                                          TCG_TMP_VEC, 0,
2265                                          h.base, h.index, 0, h.ofs);
2266             tcg_out_jxx(s, JCC_JMP, l2, true);
2267
2268             tcg_out_label(s, l1);
2269             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2270                                          TCG_TMP_VEC, 0,
2271                                          h.base, h.index, 0, h.ofs);
2272             tcg_out_label(s, l2);
2273         }
2274         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2275         break;
2276
2277     default:
2278         g_assert_not_reached();
2279     }
2280 }
2281
2282 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2283                             TCGReg addrlo, TCGReg addrhi,
2284                             MemOpIdx oi, TCGType data_type)
2285 {
2286     TCGLabelQemuLdst *ldst;
2287     HostAddress h;
2288
2289     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2290     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2291
2292     if (ldst) {
2293         ldst->type = data_type;
2294         ldst->datalo_reg = datalo;
2295         ldst->datahi_reg = datahi;
2296         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2297     }
2298 }
2299
2300 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2301                                    HostAddress h, MemOp memop)
2302 {
2303     bool use_movbe = false;
2304     int movop = OPC_MOVL_EvGv;
2305
2306     /*
2307      * Do big-endian stores with movbe or system-mode.
2308      * User-only without movbe will have its swapping done generically.
2309      */
2310     if (memop & MO_BSWAP) {
2311         tcg_debug_assert(have_movbe);
2312         use_movbe = true;
2313         movop = OPC_MOVBE_MyGy;
2314     }
2315
2316     switch (memop & MO_SIZE) {
2317     case MO_8:
2318         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2319         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2320         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2321                                  datalo, h.base, h.index, 0, h.ofs);
2322         break;
2323     case MO_16:
2324         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2325                                  h.base, h.index, 0, h.ofs);
2326         break;
2327     case MO_32:
2328         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2329                                  h.base, h.index, 0, h.ofs);
2330         break;
2331     case MO_64:
2332         if (TCG_TARGET_REG_BITS == 64) {
2333             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2334                                      h.base, h.index, 0, h.ofs);
2335         } else {
2336             if (use_movbe) {
2337                 TCGReg t = datalo;
2338                 datalo = datahi;
2339                 datahi = t;
2340             }
2341             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2342                                      h.base, h.index, 0, h.ofs);
2343             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2344                                      h.base, h.index, 0, h.ofs + 4);
2345         }
2346         break;
2347
2348     case MO_128:
2349         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2350
2351         /*
2352          * Without 16-byte atomicity, use integer regs.
2353          * That is where we have the data, and it allows bswaps.
2354          */
2355         if (h.aa.atom < MO_128) {
2356             if (use_movbe) {
2357                 TCGReg t = datalo;
2358                 datalo = datahi;
2359                 datahi = t;
2360             }
2361             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2362                                      h.base, h.index, 0, h.ofs);
2363             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2364                                      h.base, h.index, 0, h.ofs + 8);
2365             break;
2366         }
2367
2368         /*
2369          * With 16-byte atomicity, a vector store is required.
2370          * If we already have 16-byte alignment, then VMOVDQA always works.
2371          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2372          * Else use we require a runtime test for alignment for VMOVDQA;
2373          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2374          */
2375         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2376         if (h.aa.align >= MO_128) {
2377             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2378                                          TCG_TMP_VEC, 0,
2379                                          h.base, h.index, 0, h.ofs);
2380         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2381             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2382                                          TCG_TMP_VEC, 0,
2383                                          h.base, h.index, 0, h.ofs);
2384         } else {
2385             TCGLabel *l1 = gen_new_label();
2386             TCGLabel *l2 = gen_new_label();
2387
2388             tcg_out_testi(s, h.base, 15);
2389             tcg_out_jxx(s, JCC_JNE, l1, true);
2390
2391             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2392                                          TCG_TMP_VEC, 0,
2393                                          h.base, h.index, 0, h.ofs);
2394             tcg_out_jxx(s, JCC_JMP, l2, true);
2395
2396             tcg_out_label(s, l1);
2397             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2398                                          TCG_TMP_VEC, 0,
2399                                          h.base, h.index, 0, h.ofs);
2400             tcg_out_label(s, l2);
2401         }
2402         break;
2403
2404     default:
2405         g_assert_not_reached();
2406     }
2407 }
2408
2409 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2410                             TCGReg addrlo, TCGReg addrhi,
2411                             MemOpIdx oi, TCGType data_type)
2412 {
2413     TCGLabelQemuLdst *ldst;
2414     HostAddress h;
2415
2416     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2417     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2418
2419     if (ldst) {
2420         ldst->type = data_type;
2421         ldst->datalo_reg = datalo;
2422         ldst->datahi_reg = datahi;
2423         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2424     }
2425 }
2426
2427 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2428 {
2429     /* Reuse the zeroing that exists for goto_ptr.  */
2430     if (a0 == 0) {
2431         tcg_out_jmp(s, tcg_code_gen_epilogue);
2432     } else {
2433         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2434         tcg_out_jmp(s, tb_ret_addr);
2435     }
2436 }
2437
2438 static void tcg_out_goto_tb(TCGContext *s, int which)
2439 {
2440     /*
2441      * Jump displacement must be aligned for atomic patching;
2442      * see if we need to add extra nops before jump
2443      */
2444     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2445     if (gap != 1) {
2446         tcg_out_nopn(s, gap - 1);
2447     }
2448     tcg_out8(s, OPC_JMP_long); /* jmp im */
2449     set_jmp_insn_offset(s, which);
2450     tcg_out32(s, 0);
2451     set_jmp_reset_offset(s, which);
2452 }
2453
2454 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2455                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2456 {
2457     /* patch the branch destination */
2458     uintptr_t addr = tb->jmp_target_addr[n];
2459     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2460     /* no need to flush icache explicitly */
2461 }
2462
2463 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2464                               const TCGArg args[TCG_MAX_OP_ARGS],
2465                               const int const_args[TCG_MAX_OP_ARGS])
2466 {
2467     TCGArg a0, a1, a2;
2468     int c, const_a2, vexop, rexw = 0;
2469
2470 #if TCG_TARGET_REG_BITS == 64
2471 # define OP_32_64(x) \
2472         case glue(glue(INDEX_op_, x), _i64): \
2473             rexw = P_REXW; /* FALLTHRU */    \
2474         case glue(glue(INDEX_op_, x), _i32)
2475 #else
2476 # define OP_32_64(x) \
2477         case glue(glue(INDEX_op_, x), _i32)
2478 #endif
2479
2480     /* Hoist the loads of the most common arguments.  */
2481     a0 = args[0];
2482     a1 = args[1];
2483     a2 = args[2];
2484     const_a2 = const_args[2];
2485
2486     switch (opc) {
2487     case INDEX_op_goto_ptr:
2488         /* jmp to the given host address (could be epilogue) */
2489         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2490         break;
2491     case INDEX_op_br:
2492         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2493         break;
2494     OP_32_64(ld8u):
2495         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2496         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2497         break;
2498     OP_32_64(ld8s):
2499         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2500         break;
2501     OP_32_64(ld16u):
2502         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2503         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2504         break;
2505     OP_32_64(ld16s):
2506         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2507         break;
2508 #if TCG_TARGET_REG_BITS == 64
2509     case INDEX_op_ld32u_i64:
2510 #endif
2511     case INDEX_op_ld_i32:
2512         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2513         break;
2514
2515     OP_32_64(st8):
2516         if (const_args[0]) {
2517             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2518             tcg_out8(s, a0);
2519         } else {
2520             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2521         }
2522         break;
2523     OP_32_64(st16):
2524         if (const_args[0]) {
2525             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2526             tcg_out16(s, a0);
2527         } else {
2528             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2529         }
2530         break;
2531 #if TCG_TARGET_REG_BITS == 64
2532     case INDEX_op_st32_i64:
2533 #endif
2534     case INDEX_op_st_i32:
2535         if (const_args[0]) {
2536             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2537             tcg_out32(s, a0);
2538         } else {
2539             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2540         }
2541         break;
2542
2543     OP_32_64(add):
2544         /* For 3-operand addition, use LEA.  */
2545         if (a0 != a1) {
2546             TCGArg c3 = 0;
2547             if (const_a2) {
2548                 c3 = a2, a2 = -1;
2549             } else if (a0 == a2) {
2550                 /* Watch out for dest = src + dest, since we've removed
2551                    the matching constraint on the add.  */
2552                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2553                 break;
2554             }
2555
2556             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2557             break;
2558         }
2559         c = ARITH_ADD;
2560         goto gen_arith;
2561     OP_32_64(sub):
2562         c = ARITH_SUB;
2563         goto gen_arith;
2564     OP_32_64(and):
2565         c = ARITH_AND;
2566         goto gen_arith;
2567     OP_32_64(or):
2568         c = ARITH_OR;
2569         goto gen_arith;
2570     OP_32_64(xor):
2571         c = ARITH_XOR;
2572         goto gen_arith;
2573     gen_arith:
2574         if (const_a2) {
2575             tgen_arithi(s, c + rexw, a0, a2, 0);
2576         } else {
2577             tgen_arithr(s, c + rexw, a0, a2);
2578         }
2579         break;
2580
2581     OP_32_64(andc):
2582         if (const_a2) {
2583             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2584             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2585         } else {
2586             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2587         }
2588         break;
2589
2590     OP_32_64(mul):
2591         if (const_a2) {
2592             int32_t val;
2593             val = a2;
2594             if (val == (int8_t)val) {
2595                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2596                 tcg_out8(s, val);
2597             } else {
2598                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2599                 tcg_out32(s, val);
2600             }
2601         } else {
2602             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2603         }
2604         break;
2605
2606     OP_32_64(div2):
2607         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2608         break;
2609     OP_32_64(divu2):
2610         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2611         break;
2612
2613     OP_32_64(shl):
2614         /* For small constant 3-operand shift, use LEA.  */
2615         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2616             if (a2 - 1 == 0) {
2617                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2618                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2619             } else {
2620                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2621                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2622             }
2623             break;
2624         }
2625         c = SHIFT_SHL;
2626         vexop = OPC_SHLX;
2627         goto gen_shift_maybe_vex;
2628     OP_32_64(shr):
2629         c = SHIFT_SHR;
2630         vexop = OPC_SHRX;
2631         goto gen_shift_maybe_vex;
2632     OP_32_64(sar):
2633         c = SHIFT_SAR;
2634         vexop = OPC_SARX;
2635         goto gen_shift_maybe_vex;
2636     OP_32_64(rotl):
2637         c = SHIFT_ROL;
2638         goto gen_shift;
2639     OP_32_64(rotr):
2640         c = SHIFT_ROR;
2641         goto gen_shift;
2642     gen_shift_maybe_vex:
2643         if (have_bmi2) {
2644             if (!const_a2) {
2645                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2646                 break;
2647             }
2648             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2649         }
2650         /* FALLTHRU */
2651     gen_shift:
2652         if (const_a2) {
2653             tcg_out_shifti(s, c + rexw, a0, a2);
2654         } else {
2655             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2656         }
2657         break;
2658
2659     OP_32_64(ctz):
2660         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2661         break;
2662     OP_32_64(clz):
2663         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2664         break;
2665     OP_32_64(ctpop):
2666         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2667         break;
2668
2669     OP_32_64(brcond):
2670         tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2671                        arg_label(args[3]), 0);
2672         break;
2673     OP_32_64(setcond):
2674         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2675         break;
2676     OP_32_64(negsetcond):
2677         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2678         break;
2679     OP_32_64(movcond):
2680         tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2681         break;
2682
2683     OP_32_64(bswap16):
2684         if (a2 & TCG_BSWAP_OS) {
2685             /* Output must be sign-extended. */
2686             if (rexw) {
2687                 tcg_out_bswap64(s, a0);
2688                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2689             } else {
2690                 tcg_out_bswap32(s, a0);
2691                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2692             }
2693         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2694             /* Output must be zero-extended, but input isn't. */
2695             tcg_out_bswap32(s, a0);
2696             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2697         } else {
2698             tcg_out_rolw_8(s, a0);
2699         }
2700         break;
2701     OP_32_64(bswap32):
2702         tcg_out_bswap32(s, a0);
2703         if (rexw && (a2 & TCG_BSWAP_OS)) {
2704             tcg_out_ext32s(s, a0, a0);
2705         }
2706         break;
2707
2708     OP_32_64(neg):
2709         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2710         break;
2711     OP_32_64(not):
2712         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2713         break;
2714
2715     case INDEX_op_qemu_ld_a64_i32:
2716         if (TCG_TARGET_REG_BITS == 32) {
2717             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2718             break;
2719         }
2720         /* fall through */
2721     case INDEX_op_qemu_ld_a32_i32:
2722         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2723         break;
2724     case INDEX_op_qemu_ld_a32_i64:
2725         if (TCG_TARGET_REG_BITS == 64) {
2726             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2727         } else {
2728             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2729         }
2730         break;
2731     case INDEX_op_qemu_ld_a64_i64:
2732         if (TCG_TARGET_REG_BITS == 64) {
2733             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2734         } else {
2735             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2736         }
2737         break;
2738     case INDEX_op_qemu_ld_a32_i128:
2739     case INDEX_op_qemu_ld_a64_i128:
2740         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2741         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2742         break;
2743
2744     case INDEX_op_qemu_st_a64_i32:
2745     case INDEX_op_qemu_st8_a64_i32:
2746         if (TCG_TARGET_REG_BITS == 32) {
2747             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2748             break;
2749         }
2750         /* fall through */
2751     case INDEX_op_qemu_st_a32_i32:
2752     case INDEX_op_qemu_st8_a32_i32:
2753         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2754         break;
2755     case INDEX_op_qemu_st_a32_i64:
2756         if (TCG_TARGET_REG_BITS == 64) {
2757             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2758         } else {
2759             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2760         }
2761         break;
2762     case INDEX_op_qemu_st_a64_i64:
2763         if (TCG_TARGET_REG_BITS == 64) {
2764             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2765         } else {
2766             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2767         }
2768         break;
2769     case INDEX_op_qemu_st_a32_i128:
2770     case INDEX_op_qemu_st_a64_i128:
2771         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2772         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2773         break;
2774
2775     OP_32_64(mulu2):
2776         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2777         break;
2778     OP_32_64(muls2):
2779         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2780         break;
2781     OP_32_64(add2):
2782         if (const_args[4]) {
2783             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2784         } else {
2785             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2786         }
2787         if (const_args[5]) {
2788             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2789         } else {
2790             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2791         }
2792         break;
2793     OP_32_64(sub2):
2794         if (const_args[4]) {
2795             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2796         } else {
2797             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2798         }
2799         if (const_args[5]) {
2800             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2801         } else {
2802             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2803         }
2804         break;
2805
2806 #if TCG_TARGET_REG_BITS == 32
2807     case INDEX_op_brcond2_i32:
2808         tcg_out_brcond2(s, args, const_args, 0);
2809         break;
2810     case INDEX_op_setcond2_i32:
2811         tcg_out_setcond2(s, args, const_args);
2812         break;
2813 #else /* TCG_TARGET_REG_BITS == 64 */
2814     case INDEX_op_ld32s_i64:
2815         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2816         break;
2817     case INDEX_op_ld_i64:
2818         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2819         break;
2820     case INDEX_op_st_i64:
2821         if (const_args[0]) {
2822             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2823             tcg_out32(s, a0);
2824         } else {
2825             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2826         }
2827         break;
2828
2829     case INDEX_op_bswap64_i64:
2830         tcg_out_bswap64(s, a0);
2831         break;
2832     case INDEX_op_extrh_i64_i32:
2833         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2834         break;
2835 #endif
2836
2837     OP_32_64(deposit):
2838         if (args[3] == 0 && args[4] == 8) {
2839             /* load bits 0..7 */
2840             if (const_a2) {
2841                 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2842                             0, a0, 0);
2843                 tcg_out8(s, a2);
2844             } else {
2845                 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2846             }
2847         } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2848             /* load bits 8..15 */
2849             if (const_a2) {
2850                 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2851                 tcg_out8(s, a2);
2852             } else {
2853                 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2854             }
2855         } else if (args[3] == 0 && args[4] == 16) {
2856             /* load bits 0..15 */
2857             if (const_a2) {
2858                 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2859                             0, a0, 0);
2860                 tcg_out16(s, a2);
2861             } else {
2862                 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2863             }
2864         } else {
2865             g_assert_not_reached();
2866         }
2867         break;
2868
2869     case INDEX_op_extract_i64:
2870         if (a2 + args[3] == 32) {
2871             /* This is a 32-bit zero-extending right shift.  */
2872             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2873             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2874             break;
2875         }
2876         /* FALLTHRU */
2877     case INDEX_op_extract_i32:
2878         /* On the off-chance that we can use the high-byte registers.
2879            Otherwise we emit the same ext16 + shift pattern that we
2880            would have gotten from the normal tcg-op.c expansion.  */
2881         tcg_debug_assert(a2 == 8 && args[3] == 8);
2882         if (a1 < 4 && a0 < 8) {
2883             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2884         } else {
2885             tcg_out_ext16u(s, a0, a1);
2886             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2887         }
2888         break;
2889
2890     case INDEX_op_sextract_i32:
2891         /* We don't implement sextract_i64, as we cannot sign-extend to
2892            64-bits without using the REX prefix that explicitly excludes
2893            access to the high-byte registers.  */
2894         tcg_debug_assert(a2 == 8 && args[3] == 8);
2895         if (a1 < 4 && a0 < 8) {
2896             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2897         } else {
2898             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2899             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2900         }
2901         break;
2902
2903     OP_32_64(extract2):
2904         /* Note that SHRD outputs to the r/m operand.  */
2905         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2906         tcg_out8(s, args[3]);
2907         break;
2908
2909     case INDEX_op_mb:
2910         tcg_out_mb(s, a0);
2911         break;
2912     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2913     case INDEX_op_mov_i64:
2914     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2915     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2916     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2917     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2918     case INDEX_op_ext8s_i64:
2919     case INDEX_op_ext8u_i32:
2920     case INDEX_op_ext8u_i64:
2921     case INDEX_op_ext16s_i32:
2922     case INDEX_op_ext16s_i64:
2923     case INDEX_op_ext16u_i32:
2924     case INDEX_op_ext16u_i64:
2925     case INDEX_op_ext32s_i64:
2926     case INDEX_op_ext32u_i64:
2927     case INDEX_op_ext_i32_i64:
2928     case INDEX_op_extu_i32_i64:
2929     case INDEX_op_extrl_i64_i32:
2930     default:
2931         g_assert_not_reached();
2932     }
2933
2934 #undef OP_32_64
2935 }
2936
2937 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2938                            unsigned vecl, unsigned vece,
2939                            const TCGArg args[TCG_MAX_OP_ARGS],
2940                            const int const_args[TCG_MAX_OP_ARGS])
2941 {
2942     static int const add_insn[4] = {
2943         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2944     };
2945     static int const ssadd_insn[4] = {
2946         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2947     };
2948     static int const usadd_insn[4] = {
2949         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2950     };
2951     static int const sub_insn[4] = {
2952         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2953     };
2954     static int const sssub_insn[4] = {
2955         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2956     };
2957     static int const ussub_insn[4] = {
2958         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2959     };
2960     static int const mul_insn[4] = {
2961         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2962     };
2963     static int const shift_imm_insn[4] = {
2964         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2965     };
2966     static int const cmpeq_insn[4] = {
2967         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2968     };
2969     static int const cmpgt_insn[4] = {
2970         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2971     };
2972     static int const punpckl_insn[4] = {
2973         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2974     };
2975     static int const punpckh_insn[4] = {
2976         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2977     };
2978     static int const packss_insn[4] = {
2979         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2980     };
2981     static int const packus_insn[4] = {
2982         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2983     };
2984     static int const smin_insn[4] = {
2985         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2986     };
2987     static int const smax_insn[4] = {
2988         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2989     };
2990     static int const umin_insn[4] = {
2991         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2992     };
2993     static int const umax_insn[4] = {
2994         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2995     };
2996     static int const rotlv_insn[4] = {
2997         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2998     };
2999     static int const rotrv_insn[4] = {
3000         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3001     };
3002     static int const shlv_insn[4] = {
3003         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3004     };
3005     static int const shrv_insn[4] = {
3006         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3007     };
3008     static int const sarv_insn[4] = {
3009         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3010     };
3011     static int const shls_insn[4] = {
3012         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3013     };
3014     static int const shrs_insn[4] = {
3015         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3016     };
3017     static int const sars_insn[4] = {
3018         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3019     };
3020     static int const vpshldi_insn[4] = {
3021         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3022     };
3023     static int const vpshldv_insn[4] = {
3024         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3025     };
3026     static int const vpshrdv_insn[4] = {
3027         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3028     };
3029     static int const abs_insn[4] = {
3030         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3031     };
3032
3033     TCGType type = vecl + TCG_TYPE_V64;
3034     int insn, sub;
3035     TCGArg a0, a1, a2, a3;
3036
3037     a0 = args[0];
3038     a1 = args[1];
3039     a2 = args[2];
3040
3041     switch (opc) {
3042     case INDEX_op_add_vec:
3043         insn = add_insn[vece];
3044         goto gen_simd;
3045     case INDEX_op_ssadd_vec:
3046         insn = ssadd_insn[vece];
3047         goto gen_simd;
3048     case INDEX_op_usadd_vec:
3049         insn = usadd_insn[vece];
3050         goto gen_simd;
3051     case INDEX_op_sub_vec:
3052         insn = sub_insn[vece];
3053         goto gen_simd;
3054     case INDEX_op_sssub_vec:
3055         insn = sssub_insn[vece];
3056         goto gen_simd;
3057     case INDEX_op_ussub_vec:
3058         insn = ussub_insn[vece];
3059         goto gen_simd;
3060     case INDEX_op_mul_vec:
3061         insn = mul_insn[vece];
3062         goto gen_simd;
3063     case INDEX_op_and_vec:
3064         insn = OPC_PAND;
3065         goto gen_simd;
3066     case INDEX_op_or_vec:
3067         insn = OPC_POR;
3068         goto gen_simd;
3069     case INDEX_op_xor_vec:
3070         insn = OPC_PXOR;
3071         goto gen_simd;
3072     case INDEX_op_smin_vec:
3073         insn = smin_insn[vece];
3074         goto gen_simd;
3075     case INDEX_op_umin_vec:
3076         insn = umin_insn[vece];
3077         goto gen_simd;
3078     case INDEX_op_smax_vec:
3079         insn = smax_insn[vece];
3080         goto gen_simd;
3081     case INDEX_op_umax_vec:
3082         insn = umax_insn[vece];
3083         goto gen_simd;
3084     case INDEX_op_shlv_vec:
3085         insn = shlv_insn[vece];
3086         goto gen_simd;
3087     case INDEX_op_shrv_vec:
3088         insn = shrv_insn[vece];
3089         goto gen_simd;
3090     case INDEX_op_sarv_vec:
3091         insn = sarv_insn[vece];
3092         goto gen_simd;
3093     case INDEX_op_rotlv_vec:
3094         insn = rotlv_insn[vece];
3095         goto gen_simd;
3096     case INDEX_op_rotrv_vec:
3097         insn = rotrv_insn[vece];
3098         goto gen_simd;
3099     case INDEX_op_shls_vec:
3100         insn = shls_insn[vece];
3101         goto gen_simd;
3102     case INDEX_op_shrs_vec:
3103         insn = shrs_insn[vece];
3104         goto gen_simd;
3105     case INDEX_op_sars_vec:
3106         insn = sars_insn[vece];
3107         goto gen_simd;
3108     case INDEX_op_x86_punpckl_vec:
3109         insn = punpckl_insn[vece];
3110         goto gen_simd;
3111     case INDEX_op_x86_punpckh_vec:
3112         insn = punpckh_insn[vece];
3113         goto gen_simd;
3114     case INDEX_op_x86_packss_vec:
3115         insn = packss_insn[vece];
3116         goto gen_simd;
3117     case INDEX_op_x86_packus_vec:
3118         insn = packus_insn[vece];
3119         goto gen_simd;
3120     case INDEX_op_x86_vpshldv_vec:
3121         insn = vpshldv_insn[vece];
3122         a1 = a2;
3123         a2 = args[3];
3124         goto gen_simd;
3125     case INDEX_op_x86_vpshrdv_vec:
3126         insn = vpshrdv_insn[vece];
3127         a1 = a2;
3128         a2 = args[3];
3129         goto gen_simd;
3130 #if TCG_TARGET_REG_BITS == 32
3131     case INDEX_op_dup2_vec:
3132         /* First merge the two 32-bit inputs to a single 64-bit element. */
3133         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3134         /* Then replicate the 64-bit elements across the rest of the vector. */
3135         if (type != TCG_TYPE_V64) {
3136             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3137         }
3138         break;
3139 #endif
3140     case INDEX_op_abs_vec:
3141         insn = abs_insn[vece];
3142         a2 = a1;
3143         a1 = 0;
3144         goto gen_simd;
3145     gen_simd:
3146         tcg_debug_assert(insn != OPC_UD2);
3147         if (type == TCG_TYPE_V256) {
3148             insn |= P_VEXL;
3149         }
3150         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3151         break;
3152
3153     case INDEX_op_cmp_vec:
3154         sub = args[3];
3155         if (sub == TCG_COND_EQ) {
3156             insn = cmpeq_insn[vece];
3157         } else if (sub == TCG_COND_GT) {
3158             insn = cmpgt_insn[vece];
3159         } else {
3160             g_assert_not_reached();
3161         }
3162         goto gen_simd;
3163
3164     case INDEX_op_andc_vec:
3165         insn = OPC_PANDN;
3166         if (type == TCG_TYPE_V256) {
3167             insn |= P_VEXL;
3168         }
3169         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3170         break;
3171
3172     case INDEX_op_shli_vec:
3173         insn = shift_imm_insn[vece];
3174         sub = 6;
3175         goto gen_shift;
3176     case INDEX_op_shri_vec:
3177         insn = shift_imm_insn[vece];
3178         sub = 2;
3179         goto gen_shift;
3180     case INDEX_op_sari_vec:
3181         if (vece == MO_64) {
3182             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3183         } else {
3184             insn = shift_imm_insn[vece];
3185         }
3186         sub = 4;
3187         goto gen_shift;
3188     case INDEX_op_rotli_vec:
3189         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3190         if (vece == MO_64) {
3191             insn |= P_VEXW;
3192         }
3193         sub = 1;
3194         goto gen_shift;
3195     gen_shift:
3196         tcg_debug_assert(vece != MO_8);
3197         if (type == TCG_TYPE_V256) {
3198             insn |= P_VEXL;
3199         }
3200         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3201         tcg_out8(s, a2);
3202         break;
3203
3204     case INDEX_op_ld_vec:
3205         tcg_out_ld(s, type, a0, a1, a2);
3206         break;
3207     case INDEX_op_st_vec:
3208         tcg_out_st(s, type, a0, a1, a2);
3209         break;
3210     case INDEX_op_dupm_vec:
3211         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3212         break;
3213
3214     case INDEX_op_x86_shufps_vec:
3215         insn = OPC_SHUFPS;
3216         sub = args[3];
3217         goto gen_simd_imm8;
3218     case INDEX_op_x86_blend_vec:
3219         if (vece == MO_16) {
3220             insn = OPC_PBLENDW;
3221         } else if (vece == MO_32) {
3222             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3223         } else {
3224             g_assert_not_reached();
3225         }
3226         sub = args[3];
3227         goto gen_simd_imm8;
3228     case INDEX_op_x86_vperm2i128_vec:
3229         insn = OPC_VPERM2I128;
3230         sub = args[3];
3231         goto gen_simd_imm8;
3232     case INDEX_op_x86_vpshldi_vec:
3233         insn = vpshldi_insn[vece];
3234         sub = args[3];
3235         goto gen_simd_imm8;
3236
3237     case INDEX_op_not_vec:
3238         insn = OPC_VPTERNLOGQ;
3239         a2 = a1;
3240         sub = 0x33; /* !B */
3241         goto gen_simd_imm8;
3242     case INDEX_op_nor_vec:
3243         insn = OPC_VPTERNLOGQ;
3244         sub = 0x11; /* norCB */
3245         goto gen_simd_imm8;
3246     case INDEX_op_nand_vec:
3247         insn = OPC_VPTERNLOGQ;
3248         sub = 0x77; /* nandCB */
3249         goto gen_simd_imm8;
3250     case INDEX_op_eqv_vec:
3251         insn = OPC_VPTERNLOGQ;
3252         sub = 0x99; /* xnorCB */
3253         goto gen_simd_imm8;
3254     case INDEX_op_orc_vec:
3255         insn = OPC_VPTERNLOGQ;
3256         sub = 0xdd; /* orB!C */
3257         goto gen_simd_imm8;
3258
3259     case INDEX_op_bitsel_vec:
3260         insn = OPC_VPTERNLOGQ;
3261         a3 = args[3];
3262         if (a0 == a1) {
3263             a1 = a2;
3264             a2 = a3;
3265             sub = 0xca; /* A?B:C */
3266         } else if (a0 == a2) {
3267             a2 = a3;
3268             sub = 0xe2; /* B?A:C */
3269         } else {
3270             tcg_out_mov(s, type, a0, a3);
3271             sub = 0xb8; /* B?C:A */
3272         }
3273         goto gen_simd_imm8;
3274
3275     gen_simd_imm8:
3276         tcg_debug_assert(insn != OPC_UD2);
3277         if (type == TCG_TYPE_V256) {
3278             insn |= P_VEXL;
3279         }
3280         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3281         tcg_out8(s, sub);
3282         break;
3283
3284     case INDEX_op_x86_vpblendvb_vec:
3285         insn = OPC_VPBLENDVB;
3286         if (type == TCG_TYPE_V256) {
3287             insn |= P_VEXL;
3288         }
3289         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3290         tcg_out8(s, args[3] << 4);
3291         break;
3292
3293     case INDEX_op_x86_psrldq_vec:
3294         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3295         tcg_out8(s, a2);
3296         break;
3297
3298     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3299     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3300     default:
3301         g_assert_not_reached();
3302     }
3303 }
3304
3305 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3306 {
3307     switch (op) {
3308     case INDEX_op_goto_ptr:
3309         return C_O0_I1(r);
3310
3311     case INDEX_op_ld8u_i32:
3312     case INDEX_op_ld8u_i64:
3313     case INDEX_op_ld8s_i32:
3314     case INDEX_op_ld8s_i64:
3315     case INDEX_op_ld16u_i32:
3316     case INDEX_op_ld16u_i64:
3317     case INDEX_op_ld16s_i32:
3318     case INDEX_op_ld16s_i64:
3319     case INDEX_op_ld_i32:
3320     case INDEX_op_ld32u_i64:
3321     case INDEX_op_ld32s_i64:
3322     case INDEX_op_ld_i64:
3323         return C_O1_I1(r, r);
3324
3325     case INDEX_op_st8_i32:
3326     case INDEX_op_st8_i64:
3327         return C_O0_I2(qi, r);
3328
3329     case INDEX_op_st16_i32:
3330     case INDEX_op_st16_i64:
3331     case INDEX_op_st_i32:
3332     case INDEX_op_st32_i64:
3333         return C_O0_I2(ri, r);
3334
3335     case INDEX_op_st_i64:
3336         return C_O0_I2(re, r);
3337
3338     case INDEX_op_add_i32:
3339     case INDEX_op_add_i64:
3340         return C_O1_I2(r, r, re);
3341
3342     case INDEX_op_sub_i32:
3343     case INDEX_op_sub_i64:
3344     case INDEX_op_mul_i32:
3345     case INDEX_op_mul_i64:
3346     case INDEX_op_or_i32:
3347     case INDEX_op_or_i64:
3348     case INDEX_op_xor_i32:
3349     case INDEX_op_xor_i64:
3350         return C_O1_I2(r, 0, re);
3351
3352     case INDEX_op_and_i32:
3353     case INDEX_op_and_i64:
3354         return C_O1_I2(r, 0, reZ);
3355
3356     case INDEX_op_andc_i32:
3357     case INDEX_op_andc_i64:
3358         return C_O1_I2(r, r, rI);
3359
3360     case INDEX_op_shl_i32:
3361     case INDEX_op_shl_i64:
3362     case INDEX_op_shr_i32:
3363     case INDEX_op_shr_i64:
3364     case INDEX_op_sar_i32:
3365     case INDEX_op_sar_i64:
3366         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3367
3368     case INDEX_op_rotl_i32:
3369     case INDEX_op_rotl_i64:
3370     case INDEX_op_rotr_i32:
3371     case INDEX_op_rotr_i64:
3372         return C_O1_I2(r, 0, ci);
3373
3374     case INDEX_op_brcond_i32:
3375     case INDEX_op_brcond_i64:
3376         return C_O0_I2(r, re);
3377
3378     case INDEX_op_bswap16_i32:
3379     case INDEX_op_bswap16_i64:
3380     case INDEX_op_bswap32_i32:
3381     case INDEX_op_bswap32_i64:
3382     case INDEX_op_bswap64_i64:
3383     case INDEX_op_neg_i32:
3384     case INDEX_op_neg_i64:
3385     case INDEX_op_not_i32:
3386     case INDEX_op_not_i64:
3387     case INDEX_op_extrh_i64_i32:
3388         return C_O1_I1(r, 0);
3389
3390     case INDEX_op_ext8s_i32:
3391     case INDEX_op_ext8s_i64:
3392     case INDEX_op_ext8u_i32:
3393     case INDEX_op_ext8u_i64:
3394         return C_O1_I1(r, q);
3395
3396     case INDEX_op_ext16s_i32:
3397     case INDEX_op_ext16s_i64:
3398     case INDEX_op_ext16u_i32:
3399     case INDEX_op_ext16u_i64:
3400     case INDEX_op_ext32s_i64:
3401     case INDEX_op_ext32u_i64:
3402     case INDEX_op_ext_i32_i64:
3403     case INDEX_op_extu_i32_i64:
3404     case INDEX_op_extrl_i64_i32:
3405     case INDEX_op_extract_i32:
3406     case INDEX_op_extract_i64:
3407     case INDEX_op_sextract_i32:
3408     case INDEX_op_ctpop_i32:
3409     case INDEX_op_ctpop_i64:
3410         return C_O1_I1(r, r);
3411
3412     case INDEX_op_extract2_i32:
3413     case INDEX_op_extract2_i64:
3414         return C_O1_I2(r, 0, r);
3415
3416     case INDEX_op_deposit_i32:
3417     case INDEX_op_deposit_i64:
3418         return C_O1_I2(q, 0, qi);
3419
3420     case INDEX_op_setcond_i32:
3421     case INDEX_op_setcond_i64:
3422     case INDEX_op_negsetcond_i32:
3423     case INDEX_op_negsetcond_i64:
3424         return C_O1_I2(q, r, re);
3425
3426     case INDEX_op_movcond_i32:
3427     case INDEX_op_movcond_i64:
3428         return C_O1_I4(r, r, re, r, 0);
3429
3430     case INDEX_op_div2_i32:
3431     case INDEX_op_div2_i64:
3432     case INDEX_op_divu2_i32:
3433     case INDEX_op_divu2_i64:
3434         return C_O2_I3(a, d, 0, 1, r);
3435
3436     case INDEX_op_mulu2_i32:
3437     case INDEX_op_mulu2_i64:
3438     case INDEX_op_muls2_i32:
3439     case INDEX_op_muls2_i64:
3440         return C_O2_I2(a, d, a, r);
3441
3442     case INDEX_op_add2_i32:
3443     case INDEX_op_add2_i64:
3444     case INDEX_op_sub2_i32:
3445     case INDEX_op_sub2_i64:
3446         return C_N1_O1_I4(r, r, 0, 1, re, re);
3447
3448     case INDEX_op_ctz_i32:
3449     case INDEX_op_ctz_i64:
3450         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3451
3452     case INDEX_op_clz_i32:
3453     case INDEX_op_clz_i64:
3454         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3455
3456     case INDEX_op_qemu_ld_a32_i32:
3457         return C_O1_I1(r, L);
3458     case INDEX_op_qemu_ld_a64_i32:
3459         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3460
3461     case INDEX_op_qemu_st_a32_i32:
3462         return C_O0_I2(L, L);
3463     case INDEX_op_qemu_st_a64_i32:
3464         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3465     case INDEX_op_qemu_st8_a32_i32:
3466         return C_O0_I2(s, L);
3467     case INDEX_op_qemu_st8_a64_i32:
3468         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3469
3470     case INDEX_op_qemu_ld_a32_i64:
3471         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3472     case INDEX_op_qemu_ld_a64_i64:
3473         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3474
3475     case INDEX_op_qemu_st_a32_i64:
3476         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3477     case INDEX_op_qemu_st_a64_i64:
3478         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3479
3480     case INDEX_op_qemu_ld_a32_i128:
3481     case INDEX_op_qemu_ld_a64_i128:
3482         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3483         return C_O2_I1(r, r, L);
3484     case INDEX_op_qemu_st_a32_i128:
3485     case INDEX_op_qemu_st_a64_i128:
3486         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3487         return C_O0_I3(L, L, L);
3488
3489     case INDEX_op_brcond2_i32:
3490         return C_O0_I4(r, r, ri, ri);
3491
3492     case INDEX_op_setcond2_i32:
3493         return C_O1_I4(r, r, r, ri, ri);
3494
3495     case INDEX_op_ld_vec:
3496     case INDEX_op_dupm_vec:
3497         return C_O1_I1(x, r);
3498
3499     case INDEX_op_st_vec:
3500         return C_O0_I2(x, r);
3501
3502     case INDEX_op_add_vec:
3503     case INDEX_op_sub_vec:
3504     case INDEX_op_mul_vec:
3505     case INDEX_op_and_vec:
3506     case INDEX_op_or_vec:
3507     case INDEX_op_xor_vec:
3508     case INDEX_op_andc_vec:
3509     case INDEX_op_orc_vec:
3510     case INDEX_op_nand_vec:
3511     case INDEX_op_nor_vec:
3512     case INDEX_op_eqv_vec:
3513     case INDEX_op_ssadd_vec:
3514     case INDEX_op_usadd_vec:
3515     case INDEX_op_sssub_vec:
3516     case INDEX_op_ussub_vec:
3517     case INDEX_op_smin_vec:
3518     case INDEX_op_umin_vec:
3519     case INDEX_op_smax_vec:
3520     case INDEX_op_umax_vec:
3521     case INDEX_op_shlv_vec:
3522     case INDEX_op_shrv_vec:
3523     case INDEX_op_sarv_vec:
3524     case INDEX_op_rotlv_vec:
3525     case INDEX_op_rotrv_vec:
3526     case INDEX_op_shls_vec:
3527     case INDEX_op_shrs_vec:
3528     case INDEX_op_sars_vec:
3529     case INDEX_op_cmp_vec:
3530     case INDEX_op_x86_shufps_vec:
3531     case INDEX_op_x86_blend_vec:
3532     case INDEX_op_x86_packss_vec:
3533     case INDEX_op_x86_packus_vec:
3534     case INDEX_op_x86_vperm2i128_vec:
3535     case INDEX_op_x86_punpckl_vec:
3536     case INDEX_op_x86_punpckh_vec:
3537     case INDEX_op_x86_vpshldi_vec:
3538 #if TCG_TARGET_REG_BITS == 32
3539     case INDEX_op_dup2_vec:
3540 #endif
3541         return C_O1_I2(x, x, x);
3542
3543     case INDEX_op_abs_vec:
3544     case INDEX_op_dup_vec:
3545     case INDEX_op_not_vec:
3546     case INDEX_op_shli_vec:
3547     case INDEX_op_shri_vec:
3548     case INDEX_op_sari_vec:
3549     case INDEX_op_rotli_vec:
3550     case INDEX_op_x86_psrldq_vec:
3551         return C_O1_I1(x, x);
3552
3553     case INDEX_op_x86_vpshldv_vec:
3554     case INDEX_op_x86_vpshrdv_vec:
3555         return C_O1_I3(x, 0, x, x);
3556
3557     case INDEX_op_bitsel_vec:
3558     case INDEX_op_x86_vpblendvb_vec:
3559         return C_O1_I3(x, x, x, x);
3560
3561     default:
3562         g_assert_not_reached();
3563     }
3564 }
3565
3566 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3567 {
3568     switch (opc) {
3569     case INDEX_op_add_vec:
3570     case INDEX_op_sub_vec:
3571     case INDEX_op_and_vec:
3572     case INDEX_op_or_vec:
3573     case INDEX_op_xor_vec:
3574     case INDEX_op_andc_vec:
3575     case INDEX_op_orc_vec:
3576     case INDEX_op_nand_vec:
3577     case INDEX_op_nor_vec:
3578     case INDEX_op_eqv_vec:
3579     case INDEX_op_not_vec:
3580     case INDEX_op_bitsel_vec:
3581         return 1;
3582     case INDEX_op_cmp_vec:
3583     case INDEX_op_cmpsel_vec:
3584         return -1;
3585
3586     case INDEX_op_rotli_vec:
3587         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3588
3589     case INDEX_op_shli_vec:
3590     case INDEX_op_shri_vec:
3591         /* We must expand the operation for MO_8.  */
3592         return vece == MO_8 ? -1 : 1;
3593
3594     case INDEX_op_sari_vec:
3595         switch (vece) {
3596         case MO_8:
3597             return -1;
3598         case MO_16:
3599         case MO_32:
3600             return 1;
3601         case MO_64:
3602             if (have_avx512vl) {
3603                 return 1;
3604             }
3605             /*
3606              * We can emulate this for MO_64, but it does not pay off
3607              * unless we're producing at least 4 values.
3608              */
3609             return type >= TCG_TYPE_V256 ? -1 : 0;
3610         }
3611         return 0;
3612
3613     case INDEX_op_shls_vec:
3614     case INDEX_op_shrs_vec:
3615         return vece >= MO_16;
3616     case INDEX_op_sars_vec:
3617         switch (vece) {
3618         case MO_16:
3619         case MO_32:
3620             return 1;
3621         case MO_64:
3622             return have_avx512vl;
3623         }
3624         return 0;
3625     case INDEX_op_rotls_vec:
3626         return vece >= MO_16 ? -1 : 0;
3627
3628     case INDEX_op_shlv_vec:
3629     case INDEX_op_shrv_vec:
3630         switch (vece) {
3631         case MO_16:
3632             return have_avx512bw;
3633         case MO_32:
3634         case MO_64:
3635             return have_avx2;
3636         }
3637         return 0;
3638     case INDEX_op_sarv_vec:
3639         switch (vece) {
3640         case MO_16:
3641             return have_avx512bw;
3642         case MO_32:
3643             return have_avx2;
3644         case MO_64:
3645             return have_avx512vl;
3646         }
3647         return 0;
3648     case INDEX_op_rotlv_vec:
3649     case INDEX_op_rotrv_vec:
3650         switch (vece) {
3651         case MO_16:
3652             return have_avx512vbmi2 ? -1 : 0;
3653         case MO_32:
3654         case MO_64:
3655             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3656         }
3657         return 0;
3658
3659     case INDEX_op_mul_vec:
3660         switch (vece) {
3661         case MO_8:
3662             return -1;
3663         case MO_64:
3664             return have_avx512dq;
3665         }
3666         return 1;
3667
3668     case INDEX_op_ssadd_vec:
3669     case INDEX_op_usadd_vec:
3670     case INDEX_op_sssub_vec:
3671     case INDEX_op_ussub_vec:
3672         return vece <= MO_16;
3673     case INDEX_op_smin_vec:
3674     case INDEX_op_smax_vec:
3675     case INDEX_op_umin_vec:
3676     case INDEX_op_umax_vec:
3677     case INDEX_op_abs_vec:
3678         return vece <= MO_32 || have_avx512vl;
3679
3680     default:
3681         return 0;
3682     }
3683 }
3684
3685 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3686                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3687 {
3688     TCGv_vec t1, t2;
3689
3690     tcg_debug_assert(vece == MO_8);
3691
3692     t1 = tcg_temp_new_vec(type);
3693     t2 = tcg_temp_new_vec(type);
3694
3695     /*
3696      * Unpack to W, shift, and repack.  Tricky bits:
3697      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3698      *     i.e. duplicate in other half of the 16-bit lane.
3699      * (2) For right-shift, add 8 so that the high half of the lane
3700      *     becomes zero.  For left-shift, and left-rotate, we must
3701      *     shift up and down again.
3702      * (3) Step 2 leaves high half zero such that PACKUSWB
3703      *     (pack with unsigned saturation) does not modify
3704      *     the quantity.
3705      */
3706     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3707               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3708     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3709               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3710
3711     if (opc != INDEX_op_rotli_vec) {
3712         imm += 8;
3713     }
3714     if (opc == INDEX_op_shri_vec) {
3715         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3716         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3717     } else {
3718         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3719         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3720         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3721         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3722     }
3723
3724     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3725               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3726     tcg_temp_free_vec(t1);
3727     tcg_temp_free_vec(t2);
3728 }
3729
3730 static void expand_vec_sari(TCGType type, unsigned vece,
3731                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3732 {
3733     TCGv_vec t1, t2;
3734
3735     switch (vece) {
3736     case MO_8:
3737         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3738         t1 = tcg_temp_new_vec(type);
3739         t2 = tcg_temp_new_vec(type);
3740         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3741                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3742         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3743                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3744         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3745         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3746         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3747                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3748         tcg_temp_free_vec(t1);
3749         tcg_temp_free_vec(t2);
3750         break;
3751
3752     case MO_64:
3753         t1 = tcg_temp_new_vec(type);
3754         if (imm <= 32) {
3755             /*
3756              * We can emulate a small sign extend by performing an arithmetic
3757              * 32-bit shift and overwriting the high half of a 64-bit logical
3758              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3759              * does not, so we have to bound the smaller shift -- we get the
3760              * same result in the high half either way.
3761              */
3762             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3763             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3764             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3765                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3766                       tcgv_vec_arg(t1), 0xaa);
3767         } else {
3768             /* Otherwise we will need to use a compare vs 0 to produce
3769              * the sign-extend, shift and merge.
3770              */
3771             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3772                             tcg_constant_vec(type, MO_64, 0), v1);
3773             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3774             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3775             tcg_gen_or_vec(MO_64, v0, v0, t1);
3776         }
3777         tcg_temp_free_vec(t1);
3778         break;
3779
3780     default:
3781         g_assert_not_reached();
3782     }
3783 }
3784
3785 static void expand_vec_rotli(TCGType type, unsigned vece,
3786                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3787 {
3788     TCGv_vec t;
3789
3790     if (vece == MO_8) {
3791         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3792         return;
3793     }
3794
3795     if (have_avx512vbmi2) {
3796         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3797                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3798         return;
3799     }
3800
3801     t = tcg_temp_new_vec(type);
3802     tcg_gen_shli_vec(vece, t, v1, imm);
3803     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3804     tcg_gen_or_vec(vece, v0, v0, t);
3805     tcg_temp_free_vec(t);
3806 }
3807
3808 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3809                             TCGv_vec v1, TCGv_vec sh, bool right)
3810 {
3811     TCGv_vec t;
3812
3813     if (have_avx512vbmi2) {
3814         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3815                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3816                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3817         return;
3818     }
3819
3820     t = tcg_temp_new_vec(type);
3821     tcg_gen_dupi_vec(vece, t, 8 << vece);
3822     tcg_gen_sub_vec(vece, t, t, sh);
3823     if (right) {
3824         tcg_gen_shlv_vec(vece, t, v1, t);
3825         tcg_gen_shrv_vec(vece, v0, v1, sh);
3826     } else {
3827         tcg_gen_shrv_vec(vece, t, v1, t);
3828         tcg_gen_shlv_vec(vece, v0, v1, sh);
3829     }
3830     tcg_gen_or_vec(vece, v0, v0, t);
3831     tcg_temp_free_vec(t);
3832 }
3833
3834 static void expand_vec_rotls(TCGType type, unsigned vece,
3835                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3836 {
3837     TCGv_vec t = tcg_temp_new_vec(type);
3838
3839     tcg_debug_assert(vece != MO_8);
3840
3841     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3842         tcg_gen_dup_i32_vec(vece, t, lsh);
3843         if (vece >= MO_32) {
3844             tcg_gen_rotlv_vec(vece, v0, v1, t);
3845         } else {
3846             expand_vec_rotv(type, vece, v0, v1, t, false);
3847         }
3848     } else {
3849         TCGv_i32 rsh = tcg_temp_new_i32();
3850
3851         tcg_gen_neg_i32(rsh, lsh);
3852         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3853         tcg_gen_shls_vec(vece, t, v1, lsh);
3854         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3855         tcg_gen_or_vec(vece, v0, v0, t);
3856
3857         tcg_temp_free_i32(rsh);
3858     }
3859
3860     tcg_temp_free_vec(t);
3861 }
3862
3863 static void expand_vec_mul(TCGType type, unsigned vece,
3864                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3865 {
3866     TCGv_vec t1, t2, t3, t4, zero;
3867
3868     tcg_debug_assert(vece == MO_8);
3869
3870     /*
3871      * Unpack v1 bytes to words, 0 | x.
3872      * Unpack v2 bytes to words, y | 0.
3873      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3874      * Shift logical right by 8 bits to clear the high 8 bytes before
3875      * using an unsigned saturated pack.
3876      *
3877      * The difference between the V64, V128 and V256 cases is merely how
3878      * we distribute the expansion between temporaries.
3879      */
3880     switch (type) {
3881     case TCG_TYPE_V64:
3882         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3883         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3884         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3885         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3886                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3887         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3888                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3889         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3890         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3891         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3892                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3893         tcg_temp_free_vec(t1);
3894         tcg_temp_free_vec(t2);
3895         break;
3896
3897     case TCG_TYPE_V128:
3898     case TCG_TYPE_V256:
3899         t1 = tcg_temp_new_vec(type);
3900         t2 = tcg_temp_new_vec(type);
3901         t3 = tcg_temp_new_vec(type);
3902         t4 = tcg_temp_new_vec(type);
3903         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3904         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3905                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3906         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3907                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3908         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3909                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3910         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3911                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3912         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3913         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3914         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3915         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3916         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3917                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3918         tcg_temp_free_vec(t1);
3919         tcg_temp_free_vec(t2);
3920         tcg_temp_free_vec(t3);
3921         tcg_temp_free_vec(t4);
3922         break;
3923
3924     default:
3925         g_assert_not_reached();
3926     }
3927 }
3928
3929 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3930                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3931 {
3932     enum {
3933         NEED_INV  = 1,
3934         NEED_SWAP = 2,
3935         NEED_BIAS = 4,
3936         NEED_UMIN = 8,
3937         NEED_UMAX = 16,
3938     };
3939     TCGv_vec t1, t2, t3;
3940     uint8_t fixup;
3941
3942     switch (cond) {
3943     case TCG_COND_EQ:
3944     case TCG_COND_GT:
3945         fixup = 0;
3946         break;
3947     case TCG_COND_NE:
3948     case TCG_COND_LE:
3949         fixup = NEED_INV;
3950         break;
3951     case TCG_COND_LT:
3952         fixup = NEED_SWAP;
3953         break;
3954     case TCG_COND_GE:
3955         fixup = NEED_SWAP | NEED_INV;
3956         break;
3957     case TCG_COND_LEU:
3958         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3959             fixup = NEED_UMIN;
3960         } else {
3961             fixup = NEED_BIAS | NEED_INV;
3962         }
3963         break;
3964     case TCG_COND_GTU:
3965         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3966             fixup = NEED_UMIN | NEED_INV;
3967         } else {
3968             fixup = NEED_BIAS;
3969         }
3970         break;
3971     case TCG_COND_GEU:
3972         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3973             fixup = NEED_UMAX;
3974         } else {
3975             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3976         }
3977         break;
3978     case TCG_COND_LTU:
3979         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3980             fixup = NEED_UMAX | NEED_INV;
3981         } else {
3982             fixup = NEED_BIAS | NEED_SWAP;
3983         }
3984         break;
3985     default:
3986         g_assert_not_reached();
3987     }
3988
3989     if (fixup & NEED_INV) {
3990         cond = tcg_invert_cond(cond);
3991     }
3992     if (fixup & NEED_SWAP) {
3993         t1 = v1, v1 = v2, v2 = t1;
3994         cond = tcg_swap_cond(cond);
3995     }
3996
3997     t1 = t2 = NULL;
3998     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3999         t1 = tcg_temp_new_vec(type);
4000         if (fixup & NEED_UMIN) {
4001             tcg_gen_umin_vec(vece, t1, v1, v2);
4002         } else {
4003             tcg_gen_umax_vec(vece, t1, v1, v2);
4004         }
4005         v2 = t1;
4006         cond = TCG_COND_EQ;
4007     } else if (fixup & NEED_BIAS) {
4008         t1 = tcg_temp_new_vec(type);
4009         t2 = tcg_temp_new_vec(type);
4010         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4011         tcg_gen_sub_vec(vece, t1, v1, t3);
4012         tcg_gen_sub_vec(vece, t2, v2, t3);
4013         v1 = t1;
4014         v2 = t2;
4015         cond = tcg_signed_cond(cond);
4016     }
4017
4018     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4019     /* Expand directly; do not recurse.  */
4020     vec_gen_4(INDEX_op_cmp_vec, type, vece,
4021               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4022
4023     if (t1) {
4024         tcg_temp_free_vec(t1);
4025         if (t2) {
4026             tcg_temp_free_vec(t2);
4027         }
4028     }
4029     return fixup & NEED_INV;
4030 }
4031
4032 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4033                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4034 {
4035     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4036         tcg_gen_not_vec(vece, v0, v0);
4037     }
4038 }
4039
4040 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4041                               TCGv_vec c1, TCGv_vec c2,
4042                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4043 {
4044     TCGv_vec t = tcg_temp_new_vec(type);
4045
4046     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4047         /* Invert the sense of the compare by swapping arguments.  */
4048         TCGv_vec x;
4049         x = v3, v3 = v4, v4 = x;
4050     }
4051     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4052               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4053               tcgv_vec_arg(v3), tcgv_vec_arg(t));
4054     tcg_temp_free_vec(t);
4055 }
4056
4057 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4058                        TCGArg a0, ...)
4059 {
4060     va_list va;
4061     TCGArg a2;
4062     TCGv_vec v0, v1, v2, v3, v4;
4063
4064     va_start(va, a0);
4065     v0 = temp_tcgv_vec(arg_temp(a0));
4066     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4067     a2 = va_arg(va, TCGArg);
4068
4069     switch (opc) {
4070     case INDEX_op_shli_vec:
4071     case INDEX_op_shri_vec:
4072         expand_vec_shi(type, vece, opc, v0, v1, a2);
4073         break;
4074
4075     case INDEX_op_sari_vec:
4076         expand_vec_sari(type, vece, v0, v1, a2);
4077         break;
4078
4079     case INDEX_op_rotli_vec:
4080         expand_vec_rotli(type, vece, v0, v1, a2);
4081         break;
4082
4083     case INDEX_op_rotls_vec:
4084         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4085         break;
4086
4087     case INDEX_op_rotlv_vec:
4088         v2 = temp_tcgv_vec(arg_temp(a2));
4089         expand_vec_rotv(type, vece, v0, v1, v2, false);
4090         break;
4091     case INDEX_op_rotrv_vec:
4092         v2 = temp_tcgv_vec(arg_temp(a2));
4093         expand_vec_rotv(type, vece, v0, v1, v2, true);
4094         break;
4095
4096     case INDEX_op_mul_vec:
4097         v2 = temp_tcgv_vec(arg_temp(a2));
4098         expand_vec_mul(type, vece, v0, v1, v2);
4099         break;
4100
4101     case INDEX_op_cmp_vec:
4102         v2 = temp_tcgv_vec(arg_temp(a2));
4103         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4104         break;
4105
4106     case INDEX_op_cmpsel_vec:
4107         v2 = temp_tcgv_vec(arg_temp(a2));
4108         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4109         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4110         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4111         break;
4112
4113     default:
4114         break;
4115     }
4116
4117     va_end(va);
4118 }
4119
4120 static const int tcg_target_callee_save_regs[] = {
4121 #if TCG_TARGET_REG_BITS == 64
4122     TCG_REG_RBP,
4123     TCG_REG_RBX,
4124 #if defined(_WIN64)
4125     TCG_REG_RDI,
4126     TCG_REG_RSI,
4127 #endif
4128     TCG_REG_R12,
4129     TCG_REG_R13,
4130     TCG_REG_R14, /* Currently used for the global env. */
4131     TCG_REG_R15,
4132 #else
4133     TCG_REG_EBP, /* Currently used for the global env. */
4134     TCG_REG_EBX,
4135     TCG_REG_ESI,
4136     TCG_REG_EDI,
4137 #endif
4138 };
4139
4140 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4141    and tcg_register_jit.  */
4142
4143 #define PUSH_SIZE \
4144     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4145      * (TCG_TARGET_REG_BITS / 8))
4146
4147 #define FRAME_SIZE \
4148     ((PUSH_SIZE \
4149       + TCG_STATIC_CALL_ARGS_SIZE \
4150       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4151       + TCG_TARGET_STACK_ALIGN - 1) \
4152      & ~(TCG_TARGET_STACK_ALIGN - 1))
4153
4154 /* Generate global QEMU prologue and epilogue code */
4155 static void tcg_target_qemu_prologue(TCGContext *s)
4156 {
4157     int i, stack_addend;
4158
4159     /* TB prologue */
4160
4161     /* Reserve some stack space, also for TCG temps.  */
4162     stack_addend = FRAME_SIZE - PUSH_SIZE;
4163     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4164                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4165
4166     /* Save all callee saved registers.  */
4167     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4168         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4169     }
4170
4171     if (!tcg_use_softmmu && guest_base) {
4172         int seg = setup_guest_base_seg();
4173         if (seg != 0) {
4174             x86_guest_base.seg = seg;
4175         } else if (guest_base == (int32_t)guest_base) {
4176             x86_guest_base.ofs = guest_base;
4177         } else {
4178             assert(TCG_TARGET_REG_BITS == 64);
4179             /* Choose R12 because, as a base, it requires a SIB byte. */
4180             x86_guest_base.index = TCG_REG_R12;
4181             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4182             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4183         }
4184     }
4185
4186     if (TCG_TARGET_REG_BITS == 32) {
4187         tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4188                    (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4189         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4190         /* jmp *tb.  */
4191         tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4192                              (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4193                              + stack_addend);
4194     } else {
4195         tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4196         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4197         /* jmp *tb.  */
4198         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4199     }
4200
4201     /*
4202      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4203      * and fall through to the rest of the epilogue.
4204      */
4205     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4206     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4207
4208     /* TB epilogue */
4209     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4210
4211     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4212
4213     if (have_avx2) {
4214         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4215     }
4216     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4217         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4218     }
4219     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4220 }
4221
4222 static void tcg_out_tb_start(TCGContext *s)
4223 {
4224     /* nothing to do */
4225 }
4226
4227 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4228 {
4229     memset(p, 0x90, count);
4230 }
4231
4232 static void tcg_target_init(TCGContext *s)
4233 {
4234     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4235     if (TCG_TARGET_REG_BITS == 64) {
4236         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4237     }
4238     if (have_avx1) {
4239         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4240         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4241     }
4242     if (have_avx2) {
4243         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4244     }
4245
4246     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4247     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4248     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4249     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4250     if (TCG_TARGET_REG_BITS == 64) {
4251 #if !defined(_WIN64)
4252         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4253         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4254 #endif
4255         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4256         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4257         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4258         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4259     }
4260
4261     s->reserved_regs = 0;
4262     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4263     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4264 #ifdef _WIN64
4265     /* These are call saved, and we don't save them, so don't use them. */
4266     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4267     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4268     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4269     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4270     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4271     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4272     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4273     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4274     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4275     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4276 #endif
4277 }
4278
4279 typedef struct {
4280     DebugFrameHeader h;
4281     uint8_t fde_def_cfa[4];
4282     uint8_t fde_reg_ofs[14];
4283 } DebugFrame;
4284
4285 /* We're expecting a 2 byte uleb128 encoded value.  */
4286 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4287
4288 #if !defined(__ELF__)
4289     /* Host machine without ELF. */
4290 #elif TCG_TARGET_REG_BITS == 64
4291 #define ELF_HOST_MACHINE EM_X86_64
4292 static const DebugFrame debug_frame = {
4293     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4294     .h.cie.id = -1,
4295     .h.cie.version = 1,
4296     .h.cie.code_align = 1,
4297     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4298     .h.cie.return_column = 16,
4299
4300     /* Total FDE size does not include the "len" member.  */
4301     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4302
4303     .fde_def_cfa = {
4304         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4305         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4306         (FRAME_SIZE >> 7)
4307     },
4308     .fde_reg_ofs = {
4309         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4310         /* The following ordering must match tcg_target_callee_save_regs.  */
4311         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4312         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4313         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4314         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4315         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4316         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4317     }
4318 };
4319 #else
4320 #define ELF_HOST_MACHINE EM_386
4321 static const DebugFrame debug_frame = {
4322     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4323     .h.cie.id = -1,
4324     .h.cie.version = 1,
4325     .h.cie.code_align = 1,
4326     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4327     .h.cie.return_column = 8,
4328
4329     /* Total FDE size does not include the "len" member.  */
4330     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4331
4332     .fde_def_cfa = {
4333         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4334         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4335         (FRAME_SIZE >> 7)
4336     },
4337     .fde_reg_ofs = {
4338         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4339         /* The following ordering must match tcg_target_callee_save_regs.  */
4340         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4341         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4342         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4343         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4344     }
4345 };
4346 #endif
4347
4348 #if defined(ELF_HOST_MACHINE)
4349 void tcg_register_jit(const void *buf, size_t buf_size)
4350 {
4351     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4352 }
4353 #endif