tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135
 136 /* Registers used with L constraint, which are the first argument
 137    registers on x86_64, and two random call clobbered registers on
 138    i386. */
 139 #if TCG_TARGET_REG_BITS == 64
 140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 142 #else
 143 # define TCG_REG_L0 TCG_REG_EAX
 144 # define TCG_REG_L1 TCG_REG_EDX
 145 #endif
 146
 147 #define ALL_BYTEH_REGS         0x0000000fu
 148 #if TCG_TARGET_REG_BITS == 64
 149 # define ALL_GENERAL_REGS      0x0000ffffu
 150 # define ALL_VECTOR_REGS       0xffff0000u
 151 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 152 #else
 153 # define ALL_GENERAL_REGS      0x000000ffu
 154 # define ALL_VECTOR_REGS       0x00ff0000u
 155 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 156 #endif
 157 #ifdef CONFIG_SOFTMMU
 158 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 159 #else
 160 # define SOFTMMU_RESERVE_REGS  0
 161 #endif
 162
 163 /* For 64-bit, we always know that CMOV is available.  */
 164 #if TCG_TARGET_REG_BITS == 64
 165 # define have_cmov      true
 166 #else
 167 # define have_cmov      (cpuinfo & CPUINFO_CMOV)
 168 #endif
 169 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 170 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 171
 172 static const tcg_insn_unit *tb_ret_addr;
 173
 174 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 175                         intptr_t value, intptr_t addend)
 176 {
 177     value += addend;
 178     switch(type) {
 179     case R_386_PC32:
 180         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 181         if (value != (int32_t)value) {
 182             return false;
 183         }
 184         /* FALLTHRU */
 185     case R_386_32:
 186         tcg_patch32(code_ptr, value);
 187         break;
 188     case R_386_PC8:
 189         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 190         if (value != (int8_t)value) {
 191             return false;
 192         }
 193         tcg_patch8(code_ptr, value);
 194         break;
 195     default:
 196         g_assert_not_reached();
 197     }
 198     return true;
 199 }
 200
 201 /* test if a constant matches the constraint */
 202 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 203 {
 204     if (ct & TCG_CT_CONST) {
 205         return 1;
 206     }
 207     if (type == TCG_TYPE_I32) {
 208         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 209             return 1;
 210         }
 211     } else {
 212         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 213             return 1;
 214         }
 215         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 216             return 1;
 217         }
 218         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 219             return 1;
 220         }
 221     }
 222     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 223         return 1;
 224     }
 225     return 0;
 226 }
 227
 228 # define LOWREGMASK(x)  ((x) & 7)
 229
 230 #define P_EXT           0x100           /* 0x0f opcode prefix */
 231 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 232 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 233 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 234 #if TCG_TARGET_REG_BITS == 64
 235 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 236 # define P_REXB_R       0x2000          /* REG field as byte register */
 237 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 238 # define P_GS           0x8000          /* gs segment override */
 239 #else
 240 # define P_REXW         0
 241 # define P_REXB_R       0
 242 # define P_REXB_RM      0
 243 # define P_GS           0
 244 #endif
 245 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 246 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 247 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 248 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 249 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 250
 251 #define OPC_ARITH_EvIz  (0x81)
 252 #define OPC_ARITH_EvIb  (0x83)
 253 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 254 #define OPC_ANDN        (0xf2 | P_EXT38)
 255 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 256 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 257 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 258 #define OPC_BSF         (0xbc | P_EXT)
 259 #define OPC_BSR         (0xbd | P_EXT)
 260 #define OPC_BSWAP       (0xc8 | P_EXT)
 261 #define OPC_CALL_Jz     (0xe8)
 262 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 263 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 264 #define OPC_DEC_r32     (0x48)
 265 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 266 #define OPC_IMUL_GvEvIb (0x6b)
 267 #define OPC_IMUL_GvEvIz (0x69)
 268 #define OPC_INC_r32     (0x40)
 269 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 270 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 271 #define OPC_JMP_long    (0xe9)
 272 #define OPC_JMP_short   (0xeb)
 273 #define OPC_LEA         (0x8d)
 274 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 275 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 276 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 277 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 278 #define OPC_MOVB_EvIz   (0xc6)
 279 #define OPC_MOVL_EvIz   (0xc7)
 280 #define OPC_MOVL_Iv     (0xb8)
 281 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 282 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 283 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 284 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 285 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 286 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 287 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 288 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 289 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 290 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 291 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 292 #define OPC_MOVSBL      (0xbe | P_EXT)
 293 #define OPC_MOVSWL      (0xbf | P_EXT)
 294 #define OPC_MOVSLQ      (0x63 | P_REXW)
 295 #define OPC_MOVZBL      (0xb6 | P_EXT)
 296 #define OPC_MOVZWL      (0xb7 | P_EXT)
 297 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 298 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 299 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 300 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 301 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 302 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 303 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 304 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 305 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 306 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 307 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 308 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 309 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 310 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 311 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 312 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 313 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 314 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 315 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 316 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 317 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 318 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 319 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 320 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 321 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 322 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 323 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 324 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 325 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 326 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 327 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 328 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 329 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 330 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 331 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 332 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 333 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 334 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 335 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 336 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 337 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 338 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 339 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 340 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 341 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 342 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 343 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 344 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 345 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 348 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 349 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 350 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 351 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 352 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 353 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 354 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 355 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 356 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 357 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 358 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 359 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 360 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 361 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 362 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 363 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 364 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 365 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 366 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 368 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 369 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 370 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 371 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 372 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 373 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 374 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 376 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 384 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 385 #define OPC_POP_r32     (0x58)
 386 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 387 #define OPC_PUSH_r32    (0x50)
 388 #define OPC_PUSH_Iv     (0x68)
 389 #define OPC_PUSH_Ib     (0x6a)
 390 #define OPC_RET         (0xc3)
 391 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 392 #define OPC_SHIFT_1     (0xd1)
 393 #define OPC_SHIFT_Ib    (0xc1)
 394 #define OPC_SHIFT_cl    (0xd3)
 395 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 396 #define OPC_SHUFPS      (0xc6 | P_EXT)
 397 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 398 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 399 #define OPC_SHRD_Ib     (0xac | P_EXT)
 400 #define OPC_TESTL       (0x85)
 401 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 402 #define OPC_UD2         (0x0b | P_EXT)
 403 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 404 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 405 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 406 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 407 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 408 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 409 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 410 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 413 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 414 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 415 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 416 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 417 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 418 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 419 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 420 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 421 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 422 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 423 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 424 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 425 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 426 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 427 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 428 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 429 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 430 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 431 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 432 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 433 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 434 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 435 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 436 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 437 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 438 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 439 #define OPC_XCHG_ax_r32 (0x90)
 440 #define OPC_XCHG_EvGv   (0x87)
 441
 442 #define OPC_GRP3_Eb     (0xf6)
 443 #define OPC_GRP3_Ev     (0xf7)
 444 #define OPC_GRP5        (0xff)
 445 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 446
 447 /* Group 1 opcode extensions for 0x80-0x83.
 448    These are also used as modifiers for OPC_ARITH.  */
 449 #define ARITH_ADD 0
 450 #define ARITH_OR  1
 451 #define ARITH_ADC 2
 452 #define ARITH_SBB 3
 453 #define ARITH_AND 4
 454 #define ARITH_SUB 5
 455 #define ARITH_XOR 6
 456 #define ARITH_CMP 7
 457
 458 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 459 #define SHIFT_ROL 0
 460 #define SHIFT_ROR 1
 461 #define SHIFT_SHL 4
 462 #define SHIFT_SHR 5
 463 #define SHIFT_SAR 7
 464
 465 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 466 #define EXT3_TESTi 0
 467 #define EXT3_NOT   2
 468 #define EXT3_NEG   3
 469 #define EXT3_MUL   4
 470 #define EXT3_IMUL  5
 471 #define EXT3_DIV   6
 472 #define EXT3_IDIV  7
 473
 474 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 475 #define EXT5_INC_Ev     0
 476 #define EXT5_DEC_Ev     1
 477 #define EXT5_CALLN_Ev   2
 478 #define EXT5_JMPN_Ev    4
 479
 480 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 481 #define JCC_JMP (-1)
 482 #define JCC_JO  0x0
 483 #define JCC_JNO 0x1
 484 #define JCC_JB  0x2
 485 #define JCC_JAE 0x3
 486 #define JCC_JE  0x4
 487 #define JCC_JNE 0x5
 488 #define JCC_JBE 0x6
 489 #define JCC_JA  0x7
 490 #define JCC_JS  0x8
 491 #define JCC_JNS 0x9
 492 #define JCC_JP  0xa
 493 #define JCC_JNP 0xb
 494 #define JCC_JL  0xc
 495 #define JCC_JGE 0xd
 496 #define JCC_JLE 0xe
 497 #define JCC_JG  0xf
 498
 499 static const uint8_t tcg_cond_to_jcc[] = {
 500     [TCG_COND_EQ] = JCC_JE,
 501     [TCG_COND_NE] = JCC_JNE,
 502     [TCG_COND_LT] = JCC_JL,
 503     [TCG_COND_GE] = JCC_JGE,
 504     [TCG_COND_LE] = JCC_JLE,
 505     [TCG_COND_GT] = JCC_JG,
 506     [TCG_COND_LTU] = JCC_JB,
 507     [TCG_COND_GEU] = JCC_JAE,
 508     [TCG_COND_LEU] = JCC_JBE,
 509     [TCG_COND_GTU] = JCC_JA,
 510 };
 511
 512 #if TCG_TARGET_REG_BITS == 64
 513 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 514 {
 515     int rex;
 516
 517     if (opc & P_GS) {
 518         tcg_out8(s, 0x65);
 519     }
 520     if (opc & P_DATA16) {
 521         /* We should never be asking for both 16 and 64-bit operation.  */
 522         tcg_debug_assert((opc & P_REXW) == 0);
 523         tcg_out8(s, 0x66);
 524     }
 525     if (opc & P_SIMDF3) {
 526         tcg_out8(s, 0xf3);
 527     } else if (opc & P_SIMDF2) {
 528         tcg_out8(s, 0xf2);
 529     }
 530
 531     rex = 0;
 532     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 533     rex |= (r & 8) >> 1;                /* REX.R */
 534     rex |= (x & 8) >> 2;                /* REX.X */
 535     rex |= (rm & 8) >> 3;               /* REX.B */
 536
 537     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 538        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 539        as otherwise the encoding indicates %[abcd]h.  Note that the values
 540        that are ORed in merely indicate that the REX byte must be present;
 541        those bits get discarded in output.  */
 542     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 543     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 544
 545     if (rex) {
 546         tcg_out8(s, (uint8_t)(rex | 0x40));
 547     }
 548
 549     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 550         tcg_out8(s, 0x0f);
 551         if (opc & P_EXT38) {
 552             tcg_out8(s, 0x38);
 553         } else if (opc & P_EXT3A) {
 554             tcg_out8(s, 0x3a);
 555         }
 556     }
 557
 558     tcg_out8(s, opc);
 559 }
 560 #else
 561 static void tcg_out_opc(TCGContext *s, int opc)
 562 {
 563     if (opc & P_DATA16) {
 564         tcg_out8(s, 0x66);
 565     }
 566     if (opc & P_SIMDF3) {
 567         tcg_out8(s, 0xf3);
 568     } else if (opc & P_SIMDF2) {
 569         tcg_out8(s, 0xf2);
 570     }
 571     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 572         tcg_out8(s, 0x0f);
 573         if (opc & P_EXT38) {
 574             tcg_out8(s, 0x38);
 575         } else if (opc & P_EXT3A) {
 576             tcg_out8(s, 0x3a);
 577         }
 578     }
 579     tcg_out8(s, opc);
 580 }
 581 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 582    the 32-bit compilation paths.  This method works with all versions of gcc,
 583    whereas relying on optimization may not be able to exclude them.  */
 584 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 585 #endif
 586
 587 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 588 {
 589     tcg_out_opc(s, opc, r, rm, 0);
 590     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 591 }
 592
 593 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 594                             int rm, int index)
 595 {
 596     int tmp;
 597
 598     /* Use the two byte form if possible, which cannot encode
 599        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 600     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 601         && ((rm | index) & 8) == 0) {
 602         /* Two byte VEX prefix.  */
 603         tcg_out8(s, 0xc5);
 604
 605         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 606     } else {
 607         /* Three byte VEX prefix.  */
 608         tcg_out8(s, 0xc4);
 609
 610         /* VEX.m-mmmm */
 611         if (opc & P_EXT3A) {
 612             tmp = 3;
 613         } else if (opc & P_EXT38) {
 614             tmp = 2;
 615         } else if (opc & P_EXT) {
 616             tmp = 1;
 617         } else {
 618             g_assert_not_reached();
 619         }
 620         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 621         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 622         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 623         tcg_out8(s, tmp);
 624
 625         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 626     }
 627
 628     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 629     /* VEX.pp */
 630     if (opc & P_DATA16) {
 631         tmp |= 1;                          /* 0x66 */
 632     } else if (opc & P_SIMDF3) {
 633         tmp |= 2;                          /* 0xf3 */
 634     } else if (opc & P_SIMDF2) {
 635         tmp |= 3;                          /* 0xf2 */
 636     }
 637     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 638     tcg_out8(s, tmp);
 639     tcg_out8(s, opc);
 640 }
 641
 642 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 643                              int rm, int index)
 644 {
 645     /* The entire 4-byte evex prefix; with R' and V' set. */
 646     uint32_t p = 0x08041062;
 647     int mm, pp;
 648
 649     tcg_debug_assert(have_avx512vl);
 650
 651     /* EVEX.mm */
 652     if (opc & P_EXT3A) {
 653         mm = 3;
 654     } else if (opc & P_EXT38) {
 655         mm = 2;
 656     } else if (opc & P_EXT) {
 657         mm = 1;
 658     } else {
 659         g_assert_not_reached();
 660     }
 661
 662     /* EVEX.pp */
 663     if (opc & P_DATA16) {
 664         pp = 1;                          /* 0x66 */
 665     } else if (opc & P_SIMDF3) {
 666         pp = 2;                          /* 0xf3 */
 667     } else if (opc & P_SIMDF2) {
 668         pp = 3;                          /* 0xf2 */
 669     } else {
 670         pp = 0;
 671     }
 672
 673     p = deposit32(p, 8, 2, mm);
 674     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 675     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 676     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 677     p = deposit32(p, 16, 2, pp);
 678     p = deposit32(p, 19, 4, ~v);
 679     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 680     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 681
 682     tcg_out32(s, p);
 683     tcg_out8(s, opc);
 684 }
 685
 686 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 687 {
 688     if (opc & P_EVEX) {
 689         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 690     } else {
 691         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 692     }
 693     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 694 }
 695
 696 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 697    We handle either RM and INDEX missing with a negative value.  In 64-bit
 698    mode for absolute addresses, ~RM is the size of the immediate operand
 699    that will follow the instruction.  */
 700
 701 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 702                                int shift, intptr_t offset)
 703 {
 704     int mod, len;
 705
 706     if (index < 0 && rm < 0) {
 707         if (TCG_TARGET_REG_BITS == 64) {
 708             /* Try for a rip-relative addressing mode.  This has replaced
 709                the 32-bit-mode absolute addressing encoding.  */
 710             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 711             intptr_t disp = offset - pc;
 712             if (disp == (int32_t)disp) {
 713                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 714                 tcg_out32(s, disp);
 715                 return;
 716             }
 717
 718             /* Try for an absolute address encoding.  This requires the
 719                use of the MODRM+SIB encoding and is therefore larger than
 720                rip-relative addressing.  */
 721             if (offset == (int32_t)offset) {
 722                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 723                 tcg_out8(s, (4 << 3) | 5);
 724                 tcg_out32(s, offset);
 725                 return;
 726             }
 727
 728             /* ??? The memory isn't directly addressable.  */
 729             g_assert_not_reached();
 730         } else {
 731             /* Absolute address.  */
 732             tcg_out8(s, (r << 3) | 5);
 733             tcg_out32(s, offset);
 734             return;
 735         }
 736     }
 737
 738     /* Find the length of the immediate addend.  Note that the encoding
 739        that would be used for (%ebp) indicates absolute addressing.  */
 740     if (rm < 0) {
 741         mod = 0, len = 4, rm = 5;
 742     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 743         mod = 0, len = 0;
 744     } else if (offset == (int8_t)offset) {
 745         mod = 0x40, len = 1;
 746     } else {
 747         mod = 0x80, len = 4;
 748     }
 749
 750     /* Use a single byte MODRM format if possible.  Note that the encoding
 751        that would be used for %esp is the escape to the two byte form.  */
 752     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 753         /* Single byte MODRM format.  */
 754         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 755     } else {
 756         /* Two byte MODRM+SIB format.  */
 757
 758         /* Note that the encoding that would place %esp into the index
 759            field indicates no index register.  In 64-bit mode, the REX.X
 760            bit counts, so %r12 can be used as the index.  */
 761         if (index < 0) {
 762             index = 4;
 763         } else {
 764             tcg_debug_assert(index != TCG_REG_ESP);
 765         }
 766
 767         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 768         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 769     }
 770
 771     if (len == 1) {
 772         tcg_out8(s, offset);
 773     } else if (len == 4) {
 774         tcg_out32(s, offset);
 775     }
 776 }
 777
 778 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 779                                      int index, int shift, intptr_t offset)
 780 {
 781     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 782     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 783 }
 784
 785 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 786                                          int rm, int index, int shift,
 787                                          intptr_t offset)
 788 {
 789     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 790     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 791 }
 792
 793 /* A simplification of the above with no index or shift.  */
 794 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 795                                         int rm, intptr_t offset)
 796 {
 797     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 798 }
 799
 800 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 801                                             int v, int rm, intptr_t offset)
 802 {
 803     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 804 }
 805
 806 /* Output an opcode with an expected reference to the constant pool.  */
 807 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 808 {
 809     tcg_out_opc(s, opc, r, 0, 0);
 810     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 811     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 812     tcg_out32(s, 0);
 813 }
 814
 815 /* Output an opcode with an expected reference to the constant pool.  */
 816 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 817 {
 818     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 819     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 820     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 821     tcg_out32(s, 0);
 822 }
 823
 824 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 825 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 826 {
 827     /* Propagate an opcode prefix, such as P_REXW.  */
 828     int ext = subop & ~0x7;
 829     subop &= 0x7;
 830
 831     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 832 }
 833
 834 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 835 {
 836     int rexw = 0;
 837
 838     if (arg == ret) {
 839         return true;
 840     }
 841     switch (type) {
 842     case TCG_TYPE_I64:
 843         rexw = P_REXW;
 844         /* fallthru */
 845     case TCG_TYPE_I32:
 846         if (ret < 16) {
 847             if (arg < 16) {
 848                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 849             } else {
 850                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 851             }
 852         } else {
 853             if (arg < 16) {
 854                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 855             } else {
 856                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 857             }
 858         }
 859         break;
 860
 861     case TCG_TYPE_V64:
 862         tcg_debug_assert(ret >= 16 && arg >= 16);
 863         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 864         break;
 865     case TCG_TYPE_V128:
 866         tcg_debug_assert(ret >= 16 && arg >= 16);
 867         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 868         break;
 869     case TCG_TYPE_V256:
 870         tcg_debug_assert(ret >= 16 && arg >= 16);
 871         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 872         break;
 873
 874     default:
 875         g_assert_not_reached();
 876     }
 877     return true;
 878 }
 879
 880 static const int avx2_dup_insn[4] = {
 881     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 882     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 883 };
 884
 885 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 886                             TCGReg r, TCGReg a)
 887 {
 888     if (have_avx2) {
 889         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 890         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 891     } else {
 892         switch (vece) {
 893         case MO_8:
 894             /* ??? With zero in a register, use PSHUFB.  */
 895             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 896             a = r;
 897             /* FALLTHRU */
 898         case MO_16:
 899             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 900             a = r;
 901             /* FALLTHRU */
 902         case MO_32:
 903             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 904             /* imm8 operand: all output lanes selected from input lane 0.  */
 905             tcg_out8(s, 0);
 906             break;
 907         case MO_64:
 908             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 909             break;
 910         default:
 911             g_assert_not_reached();
 912         }
 913     }
 914     return true;
 915 }
 916
 917 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 918                              TCGReg r, TCGReg base, intptr_t offset)
 919 {
 920     if (have_avx2) {
 921         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 922         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 923                                  r, 0, base, offset);
 924     } else {
 925         switch (vece) {
 926         case MO_64:
 927             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 928             break;
 929         case MO_32:
 930             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 931             break;
 932         case MO_16:
 933             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 934             tcg_out8(s, 0); /* imm8 */
 935             tcg_out_dup_vec(s, type, vece, r, r);
 936             break;
 937         case MO_8:
 938             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 939             tcg_out8(s, 0); /* imm8 */
 940             tcg_out_dup_vec(s, type, vece, r, r);
 941             break;
 942         default:
 943             g_assert_not_reached();
 944         }
 945     }
 946     return true;
 947 }
 948
 949 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 950                              TCGReg ret, int64_t arg)
 951 {
 952     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 953
 954     if (arg == 0) {
 955         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 956         return;
 957     }
 958     if (arg == -1) {
 959         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 960         return;
 961     }
 962
 963     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 964         if (have_avx2) {
 965             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 966         } else {
 967             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 968         }
 969         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 970     } else {
 971         if (type == TCG_TYPE_V64) {
 972             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 973         } else if (have_avx2) {
 974             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 975         } else {
 976             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 977         }
 978         if (TCG_TARGET_REG_BITS == 64) {
 979             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 980         } else {
 981             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 982         }
 983     }
 984 }
 985
 986 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 987                              TCGReg ret, tcg_target_long arg)
 988 {
 989     if (arg == 0) {
 990         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 991         return;
 992     }
 993     if (arg == -1) {
 994         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 995         return;
 996     }
 997
 998     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 999     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1000     if (TCG_TARGET_REG_BITS == 64) {
1001         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1002     } else {
1003         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1004     }
1005 }
1006
1007 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1008                              TCGReg ret, tcg_target_long arg)
1009 {
1010     tcg_target_long diff;
1011
1012     if (arg == 0) {
1013         tgen_arithr(s, ARITH_XOR, ret, ret);
1014         return;
1015     }
1016     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1017         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1018         tcg_out32(s, arg);
1019         return;
1020     }
1021     if (arg == (int32_t)arg) {
1022         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1023         tcg_out32(s, arg);
1024         return;
1025     }
1026
1027     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1028     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1029     if (diff == (int32_t)diff) {
1030         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1031         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1032         tcg_out32(s, diff);
1033         return;
1034     }
1035
1036     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1037     tcg_out64(s, arg);
1038 }
1039
1040 static void tcg_out_movi(TCGContext *s, TCGType type,
1041                          TCGReg ret, tcg_target_long arg)
1042 {
1043     switch (type) {
1044     case TCG_TYPE_I32:
1045 #if TCG_TARGET_REG_BITS == 64
1046     case TCG_TYPE_I64:
1047 #endif
1048         if (ret < 16) {
1049             tcg_out_movi_int(s, type, ret, arg);
1050         } else {
1051             tcg_out_movi_vec(s, type, ret, arg);
1052         }
1053         break;
1054     default:
1055         g_assert_not_reached();
1056     }
1057 }
1058
1059 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1060 {
1061     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1062     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1063     return true;
1064 }
1065
1066 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1067                              tcg_target_long imm)
1068 {
1069     /* This function is only used for passing structs by reference. */
1070     tcg_debug_assert(imm == (int32_t)imm);
1071     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1072 }
1073
1074 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1075 {
1076     if (val == (int8_t)val) {
1077         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1078         tcg_out8(s, val);
1079     } else if (val == (int32_t)val) {
1080         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1081         tcg_out32(s, val);
1082     } else {
1083         g_assert_not_reached();
1084     }
1085 }
1086
1087 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1088 {
1089     /* Given the strength of x86 memory ordering, we only need care for
1090        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1091        faster than "mfence", so don't bother with the sse insn.  */
1092     if (a0 & TCG_MO_ST_LD) {
1093         tcg_out8(s, 0xf0);
1094         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1095         tcg_out8(s, 0);
1096     }
1097 }
1098
1099 static inline void tcg_out_push(TCGContext *s, int reg)
1100 {
1101     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1102 }
1103
1104 static inline void tcg_out_pop(TCGContext *s, int reg)
1105 {
1106     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1107 }
1108
1109 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1110                        TCGReg arg1, intptr_t arg2)
1111 {
1112     switch (type) {
1113     case TCG_TYPE_I32:
1114         if (ret < 16) {
1115             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1116         } else {
1117             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1118         }
1119         break;
1120     case TCG_TYPE_I64:
1121         if (ret < 16) {
1122             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1123             break;
1124         }
1125         /* FALLTHRU */
1126     case TCG_TYPE_V64:
1127         /* There is no instruction that can validate 8-byte alignment.  */
1128         tcg_debug_assert(ret >= 16);
1129         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1130         break;
1131     case TCG_TYPE_V128:
1132         /*
1133          * The gvec infrastructure is asserts that v128 vector loads
1134          * and stores use a 16-byte aligned offset.  Validate that the
1135          * final pointer is aligned by using an insn that will SIGSEGV.
1136          */
1137         tcg_debug_assert(ret >= 16);
1138         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1139         break;
1140     case TCG_TYPE_V256:
1141         /*
1142          * The gvec infrastructure only requires 16-byte alignment,
1143          * so here we must use an unaligned load.
1144          */
1145         tcg_debug_assert(ret >= 16);
1146         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1147                                  ret, 0, arg1, arg2);
1148         break;
1149     default:
1150         g_assert_not_reached();
1151     }
1152 }
1153
1154 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1155                        TCGReg arg1, intptr_t arg2)
1156 {
1157     switch (type) {
1158     case TCG_TYPE_I32:
1159         if (arg < 16) {
1160             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1161         } else {
1162             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1163         }
1164         break;
1165     case TCG_TYPE_I64:
1166         if (arg < 16) {
1167             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1168             break;
1169         }
1170         /* FALLTHRU */
1171     case TCG_TYPE_V64:
1172         /* There is no instruction that can validate 8-byte alignment.  */
1173         tcg_debug_assert(arg >= 16);
1174         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1175         break;
1176     case TCG_TYPE_V128:
1177         /*
1178          * The gvec infrastructure is asserts that v128 vector loads
1179          * and stores use a 16-byte aligned offset.  Validate that the
1180          * final pointer is aligned by using an insn that will SIGSEGV.
1181          *
1182          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1183          * for _WIN64, which must have SSE2 but may not have AVX.
1184          */
1185         tcg_debug_assert(arg >= 16);
1186         if (have_avx1) {
1187             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1188         } else {
1189             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1190         }
1191         break;
1192     case TCG_TYPE_V256:
1193         /*
1194          * The gvec infrastructure only requires 16-byte alignment,
1195          * so here we must use an unaligned store.
1196          */
1197         tcg_debug_assert(arg >= 16);
1198         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1199                                  arg, 0, arg1, arg2);
1200         break;
1201     default:
1202         g_assert_not_reached();
1203     }
1204 }
1205
1206 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1207                         TCGReg base, intptr_t ofs)
1208 {
1209     int rexw = 0;
1210     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1211         if (val != (int32_t)val) {
1212             return false;
1213         }
1214         rexw = P_REXW;
1215     } else if (type != TCG_TYPE_I32) {
1216         return false;
1217     }
1218     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1219     tcg_out32(s, val);
1220     return true;
1221 }
1222
1223 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1224 {
1225     /* Propagate an opcode prefix, such as P_DATA16.  */
1226     int ext = subopc & ~0x7;
1227     subopc &= 0x7;
1228
1229     if (count == 1) {
1230         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1231     } else {
1232         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1233         tcg_out8(s, count);
1234     }
1235 }
1236
1237 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1238 {
1239     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1240 }
1241
1242 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1243 {
1244     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1245 }
1246
1247 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1248 {
1249     /* movzbl */
1250     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1251     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1252 }
1253
1254 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1255 {
1256     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1257     /* movsbl */
1258     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1259     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1260 }
1261
1262 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1263 {
1264     /* movzwl */
1265     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1266 }
1267
1268 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1269 {
1270     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1271     /* movsw[lq] */
1272     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1273 }
1274
1275 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1276 {
1277     /* 32-bit mov zero extends.  */
1278     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1279 }
1280
1281 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1282 {
1283     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1284     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1285 }
1286
1287 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1288 {
1289     tcg_out_ext32s(s, dest, src);
1290 }
1291
1292 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1293 {
1294     if (dest != src) {
1295         tcg_out_ext32u(s, dest, src);
1296     }
1297 }
1298
1299 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1300 {
1301     tcg_out_ext32u(s, dest, src);
1302 }
1303
1304 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1305 {
1306     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1307 }
1308
1309 static void tgen_arithi(TCGContext *s, int c, int r0,
1310                         tcg_target_long val, int cf)
1311 {
1312     int rexw = 0;
1313
1314     if (TCG_TARGET_REG_BITS == 64) {
1315         rexw = c & -8;
1316         c &= 7;
1317     }
1318
1319     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1320        partial flags update stalls on Pentium4 and are not recommended
1321        by current Intel optimization manuals.  */
1322     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1323         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1324         if (TCG_TARGET_REG_BITS == 64) {
1325             /* The single-byte increment encodings are re-tasked as the
1326                REX prefixes.  Use the MODRM encoding.  */
1327             tcg_out_modrm(s, OPC_GRP5 + rexw,
1328                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1329         } else {
1330             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1331         }
1332         return;
1333     }
1334
1335     if (c == ARITH_AND) {
1336         if (TCG_TARGET_REG_BITS == 64) {
1337             if (val == 0xffffffffu) {
1338                 tcg_out_ext32u(s, r0, r0);
1339                 return;
1340             }
1341             if (val == (uint32_t)val) {
1342                 /* AND with no high bits set can use a 32-bit operation.  */
1343                 rexw = 0;
1344             }
1345         }
1346         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1347             tcg_out_ext8u(s, r0, r0);
1348             return;
1349         }
1350         if (val == 0xffffu) {
1351             tcg_out_ext16u(s, r0, r0);
1352             return;
1353         }
1354     }
1355
1356     if (val == (int8_t)val) {
1357         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1358         tcg_out8(s, val);
1359         return;
1360     }
1361     if (rexw == 0 || val == (int32_t)val) {
1362         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1363         tcg_out32(s, val);
1364         return;
1365     }
1366
1367     g_assert_not_reached();
1368 }
1369
1370 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1371 {
1372     if (val != 0) {
1373         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1374     }
1375 }
1376
1377 /* Set SMALL to force a short forward branch.  */
1378 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1379 {
1380     int32_t val, val1;
1381
1382     if (l->has_value) {
1383         val = tcg_pcrel_diff(s, l->u.value_ptr);
1384         val1 = val - 2;
1385         if ((int8_t)val1 == val1) {
1386             if (opc == -1) {
1387                 tcg_out8(s, OPC_JMP_short);
1388             } else {
1389                 tcg_out8(s, OPC_JCC_short + opc);
1390             }
1391             tcg_out8(s, val1);
1392         } else {
1393             tcg_debug_assert(!small);
1394             if (opc == -1) {
1395                 tcg_out8(s, OPC_JMP_long);
1396                 tcg_out32(s, val - 5);
1397             } else {
1398                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1399                 tcg_out32(s, val - 6);
1400             }
1401         }
1402     } else if (small) {
1403         if (opc == -1) {
1404             tcg_out8(s, OPC_JMP_short);
1405         } else {
1406             tcg_out8(s, OPC_JCC_short + opc);
1407         }
1408         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1409         s->code_ptr += 1;
1410     } else {
1411         if (opc == -1) {
1412             tcg_out8(s, OPC_JMP_long);
1413         } else {
1414             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1415         }
1416         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1417         s->code_ptr += 4;
1418     }
1419 }
1420
1421 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1422                         int const_arg2, int rexw)
1423 {
1424     if (const_arg2) {
1425         if (arg2 == 0) {
1426             /* test r, r */
1427             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1428         } else {
1429             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1430         }
1431     } else {
1432         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1433     }
1434 }
1435
1436 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1437                              TCGArg arg1, TCGArg arg2, int const_arg2,
1438                              TCGLabel *label, int small)
1439 {
1440     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1441     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1442 }
1443
1444 #if TCG_TARGET_REG_BITS == 64
1445 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1446                              TCGArg arg1, TCGArg arg2, int const_arg2,
1447                              TCGLabel *label, int small)
1448 {
1449     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1450     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1451 }
1452 #else
1453 /* XXX: we implement it at the target level to avoid having to
1454    handle cross basic blocks temporaries */
1455 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1456                             const int *const_args, int small)
1457 {
1458     TCGLabel *label_next = gen_new_label();
1459     TCGLabel *label_this = arg_label(args[5]);
1460
1461     switch(args[4]) {
1462     case TCG_COND_EQ:
1463         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1464                          label_next, 1);
1465         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1466                          label_this, small);
1467         break;
1468     case TCG_COND_NE:
1469         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1470                          label_this, small);
1471         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1472                          label_this, small);
1473         break;
1474     case TCG_COND_LT:
1475         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1476                          label_this, small);
1477         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1478         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1479                          label_this, small);
1480         break;
1481     case TCG_COND_LE:
1482         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1483                          label_this, small);
1484         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1485         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1486                          label_this, small);
1487         break;
1488     case TCG_COND_GT:
1489         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1490                          label_this, small);
1491         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1492         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1493                          label_this, small);
1494         break;
1495     case TCG_COND_GE:
1496         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1497                          label_this, small);
1498         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1500                          label_this, small);
1501         break;
1502     case TCG_COND_LTU:
1503         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1504                          label_this, small);
1505         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1507                          label_this, small);
1508         break;
1509     case TCG_COND_LEU:
1510         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1511                          label_this, small);
1512         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1514                          label_this, small);
1515         break;
1516     case TCG_COND_GTU:
1517         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1518                          label_this, small);
1519         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1521                          label_this, small);
1522         break;
1523     case TCG_COND_GEU:
1524         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1525                          label_this, small);
1526         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1527         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1528                          label_this, small);
1529         break;
1530     default:
1531         g_assert_not_reached();
1532     }
1533     tcg_out_label(s, label_next);
1534 }
1535 #endif
1536
1537 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1538                               TCGArg arg1, TCGArg arg2, int const_arg2)
1539 {
1540     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1541     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1542     tcg_out_ext8u(s, dest, dest);
1543 }
1544
1545 #if TCG_TARGET_REG_BITS == 64
1546 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1547                               TCGArg arg1, TCGArg arg2, int const_arg2)
1548 {
1549     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1550     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1551     tcg_out_ext8u(s, dest, dest);
1552 }
1553 #else
1554 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1555                              const int *const_args)
1556 {
1557     TCGArg new_args[6];
1558     TCGLabel *label_true, *label_over;
1559
1560     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1561
1562     if (args[0] == args[1] || args[0] == args[2]
1563         || (!const_args[3] && args[0] == args[3])
1564         || (!const_args[4] && args[0] == args[4])) {
1565         /* When the destination overlaps with one of the argument
1566            registers, don't do anything tricky.  */
1567         label_true = gen_new_label();
1568         label_over = gen_new_label();
1569
1570         new_args[5] = label_arg(label_true);
1571         tcg_out_brcond2(s, new_args, const_args+1, 1);
1572
1573         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1574         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1575         tcg_out_label(s, label_true);
1576
1577         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1578         tcg_out_label(s, label_over);
1579     } else {
1580         /* When the destination does not overlap one of the arguments,
1581            clear the destination first, jump if cond false, and emit an
1582            increment in the true case.  This results in smaller code.  */
1583
1584         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1585
1586         label_over = gen_new_label();
1587         new_args[4] = tcg_invert_cond(new_args[4]);
1588         new_args[5] = label_arg(label_over);
1589         tcg_out_brcond2(s, new_args, const_args+1, 1);
1590
1591         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1592         tcg_out_label(s, label_over);
1593     }
1594 }
1595 #endif
1596
1597 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1598                          TCGReg dest, TCGReg v1)
1599 {
1600     if (have_cmov) {
1601         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1602     } else {
1603         TCGLabel *over = gen_new_label();
1604         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1605         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1606         tcg_out_label(s, over);
1607     }
1608 }
1609
1610 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1611                               TCGReg c1, TCGArg c2, int const_c2,
1612                               TCGReg v1)
1613 {
1614     tcg_out_cmp(s, c1, c2, const_c2, 0);
1615     tcg_out_cmov(s, cond, 0, dest, v1);
1616 }
1617
1618 #if TCG_TARGET_REG_BITS == 64
1619 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1620                               TCGReg c1, TCGArg c2, int const_c2,
1621                               TCGReg v1)
1622 {
1623     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1624     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1625 }
1626 #endif
1627
1628 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1629                         TCGArg arg2, bool const_a2)
1630 {
1631     if (have_bmi1) {
1632         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1633         if (const_a2) {
1634             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1635         } else {
1636             tcg_debug_assert(dest != arg2);
1637             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1638         }
1639     } else {
1640         tcg_debug_assert(dest != arg2);
1641         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1642         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1643     }
1644 }
1645
1646 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1647                         TCGArg arg2, bool const_a2)
1648 {
1649     if (have_lzcnt) {
1650         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1651         if (const_a2) {
1652             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1653         } else {
1654             tcg_debug_assert(dest != arg2);
1655             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1656         }
1657     } else {
1658         tcg_debug_assert(!const_a2);
1659         tcg_debug_assert(dest != arg1);
1660         tcg_debug_assert(dest != arg2);
1661
1662         /* Recall that the output of BSR is the index not the count.  */
1663         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1664         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1665
1666         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1667         tcg_out_cmp(s, arg1, 0, 1, rexw);
1668         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1669     }
1670 }
1671
1672 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1673 {
1674     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1675
1676     if (disp == (int32_t)disp) {
1677         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1678         tcg_out32(s, disp);
1679     } else {
1680         /* rip-relative addressing into the constant pool.
1681            This is 6 + 8 = 14 bytes, as compared to using an
1682            immediate load 10 + 6 = 16 bytes, plus we may
1683            be able to re-use the pool constant for more calls.  */
1684         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1685         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1686         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1687         tcg_out32(s, 0);
1688     }
1689 }
1690
1691 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1692                          const TCGHelperInfo *info)
1693 {
1694     tcg_out_branch(s, 1, dest);
1695
1696 #ifndef _WIN32
1697     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1698         /*
1699          * The sysv i386 abi for struct return places a reference as the
1700          * first argument of the stack, and pops that argument with the
1701          * return statement.  Since we want to retain the aligned stack
1702          * pointer for the callee, we do not want to actually push that
1703          * argument before the call but rely on the normal store to the
1704          * stack slot.  But we do need to compensate for the pop in order
1705          * to reset our correct stack pointer value.
1706          * Pushing a garbage value back onto the stack is quickest.
1707          */
1708         tcg_out_push(s, TCG_REG_EAX);
1709     }
1710 #endif
1711 }
1712
1713 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1714 {
1715     tcg_out_branch(s, 0, dest);
1716 }
1717
1718 static void tcg_out_nopn(TCGContext *s, int n)
1719 {
1720     int i;
1721     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1722      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1723      * duplicate prefix, and all of the interesting recent cores can
1724      * decode and discard the duplicates in a single cycle.
1725      */
1726     tcg_debug_assert(n >= 1);
1727     for (i = 1; i < n; ++i) {
1728         tcg_out8(s, 0x66);
1729     }
1730     tcg_out8(s, 0x90);
1731 }
1732
1733 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1734 static void __attribute__((unused))
1735 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1736 {
1737     /*
1738      * This is used for testing alignment, so we can usually use testb.
1739      * For i686, we have to use testl for %esi/%edi.
1740      */
1741     if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1742         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1743         tcg_out8(s, i);
1744     } else {
1745         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1746         tcg_out32(s, i);
1747     }
1748 }
1749
1750 typedef struct {
1751     TCGReg base;
1752     int index;
1753     int ofs;
1754     int seg;
1755     TCGAtomAlign aa;
1756 } HostAddress;
1757
1758 bool tcg_target_has_memory_bswap(MemOp memop)
1759 {
1760     TCGAtomAlign aa;
1761
1762     if (!have_movbe) {
1763         return false;
1764     }
1765     if ((memop & MO_SIZE) < MO_128) {
1766         return true;
1767     }
1768
1769     /*
1770      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1771      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1772      */
1773     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1774     return aa.atom < MO_128;
1775 }
1776
1777 /*
1778  * Because i686 has no register parameters and because x86_64 has xchg
1779  * to handle addr/data register overlap, we have placed all input arguments
1780  * before we need might need a scratch reg.
1781  *
1782  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1783  * a general-purpose scratch when we don't actually know it's available,
1784  * use the ra_gen hook to load into RAX if needed.
1785  */
1786 #if TCG_TARGET_REG_BITS == 64
1787 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1788 {
1789     if (arg < 0) {
1790         arg = TCG_REG_RAX;
1791     }
1792     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1793     return arg;
1794 }
1795 static const TCGLdstHelperParam ldst_helper_param = {
1796     .ra_gen = ldst_ra_gen
1797 };
1798 #else
1799 static const TCGLdstHelperParam ldst_helper_param = { };
1800 #endif
1801
1802 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1803                                 TCGReg l, TCGReg h, TCGReg v)
1804 {
1805     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1806
1807     /* vpmov{d,q} %v, %l */
1808     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1809     /* vpextr{d,q} $1, %v, %h */
1810     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1811     tcg_out8(s, 1);
1812 }
1813
1814 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1815                                 TCGReg v, TCGReg l, TCGReg h)
1816 {
1817     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1818
1819     /* vmov{d,q} %l, %v */
1820     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1821     /* vpinsr{d,q} $1, %h, %v, %v */
1822     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1823     tcg_out8(s, 1);
1824 }
1825
1826 /*
1827  * Generate code for the slow path for a load at the end of block
1828  */
1829 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1830 {
1831     MemOp opc = get_memop(l->oi);
1832     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1833
1834     /* resolve label address */
1835     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1836     if (label_ptr[1]) {
1837         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1838     }
1839
1840     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1841     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1842     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1843
1844     tcg_out_jmp(s, l->raddr);
1845     return true;
1846 }
1847
1848 /*
1849  * Generate code for the slow path for a store at the end of block
1850  */
1851 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1852 {
1853     MemOp opc = get_memop(l->oi);
1854     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1855
1856     /* resolve label address */
1857     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1858     if (label_ptr[1]) {
1859         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1860     }
1861
1862     tcg_out_st_helper_args(s, l, &ldst_helper_param);
1863     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1864
1865     tcg_out_jmp(s, l->raddr);
1866     return true;
1867 }
1868
1869 #ifndef CONFIG_SOFTMMU
1870 static HostAddress x86_guest_base = {
1871     .index = -1
1872 };
1873
1874 #if defined(__x86_64__) && defined(__linux__)
1875 # include <asm/prctl.h>
1876 # include <sys/prctl.h>
1877 int arch_prctl(int code, unsigned long addr);
1878 static inline int setup_guest_base_seg(void)
1879 {
1880     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1881         return P_GS;
1882     }
1883     return 0;
1884 }
1885 #elif defined(__x86_64__) && \
1886       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1887 # include <machine/sysarch.h>
1888 static inline int setup_guest_base_seg(void)
1889 {
1890     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1891         return P_GS;
1892     }
1893     return 0;
1894 }
1895 #else
1896 static inline int setup_guest_base_seg(void)
1897 {
1898     return 0;
1899 }
1900 #endif /* setup_guest_base_seg */
1901 #endif /* !SOFTMMU */
1902
1903 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
1904
1905 /*
1906  * For softmmu, perform the TLB load and compare.
1907  * For useronly, perform any required alignment tests.
1908  * In both cases, return a TCGLabelQemuLdst structure if the slow path
1909  * is required and fill in @h with the host address for the fast path.
1910  */
1911 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1912                                            TCGReg addrlo, TCGReg addrhi,
1913                                            MemOpIdx oi, bool is_ld)
1914 {
1915     TCGLabelQemuLdst *ldst = NULL;
1916     MemOp opc = get_memop(oi);
1917     MemOp s_bits = opc & MO_SIZE;
1918     unsigned a_mask;
1919
1920 #ifdef CONFIG_SOFTMMU
1921     h->index = TCG_REG_L0;
1922     h->ofs = 0;
1923     h->seg = 0;
1924 #else
1925     *h = x86_guest_base;
1926 #endif
1927     h->base = addrlo;
1928     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
1929     a_mask = (1 << h->aa.align) - 1;
1930
1931 #ifdef CONFIG_SOFTMMU
1932     int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1933                         : offsetof(CPUTLBEntry, addr_write);
1934     TCGType ttype = TCG_TYPE_I32;
1935     TCGType tlbtype = TCG_TYPE_I32;
1936     int trexw = 0, hrexw = 0, tlbrexw = 0;
1937     unsigned mem_index = get_mmuidx(oi);
1938     unsigned s_mask = (1 << s_bits) - 1;
1939     int fast_ofs = tlb_mask_table_ofs(s, mem_index);
1940     int tlb_mask;
1941
1942     ldst = new_ldst_label(s);
1943     ldst->is_ld = is_ld;
1944     ldst->oi = oi;
1945     ldst->addrlo_reg = addrlo;
1946     ldst->addrhi_reg = addrhi;
1947
1948     if (TCG_TARGET_REG_BITS == 64) {
1949         ttype = s->addr_type;
1950         trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
1951         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1952             hrexw = P_REXW;
1953             if (s->page_bits + s->tlb_dyn_max_bits > 32) {
1954                 tlbtype = TCG_TYPE_I64;
1955                 tlbrexw = P_REXW;
1956             }
1957         }
1958     }
1959
1960     tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1961     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1962                    s->page_bits - CPU_TLB_ENTRY_BITS);
1963
1964     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1965                          fast_ofs + offsetof(CPUTLBDescFast, mask));
1966
1967     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1968                          fast_ofs + offsetof(CPUTLBDescFast, table));
1969
1970     /*
1971      * If the required alignment is at least as large as the access, simply
1972      * copy the address and mask.  For lesser alignments, check that we don't
1973      * cross pages for the complete access.
1974      */
1975     if (a_mask >= s_mask) {
1976         tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
1977     } else {
1978         tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
1979                              addrlo, s_mask - a_mask);
1980     }
1981     tlb_mask = s->page_mask | a_mask;
1982     tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
1983
1984     /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
1985     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
1986                          TCG_REG_L1, TCG_REG_L0, cmp_ofs);
1987
1988     /* jne slow_path */
1989     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1990     ldst->label_ptr[0] = s->code_ptr;
1991     s->code_ptr += 4;
1992
1993     if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
1994         /* cmp 4(TCG_REG_L0), addrhi */
1995         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
1996
1997         /* jne slow_path */
1998         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1999         ldst->label_ptr[1] = s->code_ptr;
2000         s->code_ptr += 4;
2001     }
2002
2003     /* TLB Hit.  */
2004     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2005                offsetof(CPUTLBEntry, addend));
2006 #else
2007     if (a_mask) {
2008         ldst = new_ldst_label(s);
2009
2010         ldst->is_ld = is_ld;
2011         ldst->oi = oi;
2012         ldst->addrlo_reg = addrlo;
2013         ldst->addrhi_reg = addrhi;
2014
2015         tcg_out_testi(s, addrlo, a_mask);
2016         /* jne slow_path */
2017         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2018         ldst->label_ptr[0] = s->code_ptr;
2019         s->code_ptr += 4;
2020     }
2021 #endif
2022
2023     return ldst;
2024 }
2025
2026 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2027                                    HostAddress h, TCGType type, MemOp memop)
2028 {
2029     bool use_movbe = false;
2030     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2031     int movop = OPC_MOVL_GvEv;
2032
2033     /* Do big-endian loads with movbe.  */
2034     if (memop & MO_BSWAP) {
2035         tcg_debug_assert(have_movbe);
2036         use_movbe = true;
2037         movop = OPC_MOVBE_GyMy;
2038     }
2039
2040     switch (memop & MO_SSIZE) {
2041     case MO_UB:
2042         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2043                                  h.base, h.index, 0, h.ofs);
2044         break;
2045     case MO_SB:
2046         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2047                                  h.base, h.index, 0, h.ofs);
2048         break;
2049     case MO_UW:
2050         if (use_movbe) {
2051             /* There is no extending movbe; only low 16-bits are modified.  */
2052             if (datalo != h.base && datalo != h.index) {
2053                 /* XOR breaks dependency chains.  */
2054                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2055                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2056                                          datalo, h.base, h.index, 0, h.ofs);
2057             } else {
2058                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2059                                          datalo, h.base, h.index, 0, h.ofs);
2060                 tcg_out_ext16u(s, datalo, datalo);
2061             }
2062         } else {
2063             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2064                                      h.base, h.index, 0, h.ofs);
2065         }
2066         break;
2067     case MO_SW:
2068         if (use_movbe) {
2069             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2070                                      datalo, h.base, h.index, 0, h.ofs);
2071             tcg_out_ext16s(s, type, datalo, datalo);
2072         } else {
2073             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2074                                      datalo, h.base, h.index, 0, h.ofs);
2075         }
2076         break;
2077     case MO_UL:
2078         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2079                                  h.base, h.index, 0, h.ofs);
2080         break;
2081 #if TCG_TARGET_REG_BITS == 64
2082     case MO_SL:
2083         if (use_movbe) {
2084             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2085                                      h.base, h.index, 0, h.ofs);
2086             tcg_out_ext32s(s, datalo, datalo);
2087         } else {
2088             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2089                                      h.base, h.index, 0, h.ofs);
2090         }
2091         break;
2092 #endif
2093     case MO_UQ:
2094         if (TCG_TARGET_REG_BITS == 64) {
2095             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2096                                      h.base, h.index, 0, h.ofs);
2097             break;
2098         }
2099         if (use_movbe) {
2100             TCGReg t = datalo;
2101             datalo = datahi;
2102             datahi = t;
2103         }
2104         if (h.base == datalo || h.index == datalo) {
2105             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2106                                      h.base, h.index, 0, h.ofs);
2107             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2108             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2109         } else {
2110             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2111                                      h.base, h.index, 0, h.ofs);
2112             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2113                                      h.base, h.index, 0, h.ofs + 4);
2114         }
2115         break;
2116
2117     case MO_128:
2118         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2119
2120         /*
2121          * Without 16-byte atomicity, use integer regs.
2122          * That is where we want the data, and it allows bswaps.
2123          */
2124         if (h.aa.atom < MO_128) {
2125             if (use_movbe) {
2126                 TCGReg t = datalo;
2127                 datalo = datahi;
2128                 datahi = t;
2129             }
2130             if (h.base == datalo || h.index == datalo) {
2131                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2132                                          h.base, h.index, 0, h.ofs);
2133                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2134                                      datalo, datahi, 0);
2135                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2136                                      datahi, datahi, 8);
2137             } else {
2138                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2139                                          h.base, h.index, 0, h.ofs);
2140                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2141                                          h.base, h.index, 0, h.ofs + 8);
2142             }
2143             break;
2144         }
2145
2146         /*
2147          * With 16-byte atomicity, a vector load is required.
2148          * If we already have 16-byte alignment, then VMOVDQA always works.
2149          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2150          * Else use we require a runtime test for alignment for VMOVDQA;
2151          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2152          */
2153         if (h.aa.align >= MO_128) {
2154             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2155                                          TCG_TMP_VEC, 0,
2156                                          h.base, h.index, 0, h.ofs);
2157         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2158             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2159                                          TCG_TMP_VEC, 0,
2160                                          h.base, h.index, 0, h.ofs);
2161         } else {
2162             TCGLabel *l1 = gen_new_label();
2163             TCGLabel *l2 = gen_new_label();
2164
2165             tcg_out_testi(s, h.base, 15);
2166             tcg_out_jxx(s, JCC_JNE, l1, true);
2167
2168             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2169                                          TCG_TMP_VEC, 0,
2170                                          h.base, h.index, 0, h.ofs);
2171             tcg_out_jxx(s, JCC_JMP, l2, true);
2172
2173             tcg_out_label(s, l1);
2174             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2175                                          TCG_TMP_VEC, 0,
2176                                          h.base, h.index, 0, h.ofs);
2177             tcg_out_label(s, l2);
2178         }
2179         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2180         break;
2181
2182     default:
2183         g_assert_not_reached();
2184     }
2185 }
2186
2187 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2188                             TCGReg addrlo, TCGReg addrhi,
2189                             MemOpIdx oi, TCGType data_type)
2190 {
2191     TCGLabelQemuLdst *ldst;
2192     HostAddress h;
2193
2194     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2195     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2196
2197     if (ldst) {
2198         ldst->type = data_type;
2199         ldst->datalo_reg = datalo;
2200         ldst->datahi_reg = datahi;
2201         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2202     }
2203 }
2204
2205 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2206                                    HostAddress h, MemOp memop)
2207 {
2208     bool use_movbe = false;
2209     int movop = OPC_MOVL_EvGv;
2210
2211     /*
2212      * Do big-endian stores with movbe or softmmu.
2213      * User-only without movbe will have its swapping done generically.
2214      */
2215     if (memop & MO_BSWAP) {
2216         tcg_debug_assert(have_movbe);
2217         use_movbe = true;
2218         movop = OPC_MOVBE_MyGy;
2219     }
2220
2221     switch (memop & MO_SIZE) {
2222     case MO_8:
2223         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2224         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2225         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2226                                  datalo, h.base, h.index, 0, h.ofs);
2227         break;
2228     case MO_16:
2229         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2230                                  h.base, h.index, 0, h.ofs);
2231         break;
2232     case MO_32:
2233         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2234                                  h.base, h.index, 0, h.ofs);
2235         break;
2236     case MO_64:
2237         if (TCG_TARGET_REG_BITS == 64) {
2238             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2239                                      h.base, h.index, 0, h.ofs);
2240         } else {
2241             if (use_movbe) {
2242                 TCGReg t = datalo;
2243                 datalo = datahi;
2244                 datahi = t;
2245             }
2246             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2247                                      h.base, h.index, 0, h.ofs);
2248             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2249                                      h.base, h.index, 0, h.ofs + 4);
2250         }
2251         break;
2252
2253     case MO_128:
2254         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2255
2256         /*
2257          * Without 16-byte atomicity, use integer regs.
2258          * That is where we have the data, and it allows bswaps.
2259          */
2260         if (h.aa.atom < MO_128) {
2261             if (use_movbe) {
2262                 TCGReg t = datalo;
2263                 datalo = datahi;
2264                 datahi = t;
2265             }
2266             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2267                                      h.base, h.index, 0, h.ofs);
2268             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2269                                      h.base, h.index, 0, h.ofs + 8);
2270             break;
2271         }
2272
2273         /*
2274          * With 16-byte atomicity, a vector store is required.
2275          * If we already have 16-byte alignment, then VMOVDQA always works.
2276          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2277          * Else use we require a runtime test for alignment for VMOVDQA;
2278          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2279          */
2280         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2281         if (h.aa.align >= MO_128) {
2282             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2283                                          TCG_TMP_VEC, 0,
2284                                          h.base, h.index, 0, h.ofs);
2285         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2286             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2287                                          TCG_TMP_VEC, 0,
2288                                          h.base, h.index, 0, h.ofs);
2289         } else {
2290             TCGLabel *l1 = gen_new_label();
2291             TCGLabel *l2 = gen_new_label();
2292
2293             tcg_out_testi(s, h.base, 15);
2294             tcg_out_jxx(s, JCC_JNE, l1, true);
2295
2296             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2297                                          TCG_TMP_VEC, 0,
2298                                          h.base, h.index, 0, h.ofs);
2299             tcg_out_jxx(s, JCC_JMP, l2, true);
2300
2301             tcg_out_label(s, l1);
2302             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2303                                          TCG_TMP_VEC, 0,
2304                                          h.base, h.index, 0, h.ofs);
2305             tcg_out_label(s, l2);
2306         }
2307         break;
2308
2309     default:
2310         g_assert_not_reached();
2311     }
2312 }
2313
2314 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2315                             TCGReg addrlo, TCGReg addrhi,
2316                             MemOpIdx oi, TCGType data_type)
2317 {
2318     TCGLabelQemuLdst *ldst;
2319     HostAddress h;
2320
2321     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2322     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2323
2324     if (ldst) {
2325         ldst->type = data_type;
2326         ldst->datalo_reg = datalo;
2327         ldst->datahi_reg = datahi;
2328         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2329     }
2330 }
2331
2332 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2333 {
2334     /* Reuse the zeroing that exists for goto_ptr.  */
2335     if (a0 == 0) {
2336         tcg_out_jmp(s, tcg_code_gen_epilogue);
2337     } else {
2338         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2339         tcg_out_jmp(s, tb_ret_addr);
2340     }
2341 }
2342
2343 static void tcg_out_goto_tb(TCGContext *s, int which)
2344 {
2345     /*
2346      * Jump displacement must be aligned for atomic patching;
2347      * see if we need to add extra nops before jump
2348      */
2349     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2350     if (gap != 1) {
2351         tcg_out_nopn(s, gap - 1);
2352     }
2353     tcg_out8(s, OPC_JMP_long); /* jmp im */
2354     set_jmp_insn_offset(s, which);
2355     tcg_out32(s, 0);
2356     set_jmp_reset_offset(s, which);
2357 }
2358
2359 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2360                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2361 {
2362     /* patch the branch destination */
2363     uintptr_t addr = tb->jmp_target_addr[n];
2364     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2365     /* no need to flush icache explicitly */
2366 }
2367
2368 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2369                               const TCGArg args[TCG_MAX_OP_ARGS],
2370                               const int const_args[TCG_MAX_OP_ARGS])
2371 {
2372     TCGArg a0, a1, a2;
2373     int c, const_a2, vexop, rexw = 0;
2374
2375 #if TCG_TARGET_REG_BITS == 64
2376 # define OP_32_64(x) \
2377         case glue(glue(INDEX_op_, x), _i64): \
2378             rexw = P_REXW; /* FALLTHRU */    \
2379         case glue(glue(INDEX_op_, x), _i32)
2380 #else
2381 # define OP_32_64(x) \
2382         case glue(glue(INDEX_op_, x), _i32)
2383 #endif
2384
2385     /* Hoist the loads of the most common arguments.  */
2386     a0 = args[0];
2387     a1 = args[1];
2388     a2 = args[2];
2389     const_a2 = const_args[2];
2390
2391     switch (opc) {
2392     case INDEX_op_goto_ptr:
2393         /* jmp to the given host address (could be epilogue) */
2394         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2395         break;
2396     case INDEX_op_br:
2397         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2398         break;
2399     OP_32_64(ld8u):
2400         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2401         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2402         break;
2403     OP_32_64(ld8s):
2404         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2405         break;
2406     OP_32_64(ld16u):
2407         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2408         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2409         break;
2410     OP_32_64(ld16s):
2411         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2412         break;
2413 #if TCG_TARGET_REG_BITS == 64
2414     case INDEX_op_ld32u_i64:
2415 #endif
2416     case INDEX_op_ld_i32:
2417         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2418         break;
2419
2420     OP_32_64(st8):
2421         if (const_args[0]) {
2422             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2423             tcg_out8(s, a0);
2424         } else {
2425             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2426         }
2427         break;
2428     OP_32_64(st16):
2429         if (const_args[0]) {
2430             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2431             tcg_out16(s, a0);
2432         } else {
2433             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2434         }
2435         break;
2436 #if TCG_TARGET_REG_BITS == 64
2437     case INDEX_op_st32_i64:
2438 #endif
2439     case INDEX_op_st_i32:
2440         if (const_args[0]) {
2441             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2442             tcg_out32(s, a0);
2443         } else {
2444             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2445         }
2446         break;
2447
2448     OP_32_64(add):
2449         /* For 3-operand addition, use LEA.  */
2450         if (a0 != a1) {
2451             TCGArg c3 = 0;
2452             if (const_a2) {
2453                 c3 = a2, a2 = -1;
2454             } else if (a0 == a2) {
2455                 /* Watch out for dest = src + dest, since we've removed
2456                    the matching constraint on the add.  */
2457                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2458                 break;
2459             }
2460
2461             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2462             break;
2463         }
2464         c = ARITH_ADD;
2465         goto gen_arith;
2466     OP_32_64(sub):
2467         c = ARITH_SUB;
2468         goto gen_arith;
2469     OP_32_64(and):
2470         c = ARITH_AND;
2471         goto gen_arith;
2472     OP_32_64(or):
2473         c = ARITH_OR;
2474         goto gen_arith;
2475     OP_32_64(xor):
2476         c = ARITH_XOR;
2477         goto gen_arith;
2478     gen_arith:
2479         if (const_a2) {
2480             tgen_arithi(s, c + rexw, a0, a2, 0);
2481         } else {
2482             tgen_arithr(s, c + rexw, a0, a2);
2483         }
2484         break;
2485
2486     OP_32_64(andc):
2487         if (const_a2) {
2488             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2489             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2490         } else {
2491             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2492         }
2493         break;
2494
2495     OP_32_64(mul):
2496         if (const_a2) {
2497             int32_t val;
2498             val = a2;
2499             if (val == (int8_t)val) {
2500                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2501                 tcg_out8(s, val);
2502             } else {
2503                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2504                 tcg_out32(s, val);
2505             }
2506         } else {
2507             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2508         }
2509         break;
2510
2511     OP_32_64(div2):
2512         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2513         break;
2514     OP_32_64(divu2):
2515         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2516         break;
2517
2518     OP_32_64(shl):
2519         /* For small constant 3-operand shift, use LEA.  */
2520         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2521             if (a2 - 1 == 0) {
2522                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2523                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2524             } else {
2525                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2526                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2527             }
2528             break;
2529         }
2530         c = SHIFT_SHL;
2531         vexop = OPC_SHLX;
2532         goto gen_shift_maybe_vex;
2533     OP_32_64(shr):
2534         c = SHIFT_SHR;
2535         vexop = OPC_SHRX;
2536         goto gen_shift_maybe_vex;
2537     OP_32_64(sar):
2538         c = SHIFT_SAR;
2539         vexop = OPC_SARX;
2540         goto gen_shift_maybe_vex;
2541     OP_32_64(rotl):
2542         c = SHIFT_ROL;
2543         goto gen_shift;
2544     OP_32_64(rotr):
2545         c = SHIFT_ROR;
2546         goto gen_shift;
2547     gen_shift_maybe_vex:
2548         if (have_bmi2) {
2549             if (!const_a2) {
2550                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2551                 break;
2552             }
2553             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2554         }
2555         /* FALLTHRU */
2556     gen_shift:
2557         if (const_a2) {
2558             tcg_out_shifti(s, c + rexw, a0, a2);
2559         } else {
2560             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2561         }
2562         break;
2563
2564     OP_32_64(ctz):
2565         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2566         break;
2567     OP_32_64(clz):
2568         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2569         break;
2570     OP_32_64(ctpop):
2571         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2572         break;
2573
2574     case INDEX_op_brcond_i32:
2575         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2576         break;
2577     case INDEX_op_setcond_i32:
2578         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2579         break;
2580     case INDEX_op_movcond_i32:
2581         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2582         break;
2583
2584     OP_32_64(bswap16):
2585         if (a2 & TCG_BSWAP_OS) {
2586             /* Output must be sign-extended. */
2587             if (rexw) {
2588                 tcg_out_bswap64(s, a0);
2589                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2590             } else {
2591                 tcg_out_bswap32(s, a0);
2592                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2593             }
2594         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2595             /* Output must be zero-extended, but input isn't. */
2596             tcg_out_bswap32(s, a0);
2597             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2598         } else {
2599             tcg_out_rolw_8(s, a0);
2600         }
2601         break;
2602     OP_32_64(bswap32):
2603         tcg_out_bswap32(s, a0);
2604         if (rexw && (a2 & TCG_BSWAP_OS)) {
2605             tcg_out_ext32s(s, a0, a0);
2606         }
2607         break;
2608
2609     OP_32_64(neg):
2610         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2611         break;
2612     OP_32_64(not):
2613         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2614         break;
2615
2616     case INDEX_op_qemu_ld_a64_i32:
2617         if (TCG_TARGET_REG_BITS == 32) {
2618             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2619             break;
2620         }
2621         /* fall through */
2622     case INDEX_op_qemu_ld_a32_i32:
2623         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2624         break;
2625     case INDEX_op_qemu_ld_a32_i64:
2626         if (TCG_TARGET_REG_BITS == 64) {
2627             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2628         } else {
2629             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2630         }
2631         break;
2632     case INDEX_op_qemu_ld_a64_i64:
2633         if (TCG_TARGET_REG_BITS == 64) {
2634             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2635         } else {
2636             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2637         }
2638         break;
2639     case INDEX_op_qemu_ld_a32_i128:
2640     case INDEX_op_qemu_ld_a64_i128:
2641         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2642         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2643         break;
2644
2645     case INDEX_op_qemu_st_a64_i32:
2646     case INDEX_op_qemu_st8_a64_i32:
2647         if (TCG_TARGET_REG_BITS == 32) {
2648             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2649             break;
2650         }
2651         /* fall through */
2652     case INDEX_op_qemu_st_a32_i32:
2653     case INDEX_op_qemu_st8_a32_i32:
2654         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2655         break;
2656     case INDEX_op_qemu_st_a32_i64:
2657         if (TCG_TARGET_REG_BITS == 64) {
2658             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2659         } else {
2660             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2661         }
2662         break;
2663     case INDEX_op_qemu_st_a64_i64:
2664         if (TCG_TARGET_REG_BITS == 64) {
2665             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2666         } else {
2667             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2668         }
2669         break;
2670     case INDEX_op_qemu_st_a32_i128:
2671     case INDEX_op_qemu_st_a64_i128:
2672         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2673         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2674         break;
2675
2676     OP_32_64(mulu2):
2677         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2678         break;
2679     OP_32_64(muls2):
2680         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2681         break;
2682     OP_32_64(add2):
2683         if (const_args[4]) {
2684             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2685         } else {
2686             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2687         }
2688         if (const_args[5]) {
2689             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2690         } else {
2691             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2692         }
2693         break;
2694     OP_32_64(sub2):
2695         if (const_args[4]) {
2696             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2697         } else {
2698             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2699         }
2700         if (const_args[5]) {
2701             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2702         } else {
2703             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2704         }
2705         break;
2706
2707 #if TCG_TARGET_REG_BITS == 32
2708     case INDEX_op_brcond2_i32:
2709         tcg_out_brcond2(s, args, const_args, 0);
2710         break;
2711     case INDEX_op_setcond2_i32:
2712         tcg_out_setcond2(s, args, const_args);
2713         break;
2714 #else /* TCG_TARGET_REG_BITS == 64 */
2715     case INDEX_op_ld32s_i64:
2716         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2717         break;
2718     case INDEX_op_ld_i64:
2719         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2720         break;
2721     case INDEX_op_st_i64:
2722         if (const_args[0]) {
2723             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2724             tcg_out32(s, a0);
2725         } else {
2726             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2727         }
2728         break;
2729
2730     case INDEX_op_brcond_i64:
2731         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2732         break;
2733     case INDEX_op_setcond_i64:
2734         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2735         break;
2736     case INDEX_op_movcond_i64:
2737         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2738         break;
2739
2740     case INDEX_op_bswap64_i64:
2741         tcg_out_bswap64(s, a0);
2742         break;
2743     case INDEX_op_extrh_i64_i32:
2744         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2745         break;
2746 #endif
2747
2748     OP_32_64(deposit):
2749         if (args[3] == 0 && args[4] == 8) {
2750             /* load bits 0..7 */
2751             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2752         } else if (args[3] == 8 && args[4] == 8) {
2753             /* load bits 8..15 */
2754             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2755         } else if (args[3] == 0 && args[4] == 16) {
2756             /* load bits 0..15 */
2757             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2758         } else {
2759             g_assert_not_reached();
2760         }
2761         break;
2762
2763     case INDEX_op_extract_i64:
2764         if (a2 + args[3] == 32) {
2765             /* This is a 32-bit zero-extending right shift.  */
2766             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2767             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2768             break;
2769         }
2770         /* FALLTHRU */
2771     case INDEX_op_extract_i32:
2772         /* On the off-chance that we can use the high-byte registers.
2773            Otherwise we emit the same ext16 + shift pattern that we
2774            would have gotten from the normal tcg-op.c expansion.  */
2775         tcg_debug_assert(a2 == 8 && args[3] == 8);
2776         if (a1 < 4 && a0 < 8) {
2777             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2778         } else {
2779             tcg_out_ext16u(s, a0, a1);
2780             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2781         }
2782         break;
2783
2784     case INDEX_op_sextract_i32:
2785         /* We don't implement sextract_i64, as we cannot sign-extend to
2786            64-bits without using the REX prefix that explicitly excludes
2787            access to the high-byte registers.  */
2788         tcg_debug_assert(a2 == 8 && args[3] == 8);
2789         if (a1 < 4 && a0 < 8) {
2790             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2791         } else {
2792             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2793             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2794         }
2795         break;
2796
2797     OP_32_64(extract2):
2798         /* Note that SHRD outputs to the r/m operand.  */
2799         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2800         tcg_out8(s, args[3]);
2801         break;
2802
2803     case INDEX_op_mb:
2804         tcg_out_mb(s, a0);
2805         break;
2806     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2807     case INDEX_op_mov_i64:
2808     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2809     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2810     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2811     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2812     case INDEX_op_ext8s_i64:
2813     case INDEX_op_ext8u_i32:
2814     case INDEX_op_ext8u_i64:
2815     case INDEX_op_ext16s_i32:
2816     case INDEX_op_ext16s_i64:
2817     case INDEX_op_ext16u_i32:
2818     case INDEX_op_ext16u_i64:
2819     case INDEX_op_ext32s_i64:
2820     case INDEX_op_ext32u_i64:
2821     case INDEX_op_ext_i32_i64:
2822     case INDEX_op_extu_i32_i64:
2823     case INDEX_op_extrl_i64_i32:
2824     default:
2825         g_assert_not_reached();
2826     }
2827
2828 #undef OP_32_64
2829 }
2830
2831 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2832                            unsigned vecl, unsigned vece,
2833                            const TCGArg args[TCG_MAX_OP_ARGS],
2834                            const int const_args[TCG_MAX_OP_ARGS])
2835 {
2836     static int const add_insn[4] = {
2837         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2838     };
2839     static int const ssadd_insn[4] = {
2840         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2841     };
2842     static int const usadd_insn[4] = {
2843         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2844     };
2845     static int const sub_insn[4] = {
2846         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2847     };
2848     static int const sssub_insn[4] = {
2849         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2850     };
2851     static int const ussub_insn[4] = {
2852         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2853     };
2854     static int const mul_insn[4] = {
2855         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2856     };
2857     static int const shift_imm_insn[4] = {
2858         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2859     };
2860     static int const cmpeq_insn[4] = {
2861         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2862     };
2863     static int const cmpgt_insn[4] = {
2864         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2865     };
2866     static int const punpckl_insn[4] = {
2867         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2868     };
2869     static int const punpckh_insn[4] = {
2870         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2871     };
2872     static int const packss_insn[4] = {
2873         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2874     };
2875     static int const packus_insn[4] = {
2876         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2877     };
2878     static int const smin_insn[4] = {
2879         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2880     };
2881     static int const smax_insn[4] = {
2882         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2883     };
2884     static int const umin_insn[4] = {
2885         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2886     };
2887     static int const umax_insn[4] = {
2888         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2889     };
2890     static int const rotlv_insn[4] = {
2891         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2892     };
2893     static int const rotrv_insn[4] = {
2894         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2895     };
2896     static int const shlv_insn[4] = {
2897         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2898     };
2899     static int const shrv_insn[4] = {
2900         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2901     };
2902     static int const sarv_insn[4] = {
2903         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2904     };
2905     static int const shls_insn[4] = {
2906         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2907     };
2908     static int const shrs_insn[4] = {
2909         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2910     };
2911     static int const sars_insn[4] = {
2912         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2913     };
2914     static int const vpshldi_insn[4] = {
2915         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2916     };
2917     static int const vpshldv_insn[4] = {
2918         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2919     };
2920     static int const vpshrdv_insn[4] = {
2921         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2922     };
2923     static int const abs_insn[4] = {
2924         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2925     };
2926
2927     TCGType type = vecl + TCG_TYPE_V64;
2928     int insn, sub;
2929     TCGArg a0, a1, a2, a3;
2930
2931     a0 = args[0];
2932     a1 = args[1];
2933     a2 = args[2];
2934
2935     switch (opc) {
2936     case INDEX_op_add_vec:
2937         insn = add_insn[vece];
2938         goto gen_simd;
2939     case INDEX_op_ssadd_vec:
2940         insn = ssadd_insn[vece];
2941         goto gen_simd;
2942     case INDEX_op_usadd_vec:
2943         insn = usadd_insn[vece];
2944         goto gen_simd;
2945     case INDEX_op_sub_vec:
2946         insn = sub_insn[vece];
2947         goto gen_simd;
2948     case INDEX_op_sssub_vec:
2949         insn = sssub_insn[vece];
2950         goto gen_simd;
2951     case INDEX_op_ussub_vec:
2952         insn = ussub_insn[vece];
2953         goto gen_simd;
2954     case INDEX_op_mul_vec:
2955         insn = mul_insn[vece];
2956         goto gen_simd;
2957     case INDEX_op_and_vec:
2958         insn = OPC_PAND;
2959         goto gen_simd;
2960     case INDEX_op_or_vec:
2961         insn = OPC_POR;
2962         goto gen_simd;
2963     case INDEX_op_xor_vec:
2964         insn = OPC_PXOR;
2965         goto gen_simd;
2966     case INDEX_op_smin_vec:
2967         insn = smin_insn[vece];
2968         goto gen_simd;
2969     case INDEX_op_umin_vec:
2970         insn = umin_insn[vece];
2971         goto gen_simd;
2972     case INDEX_op_smax_vec:
2973         insn = smax_insn[vece];
2974         goto gen_simd;
2975     case INDEX_op_umax_vec:
2976         insn = umax_insn[vece];
2977         goto gen_simd;
2978     case INDEX_op_shlv_vec:
2979         insn = shlv_insn[vece];
2980         goto gen_simd;
2981     case INDEX_op_shrv_vec:
2982         insn = shrv_insn[vece];
2983         goto gen_simd;
2984     case INDEX_op_sarv_vec:
2985         insn = sarv_insn[vece];
2986         goto gen_simd;
2987     case INDEX_op_rotlv_vec:
2988         insn = rotlv_insn[vece];
2989         goto gen_simd;
2990     case INDEX_op_rotrv_vec:
2991         insn = rotrv_insn[vece];
2992         goto gen_simd;
2993     case INDEX_op_shls_vec:
2994         insn = shls_insn[vece];
2995         goto gen_simd;
2996     case INDEX_op_shrs_vec:
2997         insn = shrs_insn[vece];
2998         goto gen_simd;
2999     case INDEX_op_sars_vec:
3000         insn = sars_insn[vece];
3001         goto gen_simd;
3002     case INDEX_op_x86_punpckl_vec:
3003         insn = punpckl_insn[vece];
3004         goto gen_simd;
3005     case INDEX_op_x86_punpckh_vec:
3006         insn = punpckh_insn[vece];
3007         goto gen_simd;
3008     case INDEX_op_x86_packss_vec:
3009         insn = packss_insn[vece];
3010         goto gen_simd;
3011     case INDEX_op_x86_packus_vec:
3012         insn = packus_insn[vece];
3013         goto gen_simd;
3014     case INDEX_op_x86_vpshldv_vec:
3015         insn = vpshldv_insn[vece];
3016         a1 = a2;
3017         a2 = args[3];
3018         goto gen_simd;
3019     case INDEX_op_x86_vpshrdv_vec:
3020         insn = vpshrdv_insn[vece];
3021         a1 = a2;
3022         a2 = args[3];
3023         goto gen_simd;
3024 #if TCG_TARGET_REG_BITS == 32
3025     case INDEX_op_dup2_vec:
3026         /* First merge the two 32-bit inputs to a single 64-bit element. */
3027         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3028         /* Then replicate the 64-bit elements across the rest of the vector. */
3029         if (type != TCG_TYPE_V64) {
3030             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3031         }
3032         break;
3033 #endif
3034     case INDEX_op_abs_vec:
3035         insn = abs_insn[vece];
3036         a2 = a1;
3037         a1 = 0;
3038         goto gen_simd;
3039     gen_simd:
3040         tcg_debug_assert(insn != OPC_UD2);
3041         if (type == TCG_TYPE_V256) {
3042             insn |= P_VEXL;
3043         }
3044         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3045         break;
3046
3047     case INDEX_op_cmp_vec:
3048         sub = args[3];
3049         if (sub == TCG_COND_EQ) {
3050             insn = cmpeq_insn[vece];
3051         } else if (sub == TCG_COND_GT) {
3052             insn = cmpgt_insn[vece];
3053         } else {
3054             g_assert_not_reached();
3055         }
3056         goto gen_simd;
3057
3058     case INDEX_op_andc_vec:
3059         insn = OPC_PANDN;
3060         if (type == TCG_TYPE_V256) {
3061             insn |= P_VEXL;
3062         }
3063         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3064         break;
3065
3066     case INDEX_op_shli_vec:
3067         insn = shift_imm_insn[vece];
3068         sub = 6;
3069         goto gen_shift;
3070     case INDEX_op_shri_vec:
3071         insn = shift_imm_insn[vece];
3072         sub = 2;
3073         goto gen_shift;
3074     case INDEX_op_sari_vec:
3075         if (vece == MO_64) {
3076             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3077         } else {
3078             insn = shift_imm_insn[vece];
3079         }
3080         sub = 4;
3081         goto gen_shift;
3082     case INDEX_op_rotli_vec:
3083         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3084         if (vece == MO_64) {
3085             insn |= P_VEXW;
3086         }
3087         sub = 1;
3088         goto gen_shift;
3089     gen_shift:
3090         tcg_debug_assert(vece != MO_8);
3091         if (type == TCG_TYPE_V256) {
3092             insn |= P_VEXL;
3093         }
3094         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3095         tcg_out8(s, a2);
3096         break;
3097
3098     case INDEX_op_ld_vec:
3099         tcg_out_ld(s, type, a0, a1, a2);
3100         break;
3101     case INDEX_op_st_vec:
3102         tcg_out_st(s, type, a0, a1, a2);
3103         break;
3104     case INDEX_op_dupm_vec:
3105         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3106         break;
3107
3108     case INDEX_op_x86_shufps_vec:
3109         insn = OPC_SHUFPS;
3110         sub = args[3];
3111         goto gen_simd_imm8;
3112     case INDEX_op_x86_blend_vec:
3113         if (vece == MO_16) {
3114             insn = OPC_PBLENDW;
3115         } else if (vece == MO_32) {
3116             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3117         } else {
3118             g_assert_not_reached();
3119         }
3120         sub = args[3];
3121         goto gen_simd_imm8;
3122     case INDEX_op_x86_vperm2i128_vec:
3123         insn = OPC_VPERM2I128;
3124         sub = args[3];
3125         goto gen_simd_imm8;
3126     case INDEX_op_x86_vpshldi_vec:
3127         insn = vpshldi_insn[vece];
3128         sub = args[3];
3129         goto gen_simd_imm8;
3130
3131     case INDEX_op_not_vec:
3132         insn = OPC_VPTERNLOGQ;
3133         a2 = a1;
3134         sub = 0x33; /* !B */
3135         goto gen_simd_imm8;
3136     case INDEX_op_nor_vec:
3137         insn = OPC_VPTERNLOGQ;
3138         sub = 0x11; /* norCB */
3139         goto gen_simd_imm8;
3140     case INDEX_op_nand_vec:
3141         insn = OPC_VPTERNLOGQ;
3142         sub = 0x77; /* nandCB */
3143         goto gen_simd_imm8;
3144     case INDEX_op_eqv_vec:
3145         insn = OPC_VPTERNLOGQ;
3146         sub = 0x99; /* xnorCB */
3147         goto gen_simd_imm8;
3148     case INDEX_op_orc_vec:
3149         insn = OPC_VPTERNLOGQ;
3150         sub = 0xdd; /* orB!C */
3151         goto gen_simd_imm8;
3152
3153     case INDEX_op_bitsel_vec:
3154         insn = OPC_VPTERNLOGQ;
3155         a3 = args[3];
3156         if (a0 == a1) {
3157             a1 = a2;
3158             a2 = a3;
3159             sub = 0xca; /* A?B:C */
3160         } else if (a0 == a2) {
3161             a2 = a3;
3162             sub = 0xe2; /* B?A:C */
3163         } else {
3164             tcg_out_mov(s, type, a0, a3);
3165             sub = 0xb8; /* B?C:A */
3166         }
3167         goto gen_simd_imm8;
3168
3169     gen_simd_imm8:
3170         tcg_debug_assert(insn != OPC_UD2);
3171         if (type == TCG_TYPE_V256) {
3172             insn |= P_VEXL;
3173         }
3174         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3175         tcg_out8(s, sub);
3176         break;
3177
3178     case INDEX_op_x86_vpblendvb_vec:
3179         insn = OPC_VPBLENDVB;
3180         if (type == TCG_TYPE_V256) {
3181             insn |= P_VEXL;
3182         }
3183         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3184         tcg_out8(s, args[3] << 4);
3185         break;
3186
3187     case INDEX_op_x86_psrldq_vec:
3188         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3189         tcg_out8(s, a2);
3190         break;
3191
3192     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3193     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3194     default:
3195         g_assert_not_reached();
3196     }
3197 }
3198
3199 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3200 {
3201     switch (op) {
3202     case INDEX_op_goto_ptr:
3203         return C_O0_I1(r);
3204
3205     case INDEX_op_ld8u_i32:
3206     case INDEX_op_ld8u_i64:
3207     case INDEX_op_ld8s_i32:
3208     case INDEX_op_ld8s_i64:
3209     case INDEX_op_ld16u_i32:
3210     case INDEX_op_ld16u_i64:
3211     case INDEX_op_ld16s_i32:
3212     case INDEX_op_ld16s_i64:
3213     case INDEX_op_ld_i32:
3214     case INDEX_op_ld32u_i64:
3215     case INDEX_op_ld32s_i64:
3216     case INDEX_op_ld_i64:
3217         return C_O1_I1(r, r);
3218
3219     case INDEX_op_st8_i32:
3220     case INDEX_op_st8_i64:
3221         return C_O0_I2(qi, r);
3222
3223     case INDEX_op_st16_i32:
3224     case INDEX_op_st16_i64:
3225     case INDEX_op_st_i32:
3226     case INDEX_op_st32_i64:
3227         return C_O0_I2(ri, r);
3228
3229     case INDEX_op_st_i64:
3230         return C_O0_I2(re, r);
3231
3232     case INDEX_op_add_i32:
3233     case INDEX_op_add_i64:
3234         return C_O1_I2(r, r, re);
3235
3236     case INDEX_op_sub_i32:
3237     case INDEX_op_sub_i64:
3238     case INDEX_op_mul_i32:
3239     case INDEX_op_mul_i64:
3240     case INDEX_op_or_i32:
3241     case INDEX_op_or_i64:
3242     case INDEX_op_xor_i32:
3243     case INDEX_op_xor_i64:
3244         return C_O1_I2(r, 0, re);
3245
3246     case INDEX_op_and_i32:
3247     case INDEX_op_and_i64:
3248         return C_O1_I2(r, 0, reZ);
3249
3250     case INDEX_op_andc_i32:
3251     case INDEX_op_andc_i64:
3252         return C_O1_I2(r, r, rI);
3253
3254     case INDEX_op_shl_i32:
3255     case INDEX_op_shl_i64:
3256     case INDEX_op_shr_i32:
3257     case INDEX_op_shr_i64:
3258     case INDEX_op_sar_i32:
3259     case INDEX_op_sar_i64:
3260         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3261
3262     case INDEX_op_rotl_i32:
3263     case INDEX_op_rotl_i64:
3264     case INDEX_op_rotr_i32:
3265     case INDEX_op_rotr_i64:
3266         return C_O1_I2(r, 0, ci);
3267
3268     case INDEX_op_brcond_i32:
3269     case INDEX_op_brcond_i64:
3270         return C_O0_I2(r, re);
3271
3272     case INDEX_op_bswap16_i32:
3273     case INDEX_op_bswap16_i64:
3274     case INDEX_op_bswap32_i32:
3275     case INDEX_op_bswap32_i64:
3276     case INDEX_op_bswap64_i64:
3277     case INDEX_op_neg_i32:
3278     case INDEX_op_neg_i64:
3279     case INDEX_op_not_i32:
3280     case INDEX_op_not_i64:
3281     case INDEX_op_extrh_i64_i32:
3282         return C_O1_I1(r, 0);
3283
3284     case INDEX_op_ext8s_i32:
3285     case INDEX_op_ext8s_i64:
3286     case INDEX_op_ext8u_i32:
3287     case INDEX_op_ext8u_i64:
3288         return C_O1_I1(r, q);
3289
3290     case INDEX_op_ext16s_i32:
3291     case INDEX_op_ext16s_i64:
3292     case INDEX_op_ext16u_i32:
3293     case INDEX_op_ext16u_i64:
3294     case INDEX_op_ext32s_i64:
3295     case INDEX_op_ext32u_i64:
3296     case INDEX_op_ext_i32_i64:
3297     case INDEX_op_extu_i32_i64:
3298     case INDEX_op_extrl_i64_i32:
3299     case INDEX_op_extract_i32:
3300     case INDEX_op_extract_i64:
3301     case INDEX_op_sextract_i32:
3302     case INDEX_op_ctpop_i32:
3303     case INDEX_op_ctpop_i64:
3304         return C_O1_I1(r, r);
3305
3306     case INDEX_op_extract2_i32:
3307     case INDEX_op_extract2_i64:
3308         return C_O1_I2(r, 0, r);
3309
3310     case INDEX_op_deposit_i32:
3311     case INDEX_op_deposit_i64:
3312         return C_O1_I2(Q, 0, Q);
3313
3314     case INDEX_op_setcond_i32:
3315     case INDEX_op_setcond_i64:
3316         return C_O1_I2(q, r, re);
3317
3318     case INDEX_op_movcond_i32:
3319     case INDEX_op_movcond_i64:
3320         return C_O1_I4(r, r, re, r, 0);
3321
3322     case INDEX_op_div2_i32:
3323     case INDEX_op_div2_i64:
3324     case INDEX_op_divu2_i32:
3325     case INDEX_op_divu2_i64:
3326         return C_O2_I3(a, d, 0, 1, r);
3327
3328     case INDEX_op_mulu2_i32:
3329     case INDEX_op_mulu2_i64:
3330     case INDEX_op_muls2_i32:
3331     case INDEX_op_muls2_i64:
3332         return C_O2_I2(a, d, a, r);
3333
3334     case INDEX_op_add2_i32:
3335     case INDEX_op_add2_i64:
3336     case INDEX_op_sub2_i32:
3337     case INDEX_op_sub2_i64:
3338         return C_N1_O1_I4(r, r, 0, 1, re, re);
3339
3340     case INDEX_op_ctz_i32:
3341     case INDEX_op_ctz_i64:
3342         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3343
3344     case INDEX_op_clz_i32:
3345     case INDEX_op_clz_i64:
3346         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3347
3348     case INDEX_op_qemu_ld_a32_i32:
3349         return C_O1_I1(r, L);
3350     case INDEX_op_qemu_ld_a64_i32:
3351         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3352
3353     case INDEX_op_qemu_st_a32_i32:
3354         return C_O0_I2(L, L);
3355     case INDEX_op_qemu_st_a64_i32:
3356         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3357     case INDEX_op_qemu_st8_a32_i32:
3358         return C_O0_I2(s, L);
3359     case INDEX_op_qemu_st8_a64_i32:
3360         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3361
3362     case INDEX_op_qemu_ld_a32_i64:
3363         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3364     case INDEX_op_qemu_ld_a64_i64:
3365         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3366
3367     case INDEX_op_qemu_st_a32_i64:
3368         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3369     case INDEX_op_qemu_st_a64_i64:
3370         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3371
3372     case INDEX_op_qemu_ld_a32_i128:
3373     case INDEX_op_qemu_ld_a64_i128:
3374         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3375         return C_O2_I1(r, r, L);
3376     case INDEX_op_qemu_st_a32_i128:
3377     case INDEX_op_qemu_st_a64_i128:
3378         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3379         return C_O0_I3(L, L, L);
3380
3381     case INDEX_op_brcond2_i32:
3382         return C_O0_I4(r, r, ri, ri);
3383
3384     case INDEX_op_setcond2_i32:
3385         return C_O1_I4(r, r, r, ri, ri);
3386
3387     case INDEX_op_ld_vec:
3388     case INDEX_op_dupm_vec:
3389         return C_O1_I1(x, r);
3390
3391     case INDEX_op_st_vec:
3392         return C_O0_I2(x, r);
3393
3394     case INDEX_op_add_vec:
3395     case INDEX_op_sub_vec:
3396     case INDEX_op_mul_vec:
3397     case INDEX_op_and_vec:
3398     case INDEX_op_or_vec:
3399     case INDEX_op_xor_vec:
3400     case INDEX_op_andc_vec:
3401     case INDEX_op_orc_vec:
3402     case INDEX_op_nand_vec:
3403     case INDEX_op_nor_vec:
3404     case INDEX_op_eqv_vec:
3405     case INDEX_op_ssadd_vec:
3406     case INDEX_op_usadd_vec:
3407     case INDEX_op_sssub_vec:
3408     case INDEX_op_ussub_vec:
3409     case INDEX_op_smin_vec:
3410     case INDEX_op_umin_vec:
3411     case INDEX_op_smax_vec:
3412     case INDEX_op_umax_vec:
3413     case INDEX_op_shlv_vec:
3414     case INDEX_op_shrv_vec:
3415     case INDEX_op_sarv_vec:
3416     case INDEX_op_rotlv_vec:
3417     case INDEX_op_rotrv_vec:
3418     case INDEX_op_shls_vec:
3419     case INDEX_op_shrs_vec:
3420     case INDEX_op_sars_vec:
3421     case INDEX_op_cmp_vec:
3422     case INDEX_op_x86_shufps_vec:
3423     case INDEX_op_x86_blend_vec:
3424     case INDEX_op_x86_packss_vec:
3425     case INDEX_op_x86_packus_vec:
3426     case INDEX_op_x86_vperm2i128_vec:
3427     case INDEX_op_x86_punpckl_vec:
3428     case INDEX_op_x86_punpckh_vec:
3429     case INDEX_op_x86_vpshldi_vec:
3430 #if TCG_TARGET_REG_BITS == 32
3431     case INDEX_op_dup2_vec:
3432 #endif
3433         return C_O1_I2(x, x, x);
3434
3435     case INDEX_op_abs_vec:
3436     case INDEX_op_dup_vec:
3437     case INDEX_op_not_vec:
3438     case INDEX_op_shli_vec:
3439     case INDEX_op_shri_vec:
3440     case INDEX_op_sari_vec:
3441     case INDEX_op_rotli_vec:
3442     case INDEX_op_x86_psrldq_vec:
3443         return C_O1_I1(x, x);
3444
3445     case INDEX_op_x86_vpshldv_vec:
3446     case INDEX_op_x86_vpshrdv_vec:
3447         return C_O1_I3(x, 0, x, x);
3448
3449     case INDEX_op_bitsel_vec:
3450     case INDEX_op_x86_vpblendvb_vec:
3451         return C_O1_I3(x, x, x, x);
3452
3453     default:
3454         g_assert_not_reached();
3455     }
3456 }
3457
3458 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3459 {
3460     switch (opc) {
3461     case INDEX_op_add_vec:
3462     case INDEX_op_sub_vec:
3463     case INDEX_op_and_vec:
3464     case INDEX_op_or_vec:
3465     case INDEX_op_xor_vec:
3466     case INDEX_op_andc_vec:
3467     case INDEX_op_orc_vec:
3468     case INDEX_op_nand_vec:
3469     case INDEX_op_nor_vec:
3470     case INDEX_op_eqv_vec:
3471     case INDEX_op_not_vec:
3472     case INDEX_op_bitsel_vec:
3473         return 1;
3474     case INDEX_op_cmp_vec:
3475     case INDEX_op_cmpsel_vec:
3476         return -1;
3477
3478     case INDEX_op_rotli_vec:
3479         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3480
3481     case INDEX_op_shli_vec:
3482     case INDEX_op_shri_vec:
3483         /* We must expand the operation for MO_8.  */
3484         return vece == MO_8 ? -1 : 1;
3485
3486     case INDEX_op_sari_vec:
3487         switch (vece) {
3488         case MO_8:
3489             return -1;
3490         case MO_16:
3491         case MO_32:
3492             return 1;
3493         case MO_64:
3494             if (have_avx512vl) {
3495                 return 1;
3496             }
3497             /*
3498              * We can emulate this for MO_64, but it does not pay off
3499              * unless we're producing at least 4 values.
3500              */
3501             return type >= TCG_TYPE_V256 ? -1 : 0;
3502         }
3503         return 0;
3504
3505     case INDEX_op_shls_vec:
3506     case INDEX_op_shrs_vec:
3507         return vece >= MO_16;
3508     case INDEX_op_sars_vec:
3509         switch (vece) {
3510         case MO_16:
3511         case MO_32:
3512             return 1;
3513         case MO_64:
3514             return have_avx512vl;
3515         }
3516         return 0;
3517     case INDEX_op_rotls_vec:
3518         return vece >= MO_16 ? -1 : 0;
3519
3520     case INDEX_op_shlv_vec:
3521     case INDEX_op_shrv_vec:
3522         switch (vece) {
3523         case MO_16:
3524             return have_avx512bw;
3525         case MO_32:
3526         case MO_64:
3527             return have_avx2;
3528         }
3529         return 0;
3530     case INDEX_op_sarv_vec:
3531         switch (vece) {
3532         case MO_16:
3533             return have_avx512bw;
3534         case MO_32:
3535             return have_avx2;
3536         case MO_64:
3537             return have_avx512vl;
3538         }
3539         return 0;
3540     case INDEX_op_rotlv_vec:
3541     case INDEX_op_rotrv_vec:
3542         switch (vece) {
3543         case MO_16:
3544             return have_avx512vbmi2 ? -1 : 0;
3545         case MO_32:
3546         case MO_64:
3547             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3548         }
3549         return 0;
3550
3551     case INDEX_op_mul_vec:
3552         switch (vece) {
3553         case MO_8:
3554             return -1;
3555         case MO_64:
3556             return have_avx512dq;
3557         }
3558         return 1;
3559
3560     case INDEX_op_ssadd_vec:
3561     case INDEX_op_usadd_vec:
3562     case INDEX_op_sssub_vec:
3563     case INDEX_op_ussub_vec:
3564         return vece <= MO_16;
3565     case INDEX_op_smin_vec:
3566     case INDEX_op_smax_vec:
3567     case INDEX_op_umin_vec:
3568     case INDEX_op_umax_vec:
3569     case INDEX_op_abs_vec:
3570         return vece <= MO_32 || have_avx512vl;
3571
3572     default:
3573         return 0;
3574     }
3575 }
3576
3577 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3578                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3579 {
3580     TCGv_vec t1, t2;
3581
3582     tcg_debug_assert(vece == MO_8);
3583
3584     t1 = tcg_temp_new_vec(type);
3585     t2 = tcg_temp_new_vec(type);
3586
3587     /*
3588      * Unpack to W, shift, and repack.  Tricky bits:
3589      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3590      *     i.e. duplicate in other half of the 16-bit lane.
3591      * (2) For right-shift, add 8 so that the high half of the lane
3592      *     becomes zero.  For left-shift, and left-rotate, we must
3593      *     shift up and down again.
3594      * (3) Step 2 leaves high half zero such that PACKUSWB
3595      *     (pack with unsigned saturation) does not modify
3596      *     the quantity.
3597      */
3598     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3599               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3600     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3601               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3602
3603     if (opc != INDEX_op_rotli_vec) {
3604         imm += 8;
3605     }
3606     if (opc == INDEX_op_shri_vec) {
3607         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3608         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3609     } else {
3610         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3611         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3612         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3613         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3614     }
3615
3616     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3617               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3618     tcg_temp_free_vec(t1);
3619     tcg_temp_free_vec(t2);
3620 }
3621
3622 static void expand_vec_sari(TCGType type, unsigned vece,
3623                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3624 {
3625     TCGv_vec t1, t2;
3626
3627     switch (vece) {
3628     case MO_8:
3629         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3630         t1 = tcg_temp_new_vec(type);
3631         t2 = tcg_temp_new_vec(type);
3632         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3633                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3634         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3635                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3636         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3637         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3638         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3639                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3640         tcg_temp_free_vec(t1);
3641         tcg_temp_free_vec(t2);
3642         break;
3643
3644     case MO_64:
3645         t1 = tcg_temp_new_vec(type);
3646         if (imm <= 32) {
3647             /*
3648              * We can emulate a small sign extend by performing an arithmetic
3649              * 32-bit shift and overwriting the high half of a 64-bit logical
3650              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3651              * does not, so we have to bound the smaller shift -- we get the
3652              * same result in the high half either way.
3653              */
3654             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3655             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3656             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3657                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3658                       tcgv_vec_arg(t1), 0xaa);
3659         } else {
3660             /* Otherwise we will need to use a compare vs 0 to produce
3661              * the sign-extend, shift and merge.
3662              */
3663             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3664                             tcg_constant_vec(type, MO_64, 0), v1);
3665             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3666             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3667             tcg_gen_or_vec(MO_64, v0, v0, t1);
3668         }
3669         tcg_temp_free_vec(t1);
3670         break;
3671
3672     default:
3673         g_assert_not_reached();
3674     }
3675 }
3676
3677 static void expand_vec_rotli(TCGType type, unsigned vece,
3678                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3679 {
3680     TCGv_vec t;
3681
3682     if (vece == MO_8) {
3683         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3684         return;
3685     }
3686
3687     if (have_avx512vbmi2) {
3688         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3689                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3690         return;
3691     }
3692
3693     t = tcg_temp_new_vec(type);
3694     tcg_gen_shli_vec(vece, t, v1, imm);
3695     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3696     tcg_gen_or_vec(vece, v0, v0, t);
3697     tcg_temp_free_vec(t);
3698 }
3699
3700 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3701                             TCGv_vec v1, TCGv_vec sh, bool right)
3702 {
3703     TCGv_vec t;
3704
3705     if (have_avx512vbmi2) {
3706         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3707                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3708                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3709         return;
3710     }
3711
3712     t = tcg_temp_new_vec(type);
3713     tcg_gen_dupi_vec(vece, t, 8 << vece);
3714     tcg_gen_sub_vec(vece, t, t, sh);
3715     if (right) {
3716         tcg_gen_shlv_vec(vece, t, v1, t);
3717         tcg_gen_shrv_vec(vece, v0, v1, sh);
3718     } else {
3719         tcg_gen_shrv_vec(vece, t, v1, t);
3720         tcg_gen_shlv_vec(vece, v0, v1, sh);
3721     }
3722     tcg_gen_or_vec(vece, v0, v0, t);
3723     tcg_temp_free_vec(t);
3724 }
3725
3726 static void expand_vec_rotls(TCGType type, unsigned vece,
3727                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3728 {
3729     TCGv_vec t = tcg_temp_new_vec(type);
3730
3731     tcg_debug_assert(vece != MO_8);
3732
3733     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3734         tcg_gen_dup_i32_vec(vece, t, lsh);
3735         if (vece >= MO_32) {
3736             tcg_gen_rotlv_vec(vece, v0, v1, t);
3737         } else {
3738             expand_vec_rotv(type, vece, v0, v1, t, false);
3739         }
3740     } else {
3741         TCGv_i32 rsh = tcg_temp_new_i32();
3742
3743         tcg_gen_neg_i32(rsh, lsh);
3744         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3745         tcg_gen_shls_vec(vece, t, v1, lsh);
3746         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3747         tcg_gen_or_vec(vece, v0, v0, t);
3748
3749         tcg_temp_free_i32(rsh);
3750     }
3751
3752     tcg_temp_free_vec(t);
3753 }
3754
3755 static void expand_vec_mul(TCGType type, unsigned vece,
3756                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3757 {
3758     TCGv_vec t1, t2, t3, t4, zero;
3759
3760     tcg_debug_assert(vece == MO_8);
3761
3762     /*
3763      * Unpack v1 bytes to words, 0 | x.
3764      * Unpack v2 bytes to words, y | 0.
3765      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3766      * Shift logical right by 8 bits to clear the high 8 bytes before
3767      * using an unsigned saturated pack.
3768      *
3769      * The difference between the V64, V128 and V256 cases is merely how
3770      * we distribute the expansion between temporaries.
3771      */
3772     switch (type) {
3773     case TCG_TYPE_V64:
3774         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3775         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3776         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3777         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3778                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3779         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3780                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3781         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3782         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3783         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3784                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3785         tcg_temp_free_vec(t1);
3786         tcg_temp_free_vec(t2);
3787         break;
3788
3789     case TCG_TYPE_V128:
3790     case TCG_TYPE_V256:
3791         t1 = tcg_temp_new_vec(type);
3792         t2 = tcg_temp_new_vec(type);
3793         t3 = tcg_temp_new_vec(type);
3794         t4 = tcg_temp_new_vec(type);
3795         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3796         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3797                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3798         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3799                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3800         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3801                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3802         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3803                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3804         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3805         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3806         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3807         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3808         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3809                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3810         tcg_temp_free_vec(t1);
3811         tcg_temp_free_vec(t2);
3812         tcg_temp_free_vec(t3);
3813         tcg_temp_free_vec(t4);
3814         break;
3815
3816     default:
3817         g_assert_not_reached();
3818     }
3819 }
3820
3821 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3822                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3823 {
3824     enum {
3825         NEED_INV  = 1,
3826         NEED_SWAP = 2,
3827         NEED_BIAS = 4,
3828         NEED_UMIN = 8,
3829         NEED_UMAX = 16,
3830     };
3831     TCGv_vec t1, t2, t3;
3832     uint8_t fixup;
3833
3834     switch (cond) {
3835     case TCG_COND_EQ:
3836     case TCG_COND_GT:
3837         fixup = 0;
3838         break;
3839     case TCG_COND_NE:
3840     case TCG_COND_LE:
3841         fixup = NEED_INV;
3842         break;
3843     case TCG_COND_LT:
3844         fixup = NEED_SWAP;
3845         break;
3846     case TCG_COND_GE:
3847         fixup = NEED_SWAP | NEED_INV;
3848         break;
3849     case TCG_COND_LEU:
3850         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3851             fixup = NEED_UMIN;
3852         } else {
3853             fixup = NEED_BIAS | NEED_INV;
3854         }
3855         break;
3856     case TCG_COND_GTU:
3857         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3858             fixup = NEED_UMIN | NEED_INV;
3859         } else {
3860             fixup = NEED_BIAS;
3861         }
3862         break;
3863     case TCG_COND_GEU:
3864         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3865             fixup = NEED_UMAX;
3866         } else {
3867             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3868         }
3869         break;
3870     case TCG_COND_LTU:
3871         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3872             fixup = NEED_UMAX | NEED_INV;
3873         } else {
3874             fixup = NEED_BIAS | NEED_SWAP;
3875         }
3876         break;
3877     default:
3878         g_assert_not_reached();
3879     }
3880
3881     if (fixup & NEED_INV) {
3882         cond = tcg_invert_cond(cond);
3883     }
3884     if (fixup & NEED_SWAP) {
3885         t1 = v1, v1 = v2, v2 = t1;
3886         cond = tcg_swap_cond(cond);
3887     }
3888
3889     t1 = t2 = NULL;
3890     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3891         t1 = tcg_temp_new_vec(type);
3892         if (fixup & NEED_UMIN) {
3893             tcg_gen_umin_vec(vece, t1, v1, v2);
3894         } else {
3895             tcg_gen_umax_vec(vece, t1, v1, v2);
3896         }
3897         v2 = t1;
3898         cond = TCG_COND_EQ;
3899     } else if (fixup & NEED_BIAS) {
3900         t1 = tcg_temp_new_vec(type);
3901         t2 = tcg_temp_new_vec(type);
3902         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3903         tcg_gen_sub_vec(vece, t1, v1, t3);
3904         tcg_gen_sub_vec(vece, t2, v2, t3);
3905         v1 = t1;
3906         v2 = t2;
3907         cond = tcg_signed_cond(cond);
3908     }
3909
3910     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3911     /* Expand directly; do not recurse.  */
3912     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3913               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3914
3915     if (t1) {
3916         tcg_temp_free_vec(t1);
3917         if (t2) {
3918             tcg_temp_free_vec(t2);
3919         }
3920     }
3921     return fixup & NEED_INV;
3922 }
3923
3924 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3925                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3926 {
3927     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3928         tcg_gen_not_vec(vece, v0, v0);
3929     }
3930 }
3931
3932 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3933                               TCGv_vec c1, TCGv_vec c2,
3934                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3935 {
3936     TCGv_vec t = tcg_temp_new_vec(type);
3937
3938     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3939         /* Invert the sense of the compare by swapping arguments.  */
3940         TCGv_vec x;
3941         x = v3, v3 = v4, v4 = x;
3942     }
3943     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3944               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3945               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3946     tcg_temp_free_vec(t);
3947 }
3948
3949 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3950                        TCGArg a0, ...)
3951 {
3952     va_list va;
3953     TCGArg a2;
3954     TCGv_vec v0, v1, v2, v3, v4;
3955
3956     va_start(va, a0);
3957     v0 = temp_tcgv_vec(arg_temp(a0));
3958     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3959     a2 = va_arg(va, TCGArg);
3960
3961     switch (opc) {
3962     case INDEX_op_shli_vec:
3963     case INDEX_op_shri_vec:
3964         expand_vec_shi(type, vece, opc, v0, v1, a2);
3965         break;
3966
3967     case INDEX_op_sari_vec:
3968         expand_vec_sari(type, vece, v0, v1, a2);
3969         break;
3970
3971     case INDEX_op_rotli_vec:
3972         expand_vec_rotli(type, vece, v0, v1, a2);
3973         break;
3974
3975     case INDEX_op_rotls_vec:
3976         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3977         break;
3978
3979     case INDEX_op_rotlv_vec:
3980         v2 = temp_tcgv_vec(arg_temp(a2));
3981         expand_vec_rotv(type, vece, v0, v1, v2, false);
3982         break;
3983     case INDEX_op_rotrv_vec:
3984         v2 = temp_tcgv_vec(arg_temp(a2));
3985         expand_vec_rotv(type, vece, v0, v1, v2, true);
3986         break;
3987
3988     case INDEX_op_mul_vec:
3989         v2 = temp_tcgv_vec(arg_temp(a2));
3990         expand_vec_mul(type, vece, v0, v1, v2);
3991         break;
3992
3993     case INDEX_op_cmp_vec:
3994         v2 = temp_tcgv_vec(arg_temp(a2));
3995         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3996         break;
3997
3998     case INDEX_op_cmpsel_vec:
3999         v2 = temp_tcgv_vec(arg_temp(a2));
4000         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4001         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4002         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4003         break;
4004
4005     default:
4006         break;
4007     }
4008
4009     va_end(va);
4010 }
4011
4012 static const int tcg_target_callee_save_regs[] = {
4013 #if TCG_TARGET_REG_BITS == 64
4014     TCG_REG_RBP,
4015     TCG_REG_RBX,
4016 #if defined(_WIN64)
4017     TCG_REG_RDI,
4018     TCG_REG_RSI,
4019 #endif
4020     TCG_REG_R12,
4021     TCG_REG_R13,
4022     TCG_REG_R14, /* Currently used for the global env. */
4023     TCG_REG_R15,
4024 #else
4025     TCG_REG_EBP, /* Currently used for the global env. */
4026     TCG_REG_EBX,
4027     TCG_REG_ESI,
4028     TCG_REG_EDI,
4029 #endif
4030 };
4031
4032 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4033    and tcg_register_jit.  */
4034
4035 #define PUSH_SIZE \
4036     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4037      * (TCG_TARGET_REG_BITS / 8))
4038
4039 #define FRAME_SIZE \
4040     ((PUSH_SIZE \
4041       + TCG_STATIC_CALL_ARGS_SIZE \
4042       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4043       + TCG_TARGET_STACK_ALIGN - 1) \
4044      & ~(TCG_TARGET_STACK_ALIGN - 1))
4045
4046 /* Generate global QEMU prologue and epilogue code */
4047 static void tcg_target_qemu_prologue(TCGContext *s)
4048 {
4049     int i, stack_addend;
4050
4051     /* TB prologue */
4052
4053     /* Reserve some stack space, also for TCG temps.  */
4054     stack_addend = FRAME_SIZE - PUSH_SIZE;
4055     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4056                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4057
4058     /* Save all callee saved registers.  */
4059     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4060         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4061     }
4062
4063 #if TCG_TARGET_REG_BITS == 32
4064     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4065                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4066     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4067     /* jmp *tb.  */
4068     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4069                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4070                          + stack_addend);
4071 #else
4072 # if !defined(CONFIG_SOFTMMU)
4073     if (guest_base) {
4074         int seg = setup_guest_base_seg();
4075         if (seg != 0) {
4076             x86_guest_base.seg = seg;
4077         } else if (guest_base == (int32_t)guest_base) {
4078             x86_guest_base.ofs = guest_base;
4079         } else {
4080             /* Choose R12 because, as a base, it requires a SIB byte. */
4081             x86_guest_base.index = TCG_REG_R12;
4082             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4083             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4084         }
4085     }
4086 # endif
4087     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4088     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4089     /* jmp *tb.  */
4090     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4091 #endif
4092
4093     /*
4094      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4095      * and fall through to the rest of the epilogue.
4096      */
4097     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4098     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4099
4100     /* TB epilogue */
4101     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4102
4103     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4104
4105     if (have_avx2) {
4106         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4107     }
4108     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4109         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4110     }
4111     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4112 }
4113
4114 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4115 {
4116     memset(p, 0x90, count);
4117 }
4118
4119 static void tcg_target_init(TCGContext *s)
4120 {
4121     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4122     if (TCG_TARGET_REG_BITS == 64) {
4123         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4124     }
4125     if (have_avx1) {
4126         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4127         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4128     }
4129     if (have_avx2) {
4130         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4131     }
4132
4133     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4134     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4135     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4136     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4137     if (TCG_TARGET_REG_BITS == 64) {
4138 #if !defined(_WIN64)
4139         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4140         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4141 #endif
4142         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4143         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4144         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4145         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4146     }
4147
4148     s->reserved_regs = 0;
4149     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4150     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4151 #ifdef _WIN64
4152     /* These are call saved, and we don't save them, so don't use them. */
4153     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4154     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4155     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4156     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4157     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4158     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4159     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4160     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4161     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4162     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4163 #endif
4164 }
4165
4166 typedef struct {
4167     DebugFrameHeader h;
4168     uint8_t fde_def_cfa[4];
4169     uint8_t fde_reg_ofs[14];
4170 } DebugFrame;
4171
4172 /* We're expecting a 2 byte uleb128 encoded value.  */
4173 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4174
4175 #if !defined(__ELF__)
4176     /* Host machine without ELF. */
4177 #elif TCG_TARGET_REG_BITS == 64
4178 #define ELF_HOST_MACHINE EM_X86_64
4179 static const DebugFrame debug_frame = {
4180     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4181     .h.cie.id = -1,
4182     .h.cie.version = 1,
4183     .h.cie.code_align = 1,
4184     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4185     .h.cie.return_column = 16,
4186
4187     /* Total FDE size does not include the "len" member.  */
4188     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4189
4190     .fde_def_cfa = {
4191         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4192         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4193         (FRAME_SIZE >> 7)
4194     },
4195     .fde_reg_ofs = {
4196         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4197         /* The following ordering must match tcg_target_callee_save_regs.  */
4198         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4199         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4200         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4201         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4202         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4203         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4204     }
4205 };
4206 #else
4207 #define ELF_HOST_MACHINE EM_386
4208 static const DebugFrame debug_frame = {
4209     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4210     .h.cie.id = -1,
4211     .h.cie.version = 1,
4212     .h.cie.code_align = 1,
4213     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4214     .h.cie.return_column = 8,
4215
4216     /* Total FDE size does not include the "len" member.  */
4217     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4218
4219     .fde_def_cfa = {
4220         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4221         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4222         (FRAME_SIZE >> 7)
4223     },
4224     .fde_reg_ofs = {
4225         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4226         /* The following ordering must match tcg_target_callee_save_regs.  */
4227         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4228         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4229         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4230         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4231     }
4232 };
4233 #endif
4234
4235 #if defined(ELF_HOST_MACHINE)
4236 void tcg_register_jit(const void *buf, size_t buf_size)
4237 {
4238     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4239 }
4240 #endif