tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 static const int tcg_target_call_iarg_regs[] = {
  95 #if TCG_TARGET_REG_BITS == 64
  96 #if defined(_WIN64)
  97     TCG_REG_RCX,
  98     TCG_REG_RDX,
  99 #else
 100     TCG_REG_RDI,
 101     TCG_REG_RSI,
 102     TCG_REG_RDX,
 103     TCG_REG_RCX,
 104 #endif
 105     TCG_REG_R8,
 106     TCG_REG_R9,
 107 #else
 108     /* 32 bit mode uses stack based calling convention (GCC default). */
 109 #endif
 110 };
 111
 112 static const int tcg_target_call_oarg_regs[] = {
 113     TCG_REG_EAX,
 114 #if TCG_TARGET_REG_BITS == 32
 115     TCG_REG_EDX
 116 #endif
 117 };
 118
 119 /* Constants we accept.  */
 120 #define TCG_CT_CONST_S32 0x100
 121 #define TCG_CT_CONST_U32 0x200
 122 #define TCG_CT_CONST_I32 0x400
 123 #define TCG_CT_CONST_WSZ 0x800
 124
 125 /* Registers used with L constraint, which are the first argument
 126    registers on x86_64, and two random call clobbered registers on
 127    i386. */
 128 #if TCG_TARGET_REG_BITS == 64
 129 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 130 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 131 #else
 132 # define TCG_REG_L0 TCG_REG_EAX
 133 # define TCG_REG_L1 TCG_REG_EDX
 134 #endif
 135
 136 #define ALL_BYTEH_REGS         0x0000000fu
 137 #if TCG_TARGET_REG_BITS == 64
 138 # define ALL_GENERAL_REGS      0x0000ffffu
 139 # define ALL_VECTOR_REGS       0xffff0000u
 140 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 141 #else
 142 # define ALL_GENERAL_REGS      0x000000ffu
 143 # define ALL_VECTOR_REGS       0x00ff0000u
 144 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 145 #endif
 146 #ifdef CONFIG_SOFTMMU
 147 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 148 #else
 149 # define SOFTMMU_RESERVE_REGS  0
 150 #endif
 151
 152 /* The host compiler should supply <cpuid.h> to enable runtime features
 153    detection, as we're not going to go so far as our own inline assembly.
 154    If not available, default values will be assumed.  */
 155 #if defined(CONFIG_CPUID_H)
 156 #include "qemu/cpuid.h"
 157 #endif
 158
 159 /* For 64-bit, we always know that CMOV is available.  */
 160 #if TCG_TARGET_REG_BITS == 64
 161 # define have_cmov 1
 162 #elif defined(CONFIG_CPUID_H)
 163 static bool have_cmov;
 164 #else
 165 # define have_cmov 0
 166 #endif
 167
 168 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 169    it there.  Therefore we always define the variable.  */
 170 bool have_bmi1;
 171 bool have_popcnt;
 172 bool have_avx1;
 173 bool have_avx2;
 174 bool have_movbe;
 175
 176 #ifdef CONFIG_CPUID_H
 177 static bool have_bmi2;
 178 static bool have_lzcnt;
 179 #else
 180 # define have_bmi2 0
 181 # define have_lzcnt 0
 182 #endif
 183
 184 static const tcg_insn_unit *tb_ret_addr;
 185
 186 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 187                         intptr_t value, intptr_t addend)
 188 {
 189     value += addend;
 190     switch(type) {
 191     case R_386_PC32:
 192         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 193         if (value != (int32_t)value) {
 194             return false;
 195         }
 196         /* FALLTHRU */
 197     case R_386_32:
 198         tcg_patch32(code_ptr, value);
 199         break;
 200     case R_386_PC8:
 201         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 202         if (value != (int8_t)value) {
 203             return false;
 204         }
 205         tcg_patch8(code_ptr, value);
 206         break;
 207     default:
 208         tcg_abort();
 209     }
 210     return true;
 211 }
 212
 213 /* test if a constant matches the constraint */
 214 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 215 {
 216     if (ct & TCG_CT_CONST) {
 217         return 1;
 218     }
 219     if (type == TCG_TYPE_I32) {
 220         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 221             return 1;
 222         }
 223     } else {
 224         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 225             return 1;
 226         }
 227         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 228             return 1;
 229         }
 230         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 231             return 1;
 232         }
 233     }
 234     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 235         return 1;
 236     }
 237     return 0;
 238 }
 239
 240 # define LOWREGMASK(x)  ((x) & 7)
 241
 242 #define P_EXT           0x100           /* 0x0f opcode prefix */
 243 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 244 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 245 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 246 #if TCG_TARGET_REG_BITS == 64
 247 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 248 # define P_REXB_R       0x2000          /* REG field as byte register */
 249 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 250 # define P_GS           0x8000          /* gs segment override */
 251 #else
 252 # define P_REXW         0
 253 # define P_REXB_R       0
 254 # define P_REXB_RM      0
 255 # define P_GS           0
 256 #endif
 257 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 258 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 259 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 260 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 261
 262 #define OPC_ARITH_EvIz  (0x81)
 263 #define OPC_ARITH_EvIb  (0x83)
 264 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 265 #define OPC_ANDN        (0xf2 | P_EXT38)
 266 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 267 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 268 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 269 #define OPC_BSF         (0xbc | P_EXT)
 270 #define OPC_BSR         (0xbd | P_EXT)
 271 #define OPC_BSWAP       (0xc8 | P_EXT)
 272 #define OPC_CALL_Jz     (0xe8)
 273 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 274 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 275 #define OPC_DEC_r32     (0x48)
 276 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 277 #define OPC_IMUL_GvEvIb (0x6b)
 278 #define OPC_IMUL_GvEvIz (0x69)
 279 #define OPC_INC_r32     (0x40)
 280 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 281 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 282 #define OPC_JMP_long    (0xe9)
 283 #define OPC_JMP_short   (0xeb)
 284 #define OPC_LEA         (0x8d)
 285 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 286 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 287 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 288 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 289 #define OPC_MOVB_EvIz   (0xc6)
 290 #define OPC_MOVL_EvIz   (0xc7)
 291 #define OPC_MOVL_Iv     (0xb8)
 292 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 293 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 294 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 295 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 296 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 297 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 298 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 299 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 300 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 301 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 302 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 303 #define OPC_MOVSBL      (0xbe | P_EXT)
 304 #define OPC_MOVSWL      (0xbf | P_EXT)
 305 #define OPC_MOVSLQ      (0x63 | P_REXW)
 306 #define OPC_MOVZBL      (0xb6 | P_EXT)
 307 #define OPC_MOVZWL      (0xb7 | P_EXT)
 308 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 309 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 310 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 311 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 312 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 313 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 314 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 315 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 316 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 317 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 318 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 319 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 320 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 321 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 322 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 323 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 324 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 325 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 326 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 327 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 328 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 329 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 330 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 331 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 332 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 333 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 334 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 335 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 336 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 337 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 338 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 339 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 340 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 341 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 342 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 343 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 344 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 345 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 348 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 349 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 350 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 351 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 352 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 353 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 354 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 355 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 356 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 357 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 358 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 359 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 360 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 361 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 362 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 363 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 364 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 365 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 366 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 368 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 369 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 370 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 371 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 372 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 373 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 374 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 376 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 377 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 384 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 385 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 386 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 387 #define OPC_POP_r32     (0x58)
 388 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 389 #define OPC_PUSH_r32    (0x50)
 390 #define OPC_PUSH_Iv     (0x68)
 391 #define OPC_PUSH_Ib     (0x6a)
 392 #define OPC_RET         (0xc3)
 393 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 394 #define OPC_SHIFT_1     (0xd1)
 395 #define OPC_SHIFT_Ib    (0xc1)
 396 #define OPC_SHIFT_cl    (0xd3)
 397 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 398 #define OPC_SHUFPS      (0xc6 | P_EXT)
 399 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 400 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 401 #define OPC_SHRD_Ib     (0xac | P_EXT)
 402 #define OPC_TESTL       (0x85)
 403 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 404 #define OPC_UD2         (0x0b | P_EXT)
 405 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 406 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 407 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 408 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 409 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 410 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 413 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 414 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 415 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 416 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 417 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 418 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 419 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 420 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 421 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 422 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 423 #define OPC_XCHG_ax_r32 (0x90)
 424
 425 #define OPC_GRP3_Eb     (0xf6)
 426 #define OPC_GRP3_Ev     (0xf7)
 427 #define OPC_GRP5        (0xff)
 428 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 429
 430 /* Group 1 opcode extensions for 0x80-0x83.
 431    These are also used as modifiers for OPC_ARITH.  */
 432 #define ARITH_ADD 0
 433 #define ARITH_OR  1
 434 #define ARITH_ADC 2
 435 #define ARITH_SBB 3
 436 #define ARITH_AND 4
 437 #define ARITH_SUB 5
 438 #define ARITH_XOR 6
 439 #define ARITH_CMP 7
 440
 441 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 442 #define SHIFT_ROL 0
 443 #define SHIFT_ROR 1
 444 #define SHIFT_SHL 4
 445 #define SHIFT_SHR 5
 446 #define SHIFT_SAR 7
 447
 448 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 449 #define EXT3_TESTi 0
 450 #define EXT3_NOT   2
 451 #define EXT3_NEG   3
 452 #define EXT3_MUL   4
 453 #define EXT3_IMUL  5
 454 #define EXT3_DIV   6
 455 #define EXT3_IDIV  7
 456
 457 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 458 #define EXT5_INC_Ev     0
 459 #define EXT5_DEC_Ev     1
 460 #define EXT5_CALLN_Ev   2
 461 #define EXT5_JMPN_Ev    4
 462
 463 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 464 #define JCC_JMP (-1)
 465 #define JCC_JO  0x0
 466 #define JCC_JNO 0x1
 467 #define JCC_JB  0x2
 468 #define JCC_JAE 0x3
 469 #define JCC_JE  0x4
 470 #define JCC_JNE 0x5
 471 #define JCC_JBE 0x6
 472 #define JCC_JA  0x7
 473 #define JCC_JS  0x8
 474 #define JCC_JNS 0x9
 475 #define JCC_JP  0xa
 476 #define JCC_JNP 0xb
 477 #define JCC_JL  0xc
 478 #define JCC_JGE 0xd
 479 #define JCC_JLE 0xe
 480 #define JCC_JG  0xf
 481
 482 static const uint8_t tcg_cond_to_jcc[] = {
 483     [TCG_COND_EQ] = JCC_JE,
 484     [TCG_COND_NE] = JCC_JNE,
 485     [TCG_COND_LT] = JCC_JL,
 486     [TCG_COND_GE] = JCC_JGE,
 487     [TCG_COND_LE] = JCC_JLE,
 488     [TCG_COND_GT] = JCC_JG,
 489     [TCG_COND_LTU] = JCC_JB,
 490     [TCG_COND_GEU] = JCC_JAE,
 491     [TCG_COND_LEU] = JCC_JBE,
 492     [TCG_COND_GTU] = JCC_JA,
 493 };
 494
 495 #if TCG_TARGET_REG_BITS == 64
 496 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 497 {
 498     int rex;
 499
 500     if (opc & P_GS) {
 501         tcg_out8(s, 0x65);
 502     }
 503     if (opc & P_DATA16) {
 504         /* We should never be asking for both 16 and 64-bit operation.  */
 505         tcg_debug_assert((opc & P_REXW) == 0);
 506         tcg_out8(s, 0x66);
 507     }
 508     if (opc & P_SIMDF3) {
 509         tcg_out8(s, 0xf3);
 510     } else if (opc & P_SIMDF2) {
 511         tcg_out8(s, 0xf2);
 512     }
 513
 514     rex = 0;
 515     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 516     rex |= (r & 8) >> 1;                /* REX.R */
 517     rex |= (x & 8) >> 2;                /* REX.X */
 518     rex |= (rm & 8) >> 3;               /* REX.B */
 519
 520     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 521        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 522        as otherwise the encoding indicates %[abcd]h.  Note that the values
 523        that are ORed in merely indicate that the REX byte must be present;
 524        those bits get discarded in output.  */
 525     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 526     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 527
 528     if (rex) {
 529         tcg_out8(s, (uint8_t)(rex | 0x40));
 530     }
 531
 532     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 533         tcg_out8(s, 0x0f);
 534         if (opc & P_EXT38) {
 535             tcg_out8(s, 0x38);
 536         } else if (opc & P_EXT3A) {
 537             tcg_out8(s, 0x3a);
 538         }
 539     }
 540
 541     tcg_out8(s, opc);
 542 }
 543 #else
 544 static void tcg_out_opc(TCGContext *s, int opc)
 545 {
 546     if (opc & P_DATA16) {
 547         tcg_out8(s, 0x66);
 548     }
 549     if (opc & P_SIMDF3) {
 550         tcg_out8(s, 0xf3);
 551     } else if (opc & P_SIMDF2) {
 552         tcg_out8(s, 0xf2);
 553     }
 554     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 555         tcg_out8(s, 0x0f);
 556         if (opc & P_EXT38) {
 557             tcg_out8(s, 0x38);
 558         } else if (opc & P_EXT3A) {
 559             tcg_out8(s, 0x3a);
 560         }
 561     }
 562     tcg_out8(s, opc);
 563 }
 564 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 565    the 32-bit compilation paths.  This method works with all versions of gcc,
 566    whereas relying on optimization may not be able to exclude them.  */
 567 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 568 #endif
 569
 570 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 571 {
 572     tcg_out_opc(s, opc, r, rm, 0);
 573     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 574 }
 575
 576 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 577                             int rm, int index)
 578 {
 579     int tmp;
 580
 581     /* Use the two byte form if possible, which cannot encode
 582        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 583     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 584         && ((rm | index) & 8) == 0) {
 585         /* Two byte VEX prefix.  */
 586         tcg_out8(s, 0xc5);
 587
 588         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 589     } else {
 590         /* Three byte VEX prefix.  */
 591         tcg_out8(s, 0xc4);
 592
 593         /* VEX.m-mmmm */
 594         if (opc & P_EXT3A) {
 595             tmp = 3;
 596         } else if (opc & P_EXT38) {
 597             tmp = 2;
 598         } else if (opc & P_EXT) {
 599             tmp = 1;
 600         } else {
 601             g_assert_not_reached();
 602         }
 603         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 604         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 605         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 606         tcg_out8(s, tmp);
 607
 608         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 609     }
 610
 611     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 612     /* VEX.pp */
 613     if (opc & P_DATA16) {
 614         tmp |= 1;                          /* 0x66 */
 615     } else if (opc & P_SIMDF3) {
 616         tmp |= 2;                          /* 0xf3 */
 617     } else if (opc & P_SIMDF2) {
 618         tmp |= 3;                          /* 0xf2 */
 619     }
 620     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 621     tcg_out8(s, tmp);
 622     tcg_out8(s, opc);
 623 }
 624
 625 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 626 {
 627     tcg_out_vex_opc(s, opc, r, v, rm, 0);
 628     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 629 }
 630
 631 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 632    We handle either RM and INDEX missing with a negative value.  In 64-bit
 633    mode for absolute addresses, ~RM is the size of the immediate operand
 634    that will follow the instruction.  */
 635
 636 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 637                                int shift, intptr_t offset)
 638 {
 639     int mod, len;
 640
 641     if (index < 0 && rm < 0) {
 642         if (TCG_TARGET_REG_BITS == 64) {
 643             /* Try for a rip-relative addressing mode.  This has replaced
 644                the 32-bit-mode absolute addressing encoding.  */
 645             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 646             intptr_t disp = offset - pc;
 647             if (disp == (int32_t)disp) {
 648                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 649                 tcg_out32(s, disp);
 650                 return;
 651             }
 652
 653             /* Try for an absolute address encoding.  This requires the
 654                use of the MODRM+SIB encoding and is therefore larger than
 655                rip-relative addressing.  */
 656             if (offset == (int32_t)offset) {
 657                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 658                 tcg_out8(s, (4 << 3) | 5);
 659                 tcg_out32(s, offset);
 660                 return;
 661             }
 662
 663             /* ??? The memory isn't directly addressable.  */
 664             g_assert_not_reached();
 665         } else {
 666             /* Absolute address.  */
 667             tcg_out8(s, (r << 3) | 5);
 668             tcg_out32(s, offset);
 669             return;
 670         }
 671     }
 672
 673     /* Find the length of the immediate addend.  Note that the encoding
 674        that would be used for (%ebp) indicates absolute addressing.  */
 675     if (rm < 0) {
 676         mod = 0, len = 4, rm = 5;
 677     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 678         mod = 0, len = 0;
 679     } else if (offset == (int8_t)offset) {
 680         mod = 0x40, len = 1;
 681     } else {
 682         mod = 0x80, len = 4;
 683     }
 684
 685     /* Use a single byte MODRM format if possible.  Note that the encoding
 686        that would be used for %esp is the escape to the two byte form.  */
 687     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 688         /* Single byte MODRM format.  */
 689         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 690     } else {
 691         /* Two byte MODRM+SIB format.  */
 692
 693         /* Note that the encoding that would place %esp into the index
 694            field indicates no index register.  In 64-bit mode, the REX.X
 695            bit counts, so %r12 can be used as the index.  */
 696         if (index < 0) {
 697             index = 4;
 698         } else {
 699             tcg_debug_assert(index != TCG_REG_ESP);
 700         }
 701
 702         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 703         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 704     }
 705
 706     if (len == 1) {
 707         tcg_out8(s, offset);
 708     } else if (len == 4) {
 709         tcg_out32(s, offset);
 710     }
 711 }
 712
 713 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 714                                      int index, int shift, intptr_t offset)
 715 {
 716     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 717     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 718 }
 719
 720 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 721                                          int rm, int index, int shift,
 722                                          intptr_t offset)
 723 {
 724     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 725     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 726 }
 727
 728 /* A simplification of the above with no index or shift.  */
 729 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 730                                         int rm, intptr_t offset)
 731 {
 732     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 733 }
 734
 735 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 736                                             int v, int rm, intptr_t offset)
 737 {
 738     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 739 }
 740
 741 /* Output an opcode with an expected reference to the constant pool.  */
 742 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 743 {
 744     tcg_out_opc(s, opc, r, 0, 0);
 745     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 746     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 747     tcg_out32(s, 0);
 748 }
 749
 750 /* Output an opcode with an expected reference to the constant pool.  */
 751 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 752 {
 753     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 754     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 755     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 756     tcg_out32(s, 0);
 757 }
 758
 759 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 760 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 761 {
 762     /* Propagate an opcode prefix, such as P_REXW.  */
 763     int ext = subop & ~0x7;
 764     subop &= 0x7;
 765
 766     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 767 }
 768
 769 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 770 {
 771     int rexw = 0;
 772
 773     if (arg == ret) {
 774         return true;
 775     }
 776     switch (type) {
 777     case TCG_TYPE_I64:
 778         rexw = P_REXW;
 779         /* fallthru */
 780     case TCG_TYPE_I32:
 781         if (ret < 16) {
 782             if (arg < 16) {
 783                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 784             } else {
 785                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 786             }
 787         } else {
 788             if (arg < 16) {
 789                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 790             } else {
 791                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 792             }
 793         }
 794         break;
 795
 796     case TCG_TYPE_V64:
 797         tcg_debug_assert(ret >= 16 && arg >= 16);
 798         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 799         break;
 800     case TCG_TYPE_V128:
 801         tcg_debug_assert(ret >= 16 && arg >= 16);
 802         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 803         break;
 804     case TCG_TYPE_V256:
 805         tcg_debug_assert(ret >= 16 && arg >= 16);
 806         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 807         break;
 808
 809     default:
 810         g_assert_not_reached();
 811     }
 812     return true;
 813 }
 814
 815 static const int avx2_dup_insn[4] = {
 816     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 817     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 818 };
 819
 820 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 821                             TCGReg r, TCGReg a)
 822 {
 823     if (have_avx2) {
 824         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 825         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 826     } else {
 827         switch (vece) {
 828         case MO_8:
 829             /* ??? With zero in a register, use PSHUFB.  */
 830             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 831             a = r;
 832             /* FALLTHRU */
 833         case MO_16:
 834             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 835             a = r;
 836             /* FALLTHRU */
 837         case MO_32:
 838             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 839             /* imm8 operand: all output lanes selected from input lane 0.  */
 840             tcg_out8(s, 0);
 841             break;
 842         case MO_64:
 843             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 844             break;
 845         default:
 846             g_assert_not_reached();
 847         }
 848     }
 849     return true;
 850 }
 851
 852 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 853                              TCGReg r, TCGReg base, intptr_t offset)
 854 {
 855     if (have_avx2) {
 856         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 857         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 858                                  r, 0, base, offset);
 859     } else {
 860         switch (vece) {
 861         case MO_64:
 862             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 863             break;
 864         case MO_32:
 865             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 866             break;
 867         case MO_16:
 868             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 869             tcg_out8(s, 0); /* imm8 */
 870             tcg_out_dup_vec(s, type, vece, r, r);
 871             break;
 872         case MO_8:
 873             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 874             tcg_out8(s, 0); /* imm8 */
 875             tcg_out_dup_vec(s, type, vece, r, r);
 876             break;
 877         default:
 878             g_assert_not_reached();
 879         }
 880     }
 881     return true;
 882 }
 883
 884 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 885                              TCGReg ret, int64_t arg)
 886 {
 887     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 888
 889     if (arg == 0) {
 890         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 891         return;
 892     }
 893     if (arg == -1) {
 894         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 895         return;
 896     }
 897
 898     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 899         if (have_avx2) {
 900             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 901         } else {
 902             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 903         }
 904         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 905     } else {
 906         if (type == TCG_TYPE_V64) {
 907             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 908         } else if (have_avx2) {
 909             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 910         } else {
 911             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 912         }
 913         if (TCG_TARGET_REG_BITS == 64) {
 914             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 915         } else {
 916             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 917         }
 918     }
 919 }
 920
 921 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 922                              TCGReg ret, tcg_target_long arg)
 923 {
 924     if (arg == 0) {
 925         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 926         return;
 927     }
 928     if (arg == -1) {
 929         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 930         return;
 931     }
 932
 933     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 934     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 935     if (TCG_TARGET_REG_BITS == 64) {
 936         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 937     } else {
 938         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 939     }
 940 }
 941
 942 static void tcg_out_movi_int(TCGContext *s, TCGType type,
 943                              TCGReg ret, tcg_target_long arg)
 944 {
 945     tcg_target_long diff;
 946
 947     if (arg == 0) {
 948         tgen_arithr(s, ARITH_XOR, ret, ret);
 949         return;
 950     }
 951     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 952         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 953         tcg_out32(s, arg);
 954         return;
 955     }
 956     if (arg == (int32_t)arg) {
 957         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 958         tcg_out32(s, arg);
 959         return;
 960     }
 961
 962     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 963     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
 964     if (diff == (int32_t)diff) {
 965         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 966         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 967         tcg_out32(s, diff);
 968         return;
 969     }
 970
 971     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 972     tcg_out64(s, arg);
 973 }
 974
 975 static void tcg_out_movi(TCGContext *s, TCGType type,
 976                          TCGReg ret, tcg_target_long arg)
 977 {
 978     switch (type) {
 979     case TCG_TYPE_I32:
 980 #if TCG_TARGET_REG_BITS == 64
 981     case TCG_TYPE_I64:
 982 #endif
 983         if (ret < 16) {
 984             tcg_out_movi_int(s, type, ret, arg);
 985         } else {
 986             tcg_out_movi_vec(s, type, ret, arg);
 987         }
 988         break;
 989     default:
 990         g_assert_not_reached();
 991     }
 992 }
 993
 994 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 995 {
 996     if (val == (int8_t)val) {
 997         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 998         tcg_out8(s, val);
 999     } else if (val == (int32_t)val) {
1000         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1001         tcg_out32(s, val);
1002     } else {
1003         tcg_abort();
1004     }
1005 }
1006
1007 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1008 {
1009     /* Given the strength of x86 memory ordering, we only need care for
1010        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1011        faster than "mfence", so don't bother with the sse insn.  */
1012     if (a0 & TCG_MO_ST_LD) {
1013         tcg_out8(s, 0xf0);
1014         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1015         tcg_out8(s, 0);
1016     }
1017 }
1018
1019 static inline void tcg_out_push(TCGContext *s, int reg)
1020 {
1021     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1022 }
1023
1024 static inline void tcg_out_pop(TCGContext *s, int reg)
1025 {
1026     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1027 }
1028
1029 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1030                        TCGReg arg1, intptr_t arg2)
1031 {
1032     switch (type) {
1033     case TCG_TYPE_I32:
1034         if (ret < 16) {
1035             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1036         } else {
1037             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1038         }
1039         break;
1040     case TCG_TYPE_I64:
1041         if (ret < 16) {
1042             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1043             break;
1044         }
1045         /* FALLTHRU */
1046     case TCG_TYPE_V64:
1047         /* There is no instruction that can validate 8-byte alignment.  */
1048         tcg_debug_assert(ret >= 16);
1049         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1050         break;
1051     case TCG_TYPE_V128:
1052         /*
1053          * The gvec infrastructure is asserts that v128 vector loads
1054          * and stores use a 16-byte aligned offset.  Validate that the
1055          * final pointer is aligned by using an insn that will SIGSEGV.
1056          */
1057         tcg_debug_assert(ret >= 16);
1058         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1059         break;
1060     case TCG_TYPE_V256:
1061         /*
1062          * The gvec infrastructure only requires 16-byte alignment,
1063          * so here we must use an unaligned load.
1064          */
1065         tcg_debug_assert(ret >= 16);
1066         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1067                                  ret, 0, arg1, arg2);
1068         break;
1069     default:
1070         g_assert_not_reached();
1071     }
1072 }
1073
1074 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1075                        TCGReg arg1, intptr_t arg2)
1076 {
1077     switch (type) {
1078     case TCG_TYPE_I32:
1079         if (arg < 16) {
1080             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1081         } else {
1082             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1083         }
1084         break;
1085     case TCG_TYPE_I64:
1086         if (arg < 16) {
1087             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1088             break;
1089         }
1090         /* FALLTHRU */
1091     case TCG_TYPE_V64:
1092         /* There is no instruction that can validate 8-byte alignment.  */
1093         tcg_debug_assert(arg >= 16);
1094         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1095         break;
1096     case TCG_TYPE_V128:
1097         /*
1098          * The gvec infrastructure is asserts that v128 vector loads
1099          * and stores use a 16-byte aligned offset.  Validate that the
1100          * final pointer is aligned by using an insn that will SIGSEGV.
1101          */
1102         tcg_debug_assert(arg >= 16);
1103         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1104         break;
1105     case TCG_TYPE_V256:
1106         /*
1107          * The gvec infrastructure only requires 16-byte alignment,
1108          * so here we must use an unaligned store.
1109          */
1110         tcg_debug_assert(arg >= 16);
1111         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1112                                  arg, 0, arg1, arg2);
1113         break;
1114     default:
1115         g_assert_not_reached();
1116     }
1117 }
1118
1119 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1120                         TCGReg base, intptr_t ofs)
1121 {
1122     int rexw = 0;
1123     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1124         if (val != (int32_t)val) {
1125             return false;
1126         }
1127         rexw = P_REXW;
1128     } else if (type != TCG_TYPE_I32) {
1129         return false;
1130     }
1131     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1132     tcg_out32(s, val);
1133     return true;
1134 }
1135
1136 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1137 {
1138     /* Propagate an opcode prefix, such as P_DATA16.  */
1139     int ext = subopc & ~0x7;
1140     subopc &= 0x7;
1141
1142     if (count == 1) {
1143         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1144     } else {
1145         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1146         tcg_out8(s, count);
1147     }
1148 }
1149
1150 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1151 {
1152     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1153 }
1154
1155 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1156 {
1157     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1158 }
1159
1160 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1161 {
1162     /* movzbl */
1163     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1164     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1165 }
1166
1167 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1168 {
1169     /* movsbl */
1170     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1171     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1172 }
1173
1174 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1175 {
1176     /* movzwl */
1177     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1178 }
1179
1180 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1181 {
1182     /* movsw[lq] */
1183     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1184 }
1185
1186 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1187 {
1188     /* 32-bit mov zero extends.  */
1189     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1190 }
1191
1192 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1193 {
1194     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1195 }
1196
1197 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1198 {
1199     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1200 }
1201
1202 static void tgen_arithi(TCGContext *s, int c, int r0,
1203                         tcg_target_long val, int cf)
1204 {
1205     int rexw = 0;
1206
1207     if (TCG_TARGET_REG_BITS == 64) {
1208         rexw = c & -8;
1209         c &= 7;
1210     }
1211
1212     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1213        partial flags update stalls on Pentium4 and are not recommended
1214        by current Intel optimization manuals.  */
1215     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1216         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1217         if (TCG_TARGET_REG_BITS == 64) {
1218             /* The single-byte increment encodings are re-tasked as the
1219                REX prefixes.  Use the MODRM encoding.  */
1220             tcg_out_modrm(s, OPC_GRP5 + rexw,
1221                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1222         } else {
1223             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1224         }
1225         return;
1226     }
1227
1228     if (c == ARITH_AND) {
1229         if (TCG_TARGET_REG_BITS == 64) {
1230             if (val == 0xffffffffu) {
1231                 tcg_out_ext32u(s, r0, r0);
1232                 return;
1233             }
1234             if (val == (uint32_t)val) {
1235                 /* AND with no high bits set can use a 32-bit operation.  */
1236                 rexw = 0;
1237             }
1238         }
1239         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1240             tcg_out_ext8u(s, r0, r0);
1241             return;
1242         }
1243         if (val == 0xffffu) {
1244             tcg_out_ext16u(s, r0, r0);
1245             return;
1246         }
1247     }
1248
1249     if (val == (int8_t)val) {
1250         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1251         tcg_out8(s, val);
1252         return;
1253     }
1254     if (rexw == 0 || val == (int32_t)val) {
1255         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1256         tcg_out32(s, val);
1257         return;
1258     }
1259
1260     tcg_abort();
1261 }
1262
1263 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1264 {
1265     if (val != 0) {
1266         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1267     }
1268 }
1269
1270 /* Use SMALL != 0 to force a short forward branch.  */
1271 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1272 {
1273     int32_t val, val1;
1274
1275     if (l->has_value) {
1276         val = tcg_pcrel_diff(s, l->u.value_ptr);
1277         val1 = val - 2;
1278         if ((int8_t)val1 == val1) {
1279             if (opc == -1) {
1280                 tcg_out8(s, OPC_JMP_short);
1281             } else {
1282                 tcg_out8(s, OPC_JCC_short + opc);
1283             }
1284             tcg_out8(s, val1);
1285         } else {
1286             if (small) {
1287                 tcg_abort();
1288             }
1289             if (opc == -1) {
1290                 tcg_out8(s, OPC_JMP_long);
1291                 tcg_out32(s, val - 5);
1292             } else {
1293                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1294                 tcg_out32(s, val - 6);
1295             }
1296         }
1297     } else if (small) {
1298         if (opc == -1) {
1299             tcg_out8(s, OPC_JMP_short);
1300         } else {
1301             tcg_out8(s, OPC_JCC_short + opc);
1302         }
1303         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1304         s->code_ptr += 1;
1305     } else {
1306         if (opc == -1) {
1307             tcg_out8(s, OPC_JMP_long);
1308         } else {
1309             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1310         }
1311         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1312         s->code_ptr += 4;
1313     }
1314 }
1315
1316 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1317                         int const_arg2, int rexw)
1318 {
1319     if (const_arg2) {
1320         if (arg2 == 0) {
1321             /* test r, r */
1322             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1323         } else {
1324             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1325         }
1326     } else {
1327         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1328     }
1329 }
1330
1331 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1332                              TCGArg arg1, TCGArg arg2, int const_arg2,
1333                              TCGLabel *label, int small)
1334 {
1335     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1336     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1337 }
1338
1339 #if TCG_TARGET_REG_BITS == 64
1340 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1341                              TCGArg arg1, TCGArg arg2, int const_arg2,
1342                              TCGLabel *label, int small)
1343 {
1344     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1345     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1346 }
1347 #else
1348 /* XXX: we implement it at the target level to avoid having to
1349    handle cross basic blocks temporaries */
1350 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1351                             const int *const_args, int small)
1352 {
1353     TCGLabel *label_next = gen_new_label();
1354     TCGLabel *label_this = arg_label(args[5]);
1355
1356     switch(args[4]) {
1357     case TCG_COND_EQ:
1358         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1359                          label_next, 1);
1360         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1361                          label_this, small);
1362         break;
1363     case TCG_COND_NE:
1364         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1365                          label_this, small);
1366         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1367                          label_this, small);
1368         break;
1369     case TCG_COND_LT:
1370         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1371                          label_this, small);
1372         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1373         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1374                          label_this, small);
1375         break;
1376     case TCG_COND_LE:
1377         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1378                          label_this, small);
1379         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1380         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1381                          label_this, small);
1382         break;
1383     case TCG_COND_GT:
1384         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1385                          label_this, small);
1386         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1387         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1388                          label_this, small);
1389         break;
1390     case TCG_COND_GE:
1391         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1392                          label_this, small);
1393         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1394         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1395                          label_this, small);
1396         break;
1397     case TCG_COND_LTU:
1398         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1399                          label_this, small);
1400         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1401         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1402                          label_this, small);
1403         break;
1404     case TCG_COND_LEU:
1405         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1406                          label_this, small);
1407         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1408         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1409                          label_this, small);
1410         break;
1411     case TCG_COND_GTU:
1412         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1413                          label_this, small);
1414         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1415         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1416                          label_this, small);
1417         break;
1418     case TCG_COND_GEU:
1419         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1420                          label_this, small);
1421         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1422         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1423                          label_this, small);
1424         break;
1425     default:
1426         tcg_abort();
1427     }
1428     tcg_out_label(s, label_next);
1429 }
1430 #endif
1431
1432 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1433                               TCGArg arg1, TCGArg arg2, int const_arg2)
1434 {
1435     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1436     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1437     tcg_out_ext8u(s, dest, dest);
1438 }
1439
1440 #if TCG_TARGET_REG_BITS == 64
1441 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1442                               TCGArg arg1, TCGArg arg2, int const_arg2)
1443 {
1444     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1445     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1446     tcg_out_ext8u(s, dest, dest);
1447 }
1448 #else
1449 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1450                              const int *const_args)
1451 {
1452     TCGArg new_args[6];
1453     TCGLabel *label_true, *label_over;
1454
1455     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1456
1457     if (args[0] == args[1] || args[0] == args[2]
1458         || (!const_args[3] && args[0] == args[3])
1459         || (!const_args[4] && args[0] == args[4])) {
1460         /* When the destination overlaps with one of the argument
1461            registers, don't do anything tricky.  */
1462         label_true = gen_new_label();
1463         label_over = gen_new_label();
1464
1465         new_args[5] = label_arg(label_true);
1466         tcg_out_brcond2(s, new_args, const_args+1, 1);
1467
1468         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1469         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1470         tcg_out_label(s, label_true);
1471
1472         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1473         tcg_out_label(s, label_over);
1474     } else {
1475         /* When the destination does not overlap one of the arguments,
1476            clear the destination first, jump if cond false, and emit an
1477            increment in the true case.  This results in smaller code.  */
1478
1479         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1480
1481         label_over = gen_new_label();
1482         new_args[4] = tcg_invert_cond(new_args[4]);
1483         new_args[5] = label_arg(label_over);
1484         tcg_out_brcond2(s, new_args, const_args+1, 1);
1485
1486         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1487         tcg_out_label(s, label_over);
1488     }
1489 }
1490 #endif
1491
1492 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1493                          TCGReg dest, TCGReg v1)
1494 {
1495     if (have_cmov) {
1496         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1497     } else {
1498         TCGLabel *over = gen_new_label();
1499         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1500         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1501         tcg_out_label(s, over);
1502     }
1503 }
1504
1505 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1506                               TCGReg c1, TCGArg c2, int const_c2,
1507                               TCGReg v1)
1508 {
1509     tcg_out_cmp(s, c1, c2, const_c2, 0);
1510     tcg_out_cmov(s, cond, 0, dest, v1);
1511 }
1512
1513 #if TCG_TARGET_REG_BITS == 64
1514 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1515                               TCGReg c1, TCGArg c2, int const_c2,
1516                               TCGReg v1)
1517 {
1518     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1519     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1520 }
1521 #endif
1522
1523 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1524                         TCGArg arg2, bool const_a2)
1525 {
1526     if (have_bmi1) {
1527         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1528         if (const_a2) {
1529             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1530         } else {
1531             tcg_debug_assert(dest != arg2);
1532             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1533         }
1534     } else {
1535         tcg_debug_assert(dest != arg2);
1536         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1537         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1538     }
1539 }
1540
1541 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1542                         TCGArg arg2, bool const_a2)
1543 {
1544     if (have_lzcnt) {
1545         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1546         if (const_a2) {
1547             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1548         } else {
1549             tcg_debug_assert(dest != arg2);
1550             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1551         }
1552     } else {
1553         tcg_debug_assert(!const_a2);
1554         tcg_debug_assert(dest != arg1);
1555         tcg_debug_assert(dest != arg2);
1556
1557         /* Recall that the output of BSR is the index not the count.  */
1558         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1559         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1560
1561         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1562         tcg_out_cmp(s, arg1, 0, 1, rexw);
1563         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1564     }
1565 }
1566
1567 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1568 {
1569     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1570
1571     if (disp == (int32_t)disp) {
1572         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1573         tcg_out32(s, disp);
1574     } else {
1575         /* rip-relative addressing into the constant pool.
1576            This is 6 + 8 = 14 bytes, as compared to using an
1577            an immediate load 10 + 6 = 16 bytes, plus we may
1578            be able to re-use the pool constant for more calls.  */
1579         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1580         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1581         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1582         tcg_out32(s, 0);
1583     }
1584 }
1585
1586 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1587 {
1588     tcg_out_branch(s, 1, dest);
1589 }
1590
1591 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1592 {
1593     tcg_out_branch(s, 0, dest);
1594 }
1595
1596 static void tcg_out_nopn(TCGContext *s, int n)
1597 {
1598     int i;
1599     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1600      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1601      * duplicate prefix, and all of the interesting recent cores can
1602      * decode and discard the duplicates in a single cycle.
1603      */
1604     tcg_debug_assert(n >= 1);
1605     for (i = 1; i < n; ++i) {
1606         tcg_out8(s, 0x66);
1607     }
1608     tcg_out8(s, 0x90);
1609 }
1610
1611 #if defined(CONFIG_SOFTMMU)
1612 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1613  *                                     int mmu_idx, uintptr_t ra)
1614  */
1615 static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1616     [MO_UB]   = helper_ret_ldub_mmu,
1617     [MO_LEUW] = helper_le_lduw_mmu,
1618     [MO_LEUL] = helper_le_ldul_mmu,
1619     [MO_LEUQ] = helper_le_ldq_mmu,
1620     [MO_BEUW] = helper_be_lduw_mmu,
1621     [MO_BEUL] = helper_be_ldul_mmu,
1622     [MO_BEUQ] = helper_be_ldq_mmu,
1623 };
1624
1625 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1626  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1627  */
1628 static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1629     [MO_UB]   = helper_ret_stb_mmu,
1630     [MO_LEUW] = helper_le_stw_mmu,
1631     [MO_LEUL] = helper_le_stl_mmu,
1632     [MO_LEUQ] = helper_le_stq_mmu,
1633     [MO_BEUW] = helper_be_stw_mmu,
1634     [MO_BEUL] = helper_be_stl_mmu,
1635     [MO_BEUQ] = helper_be_stq_mmu,
1636 };
1637
1638 /* Perform the TLB load and compare.
1639
1640    Inputs:
1641    ADDRLO and ADDRHI contain the low and high part of the address.
1642
1643    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1644
1645    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1646    This should be offsetof addr_read or addr_write.
1647
1648    Outputs:
1649    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1650    positions of the displacements of forward jumps to the TLB miss case.
1651
1652    Second argument register is loaded with the low part of the address.
1653    In the TLB hit case, it has been adjusted as indicated by the TLB
1654    and so is a host address.  In the TLB miss case, it continues to
1655    hold a guest address.
1656
1657    First argument register is clobbered.  */
1658
1659 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1660                                     int mem_index, MemOp opc,
1661                                     tcg_insn_unit **label_ptr, int which)
1662 {
1663     const TCGReg r0 = TCG_REG_L0;
1664     const TCGReg r1 = TCG_REG_L1;
1665     TCGType ttype = TCG_TYPE_I32;
1666     TCGType tlbtype = TCG_TYPE_I32;
1667     int trexw = 0, hrexw = 0, tlbrexw = 0;
1668     unsigned a_bits = get_alignment_bits(opc);
1669     unsigned s_bits = opc & MO_SIZE;
1670     unsigned a_mask = (1 << a_bits) - 1;
1671     unsigned s_mask = (1 << s_bits) - 1;
1672     target_ulong tlb_mask;
1673
1674     if (TCG_TARGET_REG_BITS == 64) {
1675         if (TARGET_LONG_BITS == 64) {
1676             ttype = TCG_TYPE_I64;
1677             trexw = P_REXW;
1678         }
1679         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1680             hrexw = P_REXW;
1681             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1682                 tlbtype = TCG_TYPE_I64;
1683                 tlbrexw = P_REXW;
1684             }
1685         }
1686     }
1687
1688     tcg_out_mov(s, tlbtype, r0, addrlo);
1689     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1690                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1691
1692     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1693                          TLB_MASK_TABLE_OFS(mem_index) +
1694                          offsetof(CPUTLBDescFast, mask));
1695
1696     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1697                          TLB_MASK_TABLE_OFS(mem_index) +
1698                          offsetof(CPUTLBDescFast, table));
1699
1700     /* If the required alignment is at least as large as the access, simply
1701        copy the address and mask.  For lesser alignments, check that we don't
1702        cross pages for the complete access.  */
1703     if (a_bits >= s_bits) {
1704         tcg_out_mov(s, ttype, r1, addrlo);
1705     } else {
1706         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1707     }
1708     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1709     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1710
1711     /* cmp 0(r0), r1 */
1712     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1713
1714     /* Prepare for both the fast path add of the tlb addend, and the slow
1715        path function argument setup.  */
1716     tcg_out_mov(s, ttype, r1, addrlo);
1717
1718     /* jne slow_path */
1719     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1720     label_ptr[0] = s->code_ptr;
1721     s->code_ptr += 4;
1722
1723     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1724         /* cmp 4(r0), addrhi */
1725         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1726
1727         /* jne slow_path */
1728         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1729         label_ptr[1] = s->code_ptr;
1730         s->code_ptr += 4;
1731     }
1732
1733     /* TLB Hit.  */
1734
1735     /* add addend(r0), r1 */
1736     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1737                          offsetof(CPUTLBEntry, addend));
1738 }
1739
1740 /*
1741  * Record the context of a call to the out of line helper code for the slow path
1742  * for a load or store, so that we can later generate the correct helper code
1743  */
1744 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1745                                 MemOpIdx oi,
1746                                 TCGReg datalo, TCGReg datahi,
1747                                 TCGReg addrlo, TCGReg addrhi,
1748                                 tcg_insn_unit *raddr,
1749                                 tcg_insn_unit **label_ptr)
1750 {
1751     TCGLabelQemuLdst *label = new_ldst_label(s);
1752
1753     label->is_ld = is_ld;
1754     label->oi = oi;
1755     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1756     label->datalo_reg = datalo;
1757     label->datahi_reg = datahi;
1758     label->addrlo_reg = addrlo;
1759     label->addrhi_reg = addrhi;
1760     label->raddr = tcg_splitwx_to_rx(raddr);
1761     label->label_ptr[0] = label_ptr[0];
1762     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1763         label->label_ptr[1] = label_ptr[1];
1764     }
1765 }
1766
1767 /*
1768  * Generate code for the slow path for a load at the end of block
1769  */
1770 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1771 {
1772     MemOpIdx oi = l->oi;
1773     MemOp opc = get_memop(oi);
1774     TCGReg data_reg;
1775     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1776     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1777
1778     /* resolve label address */
1779     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1780     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1781         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1782     }
1783
1784     if (TCG_TARGET_REG_BITS == 32) {
1785         int ofs = 0;
1786
1787         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1788         ofs += 4;
1789
1790         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1791         ofs += 4;
1792
1793         if (TARGET_LONG_BITS == 64) {
1794             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1795             ofs += 4;
1796         }
1797
1798         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1799         ofs += 4;
1800
1801         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1802     } else {
1803         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1804         /* The second argument is already loaded with addrlo.  */
1805         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1806         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1807                      (uintptr_t)l->raddr);
1808     }
1809
1810     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1811
1812     data_reg = l->datalo_reg;
1813     switch (opc & MO_SSIZE) {
1814     case MO_SB:
1815         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1816         break;
1817     case MO_SW:
1818         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1819         break;
1820 #if TCG_TARGET_REG_BITS == 64
1821     case MO_SL:
1822         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1823         break;
1824 #endif
1825     case MO_UB:
1826     case MO_UW:
1827         /* Note that the helpers have zero-extended to tcg_target_long.  */
1828     case MO_UL:
1829         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1830         break;
1831     case MO_UQ:
1832         if (TCG_TARGET_REG_BITS == 64) {
1833             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1834         } else if (data_reg == TCG_REG_EDX) {
1835             /* xchg %edx, %eax */
1836             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1837             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1838         } else {
1839             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1840             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1841         }
1842         break;
1843     default:
1844         tcg_abort();
1845     }
1846
1847     /* Jump to the code corresponding to next IR of qemu_st */
1848     tcg_out_jmp(s, l->raddr);
1849     return true;
1850 }
1851
1852 /*
1853  * Generate code for the slow path for a store at the end of block
1854  */
1855 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856 {
1857     MemOpIdx oi = l->oi;
1858     MemOp opc = get_memop(oi);
1859     MemOp s_bits = opc & MO_SIZE;
1860     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1861     TCGReg retaddr;
1862
1863     /* resolve label address */
1864     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1865     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1866         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1867     }
1868
1869     if (TCG_TARGET_REG_BITS == 32) {
1870         int ofs = 0;
1871
1872         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1873         ofs += 4;
1874
1875         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1876         ofs += 4;
1877
1878         if (TARGET_LONG_BITS == 64) {
1879             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1880             ofs += 4;
1881         }
1882
1883         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1884         ofs += 4;
1885
1886         if (s_bits == MO_64) {
1887             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1888             ofs += 4;
1889         }
1890
1891         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1892         ofs += 4;
1893
1894         retaddr = TCG_REG_EAX;
1895         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1896         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1897     } else {
1898         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1899         /* The second argument is already loaded with addrlo.  */
1900         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1901                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1902         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1903
1904         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1905             retaddr = tcg_target_call_iarg_regs[4];
1906             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1907         } else {
1908             retaddr = TCG_REG_RAX;
1909             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1910             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1911                        TCG_TARGET_CALL_STACK_OFFSET);
1912         }
1913     }
1914
1915     /* "Tail call" to the helper, with the return address back inline.  */
1916     tcg_out_push(s, retaddr);
1917     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1918     return true;
1919 }
1920 #else
1921
1922 static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
1923                                    TCGReg addrhi, unsigned a_bits)
1924 {
1925     unsigned a_mask = (1 << a_bits) - 1;
1926     TCGLabelQemuLdst *label;
1927
1928     /*
1929      * We are expecting a_bits to max out at 7, so we can usually use testb.
1930      * For i686, we have to use testl for %esi/%edi.
1931      */
1932     if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
1933         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
1934         tcg_out8(s, a_mask);
1935     } else {
1936         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
1937         tcg_out32(s, a_mask);
1938     }
1939
1940     /* jne slow_path */
1941     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1942
1943     label = new_ldst_label(s);
1944     label->is_ld = is_ld;
1945     label->addrlo_reg = addrlo;
1946     label->addrhi_reg = addrhi;
1947     label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
1948     label->label_ptr[0] = s->code_ptr;
1949
1950     s->code_ptr += 4;
1951 }
1952
1953 static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
1954 {
1955     /* resolve label address */
1956     tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
1957
1958     if (TCG_TARGET_REG_BITS == 32) {
1959         int ofs = 0;
1960
1961         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1962         ofs += 4;
1963
1964         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1965         ofs += 4;
1966         if (TARGET_LONG_BITS == 64) {
1967             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1968             ofs += 4;
1969         }
1970
1971         tcg_out_pushi(s, (uintptr_t)l->raddr);
1972     } else {
1973         tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
1974                     l->addrlo_reg);
1975         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1976
1977         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
1978         tcg_out_push(s, TCG_REG_RAX);
1979     }
1980
1981     /* "Tail call" to the helper, with the return address back inline. */
1982     tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
1983                                   : helper_unaligned_st));
1984     return true;
1985 }
1986
1987 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1988 {
1989     return tcg_out_fail_alignment(s, l);
1990 }
1991
1992 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1993 {
1994     return tcg_out_fail_alignment(s, l);
1995 }
1996
1997 #if TCG_TARGET_REG_BITS == 32
1998 # define x86_guest_base_seg     0
1999 # define x86_guest_base_index   -1
2000 # define x86_guest_base_offset  guest_base
2001 #else
2002 static int x86_guest_base_seg;
2003 static int x86_guest_base_index = -1;
2004 static int32_t x86_guest_base_offset;
2005 # if defined(__x86_64__) && defined(__linux__)
2006 #  include <asm/prctl.h>
2007 #  include <sys/prctl.h>
2008 int arch_prctl(int code, unsigned long addr);
2009 static inline int setup_guest_base_seg(void)
2010 {
2011     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2012         return P_GS;
2013     }
2014     return 0;
2015 }
2016 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
2017 #  include <machine/sysarch.h>
2018 static inline int setup_guest_base_seg(void)
2019 {
2020     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2021         return P_GS;
2022     }
2023     return 0;
2024 }
2025 # else
2026 static inline int setup_guest_base_seg(void)
2027 {
2028     return 0;
2029 }
2030 # endif
2031 #endif
2032 #endif /* SOFTMMU */
2033
2034 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2035                                    TCGReg base, int index, intptr_t ofs,
2036                                    int seg, bool is64, MemOp memop)
2037 {
2038     bool use_movbe = false;
2039     int rexw = is64 * P_REXW;
2040     int movop = OPC_MOVL_GvEv;
2041
2042     /* Do big-endian loads with movbe.  */
2043     if (memop & MO_BSWAP) {
2044         tcg_debug_assert(have_movbe);
2045         use_movbe = true;
2046         movop = OPC_MOVBE_GyMy;
2047     }
2048
2049     switch (memop & MO_SSIZE) {
2050     case MO_UB:
2051         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2052                                  base, index, 0, ofs);
2053         break;
2054     case MO_SB:
2055         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2056                                  base, index, 0, ofs);
2057         break;
2058     case MO_UW:
2059         if (use_movbe) {
2060             /* There is no extending movbe; only low 16-bits are modified.  */
2061             if (datalo != base && datalo != index) {
2062                 /* XOR breaks dependency chains.  */
2063                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2064                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2065                                          datalo, base, index, 0, ofs);
2066             } else {
2067                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2068                                          datalo, base, index, 0, ofs);
2069                 tcg_out_ext16u(s, datalo, datalo);
2070             }
2071         } else {
2072             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2073                                      base, index, 0, ofs);
2074         }
2075         break;
2076     case MO_SW:
2077         if (use_movbe) {
2078             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2079                                      datalo, base, index, 0, ofs);
2080             tcg_out_ext16s(s, datalo, datalo, rexw);
2081         } else {
2082             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2083                                      datalo, base, index, 0, ofs);
2084         }
2085         break;
2086     case MO_UL:
2087         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2088         break;
2089 #if TCG_TARGET_REG_BITS == 64
2090     case MO_SL:
2091         if (use_movbe) {
2092             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2093                                      base, index, 0, ofs);
2094             tcg_out_ext32s(s, datalo, datalo);
2095         } else {
2096             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2097                                      base, index, 0, ofs);
2098         }
2099         break;
2100 #endif
2101     case MO_UQ:
2102         if (TCG_TARGET_REG_BITS == 64) {
2103             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2104                                      base, index, 0, ofs);
2105         } else {
2106             if (use_movbe) {
2107                 TCGReg t = datalo;
2108                 datalo = datahi;
2109                 datahi = t;
2110             }
2111             if (base != datalo) {
2112                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2113                                          base, index, 0, ofs);
2114                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2115                                          base, index, 0, ofs + 4);
2116             } else {
2117                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2118                                          base, index, 0, ofs + 4);
2119                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2120                                          base, index, 0, ofs);
2121             }
2122         }
2123         break;
2124     default:
2125         g_assert_not_reached();
2126     }
2127 }
2128
2129 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2130    EAX. It will be useful once fixed registers globals are less
2131    common. */
2132 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2133 {
2134     TCGReg datalo, datahi, addrlo;
2135     TCGReg addrhi __attribute__((unused));
2136     MemOpIdx oi;
2137     MemOp opc;
2138 #if defined(CONFIG_SOFTMMU)
2139     int mem_index;
2140     tcg_insn_unit *label_ptr[2];
2141 #else
2142     unsigned a_bits;
2143 #endif
2144
2145     datalo = *args++;
2146     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2147     addrlo = *args++;
2148     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2149     oi = *args++;
2150     opc = get_memop(oi);
2151
2152 #if defined(CONFIG_SOFTMMU)
2153     mem_index = get_mmuidx(oi);
2154
2155     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2156                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2157
2158     /* TLB Hit.  */
2159     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2160
2161     /* Record the current context of a load into ldst label */
2162     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2163                         s->code_ptr, label_ptr);
2164 #else
2165     a_bits = get_alignment_bits(opc);
2166     if (a_bits) {
2167         tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
2168     }
2169
2170     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2171                            x86_guest_base_offset, x86_guest_base_seg,
2172                            is64, opc);
2173 #endif
2174 }
2175
2176 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2177                                    TCGReg base, int index, intptr_t ofs,
2178                                    int seg, MemOp memop)
2179 {
2180     bool use_movbe = false;
2181     int movop = OPC_MOVL_EvGv;
2182
2183     /*
2184      * Do big-endian stores with movbe or softmmu.
2185      * User-only without movbe will have its swapping done generically.
2186      */
2187     if (memop & MO_BSWAP) {
2188         tcg_debug_assert(have_movbe);
2189         use_movbe = true;
2190         movop = OPC_MOVBE_MyGy;
2191     }
2192
2193     switch (memop & MO_SIZE) {
2194     case MO_8:
2195         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2196         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2197         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2198                                  datalo, base, index, 0, ofs);
2199         break;
2200     case MO_16:
2201         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2202                                  base, index, 0, ofs);
2203         break;
2204     case MO_32:
2205         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2206         break;
2207     case MO_64:
2208         if (TCG_TARGET_REG_BITS == 64) {
2209             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2210                                      base, index, 0, ofs);
2211         } else {
2212             if (use_movbe) {
2213                 TCGReg t = datalo;
2214                 datalo = datahi;
2215                 datahi = t;
2216             }
2217             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2218                                      base, index, 0, ofs);
2219             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2220                                      base, index, 0, ofs + 4);
2221         }
2222         break;
2223     default:
2224         g_assert_not_reached();
2225     }
2226 }
2227
2228 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2229 {
2230     TCGReg datalo, datahi, addrlo;
2231     TCGReg addrhi __attribute__((unused));
2232     MemOpIdx oi;
2233     MemOp opc;
2234 #if defined(CONFIG_SOFTMMU)
2235     int mem_index;
2236     tcg_insn_unit *label_ptr[2];
2237 #else
2238     unsigned a_bits;
2239 #endif
2240
2241     datalo = *args++;
2242     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2243     addrlo = *args++;
2244     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2245     oi = *args++;
2246     opc = get_memop(oi);
2247
2248 #if defined(CONFIG_SOFTMMU)
2249     mem_index = get_mmuidx(oi);
2250
2251     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2252                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2253
2254     /* TLB Hit.  */
2255     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2256
2257     /* Record the current context of a store into ldst label */
2258     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2259                         s->code_ptr, label_ptr);
2260 #else
2261     a_bits = get_alignment_bits(opc);
2262     if (a_bits) {
2263         tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
2264     }
2265
2266     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2267                            x86_guest_base_offset, x86_guest_base_seg, opc);
2268 #endif
2269 }
2270
2271 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2272                               const TCGArg args[TCG_MAX_OP_ARGS],
2273                               const int const_args[TCG_MAX_OP_ARGS])
2274 {
2275     TCGArg a0, a1, a2;
2276     int c, const_a2, vexop, rexw = 0;
2277
2278 #if TCG_TARGET_REG_BITS == 64
2279 # define OP_32_64(x) \
2280         case glue(glue(INDEX_op_, x), _i64): \
2281             rexw = P_REXW; /* FALLTHRU */    \
2282         case glue(glue(INDEX_op_, x), _i32)
2283 #else
2284 # define OP_32_64(x) \
2285         case glue(glue(INDEX_op_, x), _i32)
2286 #endif
2287
2288     /* Hoist the loads of the most common arguments.  */
2289     a0 = args[0];
2290     a1 = args[1];
2291     a2 = args[2];
2292     const_a2 = const_args[2];
2293
2294     switch (opc) {
2295     case INDEX_op_exit_tb:
2296         /* Reuse the zeroing that exists for goto_ptr.  */
2297         if (a0 == 0) {
2298             tcg_out_jmp(s, tcg_code_gen_epilogue);
2299         } else {
2300             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2301             tcg_out_jmp(s, tb_ret_addr);
2302         }
2303         break;
2304     case INDEX_op_goto_tb:
2305         if (s->tb_jmp_insn_offset) {
2306             /* direct jump method */
2307             int gap;
2308             /* jump displacement must be aligned for atomic patching;
2309              * see if we need to add extra nops before jump
2310              */
2311             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2312             if (gap != 1) {
2313                 tcg_out_nopn(s, gap - 1);
2314             }
2315             tcg_out8(s, OPC_JMP_long); /* jmp im */
2316             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2317             tcg_out32(s, 0);
2318         } else {
2319             /* indirect jump method */
2320             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2321                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2322         }
2323         set_jmp_reset_offset(s, a0);
2324         break;
2325     case INDEX_op_goto_ptr:
2326         /* jmp to the given host address (could be epilogue) */
2327         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2328         break;
2329     case INDEX_op_br:
2330         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2331         break;
2332     OP_32_64(ld8u):
2333         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2334         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2335         break;
2336     OP_32_64(ld8s):
2337         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2338         break;
2339     OP_32_64(ld16u):
2340         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2341         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2342         break;
2343     OP_32_64(ld16s):
2344         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2345         break;
2346 #if TCG_TARGET_REG_BITS == 64
2347     case INDEX_op_ld32u_i64:
2348 #endif
2349     case INDEX_op_ld_i32:
2350         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2351         break;
2352
2353     OP_32_64(st8):
2354         if (const_args[0]) {
2355             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2356             tcg_out8(s, a0);
2357         } else {
2358             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2359         }
2360         break;
2361     OP_32_64(st16):
2362         if (const_args[0]) {
2363             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2364             tcg_out16(s, a0);
2365         } else {
2366             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2367         }
2368         break;
2369 #if TCG_TARGET_REG_BITS == 64
2370     case INDEX_op_st32_i64:
2371 #endif
2372     case INDEX_op_st_i32:
2373         if (const_args[0]) {
2374             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2375             tcg_out32(s, a0);
2376         } else {
2377             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2378         }
2379         break;
2380
2381     OP_32_64(add):
2382         /* For 3-operand addition, use LEA.  */
2383         if (a0 != a1) {
2384             TCGArg c3 = 0;
2385             if (const_a2) {
2386                 c3 = a2, a2 = -1;
2387             } else if (a0 == a2) {
2388                 /* Watch out for dest = src + dest, since we've removed
2389                    the matching constraint on the add.  */
2390                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2391                 break;
2392             }
2393
2394             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2395             break;
2396         }
2397         c = ARITH_ADD;
2398         goto gen_arith;
2399     OP_32_64(sub):
2400         c = ARITH_SUB;
2401         goto gen_arith;
2402     OP_32_64(and):
2403         c = ARITH_AND;
2404         goto gen_arith;
2405     OP_32_64(or):
2406         c = ARITH_OR;
2407         goto gen_arith;
2408     OP_32_64(xor):
2409         c = ARITH_XOR;
2410         goto gen_arith;
2411     gen_arith:
2412         if (const_a2) {
2413             tgen_arithi(s, c + rexw, a0, a2, 0);
2414         } else {
2415             tgen_arithr(s, c + rexw, a0, a2);
2416         }
2417         break;
2418
2419     OP_32_64(andc):
2420         if (const_a2) {
2421             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2422             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2423         } else {
2424             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2425         }
2426         break;
2427
2428     OP_32_64(mul):
2429         if (const_a2) {
2430             int32_t val;
2431             val = a2;
2432             if (val == (int8_t)val) {
2433                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2434                 tcg_out8(s, val);
2435             } else {
2436                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2437                 tcg_out32(s, val);
2438             }
2439         } else {
2440             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2441         }
2442         break;
2443
2444     OP_32_64(div2):
2445         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2446         break;
2447     OP_32_64(divu2):
2448         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2449         break;
2450
2451     OP_32_64(shl):
2452         /* For small constant 3-operand shift, use LEA.  */
2453         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2454             if (a2 - 1 == 0) {
2455                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2456                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2457             } else {
2458                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2459                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2460             }
2461             break;
2462         }
2463         c = SHIFT_SHL;
2464         vexop = OPC_SHLX;
2465         goto gen_shift_maybe_vex;
2466     OP_32_64(shr):
2467         c = SHIFT_SHR;
2468         vexop = OPC_SHRX;
2469         goto gen_shift_maybe_vex;
2470     OP_32_64(sar):
2471         c = SHIFT_SAR;
2472         vexop = OPC_SARX;
2473         goto gen_shift_maybe_vex;
2474     OP_32_64(rotl):
2475         c = SHIFT_ROL;
2476         goto gen_shift;
2477     OP_32_64(rotr):
2478         c = SHIFT_ROR;
2479         goto gen_shift;
2480     gen_shift_maybe_vex:
2481         if (have_bmi2) {
2482             if (!const_a2) {
2483                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2484                 break;
2485             }
2486             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2487         }
2488         /* FALLTHRU */
2489     gen_shift:
2490         if (const_a2) {
2491             tcg_out_shifti(s, c + rexw, a0, a2);
2492         } else {
2493             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2494         }
2495         break;
2496
2497     OP_32_64(ctz):
2498         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2499         break;
2500     OP_32_64(clz):
2501         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2502         break;
2503     OP_32_64(ctpop):
2504         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2505         break;
2506
2507     case INDEX_op_brcond_i32:
2508         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2509         break;
2510     case INDEX_op_setcond_i32:
2511         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2512         break;
2513     case INDEX_op_movcond_i32:
2514         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2515         break;
2516
2517     OP_32_64(bswap16):
2518         if (a2 & TCG_BSWAP_OS) {
2519             /* Output must be sign-extended. */
2520             if (rexw) {
2521                 tcg_out_bswap64(s, a0);
2522                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2523             } else {
2524                 tcg_out_bswap32(s, a0);
2525                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2526             }
2527         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2528             /* Output must be zero-extended, but input isn't. */
2529             tcg_out_bswap32(s, a0);
2530             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2531         } else {
2532             tcg_out_rolw_8(s, a0);
2533         }
2534         break;
2535     OP_32_64(bswap32):
2536         tcg_out_bswap32(s, a0);
2537         if (rexw && (a2 & TCG_BSWAP_OS)) {
2538             tcg_out_ext32s(s, a0, a0);
2539         }
2540         break;
2541
2542     OP_32_64(neg):
2543         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2544         break;
2545     OP_32_64(not):
2546         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2547         break;
2548
2549     OP_32_64(ext8s):
2550         tcg_out_ext8s(s, a0, a1, rexw);
2551         break;
2552     OP_32_64(ext16s):
2553         tcg_out_ext16s(s, a0, a1, rexw);
2554         break;
2555     OP_32_64(ext8u):
2556         tcg_out_ext8u(s, a0, a1);
2557         break;
2558     OP_32_64(ext16u):
2559         tcg_out_ext16u(s, a0, a1);
2560         break;
2561
2562     case INDEX_op_qemu_ld_i32:
2563         tcg_out_qemu_ld(s, args, 0);
2564         break;
2565     case INDEX_op_qemu_ld_i64:
2566         tcg_out_qemu_ld(s, args, 1);
2567         break;
2568     case INDEX_op_qemu_st_i32:
2569     case INDEX_op_qemu_st8_i32:
2570         tcg_out_qemu_st(s, args, 0);
2571         break;
2572     case INDEX_op_qemu_st_i64:
2573         tcg_out_qemu_st(s, args, 1);
2574         break;
2575
2576     OP_32_64(mulu2):
2577         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2578         break;
2579     OP_32_64(muls2):
2580         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2581         break;
2582     OP_32_64(add2):
2583         if (const_args[4]) {
2584             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2585         } else {
2586             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2587         }
2588         if (const_args[5]) {
2589             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2590         } else {
2591             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2592         }
2593         break;
2594     OP_32_64(sub2):
2595         if (const_args[4]) {
2596             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2597         } else {
2598             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2599         }
2600         if (const_args[5]) {
2601             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2602         } else {
2603             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2604         }
2605         break;
2606
2607 #if TCG_TARGET_REG_BITS == 32
2608     case INDEX_op_brcond2_i32:
2609         tcg_out_brcond2(s, args, const_args, 0);
2610         break;
2611     case INDEX_op_setcond2_i32:
2612         tcg_out_setcond2(s, args, const_args);
2613         break;
2614 #else /* TCG_TARGET_REG_BITS == 64 */
2615     case INDEX_op_ld32s_i64:
2616         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2617         break;
2618     case INDEX_op_ld_i64:
2619         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2620         break;
2621     case INDEX_op_st_i64:
2622         if (const_args[0]) {
2623             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2624             tcg_out32(s, a0);
2625         } else {
2626             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2627         }
2628         break;
2629
2630     case INDEX_op_brcond_i64:
2631         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2632         break;
2633     case INDEX_op_setcond_i64:
2634         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2635         break;
2636     case INDEX_op_movcond_i64:
2637         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2638         break;
2639
2640     case INDEX_op_bswap64_i64:
2641         tcg_out_bswap64(s, a0);
2642         break;
2643     case INDEX_op_extu_i32_i64:
2644     case INDEX_op_ext32u_i64:
2645     case INDEX_op_extrl_i64_i32:
2646         tcg_out_ext32u(s, a0, a1);
2647         break;
2648     case INDEX_op_ext_i32_i64:
2649     case INDEX_op_ext32s_i64:
2650         tcg_out_ext32s(s, a0, a1);
2651         break;
2652     case INDEX_op_extrh_i64_i32:
2653         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2654         break;
2655 #endif
2656
2657     OP_32_64(deposit):
2658         if (args[3] == 0 && args[4] == 8) {
2659             /* load bits 0..7 */
2660             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2661         } else if (args[3] == 8 && args[4] == 8) {
2662             /* load bits 8..15 */
2663             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2664         } else if (args[3] == 0 && args[4] == 16) {
2665             /* load bits 0..15 */
2666             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2667         } else {
2668             tcg_abort();
2669         }
2670         break;
2671
2672     case INDEX_op_extract_i64:
2673         if (a2 + args[3] == 32) {
2674             /* This is a 32-bit zero-extending right shift.  */
2675             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2676             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2677             break;
2678         }
2679         /* FALLTHRU */
2680     case INDEX_op_extract_i32:
2681         /* On the off-chance that we can use the high-byte registers.
2682            Otherwise we emit the same ext16 + shift pattern that we
2683            would have gotten from the normal tcg-op.c expansion.  */
2684         tcg_debug_assert(a2 == 8 && args[3] == 8);
2685         if (a1 < 4 && a0 < 8) {
2686             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2687         } else {
2688             tcg_out_ext16u(s, a0, a1);
2689             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2690         }
2691         break;
2692
2693     case INDEX_op_sextract_i32:
2694         /* We don't implement sextract_i64, as we cannot sign-extend to
2695            64-bits without using the REX prefix that explicitly excludes
2696            access to the high-byte registers.  */
2697         tcg_debug_assert(a2 == 8 && args[3] == 8);
2698         if (a1 < 4 && a0 < 8) {
2699             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2700         } else {
2701             tcg_out_ext16s(s, a0, a1, 0);
2702             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2703         }
2704         break;
2705
2706     OP_32_64(extract2):
2707         /* Note that SHRD outputs to the r/m operand.  */
2708         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2709         tcg_out8(s, args[3]);
2710         break;
2711
2712     case INDEX_op_mb:
2713         tcg_out_mb(s, a0);
2714         break;
2715     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2716     case INDEX_op_mov_i64:
2717     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2718     default:
2719         tcg_abort();
2720     }
2721
2722 #undef OP_32_64
2723 }
2724
2725 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2726                            unsigned vecl, unsigned vece,
2727                            const TCGArg args[TCG_MAX_OP_ARGS],
2728                            const int const_args[TCG_MAX_OP_ARGS])
2729 {
2730     static int const add_insn[4] = {
2731         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2732     };
2733     static int const ssadd_insn[4] = {
2734         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2735     };
2736     static int const usadd_insn[4] = {
2737         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2738     };
2739     static int const sub_insn[4] = {
2740         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2741     };
2742     static int const sssub_insn[4] = {
2743         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2744     };
2745     static int const ussub_insn[4] = {
2746         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2747     };
2748     static int const mul_insn[4] = {
2749         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2750     };
2751     static int const shift_imm_insn[4] = {
2752         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2753     };
2754     static int const cmpeq_insn[4] = {
2755         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2756     };
2757     static int const cmpgt_insn[4] = {
2758         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2759     };
2760     static int const punpckl_insn[4] = {
2761         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2762     };
2763     static int const punpckh_insn[4] = {
2764         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2765     };
2766     static int const packss_insn[4] = {
2767         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2768     };
2769     static int const packus_insn[4] = {
2770         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2771     };
2772     static int const smin_insn[4] = {
2773         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2774     };
2775     static int const smax_insn[4] = {
2776         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2777     };
2778     static int const umin_insn[4] = {
2779         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2780     };
2781     static int const umax_insn[4] = {
2782         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2783     };
2784     static int const shlv_insn[4] = {
2785         /* TODO: AVX512 adds support for MO_16.  */
2786         OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2787     };
2788     static int const shrv_insn[4] = {
2789         /* TODO: AVX512 adds support for MO_16.  */
2790         OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2791     };
2792     static int const sarv_insn[4] = {
2793         /* TODO: AVX512 adds support for MO_16, MO_64.  */
2794         OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2795     };
2796     static int const shls_insn[4] = {
2797         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2798     };
2799     static int const shrs_insn[4] = {
2800         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2801     };
2802     static int const sars_insn[4] = {
2803         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2804     };
2805     static int const abs_insn[4] = {
2806         /* TODO: AVX512 adds support for MO_64.  */
2807         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2808     };
2809
2810     TCGType type = vecl + TCG_TYPE_V64;
2811     int insn, sub;
2812     TCGArg a0, a1, a2;
2813
2814     a0 = args[0];
2815     a1 = args[1];
2816     a2 = args[2];
2817
2818     switch (opc) {
2819     case INDEX_op_add_vec:
2820         insn = add_insn[vece];
2821         goto gen_simd;
2822     case INDEX_op_ssadd_vec:
2823         insn = ssadd_insn[vece];
2824         goto gen_simd;
2825     case INDEX_op_usadd_vec:
2826         insn = usadd_insn[vece];
2827         goto gen_simd;
2828     case INDEX_op_sub_vec:
2829         insn = sub_insn[vece];
2830         goto gen_simd;
2831     case INDEX_op_sssub_vec:
2832         insn = sssub_insn[vece];
2833         goto gen_simd;
2834     case INDEX_op_ussub_vec:
2835         insn = ussub_insn[vece];
2836         goto gen_simd;
2837     case INDEX_op_mul_vec:
2838         insn = mul_insn[vece];
2839         goto gen_simd;
2840     case INDEX_op_and_vec:
2841         insn = OPC_PAND;
2842         goto gen_simd;
2843     case INDEX_op_or_vec:
2844         insn = OPC_POR;
2845         goto gen_simd;
2846     case INDEX_op_xor_vec:
2847         insn = OPC_PXOR;
2848         goto gen_simd;
2849     case INDEX_op_smin_vec:
2850         insn = smin_insn[vece];
2851         goto gen_simd;
2852     case INDEX_op_umin_vec:
2853         insn = umin_insn[vece];
2854         goto gen_simd;
2855     case INDEX_op_smax_vec:
2856         insn = smax_insn[vece];
2857         goto gen_simd;
2858     case INDEX_op_umax_vec:
2859         insn = umax_insn[vece];
2860         goto gen_simd;
2861     case INDEX_op_shlv_vec:
2862         insn = shlv_insn[vece];
2863         goto gen_simd;
2864     case INDEX_op_shrv_vec:
2865         insn = shrv_insn[vece];
2866         goto gen_simd;
2867     case INDEX_op_sarv_vec:
2868         insn = sarv_insn[vece];
2869         goto gen_simd;
2870     case INDEX_op_shls_vec:
2871         insn = shls_insn[vece];
2872         goto gen_simd;
2873     case INDEX_op_shrs_vec:
2874         insn = shrs_insn[vece];
2875         goto gen_simd;
2876     case INDEX_op_sars_vec:
2877         insn = sars_insn[vece];
2878         goto gen_simd;
2879     case INDEX_op_x86_punpckl_vec:
2880         insn = punpckl_insn[vece];
2881         goto gen_simd;
2882     case INDEX_op_x86_punpckh_vec:
2883         insn = punpckh_insn[vece];
2884         goto gen_simd;
2885     case INDEX_op_x86_packss_vec:
2886         insn = packss_insn[vece];
2887         goto gen_simd;
2888     case INDEX_op_x86_packus_vec:
2889         insn = packus_insn[vece];
2890         goto gen_simd;
2891 #if TCG_TARGET_REG_BITS == 32
2892     case INDEX_op_dup2_vec:
2893         /* First merge the two 32-bit inputs to a single 64-bit element. */
2894         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2895         /* Then replicate the 64-bit elements across the rest of the vector. */
2896         if (type != TCG_TYPE_V64) {
2897             tcg_out_dup_vec(s, type, MO_64, a0, a0);
2898         }
2899         break;
2900 #endif
2901     case INDEX_op_abs_vec:
2902         insn = abs_insn[vece];
2903         a2 = a1;
2904         a1 = 0;
2905         goto gen_simd;
2906     gen_simd:
2907         tcg_debug_assert(insn != OPC_UD2);
2908         if (type == TCG_TYPE_V256) {
2909             insn |= P_VEXL;
2910         }
2911         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2912         break;
2913
2914     case INDEX_op_cmp_vec:
2915         sub = args[3];
2916         if (sub == TCG_COND_EQ) {
2917             insn = cmpeq_insn[vece];
2918         } else if (sub == TCG_COND_GT) {
2919             insn = cmpgt_insn[vece];
2920         } else {
2921             g_assert_not_reached();
2922         }
2923         goto gen_simd;
2924
2925     case INDEX_op_andc_vec:
2926         insn = OPC_PANDN;
2927         if (type == TCG_TYPE_V256) {
2928             insn |= P_VEXL;
2929         }
2930         tcg_out_vex_modrm(s, insn, a0, a2, a1);
2931         break;
2932
2933     case INDEX_op_shli_vec:
2934         sub = 6;
2935         goto gen_shift;
2936     case INDEX_op_shri_vec:
2937         sub = 2;
2938         goto gen_shift;
2939     case INDEX_op_sari_vec:
2940         tcg_debug_assert(vece != MO_64);
2941         sub = 4;
2942     gen_shift:
2943         tcg_debug_assert(vece != MO_8);
2944         insn = shift_imm_insn[vece];
2945         if (type == TCG_TYPE_V256) {
2946             insn |= P_VEXL;
2947         }
2948         tcg_out_vex_modrm(s, insn, sub, a0, a1);
2949         tcg_out8(s, a2);
2950         break;
2951
2952     case INDEX_op_ld_vec:
2953         tcg_out_ld(s, type, a0, a1, a2);
2954         break;
2955     case INDEX_op_st_vec:
2956         tcg_out_st(s, type, a0, a1, a2);
2957         break;
2958     case INDEX_op_dupm_vec:
2959         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2960         break;
2961
2962     case INDEX_op_x86_shufps_vec:
2963         insn = OPC_SHUFPS;
2964         sub = args[3];
2965         goto gen_simd_imm8;
2966     case INDEX_op_x86_blend_vec:
2967         if (vece == MO_16) {
2968             insn = OPC_PBLENDW;
2969         } else if (vece == MO_32) {
2970             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2971         } else {
2972             g_assert_not_reached();
2973         }
2974         sub = args[3];
2975         goto gen_simd_imm8;
2976     case INDEX_op_x86_vperm2i128_vec:
2977         insn = OPC_VPERM2I128;
2978         sub = args[3];
2979         goto gen_simd_imm8;
2980     gen_simd_imm8:
2981         if (type == TCG_TYPE_V256) {
2982             insn |= P_VEXL;
2983         }
2984         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2985         tcg_out8(s, sub);
2986         break;
2987
2988     case INDEX_op_x86_vpblendvb_vec:
2989         insn = OPC_VPBLENDVB;
2990         if (type == TCG_TYPE_V256) {
2991             insn |= P_VEXL;
2992         }
2993         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2994         tcg_out8(s, args[3] << 4);
2995         break;
2996
2997     case INDEX_op_x86_psrldq_vec:
2998         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2999         tcg_out8(s, a2);
3000         break;
3001
3002     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3003     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3004     default:
3005         g_assert_not_reached();
3006     }
3007 }
3008
3009 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3010 {
3011     switch (op) {
3012     case INDEX_op_goto_ptr:
3013         return C_O0_I1(r);
3014
3015     case INDEX_op_ld8u_i32:
3016     case INDEX_op_ld8u_i64:
3017     case INDEX_op_ld8s_i32:
3018     case INDEX_op_ld8s_i64:
3019     case INDEX_op_ld16u_i32:
3020     case INDEX_op_ld16u_i64:
3021     case INDEX_op_ld16s_i32:
3022     case INDEX_op_ld16s_i64:
3023     case INDEX_op_ld_i32:
3024     case INDEX_op_ld32u_i64:
3025     case INDEX_op_ld32s_i64:
3026     case INDEX_op_ld_i64:
3027         return C_O1_I1(r, r);
3028
3029     case INDEX_op_st8_i32:
3030     case INDEX_op_st8_i64:
3031         return C_O0_I2(qi, r);
3032
3033     case INDEX_op_st16_i32:
3034     case INDEX_op_st16_i64:
3035     case INDEX_op_st_i32:
3036     case INDEX_op_st32_i64:
3037         return C_O0_I2(ri, r);
3038
3039     case INDEX_op_st_i64:
3040         return C_O0_I2(re, r);
3041
3042     case INDEX_op_add_i32:
3043     case INDEX_op_add_i64:
3044         return C_O1_I2(r, r, re);
3045
3046     case INDEX_op_sub_i32:
3047     case INDEX_op_sub_i64:
3048     case INDEX_op_mul_i32:
3049     case INDEX_op_mul_i64:
3050     case INDEX_op_or_i32:
3051     case INDEX_op_or_i64:
3052     case INDEX_op_xor_i32:
3053     case INDEX_op_xor_i64:
3054         return C_O1_I2(r, 0, re);
3055
3056     case INDEX_op_and_i32:
3057     case INDEX_op_and_i64:
3058         return C_O1_I2(r, 0, reZ);
3059
3060     case INDEX_op_andc_i32:
3061     case INDEX_op_andc_i64:
3062         return C_O1_I2(r, r, rI);
3063
3064     case INDEX_op_shl_i32:
3065     case INDEX_op_shl_i64:
3066     case INDEX_op_shr_i32:
3067     case INDEX_op_shr_i64:
3068     case INDEX_op_sar_i32:
3069     case INDEX_op_sar_i64:
3070         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3071
3072     case INDEX_op_rotl_i32:
3073     case INDEX_op_rotl_i64:
3074     case INDEX_op_rotr_i32:
3075     case INDEX_op_rotr_i64:
3076         return C_O1_I2(r, 0, ci);
3077
3078     case INDEX_op_brcond_i32:
3079     case INDEX_op_brcond_i64:
3080         return C_O0_I2(r, re);
3081
3082     case INDEX_op_bswap16_i32:
3083     case INDEX_op_bswap16_i64:
3084     case INDEX_op_bswap32_i32:
3085     case INDEX_op_bswap32_i64:
3086     case INDEX_op_bswap64_i64:
3087     case INDEX_op_neg_i32:
3088     case INDEX_op_neg_i64:
3089     case INDEX_op_not_i32:
3090     case INDEX_op_not_i64:
3091     case INDEX_op_extrh_i64_i32:
3092         return C_O1_I1(r, 0);
3093
3094     case INDEX_op_ext8s_i32:
3095     case INDEX_op_ext8s_i64:
3096     case INDEX_op_ext8u_i32:
3097     case INDEX_op_ext8u_i64:
3098         return C_O1_I1(r, q);
3099
3100     case INDEX_op_ext16s_i32:
3101     case INDEX_op_ext16s_i64:
3102     case INDEX_op_ext16u_i32:
3103     case INDEX_op_ext16u_i64:
3104     case INDEX_op_ext32s_i64:
3105     case INDEX_op_ext32u_i64:
3106     case INDEX_op_ext_i32_i64:
3107     case INDEX_op_extu_i32_i64:
3108     case INDEX_op_extrl_i64_i32:
3109     case INDEX_op_extract_i32:
3110     case INDEX_op_extract_i64:
3111     case INDEX_op_sextract_i32:
3112     case INDEX_op_ctpop_i32:
3113     case INDEX_op_ctpop_i64:
3114         return C_O1_I1(r, r);
3115
3116     case INDEX_op_extract2_i32:
3117     case INDEX_op_extract2_i64:
3118         return C_O1_I2(r, 0, r);
3119
3120     case INDEX_op_deposit_i32:
3121     case INDEX_op_deposit_i64:
3122         return C_O1_I2(Q, 0, Q);
3123
3124     case INDEX_op_setcond_i32:
3125     case INDEX_op_setcond_i64:
3126         return C_O1_I2(q, r, re);
3127
3128     case INDEX_op_movcond_i32:
3129     case INDEX_op_movcond_i64:
3130         return C_O1_I4(r, r, re, r, 0);
3131
3132     case INDEX_op_div2_i32:
3133     case INDEX_op_div2_i64:
3134     case INDEX_op_divu2_i32:
3135     case INDEX_op_divu2_i64:
3136         return C_O2_I3(a, d, 0, 1, r);
3137
3138     case INDEX_op_mulu2_i32:
3139     case INDEX_op_mulu2_i64:
3140     case INDEX_op_muls2_i32:
3141     case INDEX_op_muls2_i64:
3142         return C_O2_I2(a, d, a, r);
3143
3144     case INDEX_op_add2_i32:
3145     case INDEX_op_add2_i64:
3146     case INDEX_op_sub2_i32:
3147     case INDEX_op_sub2_i64:
3148         return C_O2_I4(r, r, 0, 1, re, re);
3149
3150     case INDEX_op_ctz_i32:
3151     case INDEX_op_ctz_i64:
3152         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3153
3154     case INDEX_op_clz_i32:
3155     case INDEX_op_clz_i64:
3156         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3157
3158     case INDEX_op_qemu_ld_i32:
3159         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3160                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3161
3162     case INDEX_op_qemu_st_i32:
3163         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3164                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3165     case INDEX_op_qemu_st8_i32:
3166         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3167                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3168
3169     case INDEX_op_qemu_ld_i64:
3170         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3171                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3172                 : C_O2_I2(r, r, L, L));
3173
3174     case INDEX_op_qemu_st_i64:
3175         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3176                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3177                 : C_O0_I4(L, L, L, L));
3178
3179     case INDEX_op_brcond2_i32:
3180         return C_O0_I4(r, r, ri, ri);
3181
3182     case INDEX_op_setcond2_i32:
3183         return C_O1_I4(r, r, r, ri, ri);
3184
3185     case INDEX_op_ld_vec:
3186     case INDEX_op_dupm_vec:
3187         return C_O1_I1(x, r);
3188
3189     case INDEX_op_st_vec:
3190         return C_O0_I2(x, r);
3191
3192     case INDEX_op_add_vec:
3193     case INDEX_op_sub_vec:
3194     case INDEX_op_mul_vec:
3195     case INDEX_op_and_vec:
3196     case INDEX_op_or_vec:
3197     case INDEX_op_xor_vec:
3198     case INDEX_op_andc_vec:
3199     case INDEX_op_ssadd_vec:
3200     case INDEX_op_usadd_vec:
3201     case INDEX_op_sssub_vec:
3202     case INDEX_op_ussub_vec:
3203     case INDEX_op_smin_vec:
3204     case INDEX_op_umin_vec:
3205     case INDEX_op_smax_vec:
3206     case INDEX_op_umax_vec:
3207     case INDEX_op_shlv_vec:
3208     case INDEX_op_shrv_vec:
3209     case INDEX_op_sarv_vec:
3210     case INDEX_op_shls_vec:
3211     case INDEX_op_shrs_vec:
3212     case INDEX_op_sars_vec:
3213     case INDEX_op_rotls_vec:
3214     case INDEX_op_cmp_vec:
3215     case INDEX_op_x86_shufps_vec:
3216     case INDEX_op_x86_blend_vec:
3217     case INDEX_op_x86_packss_vec:
3218     case INDEX_op_x86_packus_vec:
3219     case INDEX_op_x86_vperm2i128_vec:
3220     case INDEX_op_x86_punpckl_vec:
3221     case INDEX_op_x86_punpckh_vec:
3222 #if TCG_TARGET_REG_BITS == 32
3223     case INDEX_op_dup2_vec:
3224 #endif
3225         return C_O1_I2(x, x, x);
3226
3227     case INDEX_op_abs_vec:
3228     case INDEX_op_dup_vec:
3229     case INDEX_op_shli_vec:
3230     case INDEX_op_shri_vec:
3231     case INDEX_op_sari_vec:
3232     case INDEX_op_x86_psrldq_vec:
3233         return C_O1_I1(x, x);
3234
3235     case INDEX_op_x86_vpblendvb_vec:
3236         return C_O1_I3(x, x, x, x);
3237
3238     default:
3239         g_assert_not_reached();
3240     }
3241 }
3242
3243 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3244 {
3245     switch (opc) {
3246     case INDEX_op_add_vec:
3247     case INDEX_op_sub_vec:
3248     case INDEX_op_and_vec:
3249     case INDEX_op_or_vec:
3250     case INDEX_op_xor_vec:
3251     case INDEX_op_andc_vec:
3252         return 1;
3253     case INDEX_op_rotli_vec:
3254     case INDEX_op_cmp_vec:
3255     case INDEX_op_cmpsel_vec:
3256         return -1;
3257
3258     case INDEX_op_shli_vec:
3259     case INDEX_op_shri_vec:
3260         /* We must expand the operation for MO_8.  */
3261         return vece == MO_8 ? -1 : 1;
3262
3263     case INDEX_op_sari_vec:
3264         /* We must expand the operation for MO_8.  */
3265         if (vece == MO_8) {
3266             return -1;
3267         }
3268         /* We can emulate this for MO_64, but it does not pay off
3269            unless we're producing at least 4 values.  */
3270         if (vece == MO_64) {
3271             return type >= TCG_TYPE_V256 ? -1 : 0;
3272         }
3273         return 1;
3274
3275     case INDEX_op_shls_vec:
3276     case INDEX_op_shrs_vec:
3277         return vece >= MO_16;
3278     case INDEX_op_sars_vec:
3279         return vece >= MO_16 && vece <= MO_32;
3280     case INDEX_op_rotls_vec:
3281         return vece >= MO_16 ? -1 : 0;
3282
3283     case INDEX_op_shlv_vec:
3284     case INDEX_op_shrv_vec:
3285         return have_avx2 && vece >= MO_32;
3286     case INDEX_op_sarv_vec:
3287         return have_avx2 && vece == MO_32;
3288     case INDEX_op_rotlv_vec:
3289     case INDEX_op_rotrv_vec:
3290         return have_avx2 && vece >= MO_32 ? -1 : 0;
3291
3292     case INDEX_op_mul_vec:
3293         if (vece == MO_8) {
3294             /* We can expand the operation for MO_8.  */
3295             return -1;
3296         }
3297         if (vece == MO_64) {
3298             return 0;
3299         }
3300         return 1;
3301
3302     case INDEX_op_ssadd_vec:
3303     case INDEX_op_usadd_vec:
3304     case INDEX_op_sssub_vec:
3305     case INDEX_op_ussub_vec:
3306         return vece <= MO_16;
3307     case INDEX_op_smin_vec:
3308     case INDEX_op_smax_vec:
3309     case INDEX_op_umin_vec:
3310     case INDEX_op_umax_vec:
3311     case INDEX_op_abs_vec:
3312         return vece <= MO_32;
3313
3314     default:
3315         return 0;
3316     }
3317 }
3318
3319 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3320                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3321 {
3322     TCGv_vec t1, t2;
3323
3324     tcg_debug_assert(vece == MO_8);
3325
3326     t1 = tcg_temp_new_vec(type);
3327     t2 = tcg_temp_new_vec(type);
3328
3329     /*
3330      * Unpack to W, shift, and repack.  Tricky bits:
3331      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3332      *     i.e. duplicate in other half of the 16-bit lane.
3333      * (2) For right-shift, add 8 so that the high half of the lane
3334      *     becomes zero.  For left-shift, and left-rotate, we must
3335      *     shift up and down again.
3336      * (3) Step 2 leaves high half zero such that PACKUSWB
3337      *     (pack with unsigned saturation) does not modify
3338      *     the quantity.
3339      */
3340     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3341               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3342     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3343               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3344
3345     if (opc != INDEX_op_rotli_vec) {
3346         imm += 8;
3347     }
3348     if (opc == INDEX_op_shri_vec) {
3349         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3350         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3351     } else {
3352         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3353         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3354         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3355         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3356     }
3357
3358     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3359               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3360     tcg_temp_free_vec(t1);
3361     tcg_temp_free_vec(t2);
3362 }
3363
3364 static void expand_vec_sari(TCGType type, unsigned vece,
3365                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3366 {
3367     TCGv_vec t1, t2;
3368
3369     switch (vece) {
3370     case MO_8:
3371         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3372         t1 = tcg_temp_new_vec(type);
3373         t2 = tcg_temp_new_vec(type);
3374         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3375                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3376         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3377                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3378         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3379         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3380         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3381                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3382         tcg_temp_free_vec(t1);
3383         tcg_temp_free_vec(t2);
3384         break;
3385
3386     case MO_64:
3387         if (imm <= 32) {
3388             /*
3389              * We can emulate a small sign extend by performing an arithmetic
3390              * 32-bit shift and overwriting the high half of a 64-bit logical
3391              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3392              * does not, so we have to bound the smaller shift -- we get the
3393              * same result in the high half either way.
3394              */
3395             t1 = tcg_temp_new_vec(type);
3396             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3397             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3398             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3399                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3400                       tcgv_vec_arg(t1), 0xaa);
3401             tcg_temp_free_vec(t1);
3402         } else {
3403             /* Otherwise we will need to use a compare vs 0 to produce
3404              * the sign-extend, shift and merge.
3405              */
3406             t1 = tcg_const_zeros_vec(type);
3407             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3408             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3409             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3410             tcg_gen_or_vec(MO_64, v0, v0, t1);
3411             tcg_temp_free_vec(t1);
3412         }
3413         break;
3414
3415     default:
3416         g_assert_not_reached();
3417     }
3418 }
3419
3420 static void expand_vec_rotli(TCGType type, unsigned vece,
3421                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3422 {
3423     TCGv_vec t;
3424
3425     if (vece == MO_8) {
3426         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3427         return;
3428     }
3429
3430     t = tcg_temp_new_vec(type);
3431     tcg_gen_shli_vec(vece, t, v1, imm);
3432     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3433     tcg_gen_or_vec(vece, v0, v0, t);
3434     tcg_temp_free_vec(t);
3435 }
3436
3437 static void expand_vec_rotls(TCGType type, unsigned vece,
3438                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3439 {
3440     TCGv_i32 rsh;
3441     TCGv_vec t;
3442
3443     tcg_debug_assert(vece != MO_8);
3444
3445     t = tcg_temp_new_vec(type);
3446     rsh = tcg_temp_new_i32();
3447
3448     tcg_gen_neg_i32(rsh, lsh);
3449     tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3450     tcg_gen_shls_vec(vece, t, v1, lsh);
3451     tcg_gen_shrs_vec(vece, v0, v1, rsh);
3452     tcg_gen_or_vec(vece, v0, v0, t);
3453     tcg_temp_free_vec(t);
3454     tcg_temp_free_i32(rsh);
3455 }
3456
3457 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3458                             TCGv_vec v1, TCGv_vec sh, bool right)
3459 {
3460     TCGv_vec t = tcg_temp_new_vec(type);
3461
3462     tcg_gen_dupi_vec(vece, t, 8 << vece);
3463     tcg_gen_sub_vec(vece, t, t, sh);
3464     if (right) {
3465         tcg_gen_shlv_vec(vece, t, v1, t);
3466         tcg_gen_shrv_vec(vece, v0, v1, sh);
3467     } else {
3468         tcg_gen_shrv_vec(vece, t, v1, t);
3469         tcg_gen_shlv_vec(vece, v0, v1, sh);
3470     }
3471     tcg_gen_or_vec(vece, v0, v0, t);
3472     tcg_temp_free_vec(t);
3473 }
3474
3475 static void expand_vec_mul(TCGType type, unsigned vece,
3476                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3477 {
3478     TCGv_vec t1, t2, t3, t4, zero;
3479
3480     tcg_debug_assert(vece == MO_8);
3481
3482     /*
3483      * Unpack v1 bytes to words, 0 | x.
3484      * Unpack v2 bytes to words, y | 0.
3485      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3486      * Shift logical right by 8 bits to clear the high 8 bytes before
3487      * using an unsigned saturated pack.
3488      *
3489      * The difference between the V64, V128 and V256 cases is merely how
3490      * we distribute the expansion between temporaries.
3491      */
3492     switch (type) {
3493     case TCG_TYPE_V64:
3494         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3495         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3496         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3497         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3498                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3499         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3500                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3501         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3502         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3503         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3504                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3505         tcg_temp_free_vec(t1);
3506         tcg_temp_free_vec(t2);
3507         break;
3508
3509     case TCG_TYPE_V128:
3510     case TCG_TYPE_V256:
3511         t1 = tcg_temp_new_vec(type);
3512         t2 = tcg_temp_new_vec(type);
3513         t3 = tcg_temp_new_vec(type);
3514         t4 = tcg_temp_new_vec(type);
3515         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3516         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3517                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3518         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3519                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3520         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3521                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3522         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3523                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3524         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3525         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3526         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3527         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3528         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3529                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3530         tcg_temp_free_vec(t1);
3531         tcg_temp_free_vec(t2);
3532         tcg_temp_free_vec(t3);
3533         tcg_temp_free_vec(t4);
3534         break;
3535
3536     default:
3537         g_assert_not_reached();
3538     }
3539 }
3540
3541 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3542                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3543 {
3544     enum {
3545         NEED_INV  = 1,
3546         NEED_SWAP = 2,
3547         NEED_BIAS = 4,
3548         NEED_UMIN = 8,
3549         NEED_UMAX = 16,
3550     };
3551     TCGv_vec t1, t2, t3;
3552     uint8_t fixup;
3553
3554     switch (cond) {
3555     case TCG_COND_EQ:
3556     case TCG_COND_GT:
3557         fixup = 0;
3558         break;
3559     case TCG_COND_NE:
3560     case TCG_COND_LE:
3561         fixup = NEED_INV;
3562         break;
3563     case TCG_COND_LT:
3564         fixup = NEED_SWAP;
3565         break;
3566     case TCG_COND_GE:
3567         fixup = NEED_SWAP | NEED_INV;
3568         break;
3569     case TCG_COND_LEU:
3570         if (vece <= MO_32) {
3571             fixup = NEED_UMIN;
3572         } else {
3573             fixup = NEED_BIAS | NEED_INV;
3574         }
3575         break;
3576     case TCG_COND_GTU:
3577         if (vece <= MO_32) {
3578             fixup = NEED_UMIN | NEED_INV;
3579         } else {
3580             fixup = NEED_BIAS;
3581         }
3582         break;
3583     case TCG_COND_GEU:
3584         if (vece <= MO_32) {
3585             fixup = NEED_UMAX;
3586         } else {
3587             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3588         }
3589         break;
3590     case TCG_COND_LTU:
3591         if (vece <= MO_32) {
3592             fixup = NEED_UMAX | NEED_INV;
3593         } else {
3594             fixup = NEED_BIAS | NEED_SWAP;
3595         }
3596         break;
3597     default:
3598         g_assert_not_reached();
3599     }
3600
3601     if (fixup & NEED_INV) {
3602         cond = tcg_invert_cond(cond);
3603     }
3604     if (fixup & NEED_SWAP) {
3605         t1 = v1, v1 = v2, v2 = t1;
3606         cond = tcg_swap_cond(cond);
3607     }
3608
3609     t1 = t2 = NULL;
3610     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3611         t1 = tcg_temp_new_vec(type);
3612         if (fixup & NEED_UMIN) {
3613             tcg_gen_umin_vec(vece, t1, v1, v2);
3614         } else {
3615             tcg_gen_umax_vec(vece, t1, v1, v2);
3616         }
3617         v2 = t1;
3618         cond = TCG_COND_EQ;
3619     } else if (fixup & NEED_BIAS) {
3620         t1 = tcg_temp_new_vec(type);
3621         t2 = tcg_temp_new_vec(type);
3622         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3623         tcg_gen_sub_vec(vece, t1, v1, t3);
3624         tcg_gen_sub_vec(vece, t2, v2, t3);
3625         v1 = t1;
3626         v2 = t2;
3627         cond = tcg_signed_cond(cond);
3628     }
3629
3630     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3631     /* Expand directly; do not recurse.  */
3632     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3633               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3634
3635     if (t1) {
3636         tcg_temp_free_vec(t1);
3637         if (t2) {
3638             tcg_temp_free_vec(t2);
3639         }
3640     }
3641     return fixup & NEED_INV;
3642 }
3643
3644 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3645                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3646 {
3647     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3648         tcg_gen_not_vec(vece, v0, v0);
3649     }
3650 }
3651
3652 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3653                               TCGv_vec c1, TCGv_vec c2,
3654                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3655 {
3656     TCGv_vec t = tcg_temp_new_vec(type);
3657
3658     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3659         /* Invert the sense of the compare by swapping arguments.  */
3660         TCGv_vec x;
3661         x = v3, v3 = v4, v4 = x;
3662     }
3663     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3664               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3665               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3666     tcg_temp_free_vec(t);
3667 }
3668
3669 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3670                        TCGArg a0, ...)
3671 {
3672     va_list va;
3673     TCGArg a2;
3674     TCGv_vec v0, v1, v2, v3, v4;
3675
3676     va_start(va, a0);
3677     v0 = temp_tcgv_vec(arg_temp(a0));
3678     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3679     a2 = va_arg(va, TCGArg);
3680
3681     switch (opc) {
3682     case INDEX_op_shli_vec:
3683     case INDEX_op_shri_vec:
3684         expand_vec_shi(type, vece, opc, v0, v1, a2);
3685         break;
3686
3687     case INDEX_op_sari_vec:
3688         expand_vec_sari(type, vece, v0, v1, a2);
3689         break;
3690
3691     case INDEX_op_rotli_vec:
3692         expand_vec_rotli(type, vece, v0, v1, a2);
3693         break;
3694
3695     case INDEX_op_rotls_vec:
3696         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3697         break;
3698
3699     case INDEX_op_rotlv_vec:
3700         v2 = temp_tcgv_vec(arg_temp(a2));
3701         expand_vec_rotv(type, vece, v0, v1, v2, false);
3702         break;
3703     case INDEX_op_rotrv_vec:
3704         v2 = temp_tcgv_vec(arg_temp(a2));
3705         expand_vec_rotv(type, vece, v0, v1, v2, true);
3706         break;
3707
3708     case INDEX_op_mul_vec:
3709         v2 = temp_tcgv_vec(arg_temp(a2));
3710         expand_vec_mul(type, vece, v0, v1, v2);
3711         break;
3712
3713     case INDEX_op_cmp_vec:
3714         v2 = temp_tcgv_vec(arg_temp(a2));
3715         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3716         break;
3717
3718     case INDEX_op_cmpsel_vec:
3719         v2 = temp_tcgv_vec(arg_temp(a2));
3720         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3721         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3722         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3723         break;
3724
3725     default:
3726         break;
3727     }
3728
3729     va_end(va);
3730 }
3731
3732 static const int tcg_target_callee_save_regs[] = {
3733 #if TCG_TARGET_REG_BITS == 64
3734     TCG_REG_RBP,
3735     TCG_REG_RBX,
3736 #if defined(_WIN64)
3737     TCG_REG_RDI,
3738     TCG_REG_RSI,
3739 #endif
3740     TCG_REG_R12,
3741     TCG_REG_R13,
3742     TCG_REG_R14, /* Currently used for the global env. */
3743     TCG_REG_R15,
3744 #else
3745     TCG_REG_EBP, /* Currently used for the global env. */
3746     TCG_REG_EBX,
3747     TCG_REG_ESI,
3748     TCG_REG_EDI,
3749 #endif
3750 };
3751
3752 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3753    and tcg_register_jit.  */
3754
3755 #define PUSH_SIZE \
3756     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3757      * (TCG_TARGET_REG_BITS / 8))
3758
3759 #define FRAME_SIZE \
3760     ((PUSH_SIZE \
3761       + TCG_STATIC_CALL_ARGS_SIZE \
3762       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3763       + TCG_TARGET_STACK_ALIGN - 1) \
3764      & ~(TCG_TARGET_STACK_ALIGN - 1))
3765
3766 /* Generate global QEMU prologue and epilogue code */
3767 static void tcg_target_qemu_prologue(TCGContext *s)
3768 {
3769     int i, stack_addend;
3770
3771     /* TB prologue */
3772
3773     /* Reserve some stack space, also for TCG temps.  */
3774     stack_addend = FRAME_SIZE - PUSH_SIZE;
3775     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3776                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3777
3778     /* Save all callee saved registers.  */
3779     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3780         tcg_out_push(s, tcg_target_callee_save_regs[i]);
3781     }
3782
3783 #if TCG_TARGET_REG_BITS == 32
3784     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3785                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3786     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3787     /* jmp *tb.  */
3788     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3789                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3790                          + stack_addend);
3791 #else
3792 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3793     if (guest_base) {
3794         int seg = setup_guest_base_seg();
3795         if (seg != 0) {
3796             x86_guest_base_seg = seg;
3797         } else if (guest_base == (int32_t)guest_base) {
3798             x86_guest_base_offset = guest_base;
3799         } else {
3800             /* Choose R12 because, as a base, it requires a SIB byte. */
3801             x86_guest_base_index = TCG_REG_R12;
3802             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3803             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3804         }
3805     }
3806 # endif
3807     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3808     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3809     /* jmp *tb.  */
3810     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3811 #endif
3812
3813     /*
3814      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3815      * and fall through to the rest of the epilogue.
3816      */
3817     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3818     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3819
3820     /* TB epilogue */
3821     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3822
3823     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3824
3825     if (have_avx2) {
3826         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3827     }
3828     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3829         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3830     }
3831     tcg_out_opc(s, OPC_RET, 0, 0, 0);
3832 }
3833
3834 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3835 {
3836     memset(p, 0x90, count);
3837 }
3838
3839 static void tcg_target_init(TCGContext *s)
3840 {
3841 #ifdef CONFIG_CPUID_H
3842     unsigned a, b, c, d, b7 = 0;
3843     unsigned max = __get_cpuid_max(0, 0);
3844
3845     if (max >= 7) {
3846         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3847         __cpuid_count(7, 0, a, b7, c, d);
3848         have_bmi1 = (b7 & bit_BMI) != 0;
3849         have_bmi2 = (b7 & bit_BMI2) != 0;
3850     }
3851
3852     if (max >= 1) {
3853         __cpuid(1, a, b, c, d);
3854 #ifndef have_cmov
3855         /* For 32-bit, 99% certainty that we're running on hardware that
3856            supports cmov, but we still need to check.  In case cmov is not
3857            available, we'll use a small forward branch.  */
3858         have_cmov = (d & bit_CMOV) != 0;
3859 #endif
3860
3861         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3862            need to probe for it.  */
3863         have_movbe = (c & bit_MOVBE) != 0;
3864         have_popcnt = (c & bit_POPCNT) != 0;
3865
3866         /* There are a number of things we must check before we can be
3867            sure of not hitting invalid opcode.  */
3868         if (c & bit_OSXSAVE) {
3869             unsigned xcrl, xcrh;
3870             /* The xgetbv instruction is not available to older versions of
3871              * the assembler, so we encode the instruction manually.
3872              */
3873             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3874             if ((xcrl & 6) == 6) {
3875                 have_avx1 = (c & bit_AVX) != 0;
3876                 have_avx2 = (b7 & bit_AVX2) != 0;
3877             }
3878         }
3879     }
3880
3881     max = __get_cpuid_max(0x8000000, 0);
3882     if (max >= 1) {
3883         __cpuid(0x80000001, a, b, c, d);
3884         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3885         have_lzcnt = (c & bit_LZCNT) != 0;
3886     }
3887 #endif /* CONFIG_CPUID_H */
3888
3889     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3890     if (TCG_TARGET_REG_BITS == 64) {
3891         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3892     }
3893     if (have_avx1) {
3894         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3895         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3896     }
3897     if (have_avx2) {
3898         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3899     }
3900
3901     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3902     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3903     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3904     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3905     if (TCG_TARGET_REG_BITS == 64) {
3906 #if !defined(_WIN64)
3907         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3908         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3909 #endif
3910         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3911         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3912         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3913         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3914     }
3915
3916     s->reserved_regs = 0;
3917     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3918 }
3919
3920 typedef struct {
3921     DebugFrameHeader h;
3922     uint8_t fde_def_cfa[4];
3923     uint8_t fde_reg_ofs[14];
3924 } DebugFrame;
3925
3926 /* We're expecting a 2 byte uleb128 encoded value.  */
3927 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3928
3929 #if !defined(__ELF__)
3930     /* Host machine without ELF. */
3931 #elif TCG_TARGET_REG_BITS == 64
3932 #define ELF_HOST_MACHINE EM_X86_64
3933 static const DebugFrame debug_frame = {
3934     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3935     .h.cie.id = -1,
3936     .h.cie.version = 1,
3937     .h.cie.code_align = 1,
3938     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3939     .h.cie.return_column = 16,
3940
3941     /* Total FDE size does not include the "len" member.  */
3942     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3943
3944     .fde_def_cfa = {
3945         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3946         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3947         (FRAME_SIZE >> 7)
3948     },
3949     .fde_reg_ofs = {
3950         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3951         /* The following ordering must match tcg_target_callee_save_regs.  */
3952         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3953         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3954         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3955         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3956         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3957         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3958     }
3959 };
3960 #else
3961 #define ELF_HOST_MACHINE EM_386
3962 static const DebugFrame debug_frame = {
3963     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3964     .h.cie.id = -1,
3965     .h.cie.version = 1,
3966     .h.cie.code_align = 1,
3967     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3968     .h.cie.return_column = 8,
3969
3970     /* Total FDE size does not include the "len" member.  */
3971     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3972
3973     .fde_def_cfa = {
3974         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3975         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3976         (FRAME_SIZE >> 7)
3977     },
3978     .fde_reg_ofs = {
3979         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3980         /* The following ordering must match tcg_target_callee_save_regs.  */
3981         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3982         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3983         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3984         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3985     }
3986 };
3987 #endif
3988
3989 #if defined(ELF_HOST_MACHINE)
3990 void tcg_register_jit(const void *buf, size_t buf_size)
3991 {
3992     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3993 }
3994 #endif