tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-pool.c.inc"
  26
  27 #ifdef CONFIG_DEBUG_TCG
  28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29 #if TCG_TARGET_REG_BITS == 64
  30     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31 #else
  32     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33 #endif
  34     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36 #if TCG_TARGET_REG_BITS == 64
  37     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39 #endif
  40 };
  41 #endif
  42
  43 static const int tcg_target_reg_alloc_order[] = {
  44 #if TCG_TARGET_REG_BITS == 64
  45     TCG_REG_RBP,
  46     TCG_REG_RBX,
  47     TCG_REG_R12,
  48     TCG_REG_R13,
  49     TCG_REG_R14,
  50     TCG_REG_R15,
  51     TCG_REG_R10,
  52     TCG_REG_R11,
  53     TCG_REG_R9,
  54     TCG_REG_R8,
  55     TCG_REG_RCX,
  56     TCG_REG_RDX,
  57     TCG_REG_RSI,
  58     TCG_REG_RDI,
  59     TCG_REG_RAX,
  60 #else
  61     TCG_REG_EBX,
  62     TCG_REG_ESI,
  63     TCG_REG_EDI,
  64     TCG_REG_EBP,
  65     TCG_REG_ECX,
  66     TCG_REG_EDX,
  67     TCG_REG_EAX,
  68 #endif
  69     TCG_REG_XMM0,
  70     TCG_REG_XMM1,
  71     TCG_REG_XMM2,
  72     TCG_REG_XMM3,
  73     TCG_REG_XMM4,
  74     TCG_REG_XMM5,
  75 #ifndef _WIN64
  76     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78     TCG_REG_XMM6,
  79     TCG_REG_XMM7,
  80 #if TCG_TARGET_REG_BITS == 64
  81     TCG_REG_XMM8,
  82     TCG_REG_XMM9,
  83     TCG_REG_XMM10,
  84     TCG_REG_XMM11,
  85     TCG_REG_XMM12,
  86     TCG_REG_XMM13,
  87     TCG_REG_XMM14,
  88     TCG_REG_XMM15,
  89 #endif
  90 #endif
  91 };
  92
  93 static const int tcg_target_call_iarg_regs[] = {
  94 #if TCG_TARGET_REG_BITS == 64
  95 #if defined(_WIN64)
  96     TCG_REG_RCX,
  97     TCG_REG_RDX,
  98 #else
  99     TCG_REG_RDI,
 100     TCG_REG_RSI,
 101     TCG_REG_RDX,
 102     TCG_REG_RCX,
 103 #endif
 104     TCG_REG_R8,
 105     TCG_REG_R9,
 106 #else
 107     /* 32 bit mode uses stack based calling convention (GCC default). */
 108 #endif
 109 };
 110
 111 static const int tcg_target_call_oarg_regs[] = {
 112     TCG_REG_EAX,
 113 #if TCG_TARGET_REG_BITS == 32
 114     TCG_REG_EDX
 115 #endif
 116 };
 117
 118 /* Constants we accept.  */
 119 #define TCG_CT_CONST_S32 0x100
 120 #define TCG_CT_CONST_U32 0x200
 121 #define TCG_CT_CONST_I32 0x400
 122 #define TCG_CT_CONST_WSZ 0x800
 123
 124 /* Registers used with L constraint, which are the first argument
 125    registers on x86_64, and two random call clobbered registers on
 126    i386. */
 127 #if TCG_TARGET_REG_BITS == 64
 128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130 #else
 131 # define TCG_REG_L0 TCG_REG_EAX
 132 # define TCG_REG_L1 TCG_REG_EDX
 133 #endif
 134
 135 #define ALL_BYTEH_REGS         0x0000000fu
 136 #if TCG_TARGET_REG_BITS == 64
 137 # define ALL_GENERAL_REGS      0x0000ffffu
 138 # define ALL_VECTOR_REGS       0xffff0000u
 139 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 140 #else
 141 # define ALL_GENERAL_REGS      0x000000ffu
 142 # define ALL_VECTOR_REGS       0x00ff0000u
 143 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 144 #endif
 145 #ifdef CONFIG_SOFTMMU
 146 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 147 #else
 148 # define SOFTMMU_RESERVE_REGS  0
 149 #endif
 150
 151 /* The host compiler should supply <cpuid.h> to enable runtime features
 152    detection, as we're not going to go so far as our own inline assembly.
 153    If not available, default values will be assumed.  */
 154 #if defined(CONFIG_CPUID_H)
 155 #include "qemu/cpuid.h"
 156 #endif
 157
 158 /* For 64-bit, we always know that CMOV is available.  */
 159 #if TCG_TARGET_REG_BITS == 64
 160 # define have_cmov 1
 161 #elif defined(CONFIG_CPUID_H)
 162 static bool have_cmov;
 163 #else
 164 # define have_cmov 0
 165 #endif
 166
 167 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 168    it there.  Therefore we always define the variable.  */
 169 bool have_bmi1;
 170 bool have_popcnt;
 171 bool have_avx1;
 172 bool have_avx2;
 173 bool have_movbe;
 174
 175 #ifdef CONFIG_CPUID_H
 176 static bool have_bmi2;
 177 static bool have_lzcnt;
 178 #else
 179 # define have_bmi2 0
 180 # define have_lzcnt 0
 181 #endif
 182
 183 static const tcg_insn_unit *tb_ret_addr;
 184
 185 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 186                         intptr_t value, intptr_t addend)
 187 {
 188     value += addend;
 189     switch(type) {
 190     case R_386_PC32:
 191         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 192         if (value != (int32_t)value) {
 193             return false;
 194         }
 195         /* FALLTHRU */
 196     case R_386_32:
 197         tcg_patch32(code_ptr, value);
 198         break;
 199     case R_386_PC8:
 200         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 201         if (value != (int8_t)value) {
 202             return false;
 203         }
 204         tcg_patch8(code_ptr, value);
 205         break;
 206     default:
 207         tcg_abort();
 208     }
 209     return true;
 210 }
 211
 212 /* test if a constant matches the constraint */
 213 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 214                                          const TCGArgConstraint *arg_ct)
 215 {
 216     int ct = arg_ct->ct;
 217     if (ct & TCG_CT_CONST) {
 218         return 1;
 219     }
 220     if (type == TCG_TYPE_I32) {
 221         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 222             return 1;
 223         }
 224     } else {
 225         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 226             return 1;
 227         }
 228         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 229             return 1;
 230         }
 231         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 232             return 1;
 233         }
 234     }
 235     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 236         return 1;
 237     }
 238     return 0;
 239 }
 240
 241 # define LOWREGMASK(x)  ((x) & 7)
 242
 243 #define P_EXT           0x100           /* 0x0f opcode prefix */
 244 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 245 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 246 #if TCG_TARGET_REG_BITS == 64
 247 # define P_REXW         0x1000          /* Set REX.W = 1 */
 248 # define P_REXB_R       0x2000          /* REG field as byte register */
 249 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 250 # define P_GS           0x8000          /* gs segment override */
 251 #else
 252 # define P_REXW         0
 253 # define P_REXB_R       0
 254 # define P_REXB_RM      0
 255 # define P_GS           0
 256 #endif
 257 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 258 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 259 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 260 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 261
 262 #define OPC_ARITH_EvIz  (0x81)
 263 #define OPC_ARITH_EvIb  (0x83)
 264 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 265 #define OPC_ANDN        (0xf2 | P_EXT38)
 266 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 267 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 268 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 269 #define OPC_BSF         (0xbc | P_EXT)
 270 #define OPC_BSR         (0xbd | P_EXT)
 271 #define OPC_BSWAP       (0xc8 | P_EXT)
 272 #define OPC_CALL_Jz     (0xe8)
 273 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 274 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 275 #define OPC_DEC_r32     (0x48)
 276 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 277 #define OPC_IMUL_GvEvIb (0x6b)
 278 #define OPC_IMUL_GvEvIz (0x69)
 279 #define OPC_INC_r32     (0x40)
 280 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 281 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 282 #define OPC_JMP_long    (0xe9)
 283 #define OPC_JMP_short   (0xeb)
 284 #define OPC_LEA         (0x8d)
 285 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 286 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 287 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 288 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 289 #define OPC_MOVB_EvIz   (0xc6)
 290 #define OPC_MOVL_EvIz   (0xc7)
 291 #define OPC_MOVL_Iv     (0xb8)
 292 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 293 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 294 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 295 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 296 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 297 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 298 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 299 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 300 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 301 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 302 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 303 #define OPC_MOVSBL      (0xbe | P_EXT)
 304 #define OPC_MOVSWL      (0xbf | P_EXT)
 305 #define OPC_MOVSLQ      (0x63 | P_REXW)
 306 #define OPC_MOVZBL      (0xb6 | P_EXT)
 307 #define OPC_MOVZWL      (0xb7 | P_EXT)
 308 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 309 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 310 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 311 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 312 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 313 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 314 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 315 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 316 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 317 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 318 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 319 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 320 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 321 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 322 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 323 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 324 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 325 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 326 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 327 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 328 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 329 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 330 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 331 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 332 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 333 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 334 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 335 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 336 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 337 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 338 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 339 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 340 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 341 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 342 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 343 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 344 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 345 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 348 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 349 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 350 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 351 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 352 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 353 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 354 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 355 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 356 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 357 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 358 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 359 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 360 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 361 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 362 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 363 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 364 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 365 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 366 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 368 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 369 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 370 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 371 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 372 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 373 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 374 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 376 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 377 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 384 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 385 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 386 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 387 #define OPC_POP_r32     (0x58)
 388 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 389 #define OPC_PUSH_r32    (0x50)
 390 #define OPC_PUSH_Iv     (0x68)
 391 #define OPC_PUSH_Ib     (0x6a)
 392 #define OPC_RET         (0xc3)
 393 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 394 #define OPC_SHIFT_1     (0xd1)
 395 #define OPC_SHIFT_Ib    (0xc1)
 396 #define OPC_SHIFT_cl    (0xd3)
 397 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 398 #define OPC_SHUFPS      (0xc6 | P_EXT)
 399 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 400 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 401 #define OPC_SHRD_Ib     (0xac | P_EXT)
 402 #define OPC_TESTL       (0x85)
 403 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 404 #define OPC_UD2         (0x0b | P_EXT)
 405 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 406 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 407 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 408 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 409 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 410 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 413 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 414 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 415 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 416 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 417 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 418 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
 419 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 420 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 421 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
 422 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 423 #define OPC_XCHG_ax_r32 (0x90)
 424
 425 #define OPC_GRP3_Ev     (0xf7)
 426 #define OPC_GRP5        (0xff)
 427 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 428
 429 /* Group 1 opcode extensions for 0x80-0x83.
 430    These are also used as modifiers for OPC_ARITH.  */
 431 #define ARITH_ADD 0
 432 #define ARITH_OR  1
 433 #define ARITH_ADC 2
 434 #define ARITH_SBB 3
 435 #define ARITH_AND 4
 436 #define ARITH_SUB 5
 437 #define ARITH_XOR 6
 438 #define ARITH_CMP 7
 439
 440 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 441 #define SHIFT_ROL 0
 442 #define SHIFT_ROR 1
 443 #define SHIFT_SHL 4
 444 #define SHIFT_SHR 5
 445 #define SHIFT_SAR 7
 446
 447 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 448 #define EXT3_NOT   2
 449 #define EXT3_NEG   3
 450 #define EXT3_MUL   4
 451 #define EXT3_IMUL  5
 452 #define EXT3_DIV   6
 453 #define EXT3_IDIV  7
 454
 455 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 456 #define EXT5_INC_Ev     0
 457 #define EXT5_DEC_Ev     1
 458 #define EXT5_CALLN_Ev   2
 459 #define EXT5_JMPN_Ev    4
 460
 461 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 462 #define JCC_JMP (-1)
 463 #define JCC_JO  0x0
 464 #define JCC_JNO 0x1
 465 #define JCC_JB  0x2
 466 #define JCC_JAE 0x3
 467 #define JCC_JE  0x4
 468 #define JCC_JNE 0x5
 469 #define JCC_JBE 0x6
 470 #define JCC_JA  0x7
 471 #define JCC_JS  0x8
 472 #define JCC_JNS 0x9
 473 #define JCC_JP  0xa
 474 #define JCC_JNP 0xb
 475 #define JCC_JL  0xc
 476 #define JCC_JGE 0xd
 477 #define JCC_JLE 0xe
 478 #define JCC_JG  0xf
 479
 480 static const uint8_t tcg_cond_to_jcc[] = {
 481     [TCG_COND_EQ] = JCC_JE,
 482     [TCG_COND_NE] = JCC_JNE,
 483     [TCG_COND_LT] = JCC_JL,
 484     [TCG_COND_GE] = JCC_JGE,
 485     [TCG_COND_LE] = JCC_JLE,
 486     [TCG_COND_GT] = JCC_JG,
 487     [TCG_COND_LTU] = JCC_JB,
 488     [TCG_COND_GEU] = JCC_JAE,
 489     [TCG_COND_LEU] = JCC_JBE,
 490     [TCG_COND_GTU] = JCC_JA,
 491 };
 492
 493 #if TCG_TARGET_REG_BITS == 64
 494 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 495 {
 496     int rex;
 497
 498     if (opc & P_GS) {
 499         tcg_out8(s, 0x65);
 500     }
 501     if (opc & P_DATA16) {
 502         /* We should never be asking for both 16 and 64-bit operation.  */
 503         tcg_debug_assert((opc & P_REXW) == 0);
 504         tcg_out8(s, 0x66);
 505     }
 506     if (opc & P_SIMDF3) {
 507         tcg_out8(s, 0xf3);
 508     } else if (opc & P_SIMDF2) {
 509         tcg_out8(s, 0xf2);
 510     }
 511
 512     rex = 0;
 513     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 514     rex |= (r & 8) >> 1;                /* REX.R */
 515     rex |= (x & 8) >> 2;                /* REX.X */
 516     rex |= (rm & 8) >> 3;               /* REX.B */
 517
 518     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 519        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 520        as otherwise the encoding indicates %[abcd]h.  Note that the values
 521        that are ORed in merely indicate that the REX byte must be present;
 522        those bits get discarded in output.  */
 523     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 524     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 525
 526     if (rex) {
 527         tcg_out8(s, (uint8_t)(rex | 0x40));
 528     }
 529
 530     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 531         tcg_out8(s, 0x0f);
 532         if (opc & P_EXT38) {
 533             tcg_out8(s, 0x38);
 534         } else if (opc & P_EXT3A) {
 535             tcg_out8(s, 0x3a);
 536         }
 537     }
 538
 539     tcg_out8(s, opc);
 540 }
 541 #else
 542 static void tcg_out_opc(TCGContext *s, int opc)
 543 {
 544     if (opc & P_DATA16) {
 545         tcg_out8(s, 0x66);
 546     }
 547     if (opc & P_SIMDF3) {
 548         tcg_out8(s, 0xf3);
 549     } else if (opc & P_SIMDF2) {
 550         tcg_out8(s, 0xf2);
 551     }
 552     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 553         tcg_out8(s, 0x0f);
 554         if (opc & P_EXT38) {
 555             tcg_out8(s, 0x38);
 556         } else if (opc & P_EXT3A) {
 557             tcg_out8(s, 0x3a);
 558         }
 559     }
 560     tcg_out8(s, opc);
 561 }
 562 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 563    the 32-bit compilation paths.  This method works with all versions of gcc,
 564    whereas relying on optimization may not be able to exclude them.  */
 565 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 566 #endif
 567
 568 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 569 {
 570     tcg_out_opc(s, opc, r, rm, 0);
 571     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 572 }
 573
 574 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 575                             int rm, int index)
 576 {
 577     int tmp;
 578
 579     /* Use the two byte form if possible, which cannot encode
 580        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 581     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 582         && ((rm | index) & 8) == 0) {
 583         /* Two byte VEX prefix.  */
 584         tcg_out8(s, 0xc5);
 585
 586         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 587     } else {
 588         /* Three byte VEX prefix.  */
 589         tcg_out8(s, 0xc4);
 590
 591         /* VEX.m-mmmm */
 592         if (opc & P_EXT3A) {
 593             tmp = 3;
 594         } else if (opc & P_EXT38) {
 595             tmp = 2;
 596         } else if (opc & P_EXT) {
 597             tmp = 1;
 598         } else {
 599             g_assert_not_reached();
 600         }
 601         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 602         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 603         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 604         tcg_out8(s, tmp);
 605
 606         tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 607     }
 608
 609     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 610     /* VEX.pp */
 611     if (opc & P_DATA16) {
 612         tmp |= 1;                          /* 0x66 */
 613     } else if (opc & P_SIMDF3) {
 614         tmp |= 2;                          /* 0xf3 */
 615     } else if (opc & P_SIMDF2) {
 616         tmp |= 3;                          /* 0xf2 */
 617     }
 618     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 619     tcg_out8(s, tmp);
 620     tcg_out8(s, opc);
 621 }
 622
 623 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 624 {
 625     tcg_out_vex_opc(s, opc, r, v, rm, 0);
 626     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 627 }
 628
 629 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 630    We handle either RM and INDEX missing with a negative value.  In 64-bit
 631    mode for absolute addresses, ~RM is the size of the immediate operand
 632    that will follow the instruction.  */
 633
 634 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 635                                int shift, intptr_t offset)
 636 {
 637     int mod, len;
 638
 639     if (index < 0 && rm < 0) {
 640         if (TCG_TARGET_REG_BITS == 64) {
 641             /* Try for a rip-relative addressing mode.  This has replaced
 642                the 32-bit-mode absolute addressing encoding.  */
 643             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 644             intptr_t disp = offset - pc;
 645             if (disp == (int32_t)disp) {
 646                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 647                 tcg_out32(s, disp);
 648                 return;
 649             }
 650
 651             /* Try for an absolute address encoding.  This requires the
 652                use of the MODRM+SIB encoding and is therefore larger than
 653                rip-relative addressing.  */
 654             if (offset == (int32_t)offset) {
 655                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 656                 tcg_out8(s, (4 << 3) | 5);
 657                 tcg_out32(s, offset);
 658                 return;
 659             }
 660
 661             /* ??? The memory isn't directly addressable.  */
 662             g_assert_not_reached();
 663         } else {
 664             /* Absolute address.  */
 665             tcg_out8(s, (r << 3) | 5);
 666             tcg_out32(s, offset);
 667             return;
 668         }
 669     }
 670
 671     /* Find the length of the immediate addend.  Note that the encoding
 672        that would be used for (%ebp) indicates absolute addressing.  */
 673     if (rm < 0) {
 674         mod = 0, len = 4, rm = 5;
 675     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 676         mod = 0, len = 0;
 677     } else if (offset == (int8_t)offset) {
 678         mod = 0x40, len = 1;
 679     } else {
 680         mod = 0x80, len = 4;
 681     }
 682
 683     /* Use a single byte MODRM format if possible.  Note that the encoding
 684        that would be used for %esp is the escape to the two byte form.  */
 685     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 686         /* Single byte MODRM format.  */
 687         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 688     } else {
 689         /* Two byte MODRM+SIB format.  */
 690
 691         /* Note that the encoding that would place %esp into the index
 692            field indicates no index register.  In 64-bit mode, the REX.X
 693            bit counts, so %r12 can be used as the index.  */
 694         if (index < 0) {
 695             index = 4;
 696         } else {
 697             tcg_debug_assert(index != TCG_REG_ESP);
 698         }
 699
 700         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 701         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 702     }
 703
 704     if (len == 1) {
 705         tcg_out8(s, offset);
 706     } else if (len == 4) {
 707         tcg_out32(s, offset);
 708     }
 709 }
 710
 711 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 712                                      int index, int shift, intptr_t offset)
 713 {
 714     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 715     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 716 }
 717
 718 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 719                                          int rm, int index, int shift,
 720                                          intptr_t offset)
 721 {
 722     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 723     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 724 }
 725
 726 /* A simplification of the above with no index or shift.  */
 727 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 728                                         int rm, intptr_t offset)
 729 {
 730     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 731 }
 732
 733 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 734                                             int v, int rm, intptr_t offset)
 735 {
 736     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 737 }
 738
 739 /* Output an opcode with an expected reference to the constant pool.  */
 740 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 741 {
 742     tcg_out_opc(s, opc, r, 0, 0);
 743     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 744     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 745     tcg_out32(s, 0);
 746 }
 747
 748 /* Output an opcode with an expected reference to the constant pool.  */
 749 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 750 {
 751     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 752     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 753     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 754     tcg_out32(s, 0);
 755 }
 756
 757 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 758 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 759 {
 760     /* Propagate an opcode prefix, such as P_REXW.  */
 761     int ext = subop & ~0x7;
 762     subop &= 0x7;
 763
 764     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 765 }
 766
 767 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 768 {
 769     int rexw = 0;
 770
 771     if (arg == ret) {
 772         return true;
 773     }
 774     switch (type) {
 775     case TCG_TYPE_I64:
 776         rexw = P_REXW;
 777         /* fallthru */
 778     case TCG_TYPE_I32:
 779         if (ret < 16) {
 780             if (arg < 16) {
 781                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 782             } else {
 783                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 784             }
 785         } else {
 786             if (arg < 16) {
 787                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 788             } else {
 789                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 790             }
 791         }
 792         break;
 793
 794     case TCG_TYPE_V64:
 795         tcg_debug_assert(ret >= 16 && arg >= 16);
 796         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 797         break;
 798     case TCG_TYPE_V128:
 799         tcg_debug_assert(ret >= 16 && arg >= 16);
 800         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 801         break;
 802     case TCG_TYPE_V256:
 803         tcg_debug_assert(ret >= 16 && arg >= 16);
 804         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 805         break;
 806
 807     default:
 808         g_assert_not_reached();
 809     }
 810     return true;
 811 }
 812
 813 static const int avx2_dup_insn[4] = {
 814     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 815     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 816 };
 817
 818 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 819                             TCGReg r, TCGReg a)
 820 {
 821     if (have_avx2) {
 822         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 823         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 824     } else {
 825         switch (vece) {
 826         case MO_8:
 827             /* ??? With zero in a register, use PSHUFB.  */
 828             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 829             a = r;
 830             /* FALLTHRU */
 831         case MO_16:
 832             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 833             a = r;
 834             /* FALLTHRU */
 835         case MO_32:
 836             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 837             /* imm8 operand: all output lanes selected from input lane 0.  */
 838             tcg_out8(s, 0);
 839             break;
 840         case MO_64:
 841             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 842             break;
 843         default:
 844             g_assert_not_reached();
 845         }
 846     }
 847     return true;
 848 }
 849
 850 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 851                              TCGReg r, TCGReg base, intptr_t offset)
 852 {
 853     if (have_avx2) {
 854         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 855         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 856                                  r, 0, base, offset);
 857     } else {
 858         switch (vece) {
 859         case MO_64:
 860             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 861             break;
 862         case MO_32:
 863             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 864             break;
 865         case MO_16:
 866             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 867             tcg_out8(s, 0); /* imm8 */
 868             tcg_out_dup_vec(s, type, vece, r, r);
 869             break;
 870         case MO_8:
 871             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 872             tcg_out8(s, 0); /* imm8 */
 873             tcg_out_dup_vec(s, type, vece, r, r);
 874             break;
 875         default:
 876             g_assert_not_reached();
 877         }
 878     }
 879     return true;
 880 }
 881
 882 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 883                              TCGReg ret, int64_t arg)
 884 {
 885     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 886
 887     if (arg == 0) {
 888         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 889         return;
 890     }
 891     if (arg == -1) {
 892         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 893         return;
 894     }
 895
 896     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 897         if (have_avx2) {
 898             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 899         } else {
 900             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 901         }
 902         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 903     } else {
 904         if (type == TCG_TYPE_V64) {
 905             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 906         } else if (have_avx2) {
 907             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 908         } else {
 909             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 910         }
 911         if (TCG_TARGET_REG_BITS == 64) {
 912             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 913         } else {
 914             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 915         }
 916     }
 917 }
 918
 919 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 920                              TCGReg ret, tcg_target_long arg)
 921 {
 922     if (arg == 0) {
 923         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 924         return;
 925     }
 926     if (arg == -1) {
 927         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 928         return;
 929     }
 930
 931     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 932     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 933     if (TCG_TARGET_REG_BITS == 64) {
 934         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 935     } else {
 936         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 937     }
 938 }
 939
 940 static void tcg_out_movi_int(TCGContext *s, TCGType type,
 941                              TCGReg ret, tcg_target_long arg)
 942 {
 943     tcg_target_long diff;
 944
 945     if (arg == 0) {
 946         tgen_arithr(s, ARITH_XOR, ret, ret);
 947         return;
 948     }
 949     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 950         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 951         tcg_out32(s, arg);
 952         return;
 953     }
 954     if (arg == (int32_t)arg) {
 955         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 956         tcg_out32(s, arg);
 957         return;
 958     }
 959
 960     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 961     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
 962     if (diff == (int32_t)diff) {
 963         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 964         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 965         tcg_out32(s, diff);
 966         return;
 967     }
 968
 969     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 970     tcg_out64(s, arg);
 971 }
 972
 973 static void tcg_out_movi(TCGContext *s, TCGType type,
 974                          TCGReg ret, tcg_target_long arg)
 975 {
 976     switch (type) {
 977     case TCG_TYPE_I32:
 978 #if TCG_TARGET_REG_BITS == 64
 979     case TCG_TYPE_I64:
 980 #endif
 981         if (ret < 16) {
 982             tcg_out_movi_int(s, type, ret, arg);
 983         } else {
 984             tcg_out_movi_vec(s, type, ret, arg);
 985         }
 986         break;
 987     default:
 988         g_assert_not_reached();
 989     }
 990 }
 991
 992 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 993 {
 994     if (val == (int8_t)val) {
 995         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 996         tcg_out8(s, val);
 997     } else if (val == (int32_t)val) {
 998         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 999         tcg_out32(s, val);
1000     } else {
1001         tcg_abort();
1002     }
1003 }
1004
1005 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1006 {
1007     /* Given the strength of x86 memory ordering, we only need care for
1008        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1009        faster than "mfence", so don't bother with the sse insn.  */
1010     if (a0 & TCG_MO_ST_LD) {
1011         tcg_out8(s, 0xf0);
1012         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1013         tcg_out8(s, 0);
1014     }
1015 }
1016
1017 static inline void tcg_out_push(TCGContext *s, int reg)
1018 {
1019     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1020 }
1021
1022 static inline void tcg_out_pop(TCGContext *s, int reg)
1023 {
1024     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1025 }
1026
1027 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1028                        TCGReg arg1, intptr_t arg2)
1029 {
1030     switch (type) {
1031     case TCG_TYPE_I32:
1032         if (ret < 16) {
1033             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1034         } else {
1035             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1036         }
1037         break;
1038     case TCG_TYPE_I64:
1039         if (ret < 16) {
1040             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1041             break;
1042         }
1043         /* FALLTHRU */
1044     case TCG_TYPE_V64:
1045         /* There is no instruction that can validate 8-byte alignment.  */
1046         tcg_debug_assert(ret >= 16);
1047         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1048         break;
1049     case TCG_TYPE_V128:
1050         /*
1051          * The gvec infrastructure is asserts that v128 vector loads
1052          * and stores use a 16-byte aligned offset.  Validate that the
1053          * final pointer is aligned by using an insn that will SIGSEGV.
1054          */
1055         tcg_debug_assert(ret >= 16);
1056         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1057         break;
1058     case TCG_TYPE_V256:
1059         /*
1060          * The gvec infrastructure only requires 16-byte alignment,
1061          * so here we must use an unaligned load.
1062          */
1063         tcg_debug_assert(ret >= 16);
1064         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1065                                  ret, 0, arg1, arg2);
1066         break;
1067     default:
1068         g_assert_not_reached();
1069     }
1070 }
1071
1072 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1073                        TCGReg arg1, intptr_t arg2)
1074 {
1075     switch (type) {
1076     case TCG_TYPE_I32:
1077         if (arg < 16) {
1078             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1079         } else {
1080             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1081         }
1082         break;
1083     case TCG_TYPE_I64:
1084         if (arg < 16) {
1085             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1086             break;
1087         }
1088         /* FALLTHRU */
1089     case TCG_TYPE_V64:
1090         /* There is no instruction that can validate 8-byte alignment.  */
1091         tcg_debug_assert(arg >= 16);
1092         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1093         break;
1094     case TCG_TYPE_V128:
1095         /*
1096          * The gvec infrastructure is asserts that v128 vector loads
1097          * and stores use a 16-byte aligned offset.  Validate that the
1098          * final pointer is aligned by using an insn that will SIGSEGV.
1099          */
1100         tcg_debug_assert(arg >= 16);
1101         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1102         break;
1103     case TCG_TYPE_V256:
1104         /*
1105          * The gvec infrastructure only requires 16-byte alignment,
1106          * so here we must use an unaligned store.
1107          */
1108         tcg_debug_assert(arg >= 16);
1109         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1110                                  arg, 0, arg1, arg2);
1111         break;
1112     default:
1113         g_assert_not_reached();
1114     }
1115 }
1116
1117 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1118                         TCGReg base, intptr_t ofs)
1119 {
1120     int rexw = 0;
1121     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1122         if (val != (int32_t)val) {
1123             return false;
1124         }
1125         rexw = P_REXW;
1126     } else if (type != TCG_TYPE_I32) {
1127         return false;
1128     }
1129     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1130     tcg_out32(s, val);
1131     return true;
1132 }
1133
1134 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1135 {
1136     /* Propagate an opcode prefix, such as P_DATA16.  */
1137     int ext = subopc & ~0x7;
1138     subopc &= 0x7;
1139
1140     if (count == 1) {
1141         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1142     } else {
1143         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1144         tcg_out8(s, count);
1145     }
1146 }
1147
1148 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1149 {
1150     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1151 }
1152
1153 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1154 {
1155     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1156 }
1157
1158 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1159 {
1160     /* movzbl */
1161     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1162     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1163 }
1164
1165 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1166 {
1167     /* movsbl */
1168     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1169     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1170 }
1171
1172 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1173 {
1174     /* movzwl */
1175     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1176 }
1177
1178 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1179 {
1180     /* movsw[lq] */
1181     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1182 }
1183
1184 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1185 {
1186     /* 32-bit mov zero extends.  */
1187     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1188 }
1189
1190 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1191 {
1192     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1193 }
1194
1195 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1196 {
1197     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1198 }
1199
1200 static void tgen_arithi(TCGContext *s, int c, int r0,
1201                         tcg_target_long val, int cf)
1202 {
1203     int rexw = 0;
1204
1205     if (TCG_TARGET_REG_BITS == 64) {
1206         rexw = c & -8;
1207         c &= 7;
1208     }
1209
1210     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1211        partial flags update stalls on Pentium4 and are not recommended
1212        by current Intel optimization manuals.  */
1213     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1214         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1215         if (TCG_TARGET_REG_BITS == 64) {
1216             /* The single-byte increment encodings are re-tasked as the
1217                REX prefixes.  Use the MODRM encoding.  */
1218             tcg_out_modrm(s, OPC_GRP5 + rexw,
1219                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1220         } else {
1221             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1222         }
1223         return;
1224     }
1225
1226     if (c == ARITH_AND) {
1227         if (TCG_TARGET_REG_BITS == 64) {
1228             if (val == 0xffffffffu) {
1229                 tcg_out_ext32u(s, r0, r0);
1230                 return;
1231             }
1232             if (val == (uint32_t)val) {
1233                 /* AND with no high bits set can use a 32-bit operation.  */
1234                 rexw = 0;
1235             }
1236         }
1237         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1238             tcg_out_ext8u(s, r0, r0);
1239             return;
1240         }
1241         if (val == 0xffffu) {
1242             tcg_out_ext16u(s, r0, r0);
1243             return;
1244         }
1245     }
1246
1247     if (val == (int8_t)val) {
1248         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1249         tcg_out8(s, val);
1250         return;
1251     }
1252     if (rexw == 0 || val == (int32_t)val) {
1253         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1254         tcg_out32(s, val);
1255         return;
1256     }
1257
1258     tcg_abort();
1259 }
1260
1261 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1262 {
1263     if (val != 0) {
1264         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1265     }
1266 }
1267
1268 /* Use SMALL != 0 to force a short forward branch.  */
1269 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1270 {
1271     int32_t val, val1;
1272
1273     if (l->has_value) {
1274         val = tcg_pcrel_diff(s, l->u.value_ptr);
1275         val1 = val - 2;
1276         if ((int8_t)val1 == val1) {
1277             if (opc == -1) {
1278                 tcg_out8(s, OPC_JMP_short);
1279             } else {
1280                 tcg_out8(s, OPC_JCC_short + opc);
1281             }
1282             tcg_out8(s, val1);
1283         } else {
1284             if (small) {
1285                 tcg_abort();
1286             }
1287             if (opc == -1) {
1288                 tcg_out8(s, OPC_JMP_long);
1289                 tcg_out32(s, val - 5);
1290             } else {
1291                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1292                 tcg_out32(s, val - 6);
1293             }
1294         }
1295     } else if (small) {
1296         if (opc == -1) {
1297             tcg_out8(s, OPC_JMP_short);
1298         } else {
1299             tcg_out8(s, OPC_JCC_short + opc);
1300         }
1301         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1302         s->code_ptr += 1;
1303     } else {
1304         if (opc == -1) {
1305             tcg_out8(s, OPC_JMP_long);
1306         } else {
1307             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1308         }
1309         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1310         s->code_ptr += 4;
1311     }
1312 }
1313
1314 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1315                         int const_arg2, int rexw)
1316 {
1317     if (const_arg2) {
1318         if (arg2 == 0) {
1319             /* test r, r */
1320             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1321         } else {
1322             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1323         }
1324     } else {
1325         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1326     }
1327 }
1328
1329 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1330                              TCGArg arg1, TCGArg arg2, int const_arg2,
1331                              TCGLabel *label, int small)
1332 {
1333     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1334     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1335 }
1336
1337 #if TCG_TARGET_REG_BITS == 64
1338 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1339                              TCGArg arg1, TCGArg arg2, int const_arg2,
1340                              TCGLabel *label, int small)
1341 {
1342     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1343     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1344 }
1345 #else
1346 /* XXX: we implement it at the target level to avoid having to
1347    handle cross basic blocks temporaries */
1348 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1349                             const int *const_args, int small)
1350 {
1351     TCGLabel *label_next = gen_new_label();
1352     TCGLabel *label_this = arg_label(args[5]);
1353
1354     switch(args[4]) {
1355     case TCG_COND_EQ:
1356         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1357                          label_next, 1);
1358         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1359                          label_this, small);
1360         break;
1361     case TCG_COND_NE:
1362         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1363                          label_this, small);
1364         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1365                          label_this, small);
1366         break;
1367     case TCG_COND_LT:
1368         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1369                          label_this, small);
1370         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1372                          label_this, small);
1373         break;
1374     case TCG_COND_LE:
1375         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1376                          label_this, small);
1377         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1378         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1379                          label_this, small);
1380         break;
1381     case TCG_COND_GT:
1382         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1383                          label_this, small);
1384         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1385         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1386                          label_this, small);
1387         break;
1388     case TCG_COND_GE:
1389         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1390                          label_this, small);
1391         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1392         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1393                          label_this, small);
1394         break;
1395     case TCG_COND_LTU:
1396         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1397                          label_this, small);
1398         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1399         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1400                          label_this, small);
1401         break;
1402     case TCG_COND_LEU:
1403         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1404                          label_this, small);
1405         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1406         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1407                          label_this, small);
1408         break;
1409     case TCG_COND_GTU:
1410         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1411                          label_this, small);
1412         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1413         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1414                          label_this, small);
1415         break;
1416     case TCG_COND_GEU:
1417         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1418                          label_this, small);
1419         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1420         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1421                          label_this, small);
1422         break;
1423     default:
1424         tcg_abort();
1425     }
1426     tcg_out_label(s, label_next);
1427 }
1428 #endif
1429
1430 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1431                               TCGArg arg1, TCGArg arg2, int const_arg2)
1432 {
1433     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1434     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1435     tcg_out_ext8u(s, dest, dest);
1436 }
1437
1438 #if TCG_TARGET_REG_BITS == 64
1439 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1440                               TCGArg arg1, TCGArg arg2, int const_arg2)
1441 {
1442     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1443     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1444     tcg_out_ext8u(s, dest, dest);
1445 }
1446 #else
1447 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1448                              const int *const_args)
1449 {
1450     TCGArg new_args[6];
1451     TCGLabel *label_true, *label_over;
1452
1453     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1454
1455     if (args[0] == args[1] || args[0] == args[2]
1456         || (!const_args[3] && args[0] == args[3])
1457         || (!const_args[4] && args[0] == args[4])) {
1458         /* When the destination overlaps with one of the argument
1459            registers, don't do anything tricky.  */
1460         label_true = gen_new_label();
1461         label_over = gen_new_label();
1462
1463         new_args[5] = label_arg(label_true);
1464         tcg_out_brcond2(s, new_args, const_args+1, 1);
1465
1466         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1467         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1468         tcg_out_label(s, label_true);
1469
1470         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1471         tcg_out_label(s, label_over);
1472     } else {
1473         /* When the destination does not overlap one of the arguments,
1474            clear the destination first, jump if cond false, and emit an
1475            increment in the true case.  This results in smaller code.  */
1476
1477         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1478
1479         label_over = gen_new_label();
1480         new_args[4] = tcg_invert_cond(new_args[4]);
1481         new_args[5] = label_arg(label_over);
1482         tcg_out_brcond2(s, new_args, const_args+1, 1);
1483
1484         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1485         tcg_out_label(s, label_over);
1486     }
1487 }
1488 #endif
1489
1490 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1491                          TCGReg dest, TCGReg v1)
1492 {
1493     if (have_cmov) {
1494         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1495     } else {
1496         TCGLabel *over = gen_new_label();
1497         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1498         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1499         tcg_out_label(s, over);
1500     }
1501 }
1502
1503 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1504                               TCGReg c1, TCGArg c2, int const_c2,
1505                               TCGReg v1)
1506 {
1507     tcg_out_cmp(s, c1, c2, const_c2, 0);
1508     tcg_out_cmov(s, cond, 0, dest, v1);
1509 }
1510
1511 #if TCG_TARGET_REG_BITS == 64
1512 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1513                               TCGReg c1, TCGArg c2, int const_c2,
1514                               TCGReg v1)
1515 {
1516     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1517     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1518 }
1519 #endif
1520
1521 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1522                         TCGArg arg2, bool const_a2)
1523 {
1524     if (have_bmi1) {
1525         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1526         if (const_a2) {
1527             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1528         } else {
1529             tcg_debug_assert(dest != arg2);
1530             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1531         }
1532     } else {
1533         tcg_debug_assert(dest != arg2);
1534         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1535         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1536     }
1537 }
1538
1539 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1540                         TCGArg arg2, bool const_a2)
1541 {
1542     if (have_lzcnt) {
1543         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1544         if (const_a2) {
1545             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1546         } else {
1547             tcg_debug_assert(dest != arg2);
1548             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1549         }
1550     } else {
1551         tcg_debug_assert(!const_a2);
1552         tcg_debug_assert(dest != arg1);
1553         tcg_debug_assert(dest != arg2);
1554
1555         /* Recall that the output of BSR is the index not the count.  */
1556         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1557         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1558
1559         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1560         tcg_out_cmp(s, arg1, 0, 1, rexw);
1561         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1562     }
1563 }
1564
1565 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1566 {
1567     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1568
1569     if (disp == (int32_t)disp) {
1570         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1571         tcg_out32(s, disp);
1572     } else {
1573         /* rip-relative addressing into the constant pool.
1574            This is 6 + 8 = 14 bytes, as compared to using an
1575            an immediate load 10 + 6 = 16 bytes, plus we may
1576            be able to re-use the pool constant for more calls.  */
1577         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1578         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1579         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1580         tcg_out32(s, 0);
1581     }
1582 }
1583
1584 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1585 {
1586     tcg_out_branch(s, 1, dest);
1587 }
1588
1589 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1590 {
1591     tcg_out_branch(s, 0, dest);
1592 }
1593
1594 static void tcg_out_nopn(TCGContext *s, int n)
1595 {
1596     int i;
1597     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1598      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1599      * duplicate prefix, and all of the interesting recent cores can
1600      * decode and discard the duplicates in a single cycle.
1601      */
1602     tcg_debug_assert(n >= 1);
1603     for (i = 1; i < n; ++i) {
1604         tcg_out8(s, 0x66);
1605     }
1606     tcg_out8(s, 0x90);
1607 }
1608
1609 #if defined(CONFIG_SOFTMMU)
1610 #include "../tcg-ldst.c.inc"
1611
1612 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1613  *                                     int mmu_idx, uintptr_t ra)
1614  */
1615 static void * const qemu_ld_helpers[16] = {
1616     [MO_UB]   = helper_ret_ldub_mmu,
1617     [MO_LEUW] = helper_le_lduw_mmu,
1618     [MO_LEUL] = helper_le_ldul_mmu,
1619     [MO_LEQ]  = helper_le_ldq_mmu,
1620     [MO_BEUW] = helper_be_lduw_mmu,
1621     [MO_BEUL] = helper_be_ldul_mmu,
1622     [MO_BEQ]  = helper_be_ldq_mmu,
1623 };
1624
1625 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1626  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1627  */
1628 static void * const qemu_st_helpers[16] = {
1629     [MO_UB]   = helper_ret_stb_mmu,
1630     [MO_LEUW] = helper_le_stw_mmu,
1631     [MO_LEUL] = helper_le_stl_mmu,
1632     [MO_LEQ]  = helper_le_stq_mmu,
1633     [MO_BEUW] = helper_be_stw_mmu,
1634     [MO_BEUL] = helper_be_stl_mmu,
1635     [MO_BEQ]  = helper_be_stq_mmu,
1636 };
1637
1638 /* Perform the TLB load and compare.
1639
1640    Inputs:
1641    ADDRLO and ADDRHI contain the low and high part of the address.
1642
1643    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1644
1645    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1646    This should be offsetof addr_read or addr_write.
1647
1648    Outputs:
1649    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1650    positions of the displacements of forward jumps to the TLB miss case.
1651
1652    Second argument register is loaded with the low part of the address.
1653    In the TLB hit case, it has been adjusted as indicated by the TLB
1654    and so is a host address.  In the TLB miss case, it continues to
1655    hold a guest address.
1656
1657    First argument register is clobbered.  */
1658
1659 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1660                                     int mem_index, MemOp opc,
1661                                     tcg_insn_unit **label_ptr, int which)
1662 {
1663     const TCGReg r0 = TCG_REG_L0;
1664     const TCGReg r1 = TCG_REG_L1;
1665     TCGType ttype = TCG_TYPE_I32;
1666     TCGType tlbtype = TCG_TYPE_I32;
1667     int trexw = 0, hrexw = 0, tlbrexw = 0;
1668     unsigned a_bits = get_alignment_bits(opc);
1669     unsigned s_bits = opc & MO_SIZE;
1670     unsigned a_mask = (1 << a_bits) - 1;
1671     unsigned s_mask = (1 << s_bits) - 1;
1672     target_ulong tlb_mask;
1673
1674     if (TCG_TARGET_REG_BITS == 64) {
1675         if (TARGET_LONG_BITS == 64) {
1676             ttype = TCG_TYPE_I64;
1677             trexw = P_REXW;
1678         }
1679         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1680             hrexw = P_REXW;
1681             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1682                 tlbtype = TCG_TYPE_I64;
1683                 tlbrexw = P_REXW;
1684             }
1685         }
1686     }
1687
1688     tcg_out_mov(s, tlbtype, r0, addrlo);
1689     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1690                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1691
1692     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1693                          TLB_MASK_TABLE_OFS(mem_index) +
1694                          offsetof(CPUTLBDescFast, mask));
1695
1696     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1697                          TLB_MASK_TABLE_OFS(mem_index) +
1698                          offsetof(CPUTLBDescFast, table));
1699
1700     /* If the required alignment is at least as large as the access, simply
1701        copy the address and mask.  For lesser alignments, check that we don't
1702        cross pages for the complete access.  */
1703     if (a_bits >= s_bits) {
1704         tcg_out_mov(s, ttype, r1, addrlo);
1705     } else {
1706         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1707     }
1708     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1709     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1710
1711     /* cmp 0(r0), r1 */
1712     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1713
1714     /* Prepare for both the fast path add of the tlb addend, and the slow
1715        path function argument setup.  */
1716     tcg_out_mov(s, ttype, r1, addrlo);
1717
1718     /* jne slow_path */
1719     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1720     label_ptr[0] = s->code_ptr;
1721     s->code_ptr += 4;
1722
1723     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1724         /* cmp 4(r0), addrhi */
1725         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1726
1727         /* jne slow_path */
1728         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1729         label_ptr[1] = s->code_ptr;
1730         s->code_ptr += 4;
1731     }
1732
1733     /* TLB Hit.  */
1734
1735     /* add addend(r0), r1 */
1736     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1737                          offsetof(CPUTLBEntry, addend));
1738 }
1739
1740 /*
1741  * Record the context of a call to the out of line helper code for the slow path
1742  * for a load or store, so that we can later generate the correct helper code
1743  */
1744 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1745                                 TCGMemOpIdx oi,
1746                                 TCGReg datalo, TCGReg datahi,
1747                                 TCGReg addrlo, TCGReg addrhi,
1748                                 tcg_insn_unit *raddr,
1749                                 tcg_insn_unit **label_ptr)
1750 {
1751     TCGLabelQemuLdst *label = new_ldst_label(s);
1752
1753     label->is_ld = is_ld;
1754     label->oi = oi;
1755     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1756     label->datalo_reg = datalo;
1757     label->datahi_reg = datahi;
1758     label->addrlo_reg = addrlo;
1759     label->addrhi_reg = addrhi;
1760     label->raddr = tcg_splitwx_to_rx(raddr);
1761     label->label_ptr[0] = label_ptr[0];
1762     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1763         label->label_ptr[1] = label_ptr[1];
1764     }
1765 }
1766
1767 /*
1768  * Generate code for the slow path for a load at the end of block
1769  */
1770 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1771 {
1772     TCGMemOpIdx oi = l->oi;
1773     MemOp opc = get_memop(oi);
1774     TCGReg data_reg;
1775     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1776     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1777
1778     /* resolve label address */
1779     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1780     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1781         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1782     }
1783
1784     if (TCG_TARGET_REG_BITS == 32) {
1785         int ofs = 0;
1786
1787         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1788         ofs += 4;
1789
1790         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1791         ofs += 4;
1792
1793         if (TARGET_LONG_BITS == 64) {
1794             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1795             ofs += 4;
1796         }
1797
1798         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1799         ofs += 4;
1800
1801         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1802     } else {
1803         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1804         /* The second argument is already loaded with addrlo.  */
1805         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1806         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1807                      (uintptr_t)l->raddr);
1808     }
1809
1810     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1811
1812     data_reg = l->datalo_reg;
1813     switch (opc & MO_SSIZE) {
1814     case MO_SB:
1815         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1816         break;
1817     case MO_SW:
1818         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1819         break;
1820 #if TCG_TARGET_REG_BITS == 64
1821     case MO_SL:
1822         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1823         break;
1824 #endif
1825     case MO_UB:
1826     case MO_UW:
1827         /* Note that the helpers have zero-extended to tcg_target_long.  */
1828     case MO_UL:
1829         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1830         break;
1831     case MO_Q:
1832         if (TCG_TARGET_REG_BITS == 64) {
1833             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1834         } else if (data_reg == TCG_REG_EDX) {
1835             /* xchg %edx, %eax */
1836             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1837             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1838         } else {
1839             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1840             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1841         }
1842         break;
1843     default:
1844         tcg_abort();
1845     }
1846
1847     /* Jump to the code corresponding to next IR of qemu_st */
1848     tcg_out_jmp(s, l->raddr);
1849     return true;
1850 }
1851
1852 /*
1853  * Generate code for the slow path for a store at the end of block
1854  */
1855 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856 {
1857     TCGMemOpIdx oi = l->oi;
1858     MemOp opc = get_memop(oi);
1859     MemOp s_bits = opc & MO_SIZE;
1860     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1861     TCGReg retaddr;
1862
1863     /* resolve label address */
1864     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1865     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1866         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1867     }
1868
1869     if (TCG_TARGET_REG_BITS == 32) {
1870         int ofs = 0;
1871
1872         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1873         ofs += 4;
1874
1875         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1876         ofs += 4;
1877
1878         if (TARGET_LONG_BITS == 64) {
1879             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1880             ofs += 4;
1881         }
1882
1883         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1884         ofs += 4;
1885
1886         if (s_bits == MO_64) {
1887             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1888             ofs += 4;
1889         }
1890
1891         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1892         ofs += 4;
1893
1894         retaddr = TCG_REG_EAX;
1895         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1896         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1897     } else {
1898         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1899         /* The second argument is already loaded with addrlo.  */
1900         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1901                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1902         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1903
1904         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1905             retaddr = tcg_target_call_iarg_regs[4];
1906             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1907         } else {
1908             retaddr = TCG_REG_RAX;
1909             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1910             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1911                        TCG_TARGET_CALL_STACK_OFFSET);
1912         }
1913     }
1914
1915     /* "Tail call" to the helper, with the return address back inline.  */
1916     tcg_out_push(s, retaddr);
1917     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1918     return true;
1919 }
1920 #elif TCG_TARGET_REG_BITS == 32
1921 # define x86_guest_base_seg     0
1922 # define x86_guest_base_index   -1
1923 # define x86_guest_base_offset  guest_base
1924 #else
1925 static int x86_guest_base_seg;
1926 static int x86_guest_base_index = -1;
1927 static int32_t x86_guest_base_offset;
1928 # if defined(__x86_64__) && defined(__linux__)
1929 #  include <asm/prctl.h>
1930 #  include <sys/prctl.h>
1931 int arch_prctl(int code, unsigned long addr);
1932 static inline int setup_guest_base_seg(void)
1933 {
1934     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1935         return P_GS;
1936     }
1937     return 0;
1938 }
1939 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1940 #  include <machine/sysarch.h>
1941 static inline int setup_guest_base_seg(void)
1942 {
1943     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1944         return P_GS;
1945     }
1946     return 0;
1947 }
1948 # else
1949 static inline int setup_guest_base_seg(void)
1950 {
1951     return 0;
1952 }
1953 # endif
1954 #endif /* SOFTMMU */
1955
1956 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1957                                    TCGReg base, int index, intptr_t ofs,
1958                                    int seg, bool is64, MemOp memop)
1959 {
1960     bool use_movbe = false;
1961     int rexw = is64 * P_REXW;
1962     int movop = OPC_MOVL_GvEv;
1963
1964     /* Do big-endian loads with movbe.  */
1965     if (memop & MO_BSWAP) {
1966         tcg_debug_assert(have_movbe);
1967         use_movbe = true;
1968         movop = OPC_MOVBE_GyMy;
1969     }
1970
1971     switch (memop & MO_SSIZE) {
1972     case MO_UB:
1973         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1974                                  base, index, 0, ofs);
1975         break;
1976     case MO_SB:
1977         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1978                                  base, index, 0, ofs);
1979         break;
1980     case MO_UW:
1981         if (use_movbe) {
1982             /* There is no extending movbe; only low 16-bits are modified.  */
1983             if (datalo != base && datalo != index) {
1984                 /* XOR breaks dependency chains.  */
1985                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
1986                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1987                                          datalo, base, index, 0, ofs);
1988             } else {
1989                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1990                                          datalo, base, index, 0, ofs);
1991                 tcg_out_ext16u(s, datalo, datalo);
1992             }
1993         } else {
1994             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1995                                      base, index, 0, ofs);
1996         }
1997         break;
1998     case MO_SW:
1999         if (use_movbe) {
2000             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2001                                      datalo, base, index, 0, ofs);
2002             tcg_out_ext16s(s, datalo, datalo, rexw);
2003         } else {
2004             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2005                                      datalo, base, index, 0, ofs);
2006         }
2007         break;
2008     case MO_UL:
2009         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2010         break;
2011 #if TCG_TARGET_REG_BITS == 64
2012     case MO_SL:
2013         if (use_movbe) {
2014             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2015                                      base, index, 0, ofs);
2016             tcg_out_ext32s(s, datalo, datalo);
2017         } else {
2018             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2019                                      base, index, 0, ofs);
2020         }
2021         break;
2022 #endif
2023     case MO_Q:
2024         if (TCG_TARGET_REG_BITS == 64) {
2025             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2026                                      base, index, 0, ofs);
2027         } else {
2028             if (use_movbe) {
2029                 TCGReg t = datalo;
2030                 datalo = datahi;
2031                 datahi = t;
2032             }
2033             if (base != datalo) {
2034                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2035                                          base, index, 0, ofs);
2036                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2037                                          base, index, 0, ofs + 4);
2038             } else {
2039                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2040                                          base, index, 0, ofs + 4);
2041                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2042                                          base, index, 0, ofs);
2043             }
2044         }
2045         break;
2046     default:
2047         g_assert_not_reached();
2048     }
2049 }
2050
2051 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2052    EAX. It will be useful once fixed registers globals are less
2053    common. */
2054 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2055 {
2056     TCGReg datalo, datahi, addrlo;
2057     TCGReg addrhi __attribute__((unused));
2058     TCGMemOpIdx oi;
2059     MemOp opc;
2060 #if defined(CONFIG_SOFTMMU)
2061     int mem_index;
2062     tcg_insn_unit *label_ptr[2];
2063 #endif
2064
2065     datalo = *args++;
2066     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2067     addrlo = *args++;
2068     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2069     oi = *args++;
2070     opc = get_memop(oi);
2071
2072 #if defined(CONFIG_SOFTMMU)
2073     mem_index = get_mmuidx(oi);
2074
2075     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2076                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2077
2078     /* TLB Hit.  */
2079     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2080
2081     /* Record the current context of a load into ldst label */
2082     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2083                         s->code_ptr, label_ptr);
2084 #else
2085     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2086                            x86_guest_base_offset, x86_guest_base_seg,
2087                            is64, opc);
2088 #endif
2089 }
2090
2091 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2092                                    TCGReg base, int index, intptr_t ofs,
2093                                    int seg, MemOp memop)
2094 {
2095     bool use_movbe = false;
2096     int movop = OPC_MOVL_EvGv;
2097
2098     /*
2099      * Do big-endian stores with movbe or softmmu.
2100      * User-only without movbe will have its swapping done generically.
2101      */
2102     if (memop & MO_BSWAP) {
2103         tcg_debug_assert(have_movbe);
2104         use_movbe = true;
2105         movop = OPC_MOVBE_MyGy;
2106     }
2107
2108     switch (memop & MO_SIZE) {
2109     case MO_8:
2110         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2111         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2112         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2113                                  datalo, base, index, 0, ofs);
2114         break;
2115     case MO_16:
2116         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2117                                  base, index, 0, ofs);
2118         break;
2119     case MO_32:
2120         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2121         break;
2122     case MO_64:
2123         if (TCG_TARGET_REG_BITS == 64) {
2124             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2125                                      base, index, 0, ofs);
2126         } else {
2127             if (use_movbe) {
2128                 TCGReg t = datalo;
2129                 datalo = datahi;
2130                 datahi = t;
2131             }
2132             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2133                                      base, index, 0, ofs);
2134             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2135                                      base, index, 0, ofs + 4);
2136         }
2137         break;
2138     default:
2139         g_assert_not_reached();
2140     }
2141 }
2142
2143 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2144 {
2145     TCGReg datalo, datahi, addrlo;
2146     TCGReg addrhi __attribute__((unused));
2147     TCGMemOpIdx oi;
2148     MemOp opc;
2149 #if defined(CONFIG_SOFTMMU)
2150     int mem_index;
2151     tcg_insn_unit *label_ptr[2];
2152 #endif
2153
2154     datalo = *args++;
2155     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2156     addrlo = *args++;
2157     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2158     oi = *args++;
2159     opc = get_memop(oi);
2160
2161 #if defined(CONFIG_SOFTMMU)
2162     mem_index = get_mmuidx(oi);
2163
2164     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2165                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2166
2167     /* TLB Hit.  */
2168     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2169
2170     /* Record the current context of a store into ldst label */
2171     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2172                         s->code_ptr, label_ptr);
2173 #else
2174     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2175                            x86_guest_base_offset, x86_guest_base_seg, opc);
2176 #endif
2177 }
2178
2179 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2180                               const TCGArg *args, const int *const_args)
2181 {
2182     TCGArg a0, a1, a2;
2183     int c, const_a2, vexop, rexw = 0;
2184
2185 #if TCG_TARGET_REG_BITS == 64
2186 # define OP_32_64(x) \
2187         case glue(glue(INDEX_op_, x), _i64): \
2188             rexw = P_REXW; /* FALLTHRU */    \
2189         case glue(glue(INDEX_op_, x), _i32)
2190 #else
2191 # define OP_32_64(x) \
2192         case glue(glue(INDEX_op_, x), _i32)
2193 #endif
2194
2195     /* Hoist the loads of the most common arguments.  */
2196     a0 = args[0];
2197     a1 = args[1];
2198     a2 = args[2];
2199     const_a2 = const_args[2];
2200
2201     switch (opc) {
2202     case INDEX_op_exit_tb:
2203         /* Reuse the zeroing that exists for goto_ptr.  */
2204         if (a0 == 0) {
2205             tcg_out_jmp(s, tcg_code_gen_epilogue);
2206         } else {
2207             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2208             tcg_out_jmp(s, tb_ret_addr);
2209         }
2210         break;
2211     case INDEX_op_goto_tb:
2212         if (s->tb_jmp_insn_offset) {
2213             /* direct jump method */
2214             int gap;
2215             /* jump displacement must be aligned for atomic patching;
2216              * see if we need to add extra nops before jump
2217              */
2218             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2219             if (gap != 1) {
2220                 tcg_out_nopn(s, gap - 1);
2221             }
2222             tcg_out8(s, OPC_JMP_long); /* jmp im */
2223             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2224             tcg_out32(s, 0);
2225         } else {
2226             /* indirect jump method */
2227             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2228                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2229         }
2230         set_jmp_reset_offset(s, a0);
2231         break;
2232     case INDEX_op_goto_ptr:
2233         /* jmp to the given host address (could be epilogue) */
2234         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2235         break;
2236     case INDEX_op_br:
2237         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2238         break;
2239     OP_32_64(ld8u):
2240         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2241         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2242         break;
2243     OP_32_64(ld8s):
2244         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2245         break;
2246     OP_32_64(ld16u):
2247         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2248         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2249         break;
2250     OP_32_64(ld16s):
2251         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2252         break;
2253 #if TCG_TARGET_REG_BITS == 64
2254     case INDEX_op_ld32u_i64:
2255 #endif
2256     case INDEX_op_ld_i32:
2257         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2258         break;
2259
2260     OP_32_64(st8):
2261         if (const_args[0]) {
2262             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2263             tcg_out8(s, a0);
2264         } else {
2265             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2266         }
2267         break;
2268     OP_32_64(st16):
2269         if (const_args[0]) {
2270             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2271             tcg_out16(s, a0);
2272         } else {
2273             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2274         }
2275         break;
2276 #if TCG_TARGET_REG_BITS == 64
2277     case INDEX_op_st32_i64:
2278 #endif
2279     case INDEX_op_st_i32:
2280         if (const_args[0]) {
2281             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2282             tcg_out32(s, a0);
2283         } else {
2284             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2285         }
2286         break;
2287
2288     OP_32_64(add):
2289         /* For 3-operand addition, use LEA.  */
2290         if (a0 != a1) {
2291             TCGArg c3 = 0;
2292             if (const_a2) {
2293                 c3 = a2, a2 = -1;
2294             } else if (a0 == a2) {
2295                 /* Watch out for dest = src + dest, since we've removed
2296                    the matching constraint on the add.  */
2297                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2298                 break;
2299             }
2300
2301             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2302             break;
2303         }
2304         c = ARITH_ADD;
2305         goto gen_arith;
2306     OP_32_64(sub):
2307         c = ARITH_SUB;
2308         goto gen_arith;
2309     OP_32_64(and):
2310         c = ARITH_AND;
2311         goto gen_arith;
2312     OP_32_64(or):
2313         c = ARITH_OR;
2314         goto gen_arith;
2315     OP_32_64(xor):
2316         c = ARITH_XOR;
2317         goto gen_arith;
2318     gen_arith:
2319         if (const_a2) {
2320             tgen_arithi(s, c + rexw, a0, a2, 0);
2321         } else {
2322             tgen_arithr(s, c + rexw, a0, a2);
2323         }
2324         break;
2325
2326     OP_32_64(andc):
2327         if (const_a2) {
2328             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2329             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2330         } else {
2331             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2332         }
2333         break;
2334
2335     OP_32_64(mul):
2336         if (const_a2) {
2337             int32_t val;
2338             val = a2;
2339             if (val == (int8_t)val) {
2340                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2341                 tcg_out8(s, val);
2342             } else {
2343                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2344                 tcg_out32(s, val);
2345             }
2346         } else {
2347             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2348         }
2349         break;
2350
2351     OP_32_64(div2):
2352         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2353         break;
2354     OP_32_64(divu2):
2355         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2356         break;
2357
2358     OP_32_64(shl):
2359         /* For small constant 3-operand shift, use LEA.  */
2360         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2361             if (a2 - 1 == 0) {
2362                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2363                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2364             } else {
2365                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2366                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2367             }
2368             break;
2369         }
2370         c = SHIFT_SHL;
2371         vexop = OPC_SHLX;
2372         goto gen_shift_maybe_vex;
2373     OP_32_64(shr):
2374         c = SHIFT_SHR;
2375         vexop = OPC_SHRX;
2376         goto gen_shift_maybe_vex;
2377     OP_32_64(sar):
2378         c = SHIFT_SAR;
2379         vexop = OPC_SARX;
2380         goto gen_shift_maybe_vex;
2381     OP_32_64(rotl):
2382         c = SHIFT_ROL;
2383         goto gen_shift;
2384     OP_32_64(rotr):
2385         c = SHIFT_ROR;
2386         goto gen_shift;
2387     gen_shift_maybe_vex:
2388         if (have_bmi2) {
2389             if (!const_a2) {
2390                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2391                 break;
2392             }
2393             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2394         }
2395         /* FALLTHRU */
2396     gen_shift:
2397         if (const_a2) {
2398             tcg_out_shifti(s, c + rexw, a0, a2);
2399         } else {
2400             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2401         }
2402         break;
2403
2404     OP_32_64(ctz):
2405         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2406         break;
2407     OP_32_64(clz):
2408         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2409         break;
2410     OP_32_64(ctpop):
2411         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2412         break;
2413
2414     case INDEX_op_brcond_i32:
2415         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2416         break;
2417     case INDEX_op_setcond_i32:
2418         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2419         break;
2420     case INDEX_op_movcond_i32:
2421         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2422         break;
2423
2424     OP_32_64(bswap16):
2425         tcg_out_rolw_8(s, a0);
2426         break;
2427     OP_32_64(bswap32):
2428         tcg_out_bswap32(s, a0);
2429         break;
2430
2431     OP_32_64(neg):
2432         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2433         break;
2434     OP_32_64(not):
2435         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2436         break;
2437
2438     OP_32_64(ext8s):
2439         tcg_out_ext8s(s, a0, a1, rexw);
2440         break;
2441     OP_32_64(ext16s):
2442         tcg_out_ext16s(s, a0, a1, rexw);
2443         break;
2444     OP_32_64(ext8u):
2445         tcg_out_ext8u(s, a0, a1);
2446         break;
2447     OP_32_64(ext16u):
2448         tcg_out_ext16u(s, a0, a1);
2449         break;
2450
2451     case INDEX_op_qemu_ld_i32:
2452         tcg_out_qemu_ld(s, args, 0);
2453         break;
2454     case INDEX_op_qemu_ld_i64:
2455         tcg_out_qemu_ld(s, args, 1);
2456         break;
2457     case INDEX_op_qemu_st_i32:
2458     case INDEX_op_qemu_st8_i32:
2459         tcg_out_qemu_st(s, args, 0);
2460         break;
2461     case INDEX_op_qemu_st_i64:
2462         tcg_out_qemu_st(s, args, 1);
2463         break;
2464
2465     OP_32_64(mulu2):
2466         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2467         break;
2468     OP_32_64(muls2):
2469         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2470         break;
2471     OP_32_64(add2):
2472         if (const_args[4]) {
2473             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2474         } else {
2475             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2476         }
2477         if (const_args[5]) {
2478             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2479         } else {
2480             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2481         }
2482         break;
2483     OP_32_64(sub2):
2484         if (const_args[4]) {
2485             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2486         } else {
2487             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2488         }
2489         if (const_args[5]) {
2490             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2491         } else {
2492             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2493         }
2494         break;
2495
2496 #if TCG_TARGET_REG_BITS == 32
2497     case INDEX_op_brcond2_i32:
2498         tcg_out_brcond2(s, args, const_args, 0);
2499         break;
2500     case INDEX_op_setcond2_i32:
2501         tcg_out_setcond2(s, args, const_args);
2502         break;
2503 #else /* TCG_TARGET_REG_BITS == 64 */
2504     case INDEX_op_ld32s_i64:
2505         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2506         break;
2507     case INDEX_op_ld_i64:
2508         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2509         break;
2510     case INDEX_op_st_i64:
2511         if (const_args[0]) {
2512             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2513             tcg_out32(s, a0);
2514         } else {
2515             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2516         }
2517         break;
2518
2519     case INDEX_op_brcond_i64:
2520         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2521         break;
2522     case INDEX_op_setcond_i64:
2523         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2524         break;
2525     case INDEX_op_movcond_i64:
2526         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2527         break;
2528
2529     case INDEX_op_bswap64_i64:
2530         tcg_out_bswap64(s, a0);
2531         break;
2532     case INDEX_op_extu_i32_i64:
2533     case INDEX_op_ext32u_i64:
2534     case INDEX_op_extrl_i64_i32:
2535         tcg_out_ext32u(s, a0, a1);
2536         break;
2537     case INDEX_op_ext_i32_i64:
2538     case INDEX_op_ext32s_i64:
2539         tcg_out_ext32s(s, a0, a1);
2540         break;
2541     case INDEX_op_extrh_i64_i32:
2542         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2543         break;
2544 #endif
2545
2546     OP_32_64(deposit):
2547         if (args[3] == 0 && args[4] == 8) {
2548             /* load bits 0..7 */
2549             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2550         } else if (args[3] == 8 && args[4] == 8) {
2551             /* load bits 8..15 */
2552             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2553         } else if (args[3] == 0 && args[4] == 16) {
2554             /* load bits 0..15 */
2555             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2556         } else {
2557             tcg_abort();
2558         }
2559         break;
2560
2561     case INDEX_op_extract_i64:
2562         if (a2 + args[3] == 32) {
2563             /* This is a 32-bit zero-extending right shift.  */
2564             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2565             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2566             break;
2567         }
2568         /* FALLTHRU */
2569     case INDEX_op_extract_i32:
2570         /* On the off-chance that we can use the high-byte registers.
2571            Otherwise we emit the same ext16 + shift pattern that we
2572            would have gotten from the normal tcg-op.c expansion.  */
2573         tcg_debug_assert(a2 == 8 && args[3] == 8);
2574         if (a1 < 4 && a0 < 8) {
2575             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2576         } else {
2577             tcg_out_ext16u(s, a0, a1);
2578             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2579         }
2580         break;
2581
2582     case INDEX_op_sextract_i32:
2583         /* We don't implement sextract_i64, as we cannot sign-extend to
2584            64-bits without using the REX prefix that explicitly excludes
2585            access to the high-byte registers.  */
2586         tcg_debug_assert(a2 == 8 && args[3] == 8);
2587         if (a1 < 4 && a0 < 8) {
2588             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2589         } else {
2590             tcg_out_ext16s(s, a0, a1, 0);
2591             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2592         }
2593         break;
2594
2595     OP_32_64(extract2):
2596         /* Note that SHRD outputs to the r/m operand.  */
2597         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2598         tcg_out8(s, args[3]);
2599         break;
2600
2601     case INDEX_op_mb:
2602         tcg_out_mb(s, a0);
2603         break;
2604     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2605     case INDEX_op_mov_i64:
2606     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2607     default:
2608         tcg_abort();
2609     }
2610
2611 #undef OP_32_64
2612 }
2613
2614 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2615                            unsigned vecl, unsigned vece,
2616                            const TCGArg *args, const int *const_args)
2617 {
2618     static int const add_insn[4] = {
2619         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2620     };
2621     static int const ssadd_insn[4] = {
2622         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2623     };
2624     static int const usadd_insn[4] = {
2625         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2626     };
2627     static int const sub_insn[4] = {
2628         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2629     };
2630     static int const sssub_insn[4] = {
2631         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2632     };
2633     static int const ussub_insn[4] = {
2634         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2635     };
2636     static int const mul_insn[4] = {
2637         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2638     };
2639     static int const shift_imm_insn[4] = {
2640         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2641     };
2642     static int const cmpeq_insn[4] = {
2643         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2644     };
2645     static int const cmpgt_insn[4] = {
2646         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2647     };
2648     static int const punpckl_insn[4] = {
2649         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2650     };
2651     static int const punpckh_insn[4] = {
2652         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2653     };
2654     static int const packss_insn[4] = {
2655         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2656     };
2657     static int const packus_insn[4] = {
2658         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2659     };
2660     static int const smin_insn[4] = {
2661         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2662     };
2663     static int const smax_insn[4] = {
2664         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2665     };
2666     static int const umin_insn[4] = {
2667         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2668     };
2669     static int const umax_insn[4] = {
2670         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2671     };
2672     static int const shlv_insn[4] = {
2673         /* TODO: AVX512 adds support for MO_16.  */
2674         OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2675     };
2676     static int const shrv_insn[4] = {
2677         /* TODO: AVX512 adds support for MO_16.  */
2678         OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2679     };
2680     static int const sarv_insn[4] = {
2681         /* TODO: AVX512 adds support for MO_16, MO_64.  */
2682         OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2683     };
2684     static int const shls_insn[4] = {
2685         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2686     };
2687     static int const shrs_insn[4] = {
2688         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2689     };
2690     static int const sars_insn[4] = {
2691         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2692     };
2693     static int const abs_insn[4] = {
2694         /* TODO: AVX512 adds support for MO_64.  */
2695         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2696     };
2697
2698     TCGType type = vecl + TCG_TYPE_V64;
2699     int insn, sub;
2700     TCGArg a0, a1, a2;
2701
2702     a0 = args[0];
2703     a1 = args[1];
2704     a2 = args[2];
2705
2706     switch (opc) {
2707     case INDEX_op_add_vec:
2708         insn = add_insn[vece];
2709         goto gen_simd;
2710     case INDEX_op_ssadd_vec:
2711         insn = ssadd_insn[vece];
2712         goto gen_simd;
2713     case INDEX_op_usadd_vec:
2714         insn = usadd_insn[vece];
2715         goto gen_simd;
2716     case INDEX_op_sub_vec:
2717         insn = sub_insn[vece];
2718         goto gen_simd;
2719     case INDEX_op_sssub_vec:
2720         insn = sssub_insn[vece];
2721         goto gen_simd;
2722     case INDEX_op_ussub_vec:
2723         insn = ussub_insn[vece];
2724         goto gen_simd;
2725     case INDEX_op_mul_vec:
2726         insn = mul_insn[vece];
2727         goto gen_simd;
2728     case INDEX_op_and_vec:
2729         insn = OPC_PAND;
2730         goto gen_simd;
2731     case INDEX_op_or_vec:
2732         insn = OPC_POR;
2733         goto gen_simd;
2734     case INDEX_op_xor_vec:
2735         insn = OPC_PXOR;
2736         goto gen_simd;
2737     case INDEX_op_smin_vec:
2738         insn = smin_insn[vece];
2739         goto gen_simd;
2740     case INDEX_op_umin_vec:
2741         insn = umin_insn[vece];
2742         goto gen_simd;
2743     case INDEX_op_smax_vec:
2744         insn = smax_insn[vece];
2745         goto gen_simd;
2746     case INDEX_op_umax_vec:
2747         insn = umax_insn[vece];
2748         goto gen_simd;
2749     case INDEX_op_shlv_vec:
2750         insn = shlv_insn[vece];
2751         goto gen_simd;
2752     case INDEX_op_shrv_vec:
2753         insn = shrv_insn[vece];
2754         goto gen_simd;
2755     case INDEX_op_sarv_vec:
2756         insn = sarv_insn[vece];
2757         goto gen_simd;
2758     case INDEX_op_shls_vec:
2759         insn = shls_insn[vece];
2760         goto gen_simd;
2761     case INDEX_op_shrs_vec:
2762         insn = shrs_insn[vece];
2763         goto gen_simd;
2764     case INDEX_op_sars_vec:
2765         insn = sars_insn[vece];
2766         goto gen_simd;
2767     case INDEX_op_x86_punpckl_vec:
2768         insn = punpckl_insn[vece];
2769         goto gen_simd;
2770     case INDEX_op_x86_punpckh_vec:
2771         insn = punpckh_insn[vece];
2772         goto gen_simd;
2773     case INDEX_op_x86_packss_vec:
2774         insn = packss_insn[vece];
2775         goto gen_simd;
2776     case INDEX_op_x86_packus_vec:
2777         insn = packus_insn[vece];
2778         goto gen_simd;
2779 #if TCG_TARGET_REG_BITS == 32
2780     case INDEX_op_dup2_vec:
2781         /* First merge the two 32-bit inputs to a single 64-bit element. */
2782         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2783         /* Then replicate the 64-bit elements across the rest of the vector. */
2784         if (type != TCG_TYPE_V64) {
2785             tcg_out_dup_vec(s, type, MO_64, a0, a0);
2786         }
2787         break;
2788 #endif
2789     case INDEX_op_abs_vec:
2790         insn = abs_insn[vece];
2791         a2 = a1;
2792         a1 = 0;
2793         goto gen_simd;
2794     gen_simd:
2795         tcg_debug_assert(insn != OPC_UD2);
2796         if (type == TCG_TYPE_V256) {
2797             insn |= P_VEXL;
2798         }
2799         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2800         break;
2801
2802     case INDEX_op_cmp_vec:
2803         sub = args[3];
2804         if (sub == TCG_COND_EQ) {
2805             insn = cmpeq_insn[vece];
2806         } else if (sub == TCG_COND_GT) {
2807             insn = cmpgt_insn[vece];
2808         } else {
2809             g_assert_not_reached();
2810         }
2811         goto gen_simd;
2812
2813     case INDEX_op_andc_vec:
2814         insn = OPC_PANDN;
2815         if (type == TCG_TYPE_V256) {
2816             insn |= P_VEXL;
2817         }
2818         tcg_out_vex_modrm(s, insn, a0, a2, a1);
2819         break;
2820
2821     case INDEX_op_shli_vec:
2822         sub = 6;
2823         goto gen_shift;
2824     case INDEX_op_shri_vec:
2825         sub = 2;
2826         goto gen_shift;
2827     case INDEX_op_sari_vec:
2828         tcg_debug_assert(vece != MO_64);
2829         sub = 4;
2830     gen_shift:
2831         tcg_debug_assert(vece != MO_8);
2832         insn = shift_imm_insn[vece];
2833         if (type == TCG_TYPE_V256) {
2834             insn |= P_VEXL;
2835         }
2836         tcg_out_vex_modrm(s, insn, sub, a0, a1);
2837         tcg_out8(s, a2);
2838         break;
2839
2840     case INDEX_op_ld_vec:
2841         tcg_out_ld(s, type, a0, a1, a2);
2842         break;
2843     case INDEX_op_st_vec:
2844         tcg_out_st(s, type, a0, a1, a2);
2845         break;
2846     case INDEX_op_dupm_vec:
2847         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2848         break;
2849
2850     case INDEX_op_x86_shufps_vec:
2851         insn = OPC_SHUFPS;
2852         sub = args[3];
2853         goto gen_simd_imm8;
2854     case INDEX_op_x86_blend_vec:
2855         if (vece == MO_16) {
2856             insn = OPC_PBLENDW;
2857         } else if (vece == MO_32) {
2858             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2859         } else {
2860             g_assert_not_reached();
2861         }
2862         sub = args[3];
2863         goto gen_simd_imm8;
2864     case INDEX_op_x86_vperm2i128_vec:
2865         insn = OPC_VPERM2I128;
2866         sub = args[3];
2867         goto gen_simd_imm8;
2868     gen_simd_imm8:
2869         if (type == TCG_TYPE_V256) {
2870             insn |= P_VEXL;
2871         }
2872         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2873         tcg_out8(s, sub);
2874         break;
2875
2876     case INDEX_op_x86_vpblendvb_vec:
2877         insn = OPC_VPBLENDVB;
2878         if (type == TCG_TYPE_V256) {
2879             insn |= P_VEXL;
2880         }
2881         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2882         tcg_out8(s, args[3] << 4);
2883         break;
2884
2885     case INDEX_op_x86_psrldq_vec:
2886         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2887         tcg_out8(s, a2);
2888         break;
2889
2890     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2891     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2892     default:
2893         g_assert_not_reached();
2894     }
2895 }
2896
2897 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2898 {
2899     switch (op) {
2900     case INDEX_op_goto_ptr:
2901         return C_O0_I1(r);
2902
2903     case INDEX_op_ld8u_i32:
2904     case INDEX_op_ld8u_i64:
2905     case INDEX_op_ld8s_i32:
2906     case INDEX_op_ld8s_i64:
2907     case INDEX_op_ld16u_i32:
2908     case INDEX_op_ld16u_i64:
2909     case INDEX_op_ld16s_i32:
2910     case INDEX_op_ld16s_i64:
2911     case INDEX_op_ld_i32:
2912     case INDEX_op_ld32u_i64:
2913     case INDEX_op_ld32s_i64:
2914     case INDEX_op_ld_i64:
2915         return C_O1_I1(r, r);
2916
2917     case INDEX_op_st8_i32:
2918     case INDEX_op_st8_i64:
2919         return C_O0_I2(qi, r);
2920
2921     case INDEX_op_st16_i32:
2922     case INDEX_op_st16_i64:
2923     case INDEX_op_st_i32:
2924     case INDEX_op_st32_i64:
2925         return C_O0_I2(ri, r);
2926
2927     case INDEX_op_st_i64:
2928         return C_O0_I2(re, r);
2929
2930     case INDEX_op_add_i32:
2931     case INDEX_op_add_i64:
2932         return C_O1_I2(r, r, re);
2933
2934     case INDEX_op_sub_i32:
2935     case INDEX_op_sub_i64:
2936     case INDEX_op_mul_i32:
2937     case INDEX_op_mul_i64:
2938     case INDEX_op_or_i32:
2939     case INDEX_op_or_i64:
2940     case INDEX_op_xor_i32:
2941     case INDEX_op_xor_i64:
2942         return C_O1_I2(r, 0, re);
2943
2944     case INDEX_op_and_i32:
2945     case INDEX_op_and_i64:
2946         return C_O1_I2(r, 0, reZ);
2947
2948     case INDEX_op_andc_i32:
2949     case INDEX_op_andc_i64:
2950         return C_O1_I2(r, r, rI);
2951
2952     case INDEX_op_shl_i32:
2953     case INDEX_op_shl_i64:
2954     case INDEX_op_shr_i32:
2955     case INDEX_op_shr_i64:
2956     case INDEX_op_sar_i32:
2957     case INDEX_op_sar_i64:
2958         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2959
2960     case INDEX_op_rotl_i32:
2961     case INDEX_op_rotl_i64:
2962     case INDEX_op_rotr_i32:
2963     case INDEX_op_rotr_i64:
2964         return C_O1_I2(r, 0, ci);
2965
2966     case INDEX_op_brcond_i32:
2967     case INDEX_op_brcond_i64:
2968         return C_O0_I2(r, re);
2969
2970     case INDEX_op_bswap16_i32:
2971     case INDEX_op_bswap16_i64:
2972     case INDEX_op_bswap32_i32:
2973     case INDEX_op_bswap32_i64:
2974     case INDEX_op_bswap64_i64:
2975     case INDEX_op_neg_i32:
2976     case INDEX_op_neg_i64:
2977     case INDEX_op_not_i32:
2978     case INDEX_op_not_i64:
2979     case INDEX_op_extrh_i64_i32:
2980         return C_O1_I1(r, 0);
2981
2982     case INDEX_op_ext8s_i32:
2983     case INDEX_op_ext8s_i64:
2984     case INDEX_op_ext8u_i32:
2985     case INDEX_op_ext8u_i64:
2986         return C_O1_I1(r, q);
2987
2988     case INDEX_op_ext16s_i32:
2989     case INDEX_op_ext16s_i64:
2990     case INDEX_op_ext16u_i32:
2991     case INDEX_op_ext16u_i64:
2992     case INDEX_op_ext32s_i64:
2993     case INDEX_op_ext32u_i64:
2994     case INDEX_op_ext_i32_i64:
2995     case INDEX_op_extu_i32_i64:
2996     case INDEX_op_extrl_i64_i32:
2997     case INDEX_op_extract_i32:
2998     case INDEX_op_extract_i64:
2999     case INDEX_op_sextract_i32:
3000     case INDEX_op_ctpop_i32:
3001     case INDEX_op_ctpop_i64:
3002         return C_O1_I1(r, r);
3003
3004     case INDEX_op_extract2_i32:
3005     case INDEX_op_extract2_i64:
3006         return C_O1_I2(r, 0, r);
3007
3008     case INDEX_op_deposit_i32:
3009     case INDEX_op_deposit_i64:
3010         return C_O1_I2(Q, 0, Q);
3011
3012     case INDEX_op_setcond_i32:
3013     case INDEX_op_setcond_i64:
3014         return C_O1_I2(q, r, re);
3015
3016     case INDEX_op_movcond_i32:
3017     case INDEX_op_movcond_i64:
3018         return C_O1_I4(r, r, re, r, 0);
3019
3020     case INDEX_op_div2_i32:
3021     case INDEX_op_div2_i64:
3022     case INDEX_op_divu2_i32:
3023     case INDEX_op_divu2_i64:
3024         return C_O2_I3(a, d, 0, 1, r);
3025
3026     case INDEX_op_mulu2_i32:
3027     case INDEX_op_mulu2_i64:
3028     case INDEX_op_muls2_i32:
3029     case INDEX_op_muls2_i64:
3030         return C_O2_I2(a, d, a, r);
3031
3032     case INDEX_op_add2_i32:
3033     case INDEX_op_add2_i64:
3034     case INDEX_op_sub2_i32:
3035     case INDEX_op_sub2_i64:
3036         return C_O2_I4(r, r, 0, 1, re, re);
3037
3038     case INDEX_op_ctz_i32:
3039     case INDEX_op_ctz_i64:
3040         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3041
3042     case INDEX_op_clz_i32:
3043     case INDEX_op_clz_i64:
3044         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3045
3046     case INDEX_op_qemu_ld_i32:
3047         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3048                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3049
3050     case INDEX_op_qemu_st_i32:
3051         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3052                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3053     case INDEX_op_qemu_st8_i32:
3054         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3055                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3056
3057     case INDEX_op_qemu_ld_i64:
3058         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3059                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3060                 : C_O2_I2(r, r, L, L));
3061
3062     case INDEX_op_qemu_st_i64:
3063         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3064                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3065                 : C_O0_I4(L, L, L, L));
3066
3067     case INDEX_op_brcond2_i32:
3068         return C_O0_I4(r, r, ri, ri);
3069
3070     case INDEX_op_setcond2_i32:
3071         return C_O1_I4(r, r, r, ri, ri);
3072
3073     case INDEX_op_ld_vec:
3074     case INDEX_op_dupm_vec:
3075         return C_O1_I1(x, r);
3076
3077     case INDEX_op_st_vec:
3078         return C_O0_I2(x, r);
3079
3080     case INDEX_op_add_vec:
3081     case INDEX_op_sub_vec:
3082     case INDEX_op_mul_vec:
3083     case INDEX_op_and_vec:
3084     case INDEX_op_or_vec:
3085     case INDEX_op_xor_vec:
3086     case INDEX_op_andc_vec:
3087     case INDEX_op_ssadd_vec:
3088     case INDEX_op_usadd_vec:
3089     case INDEX_op_sssub_vec:
3090     case INDEX_op_ussub_vec:
3091     case INDEX_op_smin_vec:
3092     case INDEX_op_umin_vec:
3093     case INDEX_op_smax_vec:
3094     case INDEX_op_umax_vec:
3095     case INDEX_op_shlv_vec:
3096     case INDEX_op_shrv_vec:
3097     case INDEX_op_sarv_vec:
3098     case INDEX_op_shls_vec:
3099     case INDEX_op_shrs_vec:
3100     case INDEX_op_sars_vec:
3101     case INDEX_op_rotls_vec:
3102     case INDEX_op_cmp_vec:
3103     case INDEX_op_x86_shufps_vec:
3104     case INDEX_op_x86_blend_vec:
3105     case INDEX_op_x86_packss_vec:
3106     case INDEX_op_x86_packus_vec:
3107     case INDEX_op_x86_vperm2i128_vec:
3108     case INDEX_op_x86_punpckl_vec:
3109     case INDEX_op_x86_punpckh_vec:
3110 #if TCG_TARGET_REG_BITS == 32
3111     case INDEX_op_dup2_vec:
3112 #endif
3113         return C_O1_I2(x, x, x);
3114
3115     case INDEX_op_abs_vec:
3116     case INDEX_op_dup_vec:
3117     case INDEX_op_shli_vec:
3118     case INDEX_op_shri_vec:
3119     case INDEX_op_sari_vec:
3120     case INDEX_op_x86_psrldq_vec:
3121         return C_O1_I1(x, x);
3122
3123     case INDEX_op_x86_vpblendvb_vec:
3124         return C_O1_I3(x, x, x, x);
3125
3126     default:
3127         g_assert_not_reached();
3128     }
3129 }
3130
3131 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3132 {
3133     switch (opc) {
3134     case INDEX_op_add_vec:
3135     case INDEX_op_sub_vec:
3136     case INDEX_op_and_vec:
3137     case INDEX_op_or_vec:
3138     case INDEX_op_xor_vec:
3139     case INDEX_op_andc_vec:
3140         return 1;
3141     case INDEX_op_rotli_vec:
3142     case INDEX_op_cmp_vec:
3143     case INDEX_op_cmpsel_vec:
3144         return -1;
3145
3146     case INDEX_op_shli_vec:
3147     case INDEX_op_shri_vec:
3148         /* We must expand the operation for MO_8.  */
3149         return vece == MO_8 ? -1 : 1;
3150
3151     case INDEX_op_sari_vec:
3152         /* We must expand the operation for MO_8.  */
3153         if (vece == MO_8) {
3154             return -1;
3155         }
3156         /* We can emulate this for MO_64, but it does not pay off
3157            unless we're producing at least 4 values.  */
3158         if (vece == MO_64) {
3159             return type >= TCG_TYPE_V256 ? -1 : 0;
3160         }
3161         return 1;
3162
3163     case INDEX_op_shls_vec:
3164     case INDEX_op_shrs_vec:
3165         return vece >= MO_16;
3166     case INDEX_op_sars_vec:
3167         return vece >= MO_16 && vece <= MO_32;
3168     case INDEX_op_rotls_vec:
3169         return vece >= MO_16 ? -1 : 0;
3170
3171     case INDEX_op_shlv_vec:
3172     case INDEX_op_shrv_vec:
3173         return have_avx2 && vece >= MO_32;
3174     case INDEX_op_sarv_vec:
3175         return have_avx2 && vece == MO_32;
3176     case INDEX_op_rotlv_vec:
3177     case INDEX_op_rotrv_vec:
3178         return have_avx2 && vece >= MO_32 ? -1 : 0;
3179
3180     case INDEX_op_mul_vec:
3181         if (vece == MO_8) {
3182             /* We can expand the operation for MO_8.  */
3183             return -1;
3184         }
3185         if (vece == MO_64) {
3186             return 0;
3187         }
3188         return 1;
3189
3190     case INDEX_op_ssadd_vec:
3191     case INDEX_op_usadd_vec:
3192     case INDEX_op_sssub_vec:
3193     case INDEX_op_ussub_vec:
3194         return vece <= MO_16;
3195     case INDEX_op_smin_vec:
3196     case INDEX_op_smax_vec:
3197     case INDEX_op_umin_vec:
3198     case INDEX_op_umax_vec:
3199     case INDEX_op_abs_vec:
3200         return vece <= MO_32;
3201
3202     default:
3203         return 0;
3204     }
3205 }
3206
3207 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3208                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3209 {
3210     TCGv_vec t1, t2;
3211
3212     tcg_debug_assert(vece == MO_8);
3213
3214     t1 = tcg_temp_new_vec(type);
3215     t2 = tcg_temp_new_vec(type);
3216
3217     /*
3218      * Unpack to W, shift, and repack.  Tricky bits:
3219      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3220      *     i.e. duplicate in other half of the 16-bit lane.
3221      * (2) For right-shift, add 8 so that the high half of the lane
3222      *     becomes zero.  For left-shift, and left-rotate, we must
3223      *     shift up and down again.
3224      * (3) Step 2 leaves high half zero such that PACKUSWB
3225      *     (pack with unsigned saturation) does not modify
3226      *     the quantity.
3227      */
3228     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3229               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3230     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3231               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3232
3233     if (opc != INDEX_op_rotli_vec) {
3234         imm += 8;
3235     }
3236     if (opc == INDEX_op_shri_vec) {
3237         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3238         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3239     } else {
3240         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3241         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3242         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3243         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3244     }
3245
3246     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3247               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3248     tcg_temp_free_vec(t1);
3249     tcg_temp_free_vec(t2);
3250 }
3251
3252 static void expand_vec_sari(TCGType type, unsigned vece,
3253                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3254 {
3255     TCGv_vec t1, t2;
3256
3257     switch (vece) {
3258     case MO_8:
3259         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3260         t1 = tcg_temp_new_vec(type);
3261         t2 = tcg_temp_new_vec(type);
3262         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3263                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3264         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3265                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3266         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3267         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3268         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3269                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3270         tcg_temp_free_vec(t1);
3271         tcg_temp_free_vec(t2);
3272         break;
3273
3274     case MO_64:
3275         if (imm <= 32) {
3276             /*
3277              * We can emulate a small sign extend by performing an arithmetic
3278              * 32-bit shift and overwriting the high half of a 64-bit logical
3279              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3280              * does not, so we have to bound the smaller shift -- we get the
3281              * same result in the high half either way.
3282              */
3283             t1 = tcg_temp_new_vec(type);
3284             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3285             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3286             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3287                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3288                       tcgv_vec_arg(t1), 0xaa);
3289             tcg_temp_free_vec(t1);
3290         } else {
3291             /* Otherwise we will need to use a compare vs 0 to produce
3292              * the sign-extend, shift and merge.
3293              */
3294             t1 = tcg_const_zeros_vec(type);
3295             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3296             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3297             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3298             tcg_gen_or_vec(MO_64, v0, v0, t1);
3299             tcg_temp_free_vec(t1);
3300         }
3301         break;
3302
3303     default:
3304         g_assert_not_reached();
3305     }
3306 }
3307
3308 static void expand_vec_rotli(TCGType type, unsigned vece,
3309                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3310 {
3311     TCGv_vec t;
3312
3313     if (vece == MO_8) {
3314         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3315         return;
3316     }
3317
3318     t = tcg_temp_new_vec(type);
3319     tcg_gen_shli_vec(vece, t, v1, imm);
3320     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3321     tcg_gen_or_vec(vece, v0, v0, t);
3322     tcg_temp_free_vec(t);
3323 }
3324
3325 static void expand_vec_rotls(TCGType type, unsigned vece,
3326                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3327 {
3328     TCGv_i32 rsh;
3329     TCGv_vec t;
3330
3331     tcg_debug_assert(vece != MO_8);
3332
3333     t = tcg_temp_new_vec(type);
3334     rsh = tcg_temp_new_i32();
3335
3336     tcg_gen_neg_i32(rsh, lsh);
3337     tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3338     tcg_gen_shls_vec(vece, t, v1, lsh);
3339     tcg_gen_shrs_vec(vece, v0, v1, rsh);
3340     tcg_gen_or_vec(vece, v0, v0, t);
3341     tcg_temp_free_vec(t);
3342     tcg_temp_free_i32(rsh);
3343 }
3344
3345 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3346                             TCGv_vec v1, TCGv_vec sh, bool right)
3347 {
3348     TCGv_vec t = tcg_temp_new_vec(type);
3349
3350     tcg_gen_dupi_vec(vece, t, 8 << vece);
3351     tcg_gen_sub_vec(vece, t, t, sh);
3352     if (right) {
3353         tcg_gen_shlv_vec(vece, t, v1, t);
3354         tcg_gen_shrv_vec(vece, v0, v1, sh);
3355     } else {
3356         tcg_gen_shrv_vec(vece, t, v1, t);
3357         tcg_gen_shlv_vec(vece, v0, v1, sh);
3358     }
3359     tcg_gen_or_vec(vece, v0, v0, t);
3360     tcg_temp_free_vec(t);
3361 }
3362
3363 static void expand_vec_mul(TCGType type, unsigned vece,
3364                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3365 {
3366     TCGv_vec t1, t2, t3, t4, zero;
3367
3368     tcg_debug_assert(vece == MO_8);
3369
3370     /*
3371      * Unpack v1 bytes to words, 0 | x.
3372      * Unpack v2 bytes to words, y | 0.
3373      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3374      * Shift logical right by 8 bits to clear the high 8 bytes before
3375      * using an unsigned saturated pack.
3376      *
3377      * The difference between the V64, V128 and V256 cases is merely how
3378      * we distribute the expansion between temporaries.
3379      */
3380     switch (type) {
3381     case TCG_TYPE_V64:
3382         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3383         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3384         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3385         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3386                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3387         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3388                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3389         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3390         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3391         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3392                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3393         tcg_temp_free_vec(t1);
3394         tcg_temp_free_vec(t2);
3395         break;
3396
3397     case TCG_TYPE_V128:
3398     case TCG_TYPE_V256:
3399         t1 = tcg_temp_new_vec(type);
3400         t2 = tcg_temp_new_vec(type);
3401         t3 = tcg_temp_new_vec(type);
3402         t4 = tcg_temp_new_vec(type);
3403         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3404         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3405                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3406         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3407                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3408         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3409                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3410         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3411                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3412         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3413         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3414         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3415         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3416         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3417                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3418         tcg_temp_free_vec(t1);
3419         tcg_temp_free_vec(t2);
3420         tcg_temp_free_vec(t3);
3421         tcg_temp_free_vec(t4);
3422         break;
3423
3424     default:
3425         g_assert_not_reached();
3426     }
3427 }
3428
3429 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3430                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3431 {
3432     enum {
3433         NEED_INV  = 1,
3434         NEED_SWAP = 2,
3435         NEED_BIAS = 4,
3436         NEED_UMIN = 8,
3437         NEED_UMAX = 16,
3438     };
3439     TCGv_vec t1, t2, t3;
3440     uint8_t fixup;
3441
3442     switch (cond) {
3443     case TCG_COND_EQ:
3444     case TCG_COND_GT:
3445         fixup = 0;
3446         break;
3447     case TCG_COND_NE:
3448     case TCG_COND_LE:
3449         fixup = NEED_INV;
3450         break;
3451     case TCG_COND_LT:
3452         fixup = NEED_SWAP;
3453         break;
3454     case TCG_COND_GE:
3455         fixup = NEED_SWAP | NEED_INV;
3456         break;
3457     case TCG_COND_LEU:
3458         if (vece <= MO_32) {
3459             fixup = NEED_UMIN;
3460         } else {
3461             fixup = NEED_BIAS | NEED_INV;
3462         }
3463         break;
3464     case TCG_COND_GTU:
3465         if (vece <= MO_32) {
3466             fixup = NEED_UMIN | NEED_INV;
3467         } else {
3468             fixup = NEED_BIAS;
3469         }
3470         break;
3471     case TCG_COND_GEU:
3472         if (vece <= MO_32) {
3473             fixup = NEED_UMAX;
3474         } else {
3475             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3476         }
3477         break;
3478     case TCG_COND_LTU:
3479         if (vece <= MO_32) {
3480             fixup = NEED_UMAX | NEED_INV;
3481         } else {
3482             fixup = NEED_BIAS | NEED_SWAP;
3483         }
3484         break;
3485     default:
3486         g_assert_not_reached();
3487     }
3488
3489     if (fixup & NEED_INV) {
3490         cond = tcg_invert_cond(cond);
3491     }
3492     if (fixup & NEED_SWAP) {
3493         t1 = v1, v1 = v2, v2 = t1;
3494         cond = tcg_swap_cond(cond);
3495     }
3496
3497     t1 = t2 = NULL;
3498     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3499         t1 = tcg_temp_new_vec(type);
3500         if (fixup & NEED_UMIN) {
3501             tcg_gen_umin_vec(vece, t1, v1, v2);
3502         } else {
3503             tcg_gen_umax_vec(vece, t1, v1, v2);
3504         }
3505         v2 = t1;
3506         cond = TCG_COND_EQ;
3507     } else if (fixup & NEED_BIAS) {
3508         t1 = tcg_temp_new_vec(type);
3509         t2 = tcg_temp_new_vec(type);
3510         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3511         tcg_gen_sub_vec(vece, t1, v1, t3);
3512         tcg_gen_sub_vec(vece, t2, v2, t3);
3513         v1 = t1;
3514         v2 = t2;
3515         cond = tcg_signed_cond(cond);
3516     }
3517
3518     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3519     /* Expand directly; do not recurse.  */
3520     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3521               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3522
3523     if (t1) {
3524         tcg_temp_free_vec(t1);
3525         if (t2) {
3526             tcg_temp_free_vec(t2);
3527         }
3528     }
3529     return fixup & NEED_INV;
3530 }
3531
3532 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3533                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3534 {
3535     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3536         tcg_gen_not_vec(vece, v0, v0);
3537     }
3538 }
3539
3540 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3541                               TCGv_vec c1, TCGv_vec c2,
3542                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3543 {
3544     TCGv_vec t = tcg_temp_new_vec(type);
3545
3546     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3547         /* Invert the sense of the compare by swapping arguments.  */
3548         TCGv_vec x;
3549         x = v3, v3 = v4, v4 = x;
3550     }
3551     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3552               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3553               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3554     tcg_temp_free_vec(t);
3555 }
3556
3557 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3558                        TCGArg a0, ...)
3559 {
3560     va_list va;
3561     TCGArg a2;
3562     TCGv_vec v0, v1, v2, v3, v4;
3563
3564     va_start(va, a0);
3565     v0 = temp_tcgv_vec(arg_temp(a0));
3566     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3567     a2 = va_arg(va, TCGArg);
3568
3569     switch (opc) {
3570     case INDEX_op_shli_vec:
3571     case INDEX_op_shri_vec:
3572         expand_vec_shi(type, vece, opc, v0, v1, a2);
3573         break;
3574
3575     case INDEX_op_sari_vec:
3576         expand_vec_sari(type, vece, v0, v1, a2);
3577         break;
3578
3579     case INDEX_op_rotli_vec:
3580         expand_vec_rotli(type, vece, v0, v1, a2);
3581         break;
3582
3583     case INDEX_op_rotls_vec:
3584         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3585         break;
3586
3587     case INDEX_op_rotlv_vec:
3588         v2 = temp_tcgv_vec(arg_temp(a2));
3589         expand_vec_rotv(type, vece, v0, v1, v2, false);
3590         break;
3591     case INDEX_op_rotrv_vec:
3592         v2 = temp_tcgv_vec(arg_temp(a2));
3593         expand_vec_rotv(type, vece, v0, v1, v2, true);
3594         break;
3595
3596     case INDEX_op_mul_vec:
3597         v2 = temp_tcgv_vec(arg_temp(a2));
3598         expand_vec_mul(type, vece, v0, v1, v2);
3599         break;
3600
3601     case INDEX_op_cmp_vec:
3602         v2 = temp_tcgv_vec(arg_temp(a2));
3603         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3604         break;
3605
3606     case INDEX_op_cmpsel_vec:
3607         v2 = temp_tcgv_vec(arg_temp(a2));
3608         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3609         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3610         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3611         break;
3612
3613     default:
3614         break;
3615     }
3616
3617     va_end(va);
3618 }
3619
3620 static const int tcg_target_callee_save_regs[] = {
3621 #if TCG_TARGET_REG_BITS == 64
3622     TCG_REG_RBP,
3623     TCG_REG_RBX,
3624 #if defined(_WIN64)
3625     TCG_REG_RDI,
3626     TCG_REG_RSI,
3627 #endif
3628     TCG_REG_R12,
3629     TCG_REG_R13,
3630     TCG_REG_R14, /* Currently used for the global env. */
3631     TCG_REG_R15,
3632 #else
3633     TCG_REG_EBP, /* Currently used for the global env. */
3634     TCG_REG_EBX,
3635     TCG_REG_ESI,
3636     TCG_REG_EDI,
3637 #endif
3638 };
3639
3640 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3641    and tcg_register_jit.  */
3642
3643 #define PUSH_SIZE \
3644     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3645      * (TCG_TARGET_REG_BITS / 8))
3646
3647 #define FRAME_SIZE \
3648     ((PUSH_SIZE \
3649       + TCG_STATIC_CALL_ARGS_SIZE \
3650       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3651       + TCG_TARGET_STACK_ALIGN - 1) \
3652      & ~(TCG_TARGET_STACK_ALIGN - 1))
3653
3654 /* Generate global QEMU prologue and epilogue code */
3655 static void tcg_target_qemu_prologue(TCGContext *s)
3656 {
3657     int i, stack_addend;
3658
3659     /* TB prologue */
3660
3661     /* Reserve some stack space, also for TCG temps.  */
3662     stack_addend = FRAME_SIZE - PUSH_SIZE;
3663     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3664                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3665
3666     /* Save all callee saved registers.  */
3667     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3668         tcg_out_push(s, tcg_target_callee_save_regs[i]);
3669     }
3670
3671 #if TCG_TARGET_REG_BITS == 32
3672     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3673                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3674     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3675     /* jmp *tb.  */
3676     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3677                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3678                          + stack_addend);
3679 #else
3680 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3681     if (guest_base) {
3682         int seg = setup_guest_base_seg();
3683         if (seg != 0) {
3684             x86_guest_base_seg = seg;
3685         } else if (guest_base == (int32_t)guest_base) {
3686             x86_guest_base_offset = guest_base;
3687         } else {
3688             /* Choose R12 because, as a base, it requires a SIB byte. */
3689             x86_guest_base_index = TCG_REG_R12;
3690             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3691             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3692         }
3693     }
3694 # endif
3695     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3696     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3697     /* jmp *tb.  */
3698     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3699 #endif
3700
3701     /*
3702      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3703      * and fall through to the rest of the epilogue.
3704      */
3705     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3706     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3707
3708     /* TB epilogue */
3709     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3710
3711     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3712
3713     if (have_avx2) {
3714         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3715     }
3716     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3717         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3718     }
3719     tcg_out_opc(s, OPC_RET, 0, 0, 0);
3720 }
3721
3722 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3723 {
3724     memset(p, 0x90, count);
3725 }
3726
3727 static void tcg_target_init(TCGContext *s)
3728 {
3729 #ifdef CONFIG_CPUID_H
3730     unsigned a, b, c, d, b7 = 0;
3731     int max = __get_cpuid_max(0, 0);
3732
3733     if (max >= 7) {
3734         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3735         __cpuid_count(7, 0, a, b7, c, d);
3736         have_bmi1 = (b7 & bit_BMI) != 0;
3737         have_bmi2 = (b7 & bit_BMI2) != 0;
3738     }
3739
3740     if (max >= 1) {
3741         __cpuid(1, a, b, c, d);
3742 #ifndef have_cmov
3743         /* For 32-bit, 99% certainty that we're running on hardware that
3744            supports cmov, but we still need to check.  In case cmov is not
3745            available, we'll use a small forward branch.  */
3746         have_cmov = (d & bit_CMOV) != 0;
3747 #endif
3748
3749         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3750            need to probe for it.  */
3751         have_movbe = (c & bit_MOVBE) != 0;
3752         have_popcnt = (c & bit_POPCNT) != 0;
3753
3754         /* There are a number of things we must check before we can be
3755            sure of not hitting invalid opcode.  */
3756         if (c & bit_OSXSAVE) {
3757             unsigned xcrl, xcrh;
3758             /* The xgetbv instruction is not available to older versions of
3759              * the assembler, so we encode the instruction manually.
3760              */
3761             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3762             if ((xcrl & 6) == 6) {
3763                 have_avx1 = (c & bit_AVX) != 0;
3764                 have_avx2 = (b7 & bit_AVX2) != 0;
3765             }
3766         }
3767     }
3768
3769     max = __get_cpuid_max(0x8000000, 0);
3770     if (max >= 1) {
3771         __cpuid(0x80000001, a, b, c, d);
3772         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3773         have_lzcnt = (c & bit_LZCNT) != 0;
3774     }
3775 #endif /* CONFIG_CPUID_H */
3776
3777     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3778     if (TCG_TARGET_REG_BITS == 64) {
3779         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3780     }
3781     if (have_avx1) {
3782         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3783         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3784     }
3785     if (have_avx2) {
3786         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3787     }
3788
3789     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3790     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3791     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3792     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3793     if (TCG_TARGET_REG_BITS == 64) {
3794 #if !defined(_WIN64)
3795         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3796         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3797 #endif
3798         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3799         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3800         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3801         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3802     }
3803
3804     s->reserved_regs = 0;
3805     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3806 }
3807
3808 typedef struct {
3809     DebugFrameHeader h;
3810     uint8_t fde_def_cfa[4];
3811     uint8_t fde_reg_ofs[14];
3812 } DebugFrame;
3813
3814 /* We're expecting a 2 byte uleb128 encoded value.  */
3815 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3816
3817 #if !defined(__ELF__)
3818     /* Host machine without ELF. */
3819 #elif TCG_TARGET_REG_BITS == 64
3820 #define ELF_HOST_MACHINE EM_X86_64
3821 static const DebugFrame debug_frame = {
3822     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3823     .h.cie.id = -1,
3824     .h.cie.version = 1,
3825     .h.cie.code_align = 1,
3826     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3827     .h.cie.return_column = 16,
3828
3829     /* Total FDE size does not include the "len" member.  */
3830     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3831
3832     .fde_def_cfa = {
3833         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3834         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3835         (FRAME_SIZE >> 7)
3836     },
3837     .fde_reg_ofs = {
3838         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3839         /* The following ordering must match tcg_target_callee_save_regs.  */
3840         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3841         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3842         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3843         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3844         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3845         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3846     }
3847 };
3848 #else
3849 #define ELF_HOST_MACHINE EM_386
3850 static const DebugFrame debug_frame = {
3851     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3852     .h.cie.id = -1,
3853     .h.cie.version = 1,
3854     .h.cie.code_align = 1,
3855     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3856     .h.cie.return_column = 8,
3857
3858     /* Total FDE size does not include the "len" member.  */
3859     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3860
3861     .fde_def_cfa = {
3862         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3863         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3864         (FRAME_SIZE >> 7)
3865     },
3866     .fde_reg_ofs = {
3867         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3868         /* The following ordering must match tcg_target_callee_save_regs.  */
3869         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3870         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3871         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3872         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3873     }
3874 };
3875 #endif
3876
3877 #if defined(ELF_HOST_MACHINE)
3878 void tcg_register_jit(const void *buf, size_t buf_size)
3879 {
3880     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3881 }
3882 #endif