tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-pool.c.inc"
  26
  27 #ifdef CONFIG_DEBUG_TCG
  28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29 #if TCG_TARGET_REG_BITS == 64
  30     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31 #else
  32     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33 #endif
  34     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36 #if TCG_TARGET_REG_BITS == 64
  37     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39 #endif
  40 };
  41 #endif
  42
  43 static const int tcg_target_reg_alloc_order[] = {
  44 #if TCG_TARGET_REG_BITS == 64
  45     TCG_REG_RBP,
  46     TCG_REG_RBX,
  47     TCG_REG_R12,
  48     TCG_REG_R13,
  49     TCG_REG_R14,
  50     TCG_REG_R15,
  51     TCG_REG_R10,
  52     TCG_REG_R11,
  53     TCG_REG_R9,
  54     TCG_REG_R8,
  55     TCG_REG_RCX,
  56     TCG_REG_RDX,
  57     TCG_REG_RSI,
  58     TCG_REG_RDI,
  59     TCG_REG_RAX,
  60 #else
  61     TCG_REG_EBX,
  62     TCG_REG_ESI,
  63     TCG_REG_EDI,
  64     TCG_REG_EBP,
  65     TCG_REG_ECX,
  66     TCG_REG_EDX,
  67     TCG_REG_EAX,
  68 #endif
  69     TCG_REG_XMM0,
  70     TCG_REG_XMM1,
  71     TCG_REG_XMM2,
  72     TCG_REG_XMM3,
  73     TCG_REG_XMM4,
  74     TCG_REG_XMM5,
  75 #ifndef _WIN64
  76     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78     TCG_REG_XMM6,
  79     TCG_REG_XMM7,
  80 #if TCG_TARGET_REG_BITS == 64
  81     TCG_REG_XMM8,
  82     TCG_REG_XMM9,
  83     TCG_REG_XMM10,
  84     TCG_REG_XMM11,
  85     TCG_REG_XMM12,
  86     TCG_REG_XMM13,
  87     TCG_REG_XMM14,
  88     TCG_REG_XMM15,
  89 #endif
  90 #endif
  91 };
  92
  93 static const int tcg_target_call_iarg_regs[] = {
  94 #if TCG_TARGET_REG_BITS == 64
  95 #if defined(_WIN64)
  96     TCG_REG_RCX,
  97     TCG_REG_RDX,
  98 #else
  99     TCG_REG_RDI,
 100     TCG_REG_RSI,
 101     TCG_REG_RDX,
 102     TCG_REG_RCX,
 103 #endif
 104     TCG_REG_R8,
 105     TCG_REG_R9,
 106 #else
 107     /* 32 bit mode uses stack based calling convention (GCC default). */
 108 #endif
 109 };
 110
 111 static const int tcg_target_call_oarg_regs[] = {
 112     TCG_REG_EAX,
 113 #if TCG_TARGET_REG_BITS == 32
 114     TCG_REG_EDX
 115 #endif
 116 };
 117
 118 /* Constants we accept.  */
 119 #define TCG_CT_CONST_S32 0x100
 120 #define TCG_CT_CONST_U32 0x200
 121 #define TCG_CT_CONST_I32 0x400
 122 #define TCG_CT_CONST_WSZ 0x800
 123
 124 /* Registers used with L constraint, which are the first argument
 125    registers on x86_64, and two random call clobbered registers on
 126    i386. */
 127 #if TCG_TARGET_REG_BITS == 64
 128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130 #else
 131 # define TCG_REG_L0 TCG_REG_EAX
 132 # define TCG_REG_L1 TCG_REG_EDX
 133 #endif
 134
 135 #define ALL_BYTEH_REGS         0x0000000fu
 136 #if TCG_TARGET_REG_BITS == 64
 137 # define ALL_GENERAL_REGS      0x0000ffffu
 138 # define ALL_VECTOR_REGS       0xffff0000u
 139 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 140 #else
 141 # define ALL_GENERAL_REGS      0x000000ffu
 142 # define ALL_VECTOR_REGS       0x00ff0000u
 143 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 144 #endif
 145 #ifdef CONFIG_SOFTMMU
 146 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 147 #else
 148 # define SOFTMMU_RESERVE_REGS  0
 149 #endif
 150
 151 /* The host compiler should supply <cpuid.h> to enable runtime features
 152    detection, as we're not going to go so far as our own inline assembly.
 153    If not available, default values will be assumed.  */
 154 #if defined(CONFIG_CPUID_H)
 155 #include "qemu/cpuid.h"
 156 #endif
 157
 158 /* For 64-bit, we always know that CMOV is available.  */
 159 #if TCG_TARGET_REG_BITS == 64
 160 # define have_cmov 1
 161 #elif defined(CONFIG_CPUID_H)
 162 static bool have_cmov;
 163 #else
 164 # define have_cmov 0
 165 #endif
 166
 167 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 168    it there.  Therefore we always define the variable.  */
 169 bool have_bmi1;
 170 bool have_popcnt;
 171 bool have_avx1;
 172 bool have_avx2;
 173 bool have_movbe;
 174
 175 #ifdef CONFIG_CPUID_H
 176 static bool have_bmi2;
 177 static bool have_lzcnt;
 178 #else
 179 # define have_bmi2 0
 180 # define have_lzcnt 0
 181 #endif
 182
 183 static const tcg_insn_unit *tb_ret_addr;
 184
 185 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 186                         intptr_t value, intptr_t addend)
 187 {
 188     value += addend;
 189     switch(type) {
 190     case R_386_PC32:
 191         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 192         if (value != (int32_t)value) {
 193             return false;
 194         }
 195         /* FALLTHRU */
 196     case R_386_32:
 197         tcg_patch32(code_ptr, value);
 198         break;
 199     case R_386_PC8:
 200         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 201         if (value != (int8_t)value) {
 202             return false;
 203         }
 204         tcg_patch8(code_ptr, value);
 205         break;
 206     default:
 207         tcg_abort();
 208     }
 209     return true;
 210 }
 211
 212 /* test if a constant matches the constraint */
 213 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 214                                          const TCGArgConstraint *arg_ct)
 215 {
 216     int ct = arg_ct->ct;
 217     if (ct & TCG_CT_CONST) {
 218         return 1;
 219     }
 220     if (type == TCG_TYPE_I32) {
 221         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 222             return 1;
 223         }
 224     } else {
 225         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 226             return 1;
 227         }
 228         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 229             return 1;
 230         }
 231         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 232             return 1;
 233         }
 234     }
 235     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 236         return 1;
 237     }
 238     return 0;
 239 }
 240
 241 # define LOWREGMASK(x)  ((x) & 7)
 242
 243 #define P_EXT           0x100           /* 0x0f opcode prefix */
 244 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 245 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 246 #if TCG_TARGET_REG_BITS == 64
 247 # define P_REXW         0x1000          /* Set REX.W = 1 */
 248 # define P_REXB_R       0x2000          /* REG field as byte register */
 249 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 250 # define P_GS           0x8000          /* gs segment override */
 251 #else
 252 # define P_REXW         0
 253 # define P_REXB_R       0
 254 # define P_REXB_RM      0
 255 # define P_GS           0
 256 #endif
 257 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 258 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 259 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 260 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 261
 262 #define OPC_ARITH_EvIz  (0x81)
 263 #define OPC_ARITH_EvIb  (0x83)
 264 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 265 #define OPC_ANDN        (0xf2 | P_EXT38)
 266 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 267 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 268 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 269 #define OPC_BSF         (0xbc | P_EXT)
 270 #define OPC_BSR         (0xbd | P_EXT)
 271 #define OPC_BSWAP       (0xc8 | P_EXT)
 272 #define OPC_CALL_Jz     (0xe8)
 273 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 274 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 275 #define OPC_DEC_r32     (0x48)
 276 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 277 #define OPC_IMUL_GvEvIb (0x6b)
 278 #define OPC_IMUL_GvEvIz (0x69)
 279 #define OPC_INC_r32     (0x40)
 280 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 281 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 282 #define OPC_JMP_long    (0xe9)
 283 #define OPC_JMP_short   (0xeb)
 284 #define OPC_LEA         (0x8d)
 285 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 286 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 287 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 288 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 289 #define OPC_MOVB_EvIz   (0xc6)
 290 #define OPC_MOVL_EvIz   (0xc7)
 291 #define OPC_MOVL_Iv     (0xb8)
 292 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 293 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 294 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 295 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 296 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 297 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 298 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 299 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 300 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 301 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 302 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 303 #define OPC_MOVSBL      (0xbe | P_EXT)
 304 #define OPC_MOVSWL      (0xbf | P_EXT)
 305 #define OPC_MOVSLQ      (0x63 | P_REXW)
 306 #define OPC_MOVZBL      (0xb6 | P_EXT)
 307 #define OPC_MOVZWL      (0xb7 | P_EXT)
 308 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 309 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 310 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 311 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 312 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 313 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 314 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 315 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 316 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 317 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 318 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 319 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 320 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 321 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 322 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 323 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 324 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 325 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 326 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 327 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 328 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 329 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 330 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 331 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 332 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 333 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 334 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 335 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 336 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 337 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 338 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 339 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 340 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 341 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 342 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 343 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 344 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 345 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 348 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 349 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 350 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 351 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 352 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 353 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 354 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 355 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 356 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 357 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 358 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 359 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 360 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 361 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 362 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 363 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 364 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 365 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 366 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 368 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 369 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 370 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 371 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 372 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 373 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 374 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 376 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 377 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 384 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 385 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 386 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 387 #define OPC_POP_r32     (0x58)
 388 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 389 #define OPC_PUSH_r32    (0x50)
 390 #define OPC_PUSH_Iv     (0x68)
 391 #define OPC_PUSH_Ib     (0x6a)
 392 #define OPC_RET         (0xc3)
 393 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 394 #define OPC_SHIFT_1     (0xd1)
 395 #define OPC_SHIFT_Ib    (0xc1)
 396 #define OPC_SHIFT_cl    (0xd3)
 397 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 398 #define OPC_SHUFPS      (0xc6 | P_EXT)
 399 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 400 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 401 #define OPC_SHRD_Ib     (0xac | P_EXT)
 402 #define OPC_TESTL       (0x85)
 403 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 404 #define OPC_UD2         (0x0b | P_EXT)
 405 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 406 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 407 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 408 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 409 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 410 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 413 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 414 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 415 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 416 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 417 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 418 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
 419 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 420 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 421 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
 422 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 423 #define OPC_XCHG_ax_r32 (0x90)
 424
 425 #define OPC_GRP3_Ev     (0xf7)
 426 #define OPC_GRP5        (0xff)
 427 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 428
 429 /* Group 1 opcode extensions for 0x80-0x83.
 430    These are also used as modifiers for OPC_ARITH.  */
 431 #define ARITH_ADD 0
 432 #define ARITH_OR  1
 433 #define ARITH_ADC 2
 434 #define ARITH_SBB 3
 435 #define ARITH_AND 4
 436 #define ARITH_SUB 5
 437 #define ARITH_XOR 6
 438 #define ARITH_CMP 7
 439
 440 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 441 #define SHIFT_ROL 0
 442 #define SHIFT_ROR 1
 443 #define SHIFT_SHL 4
 444 #define SHIFT_SHR 5
 445 #define SHIFT_SAR 7
 446
 447 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 448 #define EXT3_NOT   2
 449 #define EXT3_NEG   3
 450 #define EXT3_MUL   4
 451 #define EXT3_IMUL  5
 452 #define EXT3_DIV   6
 453 #define EXT3_IDIV  7
 454
 455 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 456 #define EXT5_INC_Ev     0
 457 #define EXT5_DEC_Ev     1
 458 #define EXT5_CALLN_Ev   2
 459 #define EXT5_JMPN_Ev    4
 460
 461 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 462 #define JCC_JMP (-1)
 463 #define JCC_JO  0x0
 464 #define JCC_JNO 0x1
 465 #define JCC_JB  0x2
 466 #define JCC_JAE 0x3
 467 #define JCC_JE  0x4
 468 #define JCC_JNE 0x5
 469 #define JCC_JBE 0x6
 470 #define JCC_JA  0x7
 471 #define JCC_JS  0x8
 472 #define JCC_JNS 0x9
 473 #define JCC_JP  0xa
 474 #define JCC_JNP 0xb
 475 #define JCC_JL  0xc
 476 #define JCC_JGE 0xd
 477 #define JCC_JLE 0xe
 478 #define JCC_JG  0xf
 479
 480 static const uint8_t tcg_cond_to_jcc[] = {
 481     [TCG_COND_EQ] = JCC_JE,
 482     [TCG_COND_NE] = JCC_JNE,
 483     [TCG_COND_LT] = JCC_JL,
 484     [TCG_COND_GE] = JCC_JGE,
 485     [TCG_COND_LE] = JCC_JLE,
 486     [TCG_COND_GT] = JCC_JG,
 487     [TCG_COND_LTU] = JCC_JB,
 488     [TCG_COND_GEU] = JCC_JAE,
 489     [TCG_COND_LEU] = JCC_JBE,
 490     [TCG_COND_GTU] = JCC_JA,
 491 };
 492
 493 #if TCG_TARGET_REG_BITS == 64
 494 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 495 {
 496     int rex;
 497
 498     if (opc & P_GS) {
 499         tcg_out8(s, 0x65);
 500     }
 501     if (opc & P_DATA16) {
 502         /* We should never be asking for both 16 and 64-bit operation.  */
 503         tcg_debug_assert((opc & P_REXW) == 0);
 504         tcg_out8(s, 0x66);
 505     }
 506     if (opc & P_SIMDF3) {
 507         tcg_out8(s, 0xf3);
 508     } else if (opc & P_SIMDF2) {
 509         tcg_out8(s, 0xf2);
 510     }
 511
 512     rex = 0;
 513     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 514     rex |= (r & 8) >> 1;                /* REX.R */
 515     rex |= (x & 8) >> 2;                /* REX.X */
 516     rex |= (rm & 8) >> 3;               /* REX.B */
 517
 518     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 519        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 520        as otherwise the encoding indicates %[abcd]h.  Note that the values
 521        that are ORed in merely indicate that the REX byte must be present;
 522        those bits get discarded in output.  */
 523     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 524     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 525
 526     if (rex) {
 527         tcg_out8(s, (uint8_t)(rex | 0x40));
 528     }
 529
 530     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 531         tcg_out8(s, 0x0f);
 532         if (opc & P_EXT38) {
 533             tcg_out8(s, 0x38);
 534         } else if (opc & P_EXT3A) {
 535             tcg_out8(s, 0x3a);
 536         }
 537     }
 538
 539     tcg_out8(s, opc);
 540 }
 541 #else
 542 static void tcg_out_opc(TCGContext *s, int opc)
 543 {
 544     if (opc & P_DATA16) {
 545         tcg_out8(s, 0x66);
 546     }
 547     if (opc & P_SIMDF3) {
 548         tcg_out8(s, 0xf3);
 549     } else if (opc & P_SIMDF2) {
 550         tcg_out8(s, 0xf2);
 551     }
 552     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 553         tcg_out8(s, 0x0f);
 554         if (opc & P_EXT38) {
 555             tcg_out8(s, 0x38);
 556         } else if (opc & P_EXT3A) {
 557             tcg_out8(s, 0x3a);
 558         }
 559     }
 560     tcg_out8(s, opc);
 561 }
 562 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 563    the 32-bit compilation paths.  This method works with all versions of gcc,
 564    whereas relying on optimization may not be able to exclude them.  */
 565 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 566 #endif
 567
 568 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 569 {
 570     tcg_out_opc(s, opc, r, rm, 0);
 571     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 572 }
 573
 574 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 575                             int rm, int index)
 576 {
 577     int tmp;
 578
 579     /* Use the two byte form if possible, which cannot encode
 580        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 581     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 582         && ((rm | index) & 8) == 0) {
 583         /* Two byte VEX prefix.  */
 584         tcg_out8(s, 0xc5);
 585
 586         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 587     } else {
 588         /* Three byte VEX prefix.  */
 589         tcg_out8(s, 0xc4);
 590
 591         /* VEX.m-mmmm */
 592         if (opc & P_EXT3A) {
 593             tmp = 3;
 594         } else if (opc & P_EXT38) {
 595             tmp = 2;
 596         } else if (opc & P_EXT) {
 597             tmp = 1;
 598         } else {
 599             g_assert_not_reached();
 600         }
 601         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 602         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 603         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 604         tcg_out8(s, tmp);
 605
 606         tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 607     }
 608
 609     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 610     /* VEX.pp */
 611     if (opc & P_DATA16) {
 612         tmp |= 1;                          /* 0x66 */
 613     } else if (opc & P_SIMDF3) {
 614         tmp |= 2;                          /* 0xf3 */
 615     } else if (opc & P_SIMDF2) {
 616         tmp |= 3;                          /* 0xf2 */
 617     }
 618     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 619     tcg_out8(s, tmp);
 620     tcg_out8(s, opc);
 621 }
 622
 623 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 624 {
 625     tcg_out_vex_opc(s, opc, r, v, rm, 0);
 626     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 627 }
 628
 629 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 630    We handle either RM and INDEX missing with a negative value.  In 64-bit
 631    mode for absolute addresses, ~RM is the size of the immediate operand
 632    that will follow the instruction.  */
 633
 634 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 635                                int shift, intptr_t offset)
 636 {
 637     int mod, len;
 638
 639     if (index < 0 && rm < 0) {
 640         if (TCG_TARGET_REG_BITS == 64) {
 641             /* Try for a rip-relative addressing mode.  This has replaced
 642                the 32-bit-mode absolute addressing encoding.  */
 643             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 644             intptr_t disp = offset - pc;
 645             if (disp == (int32_t)disp) {
 646                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 647                 tcg_out32(s, disp);
 648                 return;
 649             }
 650
 651             /* Try for an absolute address encoding.  This requires the
 652                use of the MODRM+SIB encoding and is therefore larger than
 653                rip-relative addressing.  */
 654             if (offset == (int32_t)offset) {
 655                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 656                 tcg_out8(s, (4 << 3) | 5);
 657                 tcg_out32(s, offset);
 658                 return;
 659             }
 660
 661             /* ??? The memory isn't directly addressable.  */
 662             g_assert_not_reached();
 663         } else {
 664             /* Absolute address.  */
 665             tcg_out8(s, (r << 3) | 5);
 666             tcg_out32(s, offset);
 667             return;
 668         }
 669     }
 670
 671     /* Find the length of the immediate addend.  Note that the encoding
 672        that would be used for (%ebp) indicates absolute addressing.  */
 673     if (rm < 0) {
 674         mod = 0, len = 4, rm = 5;
 675     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 676         mod = 0, len = 0;
 677     } else if (offset == (int8_t)offset) {
 678         mod = 0x40, len = 1;
 679     } else {
 680         mod = 0x80, len = 4;
 681     }
 682
 683     /* Use a single byte MODRM format if possible.  Note that the encoding
 684        that would be used for %esp is the escape to the two byte form.  */
 685     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 686         /* Single byte MODRM format.  */
 687         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 688     } else {
 689         /* Two byte MODRM+SIB format.  */
 690
 691         /* Note that the encoding that would place %esp into the index
 692            field indicates no index register.  In 64-bit mode, the REX.X
 693            bit counts, so %r12 can be used as the index.  */
 694         if (index < 0) {
 695             index = 4;
 696         } else {
 697             tcg_debug_assert(index != TCG_REG_ESP);
 698         }
 699
 700         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 701         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 702     }
 703
 704     if (len == 1) {
 705         tcg_out8(s, offset);
 706     } else if (len == 4) {
 707         tcg_out32(s, offset);
 708     }
 709 }
 710
 711 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 712                                      int index, int shift, intptr_t offset)
 713 {
 714     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 715     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 716 }
 717
 718 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 719                                          int rm, int index, int shift,
 720                                          intptr_t offset)
 721 {
 722     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 723     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 724 }
 725
 726 /* A simplification of the above with no index or shift.  */
 727 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 728                                         int rm, intptr_t offset)
 729 {
 730     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 731 }
 732
 733 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 734                                             int v, int rm, intptr_t offset)
 735 {
 736     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 737 }
 738
 739 /* Output an opcode with an expected reference to the constant pool.  */
 740 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 741 {
 742     tcg_out_opc(s, opc, r, 0, 0);
 743     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 744     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 745     tcg_out32(s, 0);
 746 }
 747
 748 /* Output an opcode with an expected reference to the constant pool.  */
 749 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 750 {
 751     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 752     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 753     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 754     tcg_out32(s, 0);
 755 }
 756
 757 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 758 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 759 {
 760     /* Propagate an opcode prefix, such as P_REXW.  */
 761     int ext = subop & ~0x7;
 762     subop &= 0x7;
 763
 764     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 765 }
 766
 767 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 768 {
 769     int rexw = 0;
 770
 771     if (arg == ret) {
 772         return true;
 773     }
 774     switch (type) {
 775     case TCG_TYPE_I64:
 776         rexw = P_REXW;
 777         /* fallthru */
 778     case TCG_TYPE_I32:
 779         if (ret < 16) {
 780             if (arg < 16) {
 781                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 782             } else {
 783                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 784             }
 785         } else {
 786             if (arg < 16) {
 787                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 788             } else {
 789                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 790             }
 791         }
 792         break;
 793
 794     case TCG_TYPE_V64:
 795         tcg_debug_assert(ret >= 16 && arg >= 16);
 796         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 797         break;
 798     case TCG_TYPE_V128:
 799         tcg_debug_assert(ret >= 16 && arg >= 16);
 800         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 801         break;
 802     case TCG_TYPE_V256:
 803         tcg_debug_assert(ret >= 16 && arg >= 16);
 804         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 805         break;
 806
 807     default:
 808         g_assert_not_reached();
 809     }
 810     return true;
 811 }
 812
 813 static const int avx2_dup_insn[4] = {
 814     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 815     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 816 };
 817
 818 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 819                             TCGReg r, TCGReg a)
 820 {
 821     if (have_avx2) {
 822         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 823         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 824     } else {
 825         switch (vece) {
 826         case MO_8:
 827             /* ??? With zero in a register, use PSHUFB.  */
 828             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 829             a = r;
 830             /* FALLTHRU */
 831         case MO_16:
 832             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 833             a = r;
 834             /* FALLTHRU */
 835         case MO_32:
 836             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 837             /* imm8 operand: all output lanes selected from input lane 0.  */
 838             tcg_out8(s, 0);
 839             break;
 840         case MO_64:
 841             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 842             break;
 843         default:
 844             g_assert_not_reached();
 845         }
 846     }
 847     return true;
 848 }
 849
 850 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 851                              TCGReg r, TCGReg base, intptr_t offset)
 852 {
 853     if (have_avx2) {
 854         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 855         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 856                                  r, 0, base, offset);
 857     } else {
 858         switch (vece) {
 859         case MO_64:
 860             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 861             break;
 862         case MO_32:
 863             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 864             break;
 865         case MO_16:
 866             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 867             tcg_out8(s, 0); /* imm8 */
 868             tcg_out_dup_vec(s, type, vece, r, r);
 869             break;
 870         case MO_8:
 871             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 872             tcg_out8(s, 0); /* imm8 */
 873             tcg_out_dup_vec(s, type, vece, r, r);
 874             break;
 875         default:
 876             g_assert_not_reached();
 877         }
 878     }
 879     return true;
 880 }
 881
 882 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 883                              TCGReg ret, int64_t arg)
 884 {
 885     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 886
 887     if (arg == 0) {
 888         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 889         return;
 890     }
 891     if (arg == -1) {
 892         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 893         return;
 894     }
 895
 896     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 897         if (have_avx2) {
 898             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 899         } else {
 900             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 901         }
 902         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 903     } else {
 904         if (type == TCG_TYPE_V64) {
 905             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 906         } else if (have_avx2) {
 907             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 908         } else {
 909             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 910         }
 911         if (TCG_TARGET_REG_BITS == 64) {
 912             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 913         } else {
 914             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 915         }
 916     }
 917 }
 918
 919 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 920                              TCGReg ret, tcg_target_long arg)
 921 {
 922     if (arg == 0) {
 923         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 924         return;
 925     }
 926     if (arg == -1) {
 927         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 928         return;
 929     }
 930
 931     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 932     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 933     if (TCG_TARGET_REG_BITS == 64) {
 934         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 935     } else {
 936         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 937     }
 938 }
 939
 940 static void tcg_out_movi_int(TCGContext *s, TCGType type,
 941                              TCGReg ret, tcg_target_long arg)
 942 {
 943     tcg_target_long diff;
 944
 945     if (arg == 0) {
 946         tgen_arithr(s, ARITH_XOR, ret, ret);
 947         return;
 948     }
 949     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 950         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 951         tcg_out32(s, arg);
 952         return;
 953     }
 954     if (arg == (int32_t)arg) {
 955         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 956         tcg_out32(s, arg);
 957         return;
 958     }
 959
 960     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 961     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
 962     if (diff == (int32_t)diff) {
 963         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 964         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 965         tcg_out32(s, diff);
 966         return;
 967     }
 968
 969     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 970     tcg_out64(s, arg);
 971 }
 972
 973 static void tcg_out_movi(TCGContext *s, TCGType type,
 974                          TCGReg ret, tcg_target_long arg)
 975 {
 976     switch (type) {
 977     case TCG_TYPE_I32:
 978 #if TCG_TARGET_REG_BITS == 64
 979     case TCG_TYPE_I64:
 980 #endif
 981         if (ret < 16) {
 982             tcg_out_movi_int(s, type, ret, arg);
 983         } else {
 984             tcg_out_movi_vec(s, type, ret, arg);
 985         }
 986         break;
 987     default:
 988         g_assert_not_reached();
 989     }
 990 }
 991
 992 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 993 {
 994     if (val == (int8_t)val) {
 995         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 996         tcg_out8(s, val);
 997     } else if (val == (int32_t)val) {
 998         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 999         tcg_out32(s, val);
1000     } else {
1001         tcg_abort();
1002     }
1003 }
1004
1005 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1006 {
1007     /* Given the strength of x86 memory ordering, we only need care for
1008        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1009        faster than "mfence", so don't bother with the sse insn.  */
1010     if (a0 & TCG_MO_ST_LD) {
1011         tcg_out8(s, 0xf0);
1012         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1013         tcg_out8(s, 0);
1014     }
1015 }
1016
1017 static inline void tcg_out_push(TCGContext *s, int reg)
1018 {
1019     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1020 }
1021
1022 static inline void tcg_out_pop(TCGContext *s, int reg)
1023 {
1024     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1025 }
1026
1027 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1028                        TCGReg arg1, intptr_t arg2)
1029 {
1030     switch (type) {
1031     case TCG_TYPE_I32:
1032         if (ret < 16) {
1033             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1034         } else {
1035             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1036         }
1037         break;
1038     case TCG_TYPE_I64:
1039         if (ret < 16) {
1040             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1041             break;
1042         }
1043         /* FALLTHRU */
1044     case TCG_TYPE_V64:
1045         /* There is no instruction that can validate 8-byte alignment.  */
1046         tcg_debug_assert(ret >= 16);
1047         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1048         break;
1049     case TCG_TYPE_V128:
1050         /*
1051          * The gvec infrastructure is asserts that v128 vector loads
1052          * and stores use a 16-byte aligned offset.  Validate that the
1053          * final pointer is aligned by using an insn that will SIGSEGV.
1054          */
1055         tcg_debug_assert(ret >= 16);
1056         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1057         break;
1058     case TCG_TYPE_V256:
1059         /*
1060          * The gvec infrastructure only requires 16-byte alignment,
1061          * so here we must use an unaligned load.
1062          */
1063         tcg_debug_assert(ret >= 16);
1064         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1065                                  ret, 0, arg1, arg2);
1066         break;
1067     default:
1068         g_assert_not_reached();
1069     }
1070 }
1071
1072 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1073                        TCGReg arg1, intptr_t arg2)
1074 {
1075     switch (type) {
1076     case TCG_TYPE_I32:
1077         if (arg < 16) {
1078             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1079         } else {
1080             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1081         }
1082         break;
1083     case TCG_TYPE_I64:
1084         if (arg < 16) {
1085             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1086             break;
1087         }
1088         /* FALLTHRU */
1089     case TCG_TYPE_V64:
1090         /* There is no instruction that can validate 8-byte alignment.  */
1091         tcg_debug_assert(arg >= 16);
1092         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1093         break;
1094     case TCG_TYPE_V128:
1095         /*
1096          * The gvec infrastructure is asserts that v128 vector loads
1097          * and stores use a 16-byte aligned offset.  Validate that the
1098          * final pointer is aligned by using an insn that will SIGSEGV.
1099          */
1100         tcg_debug_assert(arg >= 16);
1101         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1102         break;
1103     case TCG_TYPE_V256:
1104         /*
1105          * The gvec infrastructure only requires 16-byte alignment,
1106          * so here we must use an unaligned store.
1107          */
1108         tcg_debug_assert(arg >= 16);
1109         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1110                                  arg, 0, arg1, arg2);
1111         break;
1112     default:
1113         g_assert_not_reached();
1114     }
1115 }
1116
1117 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1118                         TCGReg base, intptr_t ofs)
1119 {
1120     int rexw = 0;
1121     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1122         if (val != (int32_t)val) {
1123             return false;
1124         }
1125         rexw = P_REXW;
1126     } else if (type != TCG_TYPE_I32) {
1127         return false;
1128     }
1129     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1130     tcg_out32(s, val);
1131     return true;
1132 }
1133
1134 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1135 {
1136     /* Propagate an opcode prefix, such as P_DATA16.  */
1137     int ext = subopc & ~0x7;
1138     subopc &= 0x7;
1139
1140     if (count == 1) {
1141         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1142     } else {
1143         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1144         tcg_out8(s, count);
1145     }
1146 }
1147
1148 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1149 {
1150     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1151 }
1152
1153 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1154 {
1155     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1156 }
1157
1158 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1159 {
1160     /* movzbl */
1161     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1162     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1163 }
1164
1165 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1166 {
1167     /* movsbl */
1168     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1169     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1170 }
1171
1172 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1173 {
1174     /* movzwl */
1175     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1176 }
1177
1178 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1179 {
1180     /* movsw[lq] */
1181     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1182 }
1183
1184 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1185 {
1186     /* 32-bit mov zero extends.  */
1187     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1188 }
1189
1190 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1191 {
1192     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1193 }
1194
1195 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1196 {
1197     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1198 }
1199
1200 static void tgen_arithi(TCGContext *s, int c, int r0,
1201                         tcg_target_long val, int cf)
1202 {
1203     int rexw = 0;
1204
1205     if (TCG_TARGET_REG_BITS == 64) {
1206         rexw = c & -8;
1207         c &= 7;
1208     }
1209
1210     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1211        partial flags update stalls on Pentium4 and are not recommended
1212        by current Intel optimization manuals.  */
1213     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1214         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1215         if (TCG_TARGET_REG_BITS == 64) {
1216             /* The single-byte increment encodings are re-tasked as the
1217                REX prefixes.  Use the MODRM encoding.  */
1218             tcg_out_modrm(s, OPC_GRP5 + rexw,
1219                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1220         } else {
1221             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1222         }
1223         return;
1224     }
1225
1226     if (c == ARITH_AND) {
1227         if (TCG_TARGET_REG_BITS == 64) {
1228             if (val == 0xffffffffu) {
1229                 tcg_out_ext32u(s, r0, r0);
1230                 return;
1231             }
1232             if (val == (uint32_t)val) {
1233                 /* AND with no high bits set can use a 32-bit operation.  */
1234                 rexw = 0;
1235             }
1236         }
1237         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1238             tcg_out_ext8u(s, r0, r0);
1239             return;
1240         }
1241         if (val == 0xffffu) {
1242             tcg_out_ext16u(s, r0, r0);
1243             return;
1244         }
1245     }
1246
1247     if (val == (int8_t)val) {
1248         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1249         tcg_out8(s, val);
1250         return;
1251     }
1252     if (rexw == 0 || val == (int32_t)val) {
1253         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1254         tcg_out32(s, val);
1255         return;
1256     }
1257
1258     tcg_abort();
1259 }
1260
1261 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1262 {
1263     if (val != 0) {
1264         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1265     }
1266 }
1267
1268 /* Use SMALL != 0 to force a short forward branch.  */
1269 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1270 {
1271     int32_t val, val1;
1272
1273     if (l->has_value) {
1274         val = tcg_pcrel_diff(s, l->u.value_ptr);
1275         val1 = val - 2;
1276         if ((int8_t)val1 == val1) {
1277             if (opc == -1) {
1278                 tcg_out8(s, OPC_JMP_short);
1279             } else {
1280                 tcg_out8(s, OPC_JCC_short + opc);
1281             }
1282             tcg_out8(s, val1);
1283         } else {
1284             if (small) {
1285                 tcg_abort();
1286             }
1287             if (opc == -1) {
1288                 tcg_out8(s, OPC_JMP_long);
1289                 tcg_out32(s, val - 5);
1290             } else {
1291                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1292                 tcg_out32(s, val - 6);
1293             }
1294         }
1295     } else if (small) {
1296         if (opc == -1) {
1297             tcg_out8(s, OPC_JMP_short);
1298         } else {
1299             tcg_out8(s, OPC_JCC_short + opc);
1300         }
1301         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1302         s->code_ptr += 1;
1303     } else {
1304         if (opc == -1) {
1305             tcg_out8(s, OPC_JMP_long);
1306         } else {
1307             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1308         }
1309         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1310         s->code_ptr += 4;
1311     }
1312 }
1313
1314 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1315                         int const_arg2, int rexw)
1316 {
1317     if (const_arg2) {
1318         if (arg2 == 0) {
1319             /* test r, r */
1320             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1321         } else {
1322             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1323         }
1324     } else {
1325         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1326     }
1327 }
1328
1329 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1330                              TCGArg arg1, TCGArg arg2, int const_arg2,
1331                              TCGLabel *label, int small)
1332 {
1333     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1334     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1335 }
1336
1337 #if TCG_TARGET_REG_BITS == 64
1338 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1339                              TCGArg arg1, TCGArg arg2, int const_arg2,
1340                              TCGLabel *label, int small)
1341 {
1342     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1343     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1344 }
1345 #else
1346 /* XXX: we implement it at the target level to avoid having to
1347    handle cross basic blocks temporaries */
1348 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1349                             const int *const_args, int small)
1350 {
1351     TCGLabel *label_next = gen_new_label();
1352     TCGLabel *label_this = arg_label(args[5]);
1353
1354     switch(args[4]) {
1355     case TCG_COND_EQ:
1356         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1357                          label_next, 1);
1358         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1359                          label_this, small);
1360         break;
1361     case TCG_COND_NE:
1362         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1363                          label_this, small);
1364         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1365                          label_this, small);
1366         break;
1367     case TCG_COND_LT:
1368         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1369                          label_this, small);
1370         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1372                          label_this, small);
1373         break;
1374     case TCG_COND_LE:
1375         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1376                          label_this, small);
1377         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1378         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1379                          label_this, small);
1380         break;
1381     case TCG_COND_GT:
1382         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1383                          label_this, small);
1384         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1385         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1386                          label_this, small);
1387         break;
1388     case TCG_COND_GE:
1389         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1390                          label_this, small);
1391         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1392         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1393                          label_this, small);
1394         break;
1395     case TCG_COND_LTU:
1396         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1397                          label_this, small);
1398         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1399         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1400                          label_this, small);
1401         break;
1402     case TCG_COND_LEU:
1403         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1404                          label_this, small);
1405         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1406         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1407                          label_this, small);
1408         break;
1409     case TCG_COND_GTU:
1410         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1411                          label_this, small);
1412         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1413         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1414                          label_this, small);
1415         break;
1416     case TCG_COND_GEU:
1417         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1418                          label_this, small);
1419         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1420         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1421                          label_this, small);
1422         break;
1423     default:
1424         tcg_abort();
1425     }
1426     tcg_out_label(s, label_next);
1427 }
1428 #endif
1429
1430 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1431                               TCGArg arg1, TCGArg arg2, int const_arg2)
1432 {
1433     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1434     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1435     tcg_out_ext8u(s, dest, dest);
1436 }
1437
1438 #if TCG_TARGET_REG_BITS == 64
1439 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1440                               TCGArg arg1, TCGArg arg2, int const_arg2)
1441 {
1442     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1443     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1444     tcg_out_ext8u(s, dest, dest);
1445 }
1446 #else
1447 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1448                              const int *const_args)
1449 {
1450     TCGArg new_args[6];
1451     TCGLabel *label_true, *label_over;
1452
1453     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1454
1455     if (args[0] == args[1] || args[0] == args[2]
1456         || (!const_args[3] && args[0] == args[3])
1457         || (!const_args[4] && args[0] == args[4])) {
1458         /* When the destination overlaps with one of the argument
1459            registers, don't do anything tricky.  */
1460         label_true = gen_new_label();
1461         label_over = gen_new_label();
1462
1463         new_args[5] = label_arg(label_true);
1464         tcg_out_brcond2(s, new_args, const_args+1, 1);
1465
1466         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1467         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1468         tcg_out_label(s, label_true);
1469
1470         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1471         tcg_out_label(s, label_over);
1472     } else {
1473         /* When the destination does not overlap one of the arguments,
1474            clear the destination first, jump if cond false, and emit an
1475            increment in the true case.  This results in smaller code.  */
1476
1477         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1478
1479         label_over = gen_new_label();
1480         new_args[4] = tcg_invert_cond(new_args[4]);
1481         new_args[5] = label_arg(label_over);
1482         tcg_out_brcond2(s, new_args, const_args+1, 1);
1483
1484         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1485         tcg_out_label(s, label_over);
1486     }
1487 }
1488 #endif
1489
1490 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1491                          TCGReg dest, TCGReg v1)
1492 {
1493     if (have_cmov) {
1494         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1495     } else {
1496         TCGLabel *over = gen_new_label();
1497         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1498         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1499         tcg_out_label(s, over);
1500     }
1501 }
1502
1503 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1504                               TCGReg c1, TCGArg c2, int const_c2,
1505                               TCGReg v1)
1506 {
1507     tcg_out_cmp(s, c1, c2, const_c2, 0);
1508     tcg_out_cmov(s, cond, 0, dest, v1);
1509 }
1510
1511 #if TCG_TARGET_REG_BITS == 64
1512 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1513                               TCGReg c1, TCGArg c2, int const_c2,
1514                               TCGReg v1)
1515 {
1516     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1517     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1518 }
1519 #endif
1520
1521 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1522                         TCGArg arg2, bool const_a2)
1523 {
1524     if (have_bmi1) {
1525         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1526         if (const_a2) {
1527             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1528         } else {
1529             tcg_debug_assert(dest != arg2);
1530             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1531         }
1532     } else {
1533         tcg_debug_assert(dest != arg2);
1534         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1535         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1536     }
1537 }
1538
1539 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1540                         TCGArg arg2, bool const_a2)
1541 {
1542     if (have_lzcnt) {
1543         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1544         if (const_a2) {
1545             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1546         } else {
1547             tcg_debug_assert(dest != arg2);
1548             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1549         }
1550     } else {
1551         tcg_debug_assert(!const_a2);
1552         tcg_debug_assert(dest != arg1);
1553         tcg_debug_assert(dest != arg2);
1554
1555         /* Recall that the output of BSR is the index not the count.  */
1556         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1557         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1558
1559         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1560         tcg_out_cmp(s, arg1, 0, 1, rexw);
1561         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1562     }
1563 }
1564
1565 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1566 {
1567     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1568
1569     if (disp == (int32_t)disp) {
1570         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1571         tcg_out32(s, disp);
1572     } else {
1573         /* rip-relative addressing into the constant pool.
1574            This is 6 + 8 = 14 bytes, as compared to using an
1575            an immediate load 10 + 6 = 16 bytes, plus we may
1576            be able to re-use the pool constant for more calls.  */
1577         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1578         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1579         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1580         tcg_out32(s, 0);
1581     }
1582 }
1583
1584 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1585 {
1586     tcg_out_branch(s, 1, dest);
1587 }
1588
1589 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1590 {
1591     tcg_out_branch(s, 0, dest);
1592 }
1593
1594 static void tcg_out_nopn(TCGContext *s, int n)
1595 {
1596     int i;
1597     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1598      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1599      * duplicate prefix, and all of the interesting recent cores can
1600      * decode and discard the duplicates in a single cycle.
1601      */
1602     tcg_debug_assert(n >= 1);
1603     for (i = 1; i < n; ++i) {
1604         tcg_out8(s, 0x66);
1605     }
1606     tcg_out8(s, 0x90);
1607 }
1608
1609 #if defined(CONFIG_SOFTMMU)
1610 #include "../tcg-ldst.c.inc"
1611
1612 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1613  *                                     int mmu_idx, uintptr_t ra)
1614  */
1615 static void * const qemu_ld_helpers[16] = {
1616     [MO_UB]   = helper_ret_ldub_mmu,
1617     [MO_LEUW] = helper_le_lduw_mmu,
1618     [MO_LEUL] = helper_le_ldul_mmu,
1619     [MO_LEQ]  = helper_le_ldq_mmu,
1620     [MO_BEUW] = helper_be_lduw_mmu,
1621     [MO_BEUL] = helper_be_ldul_mmu,
1622     [MO_BEQ]  = helper_be_ldq_mmu,
1623 };
1624
1625 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1626  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1627  */
1628 static void * const qemu_st_helpers[16] = {
1629     [MO_UB]   = helper_ret_stb_mmu,
1630     [MO_LEUW] = helper_le_stw_mmu,
1631     [MO_LEUL] = helper_le_stl_mmu,
1632     [MO_LEQ]  = helper_le_stq_mmu,
1633     [MO_BEUW] = helper_be_stw_mmu,
1634     [MO_BEUL] = helper_be_stl_mmu,
1635     [MO_BEQ]  = helper_be_stq_mmu,
1636 };
1637
1638 /* Perform the TLB load and compare.
1639
1640    Inputs:
1641    ADDRLO and ADDRHI contain the low and high part of the address.
1642
1643    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1644
1645    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1646    This should be offsetof addr_read or addr_write.
1647
1648    Outputs:
1649    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1650    positions of the displacements of forward jumps to the TLB miss case.
1651
1652    Second argument register is loaded with the low part of the address.
1653    In the TLB hit case, it has been adjusted as indicated by the TLB
1654    and so is a host address.  In the TLB miss case, it continues to
1655    hold a guest address.
1656
1657    First argument register is clobbered.  */
1658
1659 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1660                                     int mem_index, MemOp opc,
1661                                     tcg_insn_unit **label_ptr, int which)
1662 {
1663     const TCGReg r0 = TCG_REG_L0;
1664     const TCGReg r1 = TCG_REG_L1;
1665     TCGType ttype = TCG_TYPE_I32;
1666     TCGType tlbtype = TCG_TYPE_I32;
1667     int trexw = 0, hrexw = 0, tlbrexw = 0;
1668     unsigned a_bits = get_alignment_bits(opc);
1669     unsigned s_bits = opc & MO_SIZE;
1670     unsigned a_mask = (1 << a_bits) - 1;
1671     unsigned s_mask = (1 << s_bits) - 1;
1672     target_ulong tlb_mask;
1673
1674     if (TCG_TARGET_REG_BITS == 64) {
1675         if (TARGET_LONG_BITS == 64) {
1676             ttype = TCG_TYPE_I64;
1677             trexw = P_REXW;
1678         }
1679         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1680             hrexw = P_REXW;
1681             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1682                 tlbtype = TCG_TYPE_I64;
1683                 tlbrexw = P_REXW;
1684             }
1685         }
1686     }
1687
1688     tcg_out_mov(s, tlbtype, r0, addrlo);
1689     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1690                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1691
1692     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1693                          TLB_MASK_TABLE_OFS(mem_index) +
1694                          offsetof(CPUTLBDescFast, mask));
1695
1696     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1697                          TLB_MASK_TABLE_OFS(mem_index) +
1698                          offsetof(CPUTLBDescFast, table));
1699
1700     /* If the required alignment is at least as large as the access, simply
1701        copy the address and mask.  For lesser alignments, check that we don't
1702        cross pages for the complete access.  */
1703     if (a_bits >= s_bits) {
1704         tcg_out_mov(s, ttype, r1, addrlo);
1705     } else {
1706         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1707     }
1708     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1709     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1710
1711     /* cmp 0(r0), r1 */
1712     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1713
1714     /* Prepare for both the fast path add of the tlb addend, and the slow
1715        path function argument setup.  */
1716     tcg_out_mov(s, ttype, r1, addrlo);
1717
1718     /* jne slow_path */
1719     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1720     label_ptr[0] = s->code_ptr;
1721     s->code_ptr += 4;
1722
1723     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1724         /* cmp 4(r0), addrhi */
1725         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1726
1727         /* jne slow_path */
1728         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1729         label_ptr[1] = s->code_ptr;
1730         s->code_ptr += 4;
1731     }
1732
1733     /* TLB Hit.  */
1734
1735     /* add addend(r0), r1 */
1736     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1737                          offsetof(CPUTLBEntry, addend));
1738 }
1739
1740 /*
1741  * Record the context of a call to the out of line helper code for the slow path
1742  * for a load or store, so that we can later generate the correct helper code
1743  */
1744 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1745                                 TCGMemOpIdx oi,
1746                                 TCGReg datalo, TCGReg datahi,
1747                                 TCGReg addrlo, TCGReg addrhi,
1748                                 tcg_insn_unit *raddr,
1749                                 tcg_insn_unit **label_ptr)
1750 {
1751     TCGLabelQemuLdst *label = new_ldst_label(s);
1752
1753     label->is_ld = is_ld;
1754     label->oi = oi;
1755     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1756     label->datalo_reg = datalo;
1757     label->datahi_reg = datahi;
1758     label->addrlo_reg = addrlo;
1759     label->addrhi_reg = addrhi;
1760     label->raddr = tcg_splitwx_to_rx(raddr);
1761     label->label_ptr[0] = label_ptr[0];
1762     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1763         label->label_ptr[1] = label_ptr[1];
1764     }
1765 }
1766
1767 /*
1768  * Generate code for the slow path for a load at the end of block
1769  */
1770 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1771 {
1772     TCGMemOpIdx oi = l->oi;
1773     MemOp opc = get_memop(oi);
1774     TCGReg data_reg;
1775     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1776     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1777
1778     /* resolve label address */
1779     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1780     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1781         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1782     }
1783
1784     if (TCG_TARGET_REG_BITS == 32) {
1785         int ofs = 0;
1786
1787         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1788         ofs += 4;
1789
1790         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1791         ofs += 4;
1792
1793         if (TARGET_LONG_BITS == 64) {
1794             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1795             ofs += 4;
1796         }
1797
1798         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1799         ofs += 4;
1800
1801         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1802     } else {
1803         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1804         /* The second argument is already loaded with addrlo.  */
1805         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1806         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1807                      (uintptr_t)l->raddr);
1808     }
1809
1810     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1811
1812     data_reg = l->datalo_reg;
1813     switch (opc & MO_SSIZE) {
1814     case MO_SB:
1815         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1816         break;
1817     case MO_SW:
1818         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1819         break;
1820 #if TCG_TARGET_REG_BITS == 64
1821     case MO_SL:
1822         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1823         break;
1824 #endif
1825     case MO_UB:
1826     case MO_UW:
1827         /* Note that the helpers have zero-extended to tcg_target_long.  */
1828     case MO_UL:
1829         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1830         break;
1831     case MO_Q:
1832         if (TCG_TARGET_REG_BITS == 64) {
1833             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1834         } else if (data_reg == TCG_REG_EDX) {
1835             /* xchg %edx, %eax */
1836             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1837             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1838         } else {
1839             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1840             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1841         }
1842         break;
1843     default:
1844         tcg_abort();
1845     }
1846
1847     /* Jump to the code corresponding to next IR of qemu_st */
1848     tcg_out_jmp(s, l->raddr);
1849     return true;
1850 }
1851
1852 /*
1853  * Generate code for the slow path for a store at the end of block
1854  */
1855 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856 {
1857     TCGMemOpIdx oi = l->oi;
1858     MemOp opc = get_memop(oi);
1859     MemOp s_bits = opc & MO_SIZE;
1860     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1861     TCGReg retaddr;
1862
1863     /* resolve label address */
1864     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1865     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1866         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1867     }
1868
1869     if (TCG_TARGET_REG_BITS == 32) {
1870         int ofs = 0;
1871
1872         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1873         ofs += 4;
1874
1875         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1876         ofs += 4;
1877
1878         if (TARGET_LONG_BITS == 64) {
1879             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1880             ofs += 4;
1881         }
1882
1883         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1884         ofs += 4;
1885
1886         if (s_bits == MO_64) {
1887             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1888             ofs += 4;
1889         }
1890
1891         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1892         ofs += 4;
1893
1894         retaddr = TCG_REG_EAX;
1895         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1896         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1897     } else {
1898         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1899         /* The second argument is already loaded with addrlo.  */
1900         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1901                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1902         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1903
1904         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1905             retaddr = tcg_target_call_iarg_regs[4];
1906             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1907         } else {
1908             retaddr = TCG_REG_RAX;
1909             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1910             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1911                        TCG_TARGET_CALL_STACK_OFFSET);
1912         }
1913     }
1914
1915     /* "Tail call" to the helper, with the return address back inline.  */
1916     tcg_out_push(s, retaddr);
1917     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1918     return true;
1919 }
1920 #elif TCG_TARGET_REG_BITS == 32
1921 # define x86_guest_base_seg     0
1922 # define x86_guest_base_index   -1
1923 # define x86_guest_base_offset  guest_base
1924 #else
1925 static int x86_guest_base_seg;
1926 static int x86_guest_base_index = -1;
1927 static int32_t x86_guest_base_offset;
1928 # if defined(__x86_64__) && defined(__linux__)
1929 #  include <asm/prctl.h>
1930 #  include <sys/prctl.h>
1931 int arch_prctl(int code, unsigned long addr);
1932 static inline int setup_guest_base_seg(void)
1933 {
1934     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1935         return P_GS;
1936     }
1937     return 0;
1938 }
1939 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1940 #  include <machine/sysarch.h>
1941 static inline int setup_guest_base_seg(void)
1942 {
1943     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1944         return P_GS;
1945     }
1946     return 0;
1947 }
1948 # else
1949 static inline int setup_guest_base_seg(void)
1950 {
1951     return 0;
1952 }
1953 # endif
1954 #endif /* SOFTMMU */
1955
1956 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1957                                    TCGReg base, int index, intptr_t ofs,
1958                                    int seg, bool is64, MemOp memop)
1959 {
1960     bool use_movbe = false;
1961     int rexw = is64 * P_REXW;
1962     int movop = OPC_MOVL_GvEv;
1963
1964     /* Do big-endian loads with movbe.  */
1965     if (memop & MO_BSWAP) {
1966         tcg_debug_assert(have_movbe);
1967         use_movbe = true;
1968         movop = OPC_MOVBE_GyMy;
1969     }
1970
1971     switch (memop & MO_SSIZE) {
1972     case MO_UB:
1973         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1974                                  base, index, 0, ofs);
1975         break;
1976     case MO_SB:
1977         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1978                                  base, index, 0, ofs);
1979         break;
1980     case MO_UW:
1981         if (use_movbe) {
1982             /* There is no extending movbe; only low 16-bits are modified.  */
1983             if (datalo != base && datalo != index) {
1984                 /* XOR breaks dependency chains.  */
1985                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
1986                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1987                                          datalo, base, index, 0, ofs);
1988             } else {
1989                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1990                                          datalo, base, index, 0, ofs);
1991                 tcg_out_ext16u(s, datalo, datalo);
1992             }
1993         } else {
1994             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1995                                      base, index, 0, ofs);
1996         }
1997         break;
1998     case MO_SW:
1999         if (use_movbe) {
2000             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2001                                      datalo, base, index, 0, ofs);
2002             tcg_out_ext16s(s, datalo, datalo, rexw);
2003         } else {
2004             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2005                                      datalo, base, index, 0, ofs);
2006         }
2007         break;
2008     case MO_UL:
2009         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2010         break;
2011 #if TCG_TARGET_REG_BITS == 64
2012     case MO_SL:
2013         if (use_movbe) {
2014             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2015                                      base, index, 0, ofs);
2016             tcg_out_ext32s(s, datalo, datalo);
2017         } else {
2018             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2019                                      base, index, 0, ofs);
2020         }
2021         break;
2022 #endif
2023     case MO_Q:
2024         if (TCG_TARGET_REG_BITS == 64) {
2025             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2026                                      base, index, 0, ofs);
2027         } else {
2028             if (use_movbe) {
2029                 TCGReg t = datalo;
2030                 datalo = datahi;
2031                 datahi = t;
2032             }
2033             if (base != datalo) {
2034                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2035                                          base, index, 0, ofs);
2036                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2037                                          base, index, 0, ofs + 4);
2038             } else {
2039                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2040                                          base, index, 0, ofs + 4);
2041                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2042                                          base, index, 0, ofs);
2043             }
2044         }
2045         break;
2046     default:
2047         g_assert_not_reached();
2048     }
2049 }
2050
2051 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2052    EAX. It will be useful once fixed registers globals are less
2053    common. */
2054 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2055 {
2056     TCGReg datalo, datahi, addrlo;
2057     TCGReg addrhi __attribute__((unused));
2058     TCGMemOpIdx oi;
2059     MemOp opc;
2060 #if defined(CONFIG_SOFTMMU)
2061     int mem_index;
2062     tcg_insn_unit *label_ptr[2];
2063 #endif
2064
2065     datalo = *args++;
2066     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2067     addrlo = *args++;
2068     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2069     oi = *args++;
2070     opc = get_memop(oi);
2071
2072 #if defined(CONFIG_SOFTMMU)
2073     mem_index = get_mmuidx(oi);
2074
2075     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2076                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2077
2078     /* TLB Hit.  */
2079     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2080
2081     /* Record the current context of a load into ldst label */
2082     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2083                         s->code_ptr, label_ptr);
2084 #else
2085     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2086                            x86_guest_base_offset, x86_guest_base_seg,
2087                            is64, opc);
2088 #endif
2089 }
2090
2091 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2092                                    TCGReg base, int index, intptr_t ofs,
2093                                    int seg, MemOp memop)
2094 {
2095     bool use_movbe = false;
2096     int movop = OPC_MOVL_EvGv;
2097
2098     /*
2099      * Do big-endian stores with movbe or softmmu.
2100      * User-only without movbe will have its swapping done generically.
2101      */
2102     if (memop & MO_BSWAP) {
2103         tcg_debug_assert(have_movbe);
2104         use_movbe = true;
2105         movop = OPC_MOVBE_MyGy;
2106     }
2107
2108     switch (memop & MO_SIZE) {
2109     case MO_8:
2110         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2111         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2112         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2113                                  datalo, base, index, 0, ofs);
2114         break;
2115     case MO_16:
2116         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2117                                  base, index, 0, ofs);
2118         break;
2119     case MO_32:
2120         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2121         break;
2122     case MO_64:
2123         if (TCG_TARGET_REG_BITS == 64) {
2124             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2125                                      base, index, 0, ofs);
2126         } else {
2127             if (use_movbe) {
2128                 TCGReg t = datalo;
2129                 datalo = datahi;
2130                 datahi = t;
2131             }
2132             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2133                                      base, index, 0, ofs);
2134             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2135                                      base, index, 0, ofs + 4);
2136         }
2137         break;
2138     default:
2139         g_assert_not_reached();
2140     }
2141 }
2142
2143 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2144 {
2145     TCGReg datalo, datahi, addrlo;
2146     TCGReg addrhi __attribute__((unused));
2147     TCGMemOpIdx oi;
2148     MemOp opc;
2149 #if defined(CONFIG_SOFTMMU)
2150     int mem_index;
2151     tcg_insn_unit *label_ptr[2];
2152 #endif
2153
2154     datalo = *args++;
2155     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2156     addrlo = *args++;
2157     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2158     oi = *args++;
2159     opc = get_memop(oi);
2160
2161 #if defined(CONFIG_SOFTMMU)
2162     mem_index = get_mmuidx(oi);
2163
2164     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2165                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2166
2167     /* TLB Hit.  */
2168     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2169
2170     /* Record the current context of a store into ldst label */
2171     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2172                         s->code_ptr, label_ptr);
2173 #else
2174     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2175                            x86_guest_base_offset, x86_guest_base_seg, opc);
2176 #endif
2177 }
2178
2179 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2180                               const TCGArg args[TCG_MAX_OP_ARGS],
2181                               const int const_args[TCG_MAX_OP_ARGS])
2182 {
2183     TCGArg a0, a1, a2;
2184     int c, const_a2, vexop, rexw = 0;
2185
2186 #if TCG_TARGET_REG_BITS == 64
2187 # define OP_32_64(x) \
2188         case glue(glue(INDEX_op_, x), _i64): \
2189             rexw = P_REXW; /* FALLTHRU */    \
2190         case glue(glue(INDEX_op_, x), _i32)
2191 #else
2192 # define OP_32_64(x) \
2193         case glue(glue(INDEX_op_, x), _i32)
2194 #endif
2195
2196     /* Hoist the loads of the most common arguments.  */
2197     a0 = args[0];
2198     a1 = args[1];
2199     a2 = args[2];
2200     const_a2 = const_args[2];
2201
2202     switch (opc) {
2203     case INDEX_op_exit_tb:
2204         /* Reuse the zeroing that exists for goto_ptr.  */
2205         if (a0 == 0) {
2206             tcg_out_jmp(s, tcg_code_gen_epilogue);
2207         } else {
2208             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2209             tcg_out_jmp(s, tb_ret_addr);
2210         }
2211         break;
2212     case INDEX_op_goto_tb:
2213         if (s->tb_jmp_insn_offset) {
2214             /* direct jump method */
2215             int gap;
2216             /* jump displacement must be aligned for atomic patching;
2217              * see if we need to add extra nops before jump
2218              */
2219             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2220             if (gap != 1) {
2221                 tcg_out_nopn(s, gap - 1);
2222             }
2223             tcg_out8(s, OPC_JMP_long); /* jmp im */
2224             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2225             tcg_out32(s, 0);
2226         } else {
2227             /* indirect jump method */
2228             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2229                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2230         }
2231         set_jmp_reset_offset(s, a0);
2232         break;
2233     case INDEX_op_goto_ptr:
2234         /* jmp to the given host address (could be epilogue) */
2235         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2236         break;
2237     case INDEX_op_br:
2238         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2239         break;
2240     OP_32_64(ld8u):
2241         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2242         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2243         break;
2244     OP_32_64(ld8s):
2245         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2246         break;
2247     OP_32_64(ld16u):
2248         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2249         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2250         break;
2251     OP_32_64(ld16s):
2252         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2253         break;
2254 #if TCG_TARGET_REG_BITS == 64
2255     case INDEX_op_ld32u_i64:
2256 #endif
2257     case INDEX_op_ld_i32:
2258         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2259         break;
2260
2261     OP_32_64(st8):
2262         if (const_args[0]) {
2263             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2264             tcg_out8(s, a0);
2265         } else {
2266             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2267         }
2268         break;
2269     OP_32_64(st16):
2270         if (const_args[0]) {
2271             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2272             tcg_out16(s, a0);
2273         } else {
2274             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2275         }
2276         break;
2277 #if TCG_TARGET_REG_BITS == 64
2278     case INDEX_op_st32_i64:
2279 #endif
2280     case INDEX_op_st_i32:
2281         if (const_args[0]) {
2282             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2283             tcg_out32(s, a0);
2284         } else {
2285             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2286         }
2287         break;
2288
2289     OP_32_64(add):
2290         /* For 3-operand addition, use LEA.  */
2291         if (a0 != a1) {
2292             TCGArg c3 = 0;
2293             if (const_a2) {
2294                 c3 = a2, a2 = -1;
2295             } else if (a0 == a2) {
2296                 /* Watch out for dest = src + dest, since we've removed
2297                    the matching constraint on the add.  */
2298                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2299                 break;
2300             }
2301
2302             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2303             break;
2304         }
2305         c = ARITH_ADD;
2306         goto gen_arith;
2307     OP_32_64(sub):
2308         c = ARITH_SUB;
2309         goto gen_arith;
2310     OP_32_64(and):
2311         c = ARITH_AND;
2312         goto gen_arith;
2313     OP_32_64(or):
2314         c = ARITH_OR;
2315         goto gen_arith;
2316     OP_32_64(xor):
2317         c = ARITH_XOR;
2318         goto gen_arith;
2319     gen_arith:
2320         if (const_a2) {
2321             tgen_arithi(s, c + rexw, a0, a2, 0);
2322         } else {
2323             tgen_arithr(s, c + rexw, a0, a2);
2324         }
2325         break;
2326
2327     OP_32_64(andc):
2328         if (const_a2) {
2329             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2330             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2331         } else {
2332             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2333         }
2334         break;
2335
2336     OP_32_64(mul):
2337         if (const_a2) {
2338             int32_t val;
2339             val = a2;
2340             if (val == (int8_t)val) {
2341                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2342                 tcg_out8(s, val);
2343             } else {
2344                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2345                 tcg_out32(s, val);
2346             }
2347         } else {
2348             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2349         }
2350         break;
2351
2352     OP_32_64(div2):
2353         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2354         break;
2355     OP_32_64(divu2):
2356         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2357         break;
2358
2359     OP_32_64(shl):
2360         /* For small constant 3-operand shift, use LEA.  */
2361         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2362             if (a2 - 1 == 0) {
2363                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2364                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2365             } else {
2366                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2367                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2368             }
2369             break;
2370         }
2371         c = SHIFT_SHL;
2372         vexop = OPC_SHLX;
2373         goto gen_shift_maybe_vex;
2374     OP_32_64(shr):
2375         c = SHIFT_SHR;
2376         vexop = OPC_SHRX;
2377         goto gen_shift_maybe_vex;
2378     OP_32_64(sar):
2379         c = SHIFT_SAR;
2380         vexop = OPC_SARX;
2381         goto gen_shift_maybe_vex;
2382     OP_32_64(rotl):
2383         c = SHIFT_ROL;
2384         goto gen_shift;
2385     OP_32_64(rotr):
2386         c = SHIFT_ROR;
2387         goto gen_shift;
2388     gen_shift_maybe_vex:
2389         if (have_bmi2) {
2390             if (!const_a2) {
2391                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2392                 break;
2393             }
2394             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2395         }
2396         /* FALLTHRU */
2397     gen_shift:
2398         if (const_a2) {
2399             tcg_out_shifti(s, c + rexw, a0, a2);
2400         } else {
2401             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2402         }
2403         break;
2404
2405     OP_32_64(ctz):
2406         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2407         break;
2408     OP_32_64(clz):
2409         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2410         break;
2411     OP_32_64(ctpop):
2412         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2413         break;
2414
2415     case INDEX_op_brcond_i32:
2416         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2417         break;
2418     case INDEX_op_setcond_i32:
2419         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2420         break;
2421     case INDEX_op_movcond_i32:
2422         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2423         break;
2424
2425     OP_32_64(bswap16):
2426         tcg_out_rolw_8(s, a0);
2427         break;
2428     OP_32_64(bswap32):
2429         tcg_out_bswap32(s, a0);
2430         break;
2431
2432     OP_32_64(neg):
2433         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2434         break;
2435     OP_32_64(not):
2436         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2437         break;
2438
2439     OP_32_64(ext8s):
2440         tcg_out_ext8s(s, a0, a1, rexw);
2441         break;
2442     OP_32_64(ext16s):
2443         tcg_out_ext16s(s, a0, a1, rexw);
2444         break;
2445     OP_32_64(ext8u):
2446         tcg_out_ext8u(s, a0, a1);
2447         break;
2448     OP_32_64(ext16u):
2449         tcg_out_ext16u(s, a0, a1);
2450         break;
2451
2452     case INDEX_op_qemu_ld_i32:
2453         tcg_out_qemu_ld(s, args, 0);
2454         break;
2455     case INDEX_op_qemu_ld_i64:
2456         tcg_out_qemu_ld(s, args, 1);
2457         break;
2458     case INDEX_op_qemu_st_i32:
2459     case INDEX_op_qemu_st8_i32:
2460         tcg_out_qemu_st(s, args, 0);
2461         break;
2462     case INDEX_op_qemu_st_i64:
2463         tcg_out_qemu_st(s, args, 1);
2464         break;
2465
2466     OP_32_64(mulu2):
2467         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2468         break;
2469     OP_32_64(muls2):
2470         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2471         break;
2472     OP_32_64(add2):
2473         if (const_args[4]) {
2474             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2475         } else {
2476             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2477         }
2478         if (const_args[5]) {
2479             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2480         } else {
2481             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2482         }
2483         break;
2484     OP_32_64(sub2):
2485         if (const_args[4]) {
2486             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2487         } else {
2488             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2489         }
2490         if (const_args[5]) {
2491             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2492         } else {
2493             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2494         }
2495         break;
2496
2497 #if TCG_TARGET_REG_BITS == 32
2498     case INDEX_op_brcond2_i32:
2499         tcg_out_brcond2(s, args, const_args, 0);
2500         break;
2501     case INDEX_op_setcond2_i32:
2502         tcg_out_setcond2(s, args, const_args);
2503         break;
2504 #else /* TCG_TARGET_REG_BITS == 64 */
2505     case INDEX_op_ld32s_i64:
2506         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2507         break;
2508     case INDEX_op_ld_i64:
2509         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2510         break;
2511     case INDEX_op_st_i64:
2512         if (const_args[0]) {
2513             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2514             tcg_out32(s, a0);
2515         } else {
2516             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2517         }
2518         break;
2519
2520     case INDEX_op_brcond_i64:
2521         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2522         break;
2523     case INDEX_op_setcond_i64:
2524         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2525         break;
2526     case INDEX_op_movcond_i64:
2527         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2528         break;
2529
2530     case INDEX_op_bswap64_i64:
2531         tcg_out_bswap64(s, a0);
2532         break;
2533     case INDEX_op_extu_i32_i64:
2534     case INDEX_op_ext32u_i64:
2535     case INDEX_op_extrl_i64_i32:
2536         tcg_out_ext32u(s, a0, a1);
2537         break;
2538     case INDEX_op_ext_i32_i64:
2539     case INDEX_op_ext32s_i64:
2540         tcg_out_ext32s(s, a0, a1);
2541         break;
2542     case INDEX_op_extrh_i64_i32:
2543         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2544         break;
2545 #endif
2546
2547     OP_32_64(deposit):
2548         if (args[3] == 0 && args[4] == 8) {
2549             /* load bits 0..7 */
2550             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2551         } else if (args[3] == 8 && args[4] == 8) {
2552             /* load bits 8..15 */
2553             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2554         } else if (args[3] == 0 && args[4] == 16) {
2555             /* load bits 0..15 */
2556             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2557         } else {
2558             tcg_abort();
2559         }
2560         break;
2561
2562     case INDEX_op_extract_i64:
2563         if (a2 + args[3] == 32) {
2564             /* This is a 32-bit zero-extending right shift.  */
2565             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2566             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2567             break;
2568         }
2569         /* FALLTHRU */
2570     case INDEX_op_extract_i32:
2571         /* On the off-chance that we can use the high-byte registers.
2572            Otherwise we emit the same ext16 + shift pattern that we
2573            would have gotten from the normal tcg-op.c expansion.  */
2574         tcg_debug_assert(a2 == 8 && args[3] == 8);
2575         if (a1 < 4 && a0 < 8) {
2576             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2577         } else {
2578             tcg_out_ext16u(s, a0, a1);
2579             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2580         }
2581         break;
2582
2583     case INDEX_op_sextract_i32:
2584         /* We don't implement sextract_i64, as we cannot sign-extend to
2585            64-bits without using the REX prefix that explicitly excludes
2586            access to the high-byte registers.  */
2587         tcg_debug_assert(a2 == 8 && args[3] == 8);
2588         if (a1 < 4 && a0 < 8) {
2589             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2590         } else {
2591             tcg_out_ext16s(s, a0, a1, 0);
2592             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2593         }
2594         break;
2595
2596     OP_32_64(extract2):
2597         /* Note that SHRD outputs to the r/m operand.  */
2598         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2599         tcg_out8(s, args[3]);
2600         break;
2601
2602     case INDEX_op_mb:
2603         tcg_out_mb(s, a0);
2604         break;
2605     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2606     case INDEX_op_mov_i64:
2607     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2608     default:
2609         tcg_abort();
2610     }
2611
2612 #undef OP_32_64
2613 }
2614
2615 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2616                            unsigned vecl, unsigned vece,
2617                            const TCGArg args[TCG_MAX_OP_ARGS],
2618                            const int const_args[TCG_MAX_OP_ARGS])
2619 {
2620     static int const add_insn[4] = {
2621         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2622     };
2623     static int const ssadd_insn[4] = {
2624         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2625     };
2626     static int const usadd_insn[4] = {
2627         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2628     };
2629     static int const sub_insn[4] = {
2630         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2631     };
2632     static int const sssub_insn[4] = {
2633         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2634     };
2635     static int const ussub_insn[4] = {
2636         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2637     };
2638     static int const mul_insn[4] = {
2639         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2640     };
2641     static int const shift_imm_insn[4] = {
2642         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2643     };
2644     static int const cmpeq_insn[4] = {
2645         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2646     };
2647     static int const cmpgt_insn[4] = {
2648         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2649     };
2650     static int const punpckl_insn[4] = {
2651         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2652     };
2653     static int const punpckh_insn[4] = {
2654         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2655     };
2656     static int const packss_insn[4] = {
2657         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2658     };
2659     static int const packus_insn[4] = {
2660         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2661     };
2662     static int const smin_insn[4] = {
2663         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2664     };
2665     static int const smax_insn[4] = {
2666         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2667     };
2668     static int const umin_insn[4] = {
2669         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2670     };
2671     static int const umax_insn[4] = {
2672         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2673     };
2674     static int const shlv_insn[4] = {
2675         /* TODO: AVX512 adds support for MO_16.  */
2676         OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2677     };
2678     static int const shrv_insn[4] = {
2679         /* TODO: AVX512 adds support for MO_16.  */
2680         OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2681     };
2682     static int const sarv_insn[4] = {
2683         /* TODO: AVX512 adds support for MO_16, MO_64.  */
2684         OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2685     };
2686     static int const shls_insn[4] = {
2687         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2688     };
2689     static int const shrs_insn[4] = {
2690         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2691     };
2692     static int const sars_insn[4] = {
2693         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2694     };
2695     static int const abs_insn[4] = {
2696         /* TODO: AVX512 adds support for MO_64.  */
2697         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2698     };
2699
2700     TCGType type = vecl + TCG_TYPE_V64;
2701     int insn, sub;
2702     TCGArg a0, a1, a2;
2703
2704     a0 = args[0];
2705     a1 = args[1];
2706     a2 = args[2];
2707
2708     switch (opc) {
2709     case INDEX_op_add_vec:
2710         insn = add_insn[vece];
2711         goto gen_simd;
2712     case INDEX_op_ssadd_vec:
2713         insn = ssadd_insn[vece];
2714         goto gen_simd;
2715     case INDEX_op_usadd_vec:
2716         insn = usadd_insn[vece];
2717         goto gen_simd;
2718     case INDEX_op_sub_vec:
2719         insn = sub_insn[vece];
2720         goto gen_simd;
2721     case INDEX_op_sssub_vec:
2722         insn = sssub_insn[vece];
2723         goto gen_simd;
2724     case INDEX_op_ussub_vec:
2725         insn = ussub_insn[vece];
2726         goto gen_simd;
2727     case INDEX_op_mul_vec:
2728         insn = mul_insn[vece];
2729         goto gen_simd;
2730     case INDEX_op_and_vec:
2731         insn = OPC_PAND;
2732         goto gen_simd;
2733     case INDEX_op_or_vec:
2734         insn = OPC_POR;
2735         goto gen_simd;
2736     case INDEX_op_xor_vec:
2737         insn = OPC_PXOR;
2738         goto gen_simd;
2739     case INDEX_op_smin_vec:
2740         insn = smin_insn[vece];
2741         goto gen_simd;
2742     case INDEX_op_umin_vec:
2743         insn = umin_insn[vece];
2744         goto gen_simd;
2745     case INDEX_op_smax_vec:
2746         insn = smax_insn[vece];
2747         goto gen_simd;
2748     case INDEX_op_umax_vec:
2749         insn = umax_insn[vece];
2750         goto gen_simd;
2751     case INDEX_op_shlv_vec:
2752         insn = shlv_insn[vece];
2753         goto gen_simd;
2754     case INDEX_op_shrv_vec:
2755         insn = shrv_insn[vece];
2756         goto gen_simd;
2757     case INDEX_op_sarv_vec:
2758         insn = sarv_insn[vece];
2759         goto gen_simd;
2760     case INDEX_op_shls_vec:
2761         insn = shls_insn[vece];
2762         goto gen_simd;
2763     case INDEX_op_shrs_vec:
2764         insn = shrs_insn[vece];
2765         goto gen_simd;
2766     case INDEX_op_sars_vec:
2767         insn = sars_insn[vece];
2768         goto gen_simd;
2769     case INDEX_op_x86_punpckl_vec:
2770         insn = punpckl_insn[vece];
2771         goto gen_simd;
2772     case INDEX_op_x86_punpckh_vec:
2773         insn = punpckh_insn[vece];
2774         goto gen_simd;
2775     case INDEX_op_x86_packss_vec:
2776         insn = packss_insn[vece];
2777         goto gen_simd;
2778     case INDEX_op_x86_packus_vec:
2779         insn = packus_insn[vece];
2780         goto gen_simd;
2781 #if TCG_TARGET_REG_BITS == 32
2782     case INDEX_op_dup2_vec:
2783         /* First merge the two 32-bit inputs to a single 64-bit element. */
2784         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2785         /* Then replicate the 64-bit elements across the rest of the vector. */
2786         if (type != TCG_TYPE_V64) {
2787             tcg_out_dup_vec(s, type, MO_64, a0, a0);
2788         }
2789         break;
2790 #endif
2791     case INDEX_op_abs_vec:
2792         insn = abs_insn[vece];
2793         a2 = a1;
2794         a1 = 0;
2795         goto gen_simd;
2796     gen_simd:
2797         tcg_debug_assert(insn != OPC_UD2);
2798         if (type == TCG_TYPE_V256) {
2799             insn |= P_VEXL;
2800         }
2801         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2802         break;
2803
2804     case INDEX_op_cmp_vec:
2805         sub = args[3];
2806         if (sub == TCG_COND_EQ) {
2807             insn = cmpeq_insn[vece];
2808         } else if (sub == TCG_COND_GT) {
2809             insn = cmpgt_insn[vece];
2810         } else {
2811             g_assert_not_reached();
2812         }
2813         goto gen_simd;
2814
2815     case INDEX_op_andc_vec:
2816         insn = OPC_PANDN;
2817         if (type == TCG_TYPE_V256) {
2818             insn |= P_VEXL;
2819         }
2820         tcg_out_vex_modrm(s, insn, a0, a2, a1);
2821         break;
2822
2823     case INDEX_op_shli_vec:
2824         sub = 6;
2825         goto gen_shift;
2826     case INDEX_op_shri_vec:
2827         sub = 2;
2828         goto gen_shift;
2829     case INDEX_op_sari_vec:
2830         tcg_debug_assert(vece != MO_64);
2831         sub = 4;
2832     gen_shift:
2833         tcg_debug_assert(vece != MO_8);
2834         insn = shift_imm_insn[vece];
2835         if (type == TCG_TYPE_V256) {
2836             insn |= P_VEXL;
2837         }
2838         tcg_out_vex_modrm(s, insn, sub, a0, a1);
2839         tcg_out8(s, a2);
2840         break;
2841
2842     case INDEX_op_ld_vec:
2843         tcg_out_ld(s, type, a0, a1, a2);
2844         break;
2845     case INDEX_op_st_vec:
2846         tcg_out_st(s, type, a0, a1, a2);
2847         break;
2848     case INDEX_op_dupm_vec:
2849         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2850         break;
2851
2852     case INDEX_op_x86_shufps_vec:
2853         insn = OPC_SHUFPS;
2854         sub = args[3];
2855         goto gen_simd_imm8;
2856     case INDEX_op_x86_blend_vec:
2857         if (vece == MO_16) {
2858             insn = OPC_PBLENDW;
2859         } else if (vece == MO_32) {
2860             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2861         } else {
2862             g_assert_not_reached();
2863         }
2864         sub = args[3];
2865         goto gen_simd_imm8;
2866     case INDEX_op_x86_vperm2i128_vec:
2867         insn = OPC_VPERM2I128;
2868         sub = args[3];
2869         goto gen_simd_imm8;
2870     gen_simd_imm8:
2871         if (type == TCG_TYPE_V256) {
2872             insn |= P_VEXL;
2873         }
2874         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2875         tcg_out8(s, sub);
2876         break;
2877
2878     case INDEX_op_x86_vpblendvb_vec:
2879         insn = OPC_VPBLENDVB;
2880         if (type == TCG_TYPE_V256) {
2881             insn |= P_VEXL;
2882         }
2883         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2884         tcg_out8(s, args[3] << 4);
2885         break;
2886
2887     case INDEX_op_x86_psrldq_vec:
2888         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2889         tcg_out8(s, a2);
2890         break;
2891
2892     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2893     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2894     default:
2895         g_assert_not_reached();
2896     }
2897 }
2898
2899 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2900 {
2901     switch (op) {
2902     case INDEX_op_goto_ptr:
2903         return C_O0_I1(r);
2904
2905     case INDEX_op_ld8u_i32:
2906     case INDEX_op_ld8u_i64:
2907     case INDEX_op_ld8s_i32:
2908     case INDEX_op_ld8s_i64:
2909     case INDEX_op_ld16u_i32:
2910     case INDEX_op_ld16u_i64:
2911     case INDEX_op_ld16s_i32:
2912     case INDEX_op_ld16s_i64:
2913     case INDEX_op_ld_i32:
2914     case INDEX_op_ld32u_i64:
2915     case INDEX_op_ld32s_i64:
2916     case INDEX_op_ld_i64:
2917         return C_O1_I1(r, r);
2918
2919     case INDEX_op_st8_i32:
2920     case INDEX_op_st8_i64:
2921         return C_O0_I2(qi, r);
2922
2923     case INDEX_op_st16_i32:
2924     case INDEX_op_st16_i64:
2925     case INDEX_op_st_i32:
2926     case INDEX_op_st32_i64:
2927         return C_O0_I2(ri, r);
2928
2929     case INDEX_op_st_i64:
2930         return C_O0_I2(re, r);
2931
2932     case INDEX_op_add_i32:
2933     case INDEX_op_add_i64:
2934         return C_O1_I2(r, r, re);
2935
2936     case INDEX_op_sub_i32:
2937     case INDEX_op_sub_i64:
2938     case INDEX_op_mul_i32:
2939     case INDEX_op_mul_i64:
2940     case INDEX_op_or_i32:
2941     case INDEX_op_or_i64:
2942     case INDEX_op_xor_i32:
2943     case INDEX_op_xor_i64:
2944         return C_O1_I2(r, 0, re);
2945
2946     case INDEX_op_and_i32:
2947     case INDEX_op_and_i64:
2948         return C_O1_I2(r, 0, reZ);
2949
2950     case INDEX_op_andc_i32:
2951     case INDEX_op_andc_i64:
2952         return C_O1_I2(r, r, rI);
2953
2954     case INDEX_op_shl_i32:
2955     case INDEX_op_shl_i64:
2956     case INDEX_op_shr_i32:
2957     case INDEX_op_shr_i64:
2958     case INDEX_op_sar_i32:
2959     case INDEX_op_sar_i64:
2960         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2961
2962     case INDEX_op_rotl_i32:
2963     case INDEX_op_rotl_i64:
2964     case INDEX_op_rotr_i32:
2965     case INDEX_op_rotr_i64:
2966         return C_O1_I2(r, 0, ci);
2967
2968     case INDEX_op_brcond_i32:
2969     case INDEX_op_brcond_i64:
2970         return C_O0_I2(r, re);
2971
2972     case INDEX_op_bswap16_i32:
2973     case INDEX_op_bswap16_i64:
2974     case INDEX_op_bswap32_i32:
2975     case INDEX_op_bswap32_i64:
2976     case INDEX_op_bswap64_i64:
2977     case INDEX_op_neg_i32:
2978     case INDEX_op_neg_i64:
2979     case INDEX_op_not_i32:
2980     case INDEX_op_not_i64:
2981     case INDEX_op_extrh_i64_i32:
2982         return C_O1_I1(r, 0);
2983
2984     case INDEX_op_ext8s_i32:
2985     case INDEX_op_ext8s_i64:
2986     case INDEX_op_ext8u_i32:
2987     case INDEX_op_ext8u_i64:
2988         return C_O1_I1(r, q);
2989
2990     case INDEX_op_ext16s_i32:
2991     case INDEX_op_ext16s_i64:
2992     case INDEX_op_ext16u_i32:
2993     case INDEX_op_ext16u_i64:
2994     case INDEX_op_ext32s_i64:
2995     case INDEX_op_ext32u_i64:
2996     case INDEX_op_ext_i32_i64:
2997     case INDEX_op_extu_i32_i64:
2998     case INDEX_op_extrl_i64_i32:
2999     case INDEX_op_extract_i32:
3000     case INDEX_op_extract_i64:
3001     case INDEX_op_sextract_i32:
3002     case INDEX_op_ctpop_i32:
3003     case INDEX_op_ctpop_i64:
3004         return C_O1_I1(r, r);
3005
3006     case INDEX_op_extract2_i32:
3007     case INDEX_op_extract2_i64:
3008         return C_O1_I2(r, 0, r);
3009
3010     case INDEX_op_deposit_i32:
3011     case INDEX_op_deposit_i64:
3012         return C_O1_I2(Q, 0, Q);
3013
3014     case INDEX_op_setcond_i32:
3015     case INDEX_op_setcond_i64:
3016         return C_O1_I2(q, r, re);
3017
3018     case INDEX_op_movcond_i32:
3019     case INDEX_op_movcond_i64:
3020         return C_O1_I4(r, r, re, r, 0);
3021
3022     case INDEX_op_div2_i32:
3023     case INDEX_op_div2_i64:
3024     case INDEX_op_divu2_i32:
3025     case INDEX_op_divu2_i64:
3026         return C_O2_I3(a, d, 0, 1, r);
3027
3028     case INDEX_op_mulu2_i32:
3029     case INDEX_op_mulu2_i64:
3030     case INDEX_op_muls2_i32:
3031     case INDEX_op_muls2_i64:
3032         return C_O2_I2(a, d, a, r);
3033
3034     case INDEX_op_add2_i32:
3035     case INDEX_op_add2_i64:
3036     case INDEX_op_sub2_i32:
3037     case INDEX_op_sub2_i64:
3038         return C_O2_I4(r, r, 0, 1, re, re);
3039
3040     case INDEX_op_ctz_i32:
3041     case INDEX_op_ctz_i64:
3042         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3043
3044     case INDEX_op_clz_i32:
3045     case INDEX_op_clz_i64:
3046         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3047
3048     case INDEX_op_qemu_ld_i32:
3049         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3050                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3051
3052     case INDEX_op_qemu_st_i32:
3053         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3054                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3055     case INDEX_op_qemu_st8_i32:
3056         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3057                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3058
3059     case INDEX_op_qemu_ld_i64:
3060         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3061                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3062                 : C_O2_I2(r, r, L, L));
3063
3064     case INDEX_op_qemu_st_i64:
3065         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3066                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3067                 : C_O0_I4(L, L, L, L));
3068
3069     case INDEX_op_brcond2_i32:
3070         return C_O0_I4(r, r, ri, ri);
3071
3072     case INDEX_op_setcond2_i32:
3073         return C_O1_I4(r, r, r, ri, ri);
3074
3075     case INDEX_op_ld_vec:
3076     case INDEX_op_dupm_vec:
3077         return C_O1_I1(x, r);
3078
3079     case INDEX_op_st_vec:
3080         return C_O0_I2(x, r);
3081
3082     case INDEX_op_add_vec:
3083     case INDEX_op_sub_vec:
3084     case INDEX_op_mul_vec:
3085     case INDEX_op_and_vec:
3086     case INDEX_op_or_vec:
3087     case INDEX_op_xor_vec:
3088     case INDEX_op_andc_vec:
3089     case INDEX_op_ssadd_vec:
3090     case INDEX_op_usadd_vec:
3091     case INDEX_op_sssub_vec:
3092     case INDEX_op_ussub_vec:
3093     case INDEX_op_smin_vec:
3094     case INDEX_op_umin_vec:
3095     case INDEX_op_smax_vec:
3096     case INDEX_op_umax_vec:
3097     case INDEX_op_shlv_vec:
3098     case INDEX_op_shrv_vec:
3099     case INDEX_op_sarv_vec:
3100     case INDEX_op_shls_vec:
3101     case INDEX_op_shrs_vec:
3102     case INDEX_op_sars_vec:
3103     case INDEX_op_rotls_vec:
3104     case INDEX_op_cmp_vec:
3105     case INDEX_op_x86_shufps_vec:
3106     case INDEX_op_x86_blend_vec:
3107     case INDEX_op_x86_packss_vec:
3108     case INDEX_op_x86_packus_vec:
3109     case INDEX_op_x86_vperm2i128_vec:
3110     case INDEX_op_x86_punpckl_vec:
3111     case INDEX_op_x86_punpckh_vec:
3112 #if TCG_TARGET_REG_BITS == 32
3113     case INDEX_op_dup2_vec:
3114 #endif
3115         return C_O1_I2(x, x, x);
3116
3117     case INDEX_op_abs_vec:
3118     case INDEX_op_dup_vec:
3119     case INDEX_op_shli_vec:
3120     case INDEX_op_shri_vec:
3121     case INDEX_op_sari_vec:
3122     case INDEX_op_x86_psrldq_vec:
3123         return C_O1_I1(x, x);
3124
3125     case INDEX_op_x86_vpblendvb_vec:
3126         return C_O1_I3(x, x, x, x);
3127
3128     default:
3129         g_assert_not_reached();
3130     }
3131 }
3132
3133 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3134 {
3135     switch (opc) {
3136     case INDEX_op_add_vec:
3137     case INDEX_op_sub_vec:
3138     case INDEX_op_and_vec:
3139     case INDEX_op_or_vec:
3140     case INDEX_op_xor_vec:
3141     case INDEX_op_andc_vec:
3142         return 1;
3143     case INDEX_op_rotli_vec:
3144     case INDEX_op_cmp_vec:
3145     case INDEX_op_cmpsel_vec:
3146         return -1;
3147
3148     case INDEX_op_shli_vec:
3149     case INDEX_op_shri_vec:
3150         /* We must expand the operation for MO_8.  */
3151         return vece == MO_8 ? -1 : 1;
3152
3153     case INDEX_op_sari_vec:
3154         /* We must expand the operation for MO_8.  */
3155         if (vece == MO_8) {
3156             return -1;
3157         }
3158         /* We can emulate this for MO_64, but it does not pay off
3159            unless we're producing at least 4 values.  */
3160         if (vece == MO_64) {
3161             return type >= TCG_TYPE_V256 ? -1 : 0;
3162         }
3163         return 1;
3164
3165     case INDEX_op_shls_vec:
3166     case INDEX_op_shrs_vec:
3167         return vece >= MO_16;
3168     case INDEX_op_sars_vec:
3169         return vece >= MO_16 && vece <= MO_32;
3170     case INDEX_op_rotls_vec:
3171         return vece >= MO_16 ? -1 : 0;
3172
3173     case INDEX_op_shlv_vec:
3174     case INDEX_op_shrv_vec:
3175         return have_avx2 && vece >= MO_32;
3176     case INDEX_op_sarv_vec:
3177         return have_avx2 && vece == MO_32;
3178     case INDEX_op_rotlv_vec:
3179     case INDEX_op_rotrv_vec:
3180         return have_avx2 && vece >= MO_32 ? -1 : 0;
3181
3182     case INDEX_op_mul_vec:
3183         if (vece == MO_8) {
3184             /* We can expand the operation for MO_8.  */
3185             return -1;
3186         }
3187         if (vece == MO_64) {
3188             return 0;
3189         }
3190         return 1;
3191
3192     case INDEX_op_ssadd_vec:
3193     case INDEX_op_usadd_vec:
3194     case INDEX_op_sssub_vec:
3195     case INDEX_op_ussub_vec:
3196         return vece <= MO_16;
3197     case INDEX_op_smin_vec:
3198     case INDEX_op_smax_vec:
3199     case INDEX_op_umin_vec:
3200     case INDEX_op_umax_vec:
3201     case INDEX_op_abs_vec:
3202         return vece <= MO_32;
3203
3204     default:
3205         return 0;
3206     }
3207 }
3208
3209 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3210                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3211 {
3212     TCGv_vec t1, t2;
3213
3214     tcg_debug_assert(vece == MO_8);
3215
3216     t1 = tcg_temp_new_vec(type);
3217     t2 = tcg_temp_new_vec(type);
3218
3219     /*
3220      * Unpack to W, shift, and repack.  Tricky bits:
3221      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3222      *     i.e. duplicate in other half of the 16-bit lane.
3223      * (2) For right-shift, add 8 so that the high half of the lane
3224      *     becomes zero.  For left-shift, and left-rotate, we must
3225      *     shift up and down again.
3226      * (3) Step 2 leaves high half zero such that PACKUSWB
3227      *     (pack with unsigned saturation) does not modify
3228      *     the quantity.
3229      */
3230     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3231               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3232     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3233               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3234
3235     if (opc != INDEX_op_rotli_vec) {
3236         imm += 8;
3237     }
3238     if (opc == INDEX_op_shri_vec) {
3239         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3240         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3241     } else {
3242         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3243         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3244         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3245         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3246     }
3247
3248     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3249               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3250     tcg_temp_free_vec(t1);
3251     tcg_temp_free_vec(t2);
3252 }
3253
3254 static void expand_vec_sari(TCGType type, unsigned vece,
3255                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3256 {
3257     TCGv_vec t1, t2;
3258
3259     switch (vece) {
3260     case MO_8:
3261         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3262         t1 = tcg_temp_new_vec(type);
3263         t2 = tcg_temp_new_vec(type);
3264         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3265                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3266         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3267                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3268         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3269         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3270         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3271                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3272         tcg_temp_free_vec(t1);
3273         tcg_temp_free_vec(t2);
3274         break;
3275
3276     case MO_64:
3277         if (imm <= 32) {
3278             /*
3279              * We can emulate a small sign extend by performing an arithmetic
3280              * 32-bit shift and overwriting the high half of a 64-bit logical
3281              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3282              * does not, so we have to bound the smaller shift -- we get the
3283              * same result in the high half either way.
3284              */
3285             t1 = tcg_temp_new_vec(type);
3286             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3287             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3288             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3289                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3290                       tcgv_vec_arg(t1), 0xaa);
3291             tcg_temp_free_vec(t1);
3292         } else {
3293             /* Otherwise we will need to use a compare vs 0 to produce
3294              * the sign-extend, shift and merge.
3295              */
3296             t1 = tcg_const_zeros_vec(type);
3297             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3298             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3299             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3300             tcg_gen_or_vec(MO_64, v0, v0, t1);
3301             tcg_temp_free_vec(t1);
3302         }
3303         break;
3304
3305     default:
3306         g_assert_not_reached();
3307     }
3308 }
3309
3310 static void expand_vec_rotli(TCGType type, unsigned vece,
3311                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3312 {
3313     TCGv_vec t;
3314
3315     if (vece == MO_8) {
3316         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3317         return;
3318     }
3319
3320     t = tcg_temp_new_vec(type);
3321     tcg_gen_shli_vec(vece, t, v1, imm);
3322     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3323     tcg_gen_or_vec(vece, v0, v0, t);
3324     tcg_temp_free_vec(t);
3325 }
3326
3327 static void expand_vec_rotls(TCGType type, unsigned vece,
3328                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3329 {
3330     TCGv_i32 rsh;
3331     TCGv_vec t;
3332
3333     tcg_debug_assert(vece != MO_8);
3334
3335     t = tcg_temp_new_vec(type);
3336     rsh = tcg_temp_new_i32();
3337
3338     tcg_gen_neg_i32(rsh, lsh);
3339     tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3340     tcg_gen_shls_vec(vece, t, v1, lsh);
3341     tcg_gen_shrs_vec(vece, v0, v1, rsh);
3342     tcg_gen_or_vec(vece, v0, v0, t);
3343     tcg_temp_free_vec(t);
3344     tcg_temp_free_i32(rsh);
3345 }
3346
3347 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3348                             TCGv_vec v1, TCGv_vec sh, bool right)
3349 {
3350     TCGv_vec t = tcg_temp_new_vec(type);
3351
3352     tcg_gen_dupi_vec(vece, t, 8 << vece);
3353     tcg_gen_sub_vec(vece, t, t, sh);
3354     if (right) {
3355         tcg_gen_shlv_vec(vece, t, v1, t);
3356         tcg_gen_shrv_vec(vece, v0, v1, sh);
3357     } else {
3358         tcg_gen_shrv_vec(vece, t, v1, t);
3359         tcg_gen_shlv_vec(vece, v0, v1, sh);
3360     }
3361     tcg_gen_or_vec(vece, v0, v0, t);
3362     tcg_temp_free_vec(t);
3363 }
3364
3365 static void expand_vec_mul(TCGType type, unsigned vece,
3366                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3367 {
3368     TCGv_vec t1, t2, t3, t4, zero;
3369
3370     tcg_debug_assert(vece == MO_8);
3371
3372     /*
3373      * Unpack v1 bytes to words, 0 | x.
3374      * Unpack v2 bytes to words, y | 0.
3375      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3376      * Shift logical right by 8 bits to clear the high 8 bytes before
3377      * using an unsigned saturated pack.
3378      *
3379      * The difference between the V64, V128 and V256 cases is merely how
3380      * we distribute the expansion between temporaries.
3381      */
3382     switch (type) {
3383     case TCG_TYPE_V64:
3384         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3385         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3386         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3387         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3388                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3389         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3390                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3391         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3392         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3393         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3394                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3395         tcg_temp_free_vec(t1);
3396         tcg_temp_free_vec(t2);
3397         break;
3398
3399     case TCG_TYPE_V128:
3400     case TCG_TYPE_V256:
3401         t1 = tcg_temp_new_vec(type);
3402         t2 = tcg_temp_new_vec(type);
3403         t3 = tcg_temp_new_vec(type);
3404         t4 = tcg_temp_new_vec(type);
3405         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3406         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3407                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3408         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3409                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3410         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3411                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3412         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3413                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3414         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3415         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3416         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3417         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3418         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3419                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3420         tcg_temp_free_vec(t1);
3421         tcg_temp_free_vec(t2);
3422         tcg_temp_free_vec(t3);
3423         tcg_temp_free_vec(t4);
3424         break;
3425
3426     default:
3427         g_assert_not_reached();
3428     }
3429 }
3430
3431 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3432                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3433 {
3434     enum {
3435         NEED_INV  = 1,
3436         NEED_SWAP = 2,
3437         NEED_BIAS = 4,
3438         NEED_UMIN = 8,
3439         NEED_UMAX = 16,
3440     };
3441     TCGv_vec t1, t2, t3;
3442     uint8_t fixup;
3443
3444     switch (cond) {
3445     case TCG_COND_EQ:
3446     case TCG_COND_GT:
3447         fixup = 0;
3448         break;
3449     case TCG_COND_NE:
3450     case TCG_COND_LE:
3451         fixup = NEED_INV;
3452         break;
3453     case TCG_COND_LT:
3454         fixup = NEED_SWAP;
3455         break;
3456     case TCG_COND_GE:
3457         fixup = NEED_SWAP | NEED_INV;
3458         break;
3459     case TCG_COND_LEU:
3460         if (vece <= MO_32) {
3461             fixup = NEED_UMIN;
3462         } else {
3463             fixup = NEED_BIAS | NEED_INV;
3464         }
3465         break;
3466     case TCG_COND_GTU:
3467         if (vece <= MO_32) {
3468             fixup = NEED_UMIN | NEED_INV;
3469         } else {
3470             fixup = NEED_BIAS;
3471         }
3472         break;
3473     case TCG_COND_GEU:
3474         if (vece <= MO_32) {
3475             fixup = NEED_UMAX;
3476         } else {
3477             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3478         }
3479         break;
3480     case TCG_COND_LTU:
3481         if (vece <= MO_32) {
3482             fixup = NEED_UMAX | NEED_INV;
3483         } else {
3484             fixup = NEED_BIAS | NEED_SWAP;
3485         }
3486         break;
3487     default:
3488         g_assert_not_reached();
3489     }
3490
3491     if (fixup & NEED_INV) {
3492         cond = tcg_invert_cond(cond);
3493     }
3494     if (fixup & NEED_SWAP) {
3495         t1 = v1, v1 = v2, v2 = t1;
3496         cond = tcg_swap_cond(cond);
3497     }
3498
3499     t1 = t2 = NULL;
3500     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3501         t1 = tcg_temp_new_vec(type);
3502         if (fixup & NEED_UMIN) {
3503             tcg_gen_umin_vec(vece, t1, v1, v2);
3504         } else {
3505             tcg_gen_umax_vec(vece, t1, v1, v2);
3506         }
3507         v2 = t1;
3508         cond = TCG_COND_EQ;
3509     } else if (fixup & NEED_BIAS) {
3510         t1 = tcg_temp_new_vec(type);
3511         t2 = tcg_temp_new_vec(type);
3512         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3513         tcg_gen_sub_vec(vece, t1, v1, t3);
3514         tcg_gen_sub_vec(vece, t2, v2, t3);
3515         v1 = t1;
3516         v2 = t2;
3517         cond = tcg_signed_cond(cond);
3518     }
3519
3520     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3521     /* Expand directly; do not recurse.  */
3522     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3523               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3524
3525     if (t1) {
3526         tcg_temp_free_vec(t1);
3527         if (t2) {
3528             tcg_temp_free_vec(t2);
3529         }
3530     }
3531     return fixup & NEED_INV;
3532 }
3533
3534 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3535                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3536 {
3537     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3538         tcg_gen_not_vec(vece, v0, v0);
3539     }
3540 }
3541
3542 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3543                               TCGv_vec c1, TCGv_vec c2,
3544                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3545 {
3546     TCGv_vec t = tcg_temp_new_vec(type);
3547
3548     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3549         /* Invert the sense of the compare by swapping arguments.  */
3550         TCGv_vec x;
3551         x = v3, v3 = v4, v4 = x;
3552     }
3553     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3554               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3555               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3556     tcg_temp_free_vec(t);
3557 }
3558
3559 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3560                        TCGArg a0, ...)
3561 {
3562     va_list va;
3563     TCGArg a2;
3564     TCGv_vec v0, v1, v2, v3, v4;
3565
3566     va_start(va, a0);
3567     v0 = temp_tcgv_vec(arg_temp(a0));
3568     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3569     a2 = va_arg(va, TCGArg);
3570
3571     switch (opc) {
3572     case INDEX_op_shli_vec:
3573     case INDEX_op_shri_vec:
3574         expand_vec_shi(type, vece, opc, v0, v1, a2);
3575         break;
3576
3577     case INDEX_op_sari_vec:
3578         expand_vec_sari(type, vece, v0, v1, a2);
3579         break;
3580
3581     case INDEX_op_rotli_vec:
3582         expand_vec_rotli(type, vece, v0, v1, a2);
3583         break;
3584
3585     case INDEX_op_rotls_vec:
3586         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3587         break;
3588
3589     case INDEX_op_rotlv_vec:
3590         v2 = temp_tcgv_vec(arg_temp(a2));
3591         expand_vec_rotv(type, vece, v0, v1, v2, false);
3592         break;
3593     case INDEX_op_rotrv_vec:
3594         v2 = temp_tcgv_vec(arg_temp(a2));
3595         expand_vec_rotv(type, vece, v0, v1, v2, true);
3596         break;
3597
3598     case INDEX_op_mul_vec:
3599         v2 = temp_tcgv_vec(arg_temp(a2));
3600         expand_vec_mul(type, vece, v0, v1, v2);
3601         break;
3602
3603     case INDEX_op_cmp_vec:
3604         v2 = temp_tcgv_vec(arg_temp(a2));
3605         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3606         break;
3607
3608     case INDEX_op_cmpsel_vec:
3609         v2 = temp_tcgv_vec(arg_temp(a2));
3610         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3611         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3612         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3613         break;
3614
3615     default:
3616         break;
3617     }
3618
3619     va_end(va);
3620 }
3621
3622 static const int tcg_target_callee_save_regs[] = {
3623 #if TCG_TARGET_REG_BITS == 64
3624     TCG_REG_RBP,
3625     TCG_REG_RBX,
3626 #if defined(_WIN64)
3627     TCG_REG_RDI,
3628     TCG_REG_RSI,
3629 #endif
3630     TCG_REG_R12,
3631     TCG_REG_R13,
3632     TCG_REG_R14, /* Currently used for the global env. */
3633     TCG_REG_R15,
3634 #else
3635     TCG_REG_EBP, /* Currently used for the global env. */
3636     TCG_REG_EBX,
3637     TCG_REG_ESI,
3638     TCG_REG_EDI,
3639 #endif
3640 };
3641
3642 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3643    and tcg_register_jit.  */
3644
3645 #define PUSH_SIZE \
3646     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3647      * (TCG_TARGET_REG_BITS / 8))
3648
3649 #define FRAME_SIZE \
3650     ((PUSH_SIZE \
3651       + TCG_STATIC_CALL_ARGS_SIZE \
3652       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3653       + TCG_TARGET_STACK_ALIGN - 1) \
3654      & ~(TCG_TARGET_STACK_ALIGN - 1))
3655
3656 /* Generate global QEMU prologue and epilogue code */
3657 static void tcg_target_qemu_prologue(TCGContext *s)
3658 {
3659     int i, stack_addend;
3660
3661     /* TB prologue */
3662
3663     /* Reserve some stack space, also for TCG temps.  */
3664     stack_addend = FRAME_SIZE - PUSH_SIZE;
3665     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3666                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3667
3668     /* Save all callee saved registers.  */
3669     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3670         tcg_out_push(s, tcg_target_callee_save_regs[i]);
3671     }
3672
3673 #if TCG_TARGET_REG_BITS == 32
3674     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3675                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3676     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3677     /* jmp *tb.  */
3678     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3679                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3680                          + stack_addend);
3681 #else
3682 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3683     if (guest_base) {
3684         int seg = setup_guest_base_seg();
3685         if (seg != 0) {
3686             x86_guest_base_seg = seg;
3687         } else if (guest_base == (int32_t)guest_base) {
3688             x86_guest_base_offset = guest_base;
3689         } else {
3690             /* Choose R12 because, as a base, it requires a SIB byte. */
3691             x86_guest_base_index = TCG_REG_R12;
3692             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3693             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3694         }
3695     }
3696 # endif
3697     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3698     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3699     /* jmp *tb.  */
3700     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3701 #endif
3702
3703     /*
3704      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3705      * and fall through to the rest of the epilogue.
3706      */
3707     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3708     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3709
3710     /* TB epilogue */
3711     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3712
3713     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3714
3715     if (have_avx2) {
3716         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3717     }
3718     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3719         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3720     }
3721     tcg_out_opc(s, OPC_RET, 0, 0, 0);
3722 }
3723
3724 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3725 {
3726     memset(p, 0x90, count);
3727 }
3728
3729 static void tcg_target_init(TCGContext *s)
3730 {
3731 #ifdef CONFIG_CPUID_H
3732     unsigned a, b, c, d, b7 = 0;
3733     int max = __get_cpuid_max(0, 0);
3734
3735     if (max >= 7) {
3736         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3737         __cpuid_count(7, 0, a, b7, c, d);
3738         have_bmi1 = (b7 & bit_BMI) != 0;
3739         have_bmi2 = (b7 & bit_BMI2) != 0;
3740     }
3741
3742     if (max >= 1) {
3743         __cpuid(1, a, b, c, d);
3744 #ifndef have_cmov
3745         /* For 32-bit, 99% certainty that we're running on hardware that
3746            supports cmov, but we still need to check.  In case cmov is not
3747            available, we'll use a small forward branch.  */
3748         have_cmov = (d & bit_CMOV) != 0;
3749 #endif
3750
3751         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3752            need to probe for it.  */
3753         have_movbe = (c & bit_MOVBE) != 0;
3754         have_popcnt = (c & bit_POPCNT) != 0;
3755
3756         /* There are a number of things we must check before we can be
3757            sure of not hitting invalid opcode.  */
3758         if (c & bit_OSXSAVE) {
3759             unsigned xcrl, xcrh;
3760             /* The xgetbv instruction is not available to older versions of
3761              * the assembler, so we encode the instruction manually.
3762              */
3763             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3764             if ((xcrl & 6) == 6) {
3765                 have_avx1 = (c & bit_AVX) != 0;
3766                 have_avx2 = (b7 & bit_AVX2) != 0;
3767             }
3768         }
3769     }
3770
3771     max = __get_cpuid_max(0x8000000, 0);
3772     if (max >= 1) {
3773         __cpuid(0x80000001, a, b, c, d);
3774         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3775         have_lzcnt = (c & bit_LZCNT) != 0;
3776     }
3777 #endif /* CONFIG_CPUID_H */
3778
3779     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3780     if (TCG_TARGET_REG_BITS == 64) {
3781         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3782     }
3783     if (have_avx1) {
3784         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3785         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3786     }
3787     if (have_avx2) {
3788         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3789     }
3790
3791     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3792     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3793     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3794     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3795     if (TCG_TARGET_REG_BITS == 64) {
3796 #if !defined(_WIN64)
3797         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3798         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3799 #endif
3800         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3801         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3802         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3803         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3804     }
3805
3806     s->reserved_regs = 0;
3807     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3808 }
3809
3810 typedef struct {
3811     DebugFrameHeader h;
3812     uint8_t fde_def_cfa[4];
3813     uint8_t fde_reg_ofs[14];
3814 } DebugFrame;
3815
3816 /* We're expecting a 2 byte uleb128 encoded value.  */
3817 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3818
3819 #if !defined(__ELF__)
3820     /* Host machine without ELF. */
3821 #elif TCG_TARGET_REG_BITS == 64
3822 #define ELF_HOST_MACHINE EM_X86_64
3823 static const DebugFrame debug_frame = {
3824     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3825     .h.cie.id = -1,
3826     .h.cie.version = 1,
3827     .h.cie.code_align = 1,
3828     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3829     .h.cie.return_column = 16,
3830
3831     /* Total FDE size does not include the "len" member.  */
3832     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3833
3834     .fde_def_cfa = {
3835         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3836         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3837         (FRAME_SIZE >> 7)
3838     },
3839     .fde_reg_ofs = {
3840         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3841         /* The following ordering must match tcg_target_callee_save_regs.  */
3842         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3843         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3844         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3845         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3846         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3847         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3848     }
3849 };
3850 #else
3851 #define ELF_HOST_MACHINE EM_386
3852 static const DebugFrame debug_frame = {
3853     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3854     .h.cie.id = -1,
3855     .h.cie.version = 1,
3856     .h.cie.code_align = 1,
3857     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3858     .h.cie.return_column = 8,
3859
3860     /* Total FDE size does not include the "len" member.  */
3861     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3862
3863     .fde_def_cfa = {
3864         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3865         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3866         (FRAME_SIZE >> 7)
3867     },
3868     .fde_reg_ofs = {
3869         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3870         /* The following ordering must match tcg_target_callee_save_regs.  */
3871         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3872         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3873         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3874         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3875     }
3876 };
3877 #endif
3878
3879 #if defined(ELF_HOST_MACHINE)
3880 void tcg_register_jit(const void *buf, size_t buf_size)
3881 {
3882     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3883 }
3884 #endif