tcg/i386/tcg-target.inc.c

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "tcg-pool.inc.c"
  26
  27 #ifdef CONFIG_DEBUG_TCG
  28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29 #if TCG_TARGET_REG_BITS == 64
  30     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31 #else
  32     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33 #endif
  34     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36 #if TCG_TARGET_REG_BITS == 64
  37     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39 #endif
  40 };
  41 #endif
  42
  43 static const int tcg_target_reg_alloc_order[] = {
  44 #if TCG_TARGET_REG_BITS == 64
  45     TCG_REG_RBP,
  46     TCG_REG_RBX,
  47     TCG_REG_R12,
  48     TCG_REG_R13,
  49     TCG_REG_R14,
  50     TCG_REG_R15,
  51     TCG_REG_R10,
  52     TCG_REG_R11,
  53     TCG_REG_R9,
  54     TCG_REG_R8,
  55     TCG_REG_RCX,
  56     TCG_REG_RDX,
  57     TCG_REG_RSI,
  58     TCG_REG_RDI,
  59     TCG_REG_RAX,
  60 #else
  61     TCG_REG_EBX,
  62     TCG_REG_ESI,
  63     TCG_REG_EDI,
  64     TCG_REG_EBP,
  65     TCG_REG_ECX,
  66     TCG_REG_EDX,
  67     TCG_REG_EAX,
  68 #endif
  69     TCG_REG_XMM0,
  70     TCG_REG_XMM1,
  71     TCG_REG_XMM2,
  72     TCG_REG_XMM3,
  73     TCG_REG_XMM4,
  74     TCG_REG_XMM5,
  75 #ifndef _WIN64
  76     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78     TCG_REG_XMM6,
  79     TCG_REG_XMM7,
  80 #if TCG_TARGET_REG_BITS == 64
  81     TCG_REG_XMM8,
  82     TCG_REG_XMM9,
  83     TCG_REG_XMM10,
  84     TCG_REG_XMM11,
  85     TCG_REG_XMM12,
  86     TCG_REG_XMM13,
  87     TCG_REG_XMM14,
  88     TCG_REG_XMM15,
  89 #endif
  90 #endif
  91 };
  92
  93 static const int tcg_target_call_iarg_regs[] = {
  94 #if TCG_TARGET_REG_BITS == 64
  95 #if defined(_WIN64)
  96     TCG_REG_RCX,
  97     TCG_REG_RDX,
  98 #else
  99     TCG_REG_RDI,
 100     TCG_REG_RSI,
 101     TCG_REG_RDX,
 102     TCG_REG_RCX,
 103 #endif
 104     TCG_REG_R8,
 105     TCG_REG_R9,
 106 #else
 107     /* 32 bit mode uses stack based calling convention (GCC default). */
 108 #endif
 109 };
 110
 111 static const int tcg_target_call_oarg_regs[] = {
 112     TCG_REG_EAX,
 113 #if TCG_TARGET_REG_BITS == 32
 114     TCG_REG_EDX
 115 #endif
 116 };
 117
 118 /* Constants we accept.  */
 119 #define TCG_CT_CONST_S32 0x100
 120 #define TCG_CT_CONST_U32 0x200
 121 #define TCG_CT_CONST_I32 0x400
 122 #define TCG_CT_CONST_WSZ 0x800
 123
 124 /* Registers used with L constraint, which are the first argument
 125    registers on x86_64, and two random call clobbered registers on
 126    i386. */
 127 #if TCG_TARGET_REG_BITS == 64
 128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130 #else
 131 # define TCG_REG_L0 TCG_REG_EAX
 132 # define TCG_REG_L1 TCG_REG_EDX
 133 #endif
 134
 135 /* The host compiler should supply <cpuid.h> to enable runtime features
 136    detection, as we're not going to go so far as our own inline assembly.
 137    If not available, default values will be assumed.  */
 138 #if defined(CONFIG_CPUID_H)
 139 #include "qemu/cpuid.h"
 140 #endif
 141
 142 /* For 64-bit, we always know that CMOV is available.  */
 143 #if TCG_TARGET_REG_BITS == 64
 144 # define have_cmov 1
 145 #elif defined(CONFIG_CPUID_H)
 146 static bool have_cmov;
 147 #else
 148 # define have_cmov 0
 149 #endif
 150
 151 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 152    it there.  Therefore we always define the variable.  */
 153 bool have_bmi1;
 154 bool have_popcnt;
 155 bool have_avx1;
 156 bool have_avx2;
 157
 158 #ifdef CONFIG_CPUID_H
 159 static bool have_movbe;
 160 static bool have_bmi2;
 161 static bool have_lzcnt;
 162 #else
 163 # define have_movbe 0
 164 # define have_bmi2 0
 165 # define have_lzcnt 0
 166 #endif
 167
 168 static tcg_insn_unit *tb_ret_addr;
 169
 170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                         intptr_t value, intptr_t addend)
 172 {
 173     value += addend;
 174     switch(type) {
 175     case R_386_PC32:
 176         value -= (uintptr_t)code_ptr;
 177         if (value != (int32_t)value) {
 178             return false;
 179         }
 180         /* FALLTHRU */
 181     case R_386_32:
 182         tcg_patch32(code_ptr, value);
 183         break;
 184     case R_386_PC8:
 185         value -= (uintptr_t)code_ptr;
 186         if (value != (int8_t)value) {
 187             return false;
 188         }
 189         tcg_patch8(code_ptr, value);
 190         break;
 191     default:
 192         tcg_abort();
 193     }
 194     return true;
 195 }
 196
 197 #if TCG_TARGET_REG_BITS == 64
 198 #define ALL_GENERAL_REGS   0x0000ffffu
 199 #define ALL_VECTOR_REGS    0xffff0000u
 200 #else
 201 #define ALL_GENERAL_REGS   0x000000ffu
 202 #define ALL_VECTOR_REGS    0x00ff0000u
 203 #endif
 204
 205 /* parse target specific constraints */
 206 static const char *target_parse_constraint(TCGArgConstraint *ct,
 207                                            const char *ct_str, TCGType type)
 208 {
 209     switch(*ct_str++) {
 210     case 'a':
 211         ct->ct |= TCG_CT_REG;
 212         tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
 213         break;
 214     case 'b':
 215         ct->ct |= TCG_CT_REG;
 216         tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 217         break;
 218     case 'c':
 219         ct->ct |= TCG_CT_REG;
 220         tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 221         break;
 222     case 'd':
 223         ct->ct |= TCG_CT_REG;
 224         tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
 225         break;
 226     case 'S':
 227         ct->ct |= TCG_CT_REG;
 228         tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
 229         break;
 230     case 'D':
 231         ct->ct |= TCG_CT_REG;
 232         tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
 233         break;
 234     case 'q':
 235         /* A register that can be used as a byte operand.  */
 236         ct->ct |= TCG_CT_REG;
 237         ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
 238         break;
 239     case 'Q':
 240         /* A register with an addressable second byte (e.g. %ah).  */
 241         ct->ct |= TCG_CT_REG;
 242         ct->u.regs = 0xf;
 243         break;
 244     case 'r':
 245         /* A general register.  */
 246         ct->ct |= TCG_CT_REG;
 247         ct->u.regs |= ALL_GENERAL_REGS;
 248         break;
 249     case 'W':
 250         /* With TZCNT/LZCNT, we can have operand-size as an input.  */
 251         ct->ct |= TCG_CT_CONST_WSZ;
 252         break;
 253     case 'x':
 254         /* A vector register.  */
 255         ct->ct |= TCG_CT_REG;
 256         ct->u.regs |= ALL_VECTOR_REGS;
 257         break;
 258
 259         /* qemu_ld/st address constraint */
 260     case 'L':
 261         ct->ct |= TCG_CT_REG;
 262         ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 263         tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 264         tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 265         break;
 266
 267     case 'e':
 268         ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
 269         break;
 270     case 'Z':
 271         ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
 272         break;
 273     case 'I':
 274         ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 275         break;
 276
 277     default:
 278         return NULL;
 279     }
 280     return ct_str;
 281 }
 282
 283 /* test if a constant matches the constraint */
 284 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 285                                          const TCGArgConstraint *arg_ct)
 286 {
 287     int ct = arg_ct->ct;
 288     if (ct & TCG_CT_CONST) {
 289         return 1;
 290     }
 291     if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 292         return 1;
 293     }
 294     if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 295         return 1;
 296     }
 297     if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 298         return 1;
 299     }
 300     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 301         return 1;
 302     }
 303     return 0;
 304 }
 305
 306 # define LOWREGMASK(x)  ((x) & 7)
 307
 308 #define P_EXT           0x100           /* 0x0f opcode prefix */
 309 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 310 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 311 #if TCG_TARGET_REG_BITS == 64
 312 # define P_REXW         0x1000          /* Set REX.W = 1 */
 313 # define P_REXB_R       0x2000          /* REG field as byte register */
 314 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 315 # define P_GS           0x8000          /* gs segment override */
 316 #else
 317 # define P_REXW         0
 318 # define P_REXB_R       0
 319 # define P_REXB_RM      0
 320 # define P_GS           0
 321 #endif
 322 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 323 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 324 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 325 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 326
 327 #define OPC_ARITH_EvIz  (0x81)
 328 #define OPC_ARITH_EvIb  (0x83)
 329 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 330 #define OPC_ANDN        (0xf2 | P_EXT38)
 331 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 332 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 333 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 334 #define OPC_BSF         (0xbc | P_EXT)
 335 #define OPC_BSR         (0xbd | P_EXT)
 336 #define OPC_BSWAP       (0xc8 | P_EXT)
 337 #define OPC_CALL_Jz     (0xe8)
 338 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 339 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 340 #define OPC_DEC_r32     (0x48)
 341 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 342 #define OPC_IMUL_GvEvIb (0x6b)
 343 #define OPC_IMUL_GvEvIz (0x69)
 344 #define OPC_INC_r32     (0x40)
 345 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 346 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 347 #define OPC_JMP_long    (0xe9)
 348 #define OPC_JMP_short   (0xeb)
 349 #define OPC_LEA         (0x8d)
 350 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 351 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 352 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 353 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 354 #define OPC_MOVB_EvIz   (0xc6)
 355 #define OPC_MOVL_EvIz   (0xc7)
 356 #define OPC_MOVL_Iv     (0xb8)
 357 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 358 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 359 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 360 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 361 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 362 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 363 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 364 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 365 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 366 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 367 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 368 #define OPC_MOVSBL      (0xbe | P_EXT)
 369 #define OPC_MOVSWL      (0xbf | P_EXT)
 370 #define OPC_MOVSLQ      (0x63 | P_REXW)
 371 #define OPC_MOVZBL      (0xb6 | P_EXT)
 372 #define OPC_MOVZWL      (0xb7 | P_EXT)
 373 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 374 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 375 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 376 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 377 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 378 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 379 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 380 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 381 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 382 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 383 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 384 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 385 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 386 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 387 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 388 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 389 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 390 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 391 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 392 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 393 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 394 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 395 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 396 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 397 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 398 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 399 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 400 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 401 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 402 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 403 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 404 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 405 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 406 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 407 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 408 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 409 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 410 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 411 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 412 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 413 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 414 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 415 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 416 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 417 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 418 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 419 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 420 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 421 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 422 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 423 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 424 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 425 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 426 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 427 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 428 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 429 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 430 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 431 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 432 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 433 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 434 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 435 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 436 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 437 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 438 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 439 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 440 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 441 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 442 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 443 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 444 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 445 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 446 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 447 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 448 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 449 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 450 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 451 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 452 #define OPC_POP_r32     (0x58)
 453 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 454 #define OPC_PUSH_r32    (0x50)
 455 #define OPC_PUSH_Iv     (0x68)
 456 #define OPC_PUSH_Ib     (0x6a)
 457 #define OPC_RET         (0xc3)
 458 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 459 #define OPC_SHIFT_1     (0xd1)
 460 #define OPC_SHIFT_Ib    (0xc1)
 461 #define OPC_SHIFT_cl    (0xd3)
 462 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 463 #define OPC_SHUFPS      (0xc6 | P_EXT)
 464 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 465 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 466 #define OPC_SHRD_Ib     (0xac | P_EXT)
 467 #define OPC_TESTL       (0x85)
 468 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 469 #define OPC_UD2         (0x0b | P_EXT)
 470 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 471 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 472 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 473 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 474 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 475 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 476 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 477 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 478 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 479 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 480 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 481 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 482 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 483 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
 484 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 485 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 486 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
 487 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 488 #define OPC_XCHG_ax_r32 (0x90)
 489
 490 #define OPC_GRP3_Ev     (0xf7)
 491 #define OPC_GRP5        (0xff)
 492 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 493
 494 /* Group 1 opcode extensions for 0x80-0x83.
 495    These are also used as modifiers for OPC_ARITH.  */
 496 #define ARITH_ADD 0
 497 #define ARITH_OR  1
 498 #define ARITH_ADC 2
 499 #define ARITH_SBB 3
 500 #define ARITH_AND 4
 501 #define ARITH_SUB 5
 502 #define ARITH_XOR 6
 503 #define ARITH_CMP 7
 504
 505 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 506 #define SHIFT_ROL 0
 507 #define SHIFT_ROR 1
 508 #define SHIFT_SHL 4
 509 #define SHIFT_SHR 5
 510 #define SHIFT_SAR 7
 511
 512 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 513 #define EXT3_NOT   2
 514 #define EXT3_NEG   3
 515 #define EXT3_MUL   4
 516 #define EXT3_IMUL  5
 517 #define EXT3_DIV   6
 518 #define EXT3_IDIV  7
 519
 520 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 521 #define EXT5_INC_Ev     0
 522 #define EXT5_DEC_Ev     1
 523 #define EXT5_CALLN_Ev   2
 524 #define EXT5_JMPN_Ev    4
 525
 526 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 527 #define JCC_JMP (-1)
 528 #define JCC_JO  0x0
 529 #define JCC_JNO 0x1
 530 #define JCC_JB  0x2
 531 #define JCC_JAE 0x3
 532 #define JCC_JE  0x4
 533 #define JCC_JNE 0x5
 534 #define JCC_JBE 0x6
 535 #define JCC_JA  0x7
 536 #define JCC_JS  0x8
 537 #define JCC_JNS 0x9
 538 #define JCC_JP  0xa
 539 #define JCC_JNP 0xb
 540 #define JCC_JL  0xc
 541 #define JCC_JGE 0xd
 542 #define JCC_JLE 0xe
 543 #define JCC_JG  0xf
 544
 545 static const uint8_t tcg_cond_to_jcc[] = {
 546     [TCG_COND_EQ] = JCC_JE,
 547     [TCG_COND_NE] = JCC_JNE,
 548     [TCG_COND_LT] = JCC_JL,
 549     [TCG_COND_GE] = JCC_JGE,
 550     [TCG_COND_LE] = JCC_JLE,
 551     [TCG_COND_GT] = JCC_JG,
 552     [TCG_COND_LTU] = JCC_JB,
 553     [TCG_COND_GEU] = JCC_JAE,
 554     [TCG_COND_LEU] = JCC_JBE,
 555     [TCG_COND_GTU] = JCC_JA,
 556 };
 557
 558 #if TCG_TARGET_REG_BITS == 64
 559 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 560 {
 561     int rex;
 562
 563     if (opc & P_GS) {
 564         tcg_out8(s, 0x65);
 565     }
 566     if (opc & P_DATA16) {
 567         /* We should never be asking for both 16 and 64-bit operation.  */
 568         tcg_debug_assert((opc & P_REXW) == 0);
 569         tcg_out8(s, 0x66);
 570     }
 571     if (opc & P_SIMDF3) {
 572         tcg_out8(s, 0xf3);
 573     } else if (opc & P_SIMDF2) {
 574         tcg_out8(s, 0xf2);
 575     }
 576
 577     rex = 0;
 578     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 579     rex |= (r & 8) >> 1;                /* REX.R */
 580     rex |= (x & 8) >> 2;                /* REX.X */
 581     rex |= (rm & 8) >> 3;               /* REX.B */
 582
 583     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 584        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 585        as otherwise the encoding indicates %[abcd]h.  Note that the values
 586        that are ORed in merely indicate that the REX byte must be present;
 587        those bits get discarded in output.  */
 588     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 589     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 590
 591     if (rex) {
 592         tcg_out8(s, (uint8_t)(rex | 0x40));
 593     }
 594
 595     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 596         tcg_out8(s, 0x0f);
 597         if (opc & P_EXT38) {
 598             tcg_out8(s, 0x38);
 599         } else if (opc & P_EXT3A) {
 600             tcg_out8(s, 0x3a);
 601         }
 602     }
 603
 604     tcg_out8(s, opc);
 605 }
 606 #else
 607 static void tcg_out_opc(TCGContext *s, int opc)
 608 {
 609     if (opc & P_DATA16) {
 610         tcg_out8(s, 0x66);
 611     }
 612     if (opc & P_SIMDF3) {
 613         tcg_out8(s, 0xf3);
 614     } else if (opc & P_SIMDF2) {
 615         tcg_out8(s, 0xf2);
 616     }
 617     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 618         tcg_out8(s, 0x0f);
 619         if (opc & P_EXT38) {
 620             tcg_out8(s, 0x38);
 621         } else if (opc & P_EXT3A) {
 622             tcg_out8(s, 0x3a);
 623         }
 624     }
 625     tcg_out8(s, opc);
 626 }
 627 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 628    the 32-bit compilation paths.  This method works with all versions of gcc,
 629    whereas relying on optimization may not be able to exclude them.  */
 630 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 631 #endif
 632
 633 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 634 {
 635     tcg_out_opc(s, opc, r, rm, 0);
 636     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 637 }
 638
 639 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 640                             int rm, int index)
 641 {
 642     int tmp;
 643
 644     /* Use the two byte form if possible, which cannot encode
 645        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 646     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 647         && ((rm | index) & 8) == 0) {
 648         /* Two byte VEX prefix.  */
 649         tcg_out8(s, 0xc5);
 650
 651         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 652     } else {
 653         /* Three byte VEX prefix.  */
 654         tcg_out8(s, 0xc4);
 655
 656         /* VEX.m-mmmm */
 657         if (opc & P_EXT3A) {
 658             tmp = 3;
 659         } else if (opc & P_EXT38) {
 660             tmp = 2;
 661         } else if (opc & P_EXT) {
 662             tmp = 1;
 663         } else {
 664             g_assert_not_reached();
 665         }
 666         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 667         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 668         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 669         tcg_out8(s, tmp);
 670
 671         tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 672     }
 673
 674     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 675     /* VEX.pp */
 676     if (opc & P_DATA16) {
 677         tmp |= 1;                          /* 0x66 */
 678     } else if (opc & P_SIMDF3) {
 679         tmp |= 2;                          /* 0xf3 */
 680     } else if (opc & P_SIMDF2) {
 681         tmp |= 3;                          /* 0xf2 */
 682     }
 683     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 684     tcg_out8(s, tmp);
 685     tcg_out8(s, opc);
 686 }
 687
 688 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 689 {
 690     tcg_out_vex_opc(s, opc, r, v, rm, 0);
 691     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 692 }
 693
 694 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 695    We handle either RM and INDEX missing with a negative value.  In 64-bit
 696    mode for absolute addresses, ~RM is the size of the immediate operand
 697    that will follow the instruction.  */
 698
 699 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 700                                int shift, intptr_t offset)
 701 {
 702     int mod, len;
 703
 704     if (index < 0 && rm < 0) {
 705         if (TCG_TARGET_REG_BITS == 64) {
 706             /* Try for a rip-relative addressing mode.  This has replaced
 707                the 32-bit-mode absolute addressing encoding.  */
 708             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 709             intptr_t disp = offset - pc;
 710             if (disp == (int32_t)disp) {
 711                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 712                 tcg_out32(s, disp);
 713                 return;
 714             }
 715
 716             /* Try for an absolute address encoding.  This requires the
 717                use of the MODRM+SIB encoding and is therefore larger than
 718                rip-relative addressing.  */
 719             if (offset == (int32_t)offset) {
 720                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 721                 tcg_out8(s, (4 << 3) | 5);
 722                 tcg_out32(s, offset);
 723                 return;
 724             }
 725
 726             /* ??? The memory isn't directly addressable.  */
 727             g_assert_not_reached();
 728         } else {
 729             /* Absolute address.  */
 730             tcg_out8(s, (r << 3) | 5);
 731             tcg_out32(s, offset);
 732             return;
 733         }
 734     }
 735
 736     /* Find the length of the immediate addend.  Note that the encoding
 737        that would be used for (%ebp) indicates absolute addressing.  */
 738     if (rm < 0) {
 739         mod = 0, len = 4, rm = 5;
 740     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 741         mod = 0, len = 0;
 742     } else if (offset == (int8_t)offset) {
 743         mod = 0x40, len = 1;
 744     } else {
 745         mod = 0x80, len = 4;
 746     }
 747
 748     /* Use a single byte MODRM format if possible.  Note that the encoding
 749        that would be used for %esp is the escape to the two byte form.  */
 750     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 751         /* Single byte MODRM format.  */
 752         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 753     } else {
 754         /* Two byte MODRM+SIB format.  */
 755
 756         /* Note that the encoding that would place %esp into the index
 757            field indicates no index register.  In 64-bit mode, the REX.X
 758            bit counts, so %r12 can be used as the index.  */
 759         if (index < 0) {
 760             index = 4;
 761         } else {
 762             tcg_debug_assert(index != TCG_REG_ESP);
 763         }
 764
 765         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 766         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 767     }
 768
 769     if (len == 1) {
 770         tcg_out8(s, offset);
 771     } else if (len == 4) {
 772         tcg_out32(s, offset);
 773     }
 774 }
 775
 776 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 777                                      int index, int shift, intptr_t offset)
 778 {
 779     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 780     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 781 }
 782
 783 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 784                                          int rm, int index, int shift,
 785                                          intptr_t offset)
 786 {
 787     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 788     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 789 }
 790
 791 /* A simplification of the above with no index or shift.  */
 792 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 793                                         int rm, intptr_t offset)
 794 {
 795     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 796 }
 797
 798 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 799                                             int v, int rm, intptr_t offset)
 800 {
 801     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 802 }
 803
 804 /* Output an opcode with an expected reference to the constant pool.  */
 805 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 806 {
 807     tcg_out_opc(s, opc, r, 0, 0);
 808     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 809     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 810     tcg_out32(s, 0);
 811 }
 812
 813 /* Output an opcode with an expected reference to the constant pool.  */
 814 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 815 {
 816     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 817     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 818     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 819     tcg_out32(s, 0);
 820 }
 821
 822 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 823 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 824 {
 825     /* Propagate an opcode prefix, such as P_REXW.  */
 826     int ext = subop & ~0x7;
 827     subop &= 0x7;
 828
 829     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 830 }
 831
 832 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 833 {
 834     int rexw = 0;
 835
 836     if (arg == ret) {
 837         return true;
 838     }
 839     switch (type) {
 840     case TCG_TYPE_I64:
 841         rexw = P_REXW;
 842         /* fallthru */
 843     case TCG_TYPE_I32:
 844         if (ret < 16) {
 845             if (arg < 16) {
 846                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 847             } else {
 848                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 849             }
 850         } else {
 851             if (arg < 16) {
 852                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 853             } else {
 854                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 855             }
 856         }
 857         break;
 858
 859     case TCG_TYPE_V64:
 860         tcg_debug_assert(ret >= 16 && arg >= 16);
 861         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 862         break;
 863     case TCG_TYPE_V128:
 864         tcg_debug_assert(ret >= 16 && arg >= 16);
 865         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 866         break;
 867     case TCG_TYPE_V256:
 868         tcg_debug_assert(ret >= 16 && arg >= 16);
 869         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 870         break;
 871
 872     default:
 873         g_assert_not_reached();
 874     }
 875     return true;
 876 }
 877
 878 static const int avx2_dup_insn[4] = {
 879     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 880     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 881 };
 882
 883 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 884                             TCGReg r, TCGReg a)
 885 {
 886     if (have_avx2) {
 887         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 888         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 889     } else {
 890         switch (vece) {
 891         case MO_8:
 892             /* ??? With zero in a register, use PSHUFB.  */
 893             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 894             a = r;
 895             /* FALLTHRU */
 896         case MO_16:
 897             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 898             a = r;
 899             /* FALLTHRU */
 900         case MO_32:
 901             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 902             /* imm8 operand: all output lanes selected from input lane 0.  */
 903             tcg_out8(s, 0);
 904             break;
 905         case MO_64:
 906             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 907             break;
 908         default:
 909             g_assert_not_reached();
 910         }
 911     }
 912     return true;
 913 }
 914
 915 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 916                              TCGReg r, TCGReg base, intptr_t offset)
 917 {
 918     if (have_avx2) {
 919         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 920         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 921                                  r, 0, base, offset);
 922     } else {
 923         switch (vece) {
 924         case MO_64:
 925             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 926             break;
 927         case MO_32:
 928             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 929             break;
 930         case MO_16:
 931             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 932             tcg_out8(s, 0); /* imm8 */
 933             tcg_out_dup_vec(s, type, vece, r, r);
 934             break;
 935         case MO_8:
 936             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 937             tcg_out8(s, 0); /* imm8 */
 938             tcg_out_dup_vec(s, type, vece, r, r);
 939             break;
 940         default:
 941             g_assert_not_reached();
 942         }
 943     }
 944     return true;
 945 }
 946
 947 static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 948                              TCGReg ret, tcg_target_long arg)
 949 {
 950     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 951
 952     if (arg == 0) {
 953         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 954         return;
 955     }
 956     if (arg == -1) {
 957         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 958         return;
 959     }
 960
 961     if (TCG_TARGET_REG_BITS == 64) {
 962         if (type == TCG_TYPE_V64) {
 963             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 964         } else if (have_avx2) {
 965             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 966         } else {
 967             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 968         }
 969         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 970     } else {
 971         if (have_avx2) {
 972             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
 973         } else {
 974             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 975         }
 976         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 977     }
 978 }
 979
 980 static void tcg_out_movi(TCGContext *s, TCGType type,
 981                          TCGReg ret, tcg_target_long arg)
 982 {
 983     tcg_target_long diff;
 984
 985     switch (type) {
 986     case TCG_TYPE_I32:
 987 #if TCG_TARGET_REG_BITS == 64
 988     case TCG_TYPE_I64:
 989 #endif
 990         if (ret < 16) {
 991             break;
 992         }
 993         /* fallthru */
 994     case TCG_TYPE_V64:
 995     case TCG_TYPE_V128:
 996     case TCG_TYPE_V256:
 997         tcg_debug_assert(ret >= 16);
 998         tcg_out_dupi_vec(s, type, ret, arg);
 999         return;
1000     default:
1001         g_assert_not_reached();
1002     }
1003
1004     if (arg == 0) {
1005         tgen_arithr(s, ARITH_XOR, ret, ret);
1006         return;
1007     }
1008     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1009         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1010         tcg_out32(s, arg);
1011         return;
1012     }
1013     if (arg == (int32_t)arg) {
1014         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1015         tcg_out32(s, arg);
1016         return;
1017     }
1018
1019     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1020     diff = arg - ((uintptr_t)s->code_ptr + 7);
1021     if (diff == (int32_t)diff) {
1022         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1023         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1024         tcg_out32(s, diff);
1025         return;
1026     }
1027
1028     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1029     tcg_out64(s, arg);
1030 }
1031
1032 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1033 {
1034     if (val == (int8_t)val) {
1035         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1036         tcg_out8(s, val);
1037     } else if (val == (int32_t)val) {
1038         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1039         tcg_out32(s, val);
1040     } else {
1041         tcg_abort();
1042     }
1043 }
1044
1045 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1046 {
1047     /* Given the strength of x86 memory ordering, we only need care for
1048        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1049        faster than "mfence", so don't bother with the sse insn.  */
1050     if (a0 & TCG_MO_ST_LD) {
1051         tcg_out8(s, 0xf0);
1052         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1053         tcg_out8(s, 0);
1054     }
1055 }
1056
1057 static inline void tcg_out_push(TCGContext *s, int reg)
1058 {
1059     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1060 }
1061
1062 static inline void tcg_out_pop(TCGContext *s, int reg)
1063 {
1064     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1065 }
1066
1067 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1068                        TCGReg arg1, intptr_t arg2)
1069 {
1070     switch (type) {
1071     case TCG_TYPE_I32:
1072         if (ret < 16) {
1073             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1074         } else {
1075             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1076         }
1077         break;
1078     case TCG_TYPE_I64:
1079         if (ret < 16) {
1080             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1081             break;
1082         }
1083         /* FALLTHRU */
1084     case TCG_TYPE_V64:
1085         /* There is no instruction that can validate 8-byte alignment.  */
1086         tcg_debug_assert(ret >= 16);
1087         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1088         break;
1089     case TCG_TYPE_V128:
1090         /*
1091          * The gvec infrastructure is asserts that v128 vector loads
1092          * and stores use a 16-byte aligned offset.  Validate that the
1093          * final pointer is aligned by using an insn that will SIGSEGV.
1094          */
1095         tcg_debug_assert(ret >= 16);
1096         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1097         break;
1098     case TCG_TYPE_V256:
1099         /*
1100          * The gvec infrastructure only requires 16-byte alignment,
1101          * so here we must use an unaligned load.
1102          */
1103         tcg_debug_assert(ret >= 16);
1104         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1105                                  ret, 0, arg1, arg2);
1106         break;
1107     default:
1108         g_assert_not_reached();
1109     }
1110 }
1111
1112 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1113                        TCGReg arg1, intptr_t arg2)
1114 {
1115     switch (type) {
1116     case TCG_TYPE_I32:
1117         if (arg < 16) {
1118             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1119         } else {
1120             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1121         }
1122         break;
1123     case TCG_TYPE_I64:
1124         if (arg < 16) {
1125             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1126             break;
1127         }
1128         /* FALLTHRU */
1129     case TCG_TYPE_V64:
1130         /* There is no instruction that can validate 8-byte alignment.  */
1131         tcg_debug_assert(arg >= 16);
1132         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1133         break;
1134     case TCG_TYPE_V128:
1135         /*
1136          * The gvec infrastructure is asserts that v128 vector loads
1137          * and stores use a 16-byte aligned offset.  Validate that the
1138          * final pointer is aligned by using an insn that will SIGSEGV.
1139          */
1140         tcg_debug_assert(arg >= 16);
1141         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1142         break;
1143     case TCG_TYPE_V256:
1144         /*
1145          * The gvec infrastructure only requires 16-byte alignment,
1146          * so here we must use an unaligned store.
1147          */
1148         tcg_debug_assert(arg >= 16);
1149         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1150                                  arg, 0, arg1, arg2);
1151         break;
1152     default:
1153         g_assert_not_reached();
1154     }
1155 }
1156
1157 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1158                         TCGReg base, intptr_t ofs)
1159 {
1160     int rexw = 0;
1161     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1162         if (val != (int32_t)val) {
1163             return false;
1164         }
1165         rexw = P_REXW;
1166     } else if (type != TCG_TYPE_I32) {
1167         return false;
1168     }
1169     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1170     tcg_out32(s, val);
1171     return true;
1172 }
1173
1174 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1175 {
1176     /* Propagate an opcode prefix, such as P_DATA16.  */
1177     int ext = subopc & ~0x7;
1178     subopc &= 0x7;
1179
1180     if (count == 1) {
1181         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1182     } else {
1183         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1184         tcg_out8(s, count);
1185     }
1186 }
1187
1188 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1189 {
1190     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1191 }
1192
1193 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1194 {
1195     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1196 }
1197
1198 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1199 {
1200     /* movzbl */
1201     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1202     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1203 }
1204
1205 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1206 {
1207     /* movsbl */
1208     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1209     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1210 }
1211
1212 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1213 {
1214     /* movzwl */
1215     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1216 }
1217
1218 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1219 {
1220     /* movsw[lq] */
1221     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1222 }
1223
1224 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1225 {
1226     /* 32-bit mov zero extends.  */
1227     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1228 }
1229
1230 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1231 {
1232     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1233 }
1234
1235 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1236 {
1237     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1238 }
1239
1240 static void tgen_arithi(TCGContext *s, int c, int r0,
1241                         tcg_target_long val, int cf)
1242 {
1243     int rexw = 0;
1244
1245     if (TCG_TARGET_REG_BITS == 64) {
1246         rexw = c & -8;
1247         c &= 7;
1248     }
1249
1250     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1251        partial flags update stalls on Pentium4 and are not recommended
1252        by current Intel optimization manuals.  */
1253     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1254         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1255         if (TCG_TARGET_REG_BITS == 64) {
1256             /* The single-byte increment encodings are re-tasked as the
1257                REX prefixes.  Use the MODRM encoding.  */
1258             tcg_out_modrm(s, OPC_GRP5 + rexw,
1259                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1260         } else {
1261             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1262         }
1263         return;
1264     }
1265
1266     if (c == ARITH_AND) {
1267         if (TCG_TARGET_REG_BITS == 64) {
1268             if (val == 0xffffffffu) {
1269                 tcg_out_ext32u(s, r0, r0);
1270                 return;
1271             }
1272             if (val == (uint32_t)val) {
1273                 /* AND with no high bits set can use a 32-bit operation.  */
1274                 rexw = 0;
1275             }
1276         }
1277         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1278             tcg_out_ext8u(s, r0, r0);
1279             return;
1280         }
1281         if (val == 0xffffu) {
1282             tcg_out_ext16u(s, r0, r0);
1283             return;
1284         }
1285     }
1286
1287     if (val == (int8_t)val) {
1288         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1289         tcg_out8(s, val);
1290         return;
1291     }
1292     if (rexw == 0 || val == (int32_t)val) {
1293         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1294         tcg_out32(s, val);
1295         return;
1296     }
1297
1298     tcg_abort();
1299 }
1300
1301 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1302 {
1303     if (val != 0) {
1304         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1305     }
1306 }
1307
1308 /* Use SMALL != 0 to force a short forward branch.  */
1309 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1310 {
1311     int32_t val, val1;
1312
1313     if (l->has_value) {
1314         val = tcg_pcrel_diff(s, l->u.value_ptr);
1315         val1 = val - 2;
1316         if ((int8_t)val1 == val1) {
1317             if (opc == -1) {
1318                 tcg_out8(s, OPC_JMP_short);
1319             } else {
1320                 tcg_out8(s, OPC_JCC_short + opc);
1321             }
1322             tcg_out8(s, val1);
1323         } else {
1324             if (small) {
1325                 tcg_abort();
1326             }
1327             if (opc == -1) {
1328                 tcg_out8(s, OPC_JMP_long);
1329                 tcg_out32(s, val - 5);
1330             } else {
1331                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1332                 tcg_out32(s, val - 6);
1333             }
1334         }
1335     } else if (small) {
1336         if (opc == -1) {
1337             tcg_out8(s, OPC_JMP_short);
1338         } else {
1339             tcg_out8(s, OPC_JCC_short + opc);
1340         }
1341         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1342         s->code_ptr += 1;
1343     } else {
1344         if (opc == -1) {
1345             tcg_out8(s, OPC_JMP_long);
1346         } else {
1347             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1348         }
1349         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1350         s->code_ptr += 4;
1351     }
1352 }
1353
1354 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1355                         int const_arg2, int rexw)
1356 {
1357     if (const_arg2) {
1358         if (arg2 == 0) {
1359             /* test r, r */
1360             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1361         } else {
1362             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1363         }
1364     } else {
1365         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1366     }
1367 }
1368
1369 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1370                              TCGArg arg1, TCGArg arg2, int const_arg2,
1371                              TCGLabel *label, int small)
1372 {
1373     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1374     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1375 }
1376
1377 #if TCG_TARGET_REG_BITS == 64
1378 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1379                              TCGArg arg1, TCGArg arg2, int const_arg2,
1380                              TCGLabel *label, int small)
1381 {
1382     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1383     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1384 }
1385 #else
1386 /* XXX: we implement it at the target level to avoid having to
1387    handle cross basic blocks temporaries */
1388 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1389                             const int *const_args, int small)
1390 {
1391     TCGLabel *label_next = gen_new_label();
1392     TCGLabel *label_this = arg_label(args[5]);
1393
1394     switch(args[4]) {
1395     case TCG_COND_EQ:
1396         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1397                          label_next, 1);
1398         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1399                          label_this, small);
1400         break;
1401     case TCG_COND_NE:
1402         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1403                          label_this, small);
1404         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1405                          label_this, small);
1406         break;
1407     case TCG_COND_LT:
1408         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1409                          label_this, small);
1410         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1411         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1412                          label_this, small);
1413         break;
1414     case TCG_COND_LE:
1415         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1416                          label_this, small);
1417         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1418         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1419                          label_this, small);
1420         break;
1421     case TCG_COND_GT:
1422         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1423                          label_this, small);
1424         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1425         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1426                          label_this, small);
1427         break;
1428     case TCG_COND_GE:
1429         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1430                          label_this, small);
1431         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1432         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1433                          label_this, small);
1434         break;
1435     case TCG_COND_LTU:
1436         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1437                          label_this, small);
1438         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1439         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1440                          label_this, small);
1441         break;
1442     case TCG_COND_LEU:
1443         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1444                          label_this, small);
1445         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1446         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1447                          label_this, small);
1448         break;
1449     case TCG_COND_GTU:
1450         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1451                          label_this, small);
1452         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1453         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1454                          label_this, small);
1455         break;
1456     case TCG_COND_GEU:
1457         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1458                          label_this, small);
1459         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1460         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1461                          label_this, small);
1462         break;
1463     default:
1464         tcg_abort();
1465     }
1466     tcg_out_label(s, label_next, s->code_ptr);
1467 }
1468 #endif
1469
1470 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1471                               TCGArg arg1, TCGArg arg2, int const_arg2)
1472 {
1473     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1474     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1475     tcg_out_ext8u(s, dest, dest);
1476 }
1477
1478 #if TCG_TARGET_REG_BITS == 64
1479 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1480                               TCGArg arg1, TCGArg arg2, int const_arg2)
1481 {
1482     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1483     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1484     tcg_out_ext8u(s, dest, dest);
1485 }
1486 #else
1487 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1488                              const int *const_args)
1489 {
1490     TCGArg new_args[6];
1491     TCGLabel *label_true, *label_over;
1492
1493     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1494
1495     if (args[0] == args[1] || args[0] == args[2]
1496         || (!const_args[3] && args[0] == args[3])
1497         || (!const_args[4] && args[0] == args[4])) {
1498         /* When the destination overlaps with one of the argument
1499            registers, don't do anything tricky.  */
1500         label_true = gen_new_label();
1501         label_over = gen_new_label();
1502
1503         new_args[5] = label_arg(label_true);
1504         tcg_out_brcond2(s, new_args, const_args+1, 1);
1505
1506         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1507         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1508         tcg_out_label(s, label_true, s->code_ptr);
1509
1510         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1511         tcg_out_label(s, label_over, s->code_ptr);
1512     } else {
1513         /* When the destination does not overlap one of the arguments,
1514            clear the destination first, jump if cond false, and emit an
1515            increment in the true case.  This results in smaller code.  */
1516
1517         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1518
1519         label_over = gen_new_label();
1520         new_args[4] = tcg_invert_cond(new_args[4]);
1521         new_args[5] = label_arg(label_over);
1522         tcg_out_brcond2(s, new_args, const_args+1, 1);
1523
1524         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1525         tcg_out_label(s, label_over, s->code_ptr);
1526     }
1527 }
1528 #endif
1529
1530 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1531                          TCGReg dest, TCGReg v1)
1532 {
1533     if (have_cmov) {
1534         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1535     } else {
1536         TCGLabel *over = gen_new_label();
1537         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1538         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1539         tcg_out_label(s, over, s->code_ptr);
1540     }
1541 }
1542
1543 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1544                               TCGReg c1, TCGArg c2, int const_c2,
1545                               TCGReg v1)
1546 {
1547     tcg_out_cmp(s, c1, c2, const_c2, 0);
1548     tcg_out_cmov(s, cond, 0, dest, v1);
1549 }
1550
1551 #if TCG_TARGET_REG_BITS == 64
1552 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1553                               TCGReg c1, TCGArg c2, int const_c2,
1554                               TCGReg v1)
1555 {
1556     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1557     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1558 }
1559 #endif
1560
1561 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1562                         TCGArg arg2, bool const_a2)
1563 {
1564     if (have_bmi1) {
1565         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1566         if (const_a2) {
1567             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1568         } else {
1569             tcg_debug_assert(dest != arg2);
1570             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1571         }
1572     } else {
1573         tcg_debug_assert(dest != arg2);
1574         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1575         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1576     }
1577 }
1578
1579 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1580                         TCGArg arg2, bool const_a2)
1581 {
1582     if (have_lzcnt) {
1583         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1584         if (const_a2) {
1585             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1586         } else {
1587             tcg_debug_assert(dest != arg2);
1588             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1589         }
1590     } else {
1591         tcg_debug_assert(!const_a2);
1592         tcg_debug_assert(dest != arg1);
1593         tcg_debug_assert(dest != arg2);
1594
1595         /* Recall that the output of BSR is the index not the count.  */
1596         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1597         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1598
1599         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1600         tcg_out_cmp(s, arg1, 0, 1, rexw);
1601         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1602     }
1603 }
1604
1605 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1606 {
1607     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1608
1609     if (disp == (int32_t)disp) {
1610         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1611         tcg_out32(s, disp);
1612     } else {
1613         /* rip-relative addressing into the constant pool.
1614            This is 6 + 8 = 14 bytes, as compared to using an
1615            an immediate load 10 + 6 = 16 bytes, plus we may
1616            be able to re-use the pool constant for more calls.  */
1617         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1618         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1619         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1620         tcg_out32(s, 0);
1621     }
1622 }
1623
1624 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1625 {
1626     tcg_out_branch(s, 1, dest);
1627 }
1628
1629 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1630 {
1631     tcg_out_branch(s, 0, dest);
1632 }
1633
1634 static void tcg_out_nopn(TCGContext *s, int n)
1635 {
1636     int i;
1637     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1638      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1639      * duplicate prefix, and all of the interesting recent cores can
1640      * decode and discard the duplicates in a single cycle.
1641      */
1642     tcg_debug_assert(n >= 1);
1643     for (i = 1; i < n; ++i) {
1644         tcg_out8(s, 0x66);
1645     }
1646     tcg_out8(s, 0x90);
1647 }
1648
1649 #if defined(CONFIG_SOFTMMU)
1650 #include "tcg-ldst.inc.c"
1651
1652 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1653  *                                     int mmu_idx, uintptr_t ra)
1654  */
1655 static void * const qemu_ld_helpers[16] = {
1656     [MO_UB]   = helper_ret_ldub_mmu,
1657     [MO_LEUW] = helper_le_lduw_mmu,
1658     [MO_LEUL] = helper_le_ldul_mmu,
1659     [MO_LEQ]  = helper_le_ldq_mmu,
1660     [MO_BEUW] = helper_be_lduw_mmu,
1661     [MO_BEUL] = helper_be_ldul_mmu,
1662     [MO_BEQ]  = helper_be_ldq_mmu,
1663 };
1664
1665 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1666  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1667  */
1668 static void * const qemu_st_helpers[16] = {
1669     [MO_UB]   = helper_ret_stb_mmu,
1670     [MO_LEUW] = helper_le_stw_mmu,
1671     [MO_LEUL] = helper_le_stl_mmu,
1672     [MO_LEQ]  = helper_le_stq_mmu,
1673     [MO_BEUW] = helper_be_stw_mmu,
1674     [MO_BEUL] = helper_be_stl_mmu,
1675     [MO_BEQ]  = helper_be_stq_mmu,
1676 };
1677
1678 /* Perform the TLB load and compare.
1679
1680    Inputs:
1681    ADDRLO and ADDRHI contain the low and high part of the address.
1682
1683    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1684
1685    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1686    This should be offsetof addr_read or addr_write.
1687
1688    Outputs:
1689    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1690    positions of the displacements of forward jumps to the TLB miss case.
1691
1692    Second argument register is loaded with the low part of the address.
1693    In the TLB hit case, it has been adjusted as indicated by the TLB
1694    and so is a host address.  In the TLB miss case, it continues to
1695    hold a guest address.
1696
1697    First argument register is clobbered.  */
1698
1699 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1700                                     int mem_index, TCGMemOp opc,
1701                                     tcg_insn_unit **label_ptr, int which)
1702 {
1703     const TCGReg r0 = TCG_REG_L0;
1704     const TCGReg r1 = TCG_REG_L1;
1705     TCGType ttype = TCG_TYPE_I32;
1706     TCGType tlbtype = TCG_TYPE_I32;
1707     int trexw = 0, hrexw = 0, tlbrexw = 0;
1708     unsigned a_bits = get_alignment_bits(opc);
1709     unsigned s_bits = opc & MO_SIZE;
1710     unsigned a_mask = (1 << a_bits) - 1;
1711     unsigned s_mask = (1 << s_bits) - 1;
1712     target_ulong tlb_mask;
1713
1714     if (TCG_TARGET_REG_BITS == 64) {
1715         if (TARGET_LONG_BITS == 64) {
1716             ttype = TCG_TYPE_I64;
1717             trexw = P_REXW;
1718         }
1719         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1720             hrexw = P_REXW;
1721             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1722                 tlbtype = TCG_TYPE_I64;
1723                 tlbrexw = P_REXW;
1724             }
1725         }
1726     }
1727
1728     tcg_out_mov(s, tlbtype, r0, addrlo);
1729     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1730                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1731
1732     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1733                          offsetof(CPUArchState, tlb_mask[mem_index]));
1734
1735     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1736                          offsetof(CPUArchState, tlb_table[mem_index]));
1737
1738     /* If the required alignment is at least as large as the access, simply
1739        copy the address and mask.  For lesser alignments, check that we don't
1740        cross pages for the complete access.  */
1741     if (a_bits >= s_bits) {
1742         tcg_out_mov(s, ttype, r1, addrlo);
1743     } else {
1744         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1745     }
1746     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1747     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1748
1749     /* cmp 0(r0), r1 */
1750     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1751
1752     /* Prepare for both the fast path add of the tlb addend, and the slow
1753        path function argument setup.  */
1754     tcg_out_mov(s, ttype, r1, addrlo);
1755
1756     /* jne slow_path */
1757     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1758     label_ptr[0] = s->code_ptr;
1759     s->code_ptr += 4;
1760
1761     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1762         /* cmp 4(r0), addrhi */
1763         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1764
1765         /* jne slow_path */
1766         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1767         label_ptr[1] = s->code_ptr;
1768         s->code_ptr += 4;
1769     }
1770
1771     /* TLB Hit.  */
1772
1773     /* add addend(r0), r1 */
1774     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1775                          offsetof(CPUTLBEntry, addend));
1776 }
1777
1778 /*
1779  * Record the context of a call to the out of line helper code for the slow path
1780  * for a load or store, so that we can later generate the correct helper code
1781  */
1782 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1783                                 TCGMemOpIdx oi,
1784                                 TCGReg datalo, TCGReg datahi,
1785                                 TCGReg addrlo, TCGReg addrhi,
1786                                 tcg_insn_unit *raddr,
1787                                 tcg_insn_unit **label_ptr)
1788 {
1789     TCGLabelQemuLdst *label = new_ldst_label(s);
1790
1791     label->is_ld = is_ld;
1792     label->oi = oi;
1793     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1794     label->datalo_reg = datalo;
1795     label->datahi_reg = datahi;
1796     label->addrlo_reg = addrlo;
1797     label->addrhi_reg = addrhi;
1798     label->raddr = raddr;
1799     label->label_ptr[0] = label_ptr[0];
1800     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1801         label->label_ptr[1] = label_ptr[1];
1802     }
1803 }
1804
1805 /*
1806  * Generate code for the slow path for a load at the end of block
1807  */
1808 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1809 {
1810     TCGMemOpIdx oi = l->oi;
1811     TCGMemOp opc = get_memop(oi);
1812     TCGReg data_reg;
1813     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1814     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1815
1816     /* resolve label address */
1817     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1818     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1819         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1820     }
1821
1822     if (TCG_TARGET_REG_BITS == 32) {
1823         int ofs = 0;
1824
1825         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1826         ofs += 4;
1827
1828         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1829         ofs += 4;
1830
1831         if (TARGET_LONG_BITS == 64) {
1832             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1833             ofs += 4;
1834         }
1835
1836         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1837         ofs += 4;
1838
1839         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1840     } else {
1841         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1842         /* The second argument is already loaded with addrlo.  */
1843         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1844         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1845                      (uintptr_t)l->raddr);
1846     }
1847
1848     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1849
1850     data_reg = l->datalo_reg;
1851     switch (opc & MO_SSIZE) {
1852     case MO_SB:
1853         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1854         break;
1855     case MO_SW:
1856         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1857         break;
1858 #if TCG_TARGET_REG_BITS == 64
1859     case MO_SL:
1860         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1861         break;
1862 #endif
1863     case MO_UB:
1864     case MO_UW:
1865         /* Note that the helpers have zero-extended to tcg_target_long.  */
1866     case MO_UL:
1867         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1868         break;
1869     case MO_Q:
1870         if (TCG_TARGET_REG_BITS == 64) {
1871             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1872         } else if (data_reg == TCG_REG_EDX) {
1873             /* xchg %edx, %eax */
1874             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1875             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1876         } else {
1877             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1878             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1879         }
1880         break;
1881     default:
1882         tcg_abort();
1883     }
1884
1885     /* Jump to the code corresponding to next IR of qemu_st */
1886     tcg_out_jmp(s, l->raddr);
1887     return true;
1888 }
1889
1890 /*
1891  * Generate code for the slow path for a store at the end of block
1892  */
1893 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1894 {
1895     TCGMemOpIdx oi = l->oi;
1896     TCGMemOp opc = get_memop(oi);
1897     TCGMemOp s_bits = opc & MO_SIZE;
1898     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1899     TCGReg retaddr;
1900
1901     /* resolve label address */
1902     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1903     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1904         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1905     }
1906
1907     if (TCG_TARGET_REG_BITS == 32) {
1908         int ofs = 0;
1909
1910         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1911         ofs += 4;
1912
1913         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1914         ofs += 4;
1915
1916         if (TARGET_LONG_BITS == 64) {
1917             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1918             ofs += 4;
1919         }
1920
1921         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1922         ofs += 4;
1923
1924         if (s_bits == MO_64) {
1925             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1926             ofs += 4;
1927         }
1928
1929         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1930         ofs += 4;
1931
1932         retaddr = TCG_REG_EAX;
1933         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1934         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1935     } else {
1936         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1937         /* The second argument is already loaded with addrlo.  */
1938         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1939                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1940         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1941
1942         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1943             retaddr = tcg_target_call_iarg_regs[4];
1944             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1945         } else {
1946             retaddr = TCG_REG_RAX;
1947             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1948             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1949                        TCG_TARGET_CALL_STACK_OFFSET);
1950         }
1951     }
1952
1953     /* "Tail call" to the helper, with the return address back inline.  */
1954     tcg_out_push(s, retaddr);
1955     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1956     return true;
1957 }
1958 #elif TCG_TARGET_REG_BITS == 32
1959 # define x86_guest_base_seg     0
1960 # define x86_guest_base_index   -1
1961 # define x86_guest_base_offset  guest_base
1962 #else
1963 static int x86_guest_base_seg;
1964 static int x86_guest_base_index = -1;
1965 static int32_t x86_guest_base_offset;
1966 # if defined(__x86_64__) && defined(__linux__)
1967 #  include <asm/prctl.h>
1968 #  include <sys/prctl.h>
1969 int arch_prctl(int code, unsigned long addr);
1970 static inline int setup_guest_base_seg(void)
1971 {
1972     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1973         return P_GS;
1974     }
1975     return 0;
1976 }
1977 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1978 #  include <machine/sysarch.h>
1979 static inline int setup_guest_base_seg(void)
1980 {
1981     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1982         return P_GS;
1983     }
1984     return 0;
1985 }
1986 # else
1987 static inline int setup_guest_base_seg(void)
1988 {
1989     return 0;
1990 }
1991 # endif
1992 #endif /* SOFTMMU */
1993
1994 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1995                                    TCGReg base, int index, intptr_t ofs,
1996                                    int seg, bool is64, TCGMemOp memop)
1997 {
1998     const TCGMemOp real_bswap = memop & MO_BSWAP;
1999     TCGMemOp bswap = real_bswap;
2000     int rexw = is64 * P_REXW;
2001     int movop = OPC_MOVL_GvEv;
2002
2003     if (have_movbe && real_bswap) {
2004         bswap = 0;
2005         movop = OPC_MOVBE_GyMy;
2006     }
2007
2008     switch (memop & MO_SSIZE) {
2009     case MO_UB:
2010         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2011                                  base, index, 0, ofs);
2012         break;
2013     case MO_SB:
2014         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2015                                  base, index, 0, ofs);
2016         break;
2017     case MO_UW:
2018         tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2019                                  base, index, 0, ofs);
2020         if (real_bswap) {
2021             tcg_out_rolw_8(s, datalo);
2022         }
2023         break;
2024     case MO_SW:
2025         if (real_bswap) {
2026             if (have_movbe) {
2027                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2028                                          datalo, base, index, 0, ofs);
2029             } else {
2030                 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2031                                          base, index, 0, ofs);
2032                 tcg_out_rolw_8(s, datalo);
2033             }
2034             tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
2035         } else {
2036             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2037                                      datalo, base, index, 0, ofs);
2038         }
2039         break;
2040     case MO_UL:
2041         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2042         if (bswap) {
2043             tcg_out_bswap32(s, datalo);
2044         }
2045         break;
2046 #if TCG_TARGET_REG_BITS == 64
2047     case MO_SL:
2048         if (real_bswap) {
2049             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2050                                      base, index, 0, ofs);
2051             if (bswap) {
2052                 tcg_out_bswap32(s, datalo);
2053             }
2054             tcg_out_ext32s(s, datalo, datalo);
2055         } else {
2056             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2057                                      base, index, 0, ofs);
2058         }
2059         break;
2060 #endif
2061     case MO_Q:
2062         if (TCG_TARGET_REG_BITS == 64) {
2063             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2064                                      base, index, 0, ofs);
2065             if (bswap) {
2066                 tcg_out_bswap64(s, datalo);
2067             }
2068         } else {
2069             if (real_bswap) {
2070                 int t = datalo;
2071                 datalo = datahi;
2072                 datahi = t;
2073             }
2074             if (base != datalo) {
2075                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2076                                          base, index, 0, ofs);
2077                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2078                                          base, index, 0, ofs + 4);
2079             } else {
2080                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2081                                          base, index, 0, ofs + 4);
2082                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2083                                          base, index, 0, ofs);
2084             }
2085             if (bswap) {
2086                 tcg_out_bswap32(s, datalo);
2087                 tcg_out_bswap32(s, datahi);
2088             }
2089         }
2090         break;
2091     default:
2092         tcg_abort();
2093     }
2094 }
2095
2096 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2097    EAX. It will be useful once fixed registers globals are less
2098    common. */
2099 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2100 {
2101     TCGReg datalo, datahi, addrlo;
2102     TCGReg addrhi __attribute__((unused));
2103     TCGMemOpIdx oi;
2104     TCGMemOp opc;
2105 #if defined(CONFIG_SOFTMMU)
2106     int mem_index;
2107     tcg_insn_unit *label_ptr[2];
2108 #endif
2109
2110     datalo = *args++;
2111     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2112     addrlo = *args++;
2113     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2114     oi = *args++;
2115     opc = get_memop(oi);
2116
2117 #if defined(CONFIG_SOFTMMU)
2118     mem_index = get_mmuidx(oi);
2119
2120     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2121                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2122
2123     /* TLB Hit.  */
2124     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2125
2126     /* Record the current context of a load into ldst label */
2127     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2128                         s->code_ptr, label_ptr);
2129 #else
2130     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2131                            x86_guest_base_offset, x86_guest_base_seg,
2132                            is64, opc);
2133 #endif
2134 }
2135
2136 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2137                                    TCGReg base, int index, intptr_t ofs,
2138                                    int seg, TCGMemOp memop)
2139 {
2140     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2141        we could perform the bswap twice to restore the original value
2142        instead of moving to the scratch.  But as it is, the L constraint
2143        means that TCG_REG_L0 is definitely free here.  */
2144     const TCGReg scratch = TCG_REG_L0;
2145     const TCGMemOp real_bswap = memop & MO_BSWAP;
2146     TCGMemOp bswap = real_bswap;
2147     int movop = OPC_MOVL_EvGv;
2148
2149     if (have_movbe && real_bswap) {
2150         bswap = 0;
2151         movop = OPC_MOVBE_MyGy;
2152     }
2153
2154     switch (memop & MO_SIZE) {
2155     case MO_8:
2156         /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2157            Use the scratch register if necessary.  */
2158         if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2159             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2160             datalo = scratch;
2161         }
2162         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2163                                  datalo, base, index, 0, ofs);
2164         break;
2165     case MO_16:
2166         if (bswap) {
2167             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2168             tcg_out_rolw_8(s, scratch);
2169             datalo = scratch;
2170         }
2171         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2172                                  base, index, 0, ofs);
2173         break;
2174     case MO_32:
2175         if (bswap) {
2176             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2177             tcg_out_bswap32(s, scratch);
2178             datalo = scratch;
2179         }
2180         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2181         break;
2182     case MO_64:
2183         if (TCG_TARGET_REG_BITS == 64) {
2184             if (bswap) {
2185                 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2186                 tcg_out_bswap64(s, scratch);
2187                 datalo = scratch;
2188             }
2189             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2190                                      base, index, 0, ofs);
2191         } else if (bswap) {
2192             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2193             tcg_out_bswap32(s, scratch);
2194             tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2195                                      base, index, 0, ofs);
2196             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2197             tcg_out_bswap32(s, scratch);
2198             tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2199                                      base, index, 0, ofs + 4);
2200         } else {
2201             if (real_bswap) {
2202                 int t = datalo;
2203                 datalo = datahi;
2204                 datahi = t;
2205             }
2206             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2207                                      base, index, 0, ofs);
2208             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2209                                      base, index, 0, ofs + 4);
2210         }
2211         break;
2212     default:
2213         tcg_abort();
2214     }
2215 }
2216
2217 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2218 {
2219     TCGReg datalo, datahi, addrlo;
2220     TCGReg addrhi __attribute__((unused));
2221     TCGMemOpIdx oi;
2222     TCGMemOp opc;
2223 #if defined(CONFIG_SOFTMMU)
2224     int mem_index;
2225     tcg_insn_unit *label_ptr[2];
2226 #endif
2227
2228     datalo = *args++;
2229     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2230     addrlo = *args++;
2231     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2232     oi = *args++;
2233     opc = get_memop(oi);
2234
2235 #if defined(CONFIG_SOFTMMU)
2236     mem_index = get_mmuidx(oi);
2237
2238     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2239                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2240
2241     /* TLB Hit.  */
2242     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2243
2244     /* Record the current context of a store into ldst label */
2245     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2246                         s->code_ptr, label_ptr);
2247 #else
2248     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2249                            x86_guest_base_offset, x86_guest_base_seg, opc);
2250 #endif
2251 }
2252
2253 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2254                               const TCGArg *args, const int *const_args)
2255 {
2256     TCGArg a0, a1, a2;
2257     int c, const_a2, vexop, rexw = 0;
2258
2259 #if TCG_TARGET_REG_BITS == 64
2260 # define OP_32_64(x) \
2261         case glue(glue(INDEX_op_, x), _i64): \
2262             rexw = P_REXW; /* FALLTHRU */    \
2263         case glue(glue(INDEX_op_, x), _i32)
2264 #else
2265 # define OP_32_64(x) \
2266         case glue(glue(INDEX_op_, x), _i32)
2267 #endif
2268
2269     /* Hoist the loads of the most common arguments.  */
2270     a0 = args[0];
2271     a1 = args[1];
2272     a2 = args[2];
2273     const_a2 = const_args[2];
2274
2275     switch (opc) {
2276     case INDEX_op_exit_tb:
2277         /* Reuse the zeroing that exists for goto_ptr.  */
2278         if (a0 == 0) {
2279             tcg_out_jmp(s, s->code_gen_epilogue);
2280         } else {
2281             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2282             tcg_out_jmp(s, tb_ret_addr);
2283         }
2284         break;
2285     case INDEX_op_goto_tb:
2286         if (s->tb_jmp_insn_offset) {
2287             /* direct jump method */
2288             int gap;
2289             /* jump displacement must be aligned for atomic patching;
2290              * see if we need to add extra nops before jump
2291              */
2292             gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2293             if (gap != 1) {
2294                 tcg_out_nopn(s, gap - 1);
2295             }
2296             tcg_out8(s, OPC_JMP_long); /* jmp im */
2297             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2298             tcg_out32(s, 0);
2299         } else {
2300             /* indirect jump method */
2301             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2302                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2303         }
2304         set_jmp_reset_offset(s, a0);
2305         break;
2306     case INDEX_op_goto_ptr:
2307         /* jmp to the given host address (could be epilogue) */
2308         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2309         break;
2310     case INDEX_op_br:
2311         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2312         break;
2313     OP_32_64(ld8u):
2314         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2315         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2316         break;
2317     OP_32_64(ld8s):
2318         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2319         break;
2320     OP_32_64(ld16u):
2321         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2322         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2323         break;
2324     OP_32_64(ld16s):
2325         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2326         break;
2327 #if TCG_TARGET_REG_BITS == 64
2328     case INDEX_op_ld32u_i64:
2329 #endif
2330     case INDEX_op_ld_i32:
2331         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2332         break;
2333
2334     OP_32_64(st8):
2335         if (const_args[0]) {
2336             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2337             tcg_out8(s, a0);
2338         } else {
2339             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2340         }
2341         break;
2342     OP_32_64(st16):
2343         if (const_args[0]) {
2344             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2345             tcg_out16(s, a0);
2346         } else {
2347             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2348         }
2349         break;
2350 #if TCG_TARGET_REG_BITS == 64
2351     case INDEX_op_st32_i64:
2352 #endif
2353     case INDEX_op_st_i32:
2354         if (const_args[0]) {
2355             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2356             tcg_out32(s, a0);
2357         } else {
2358             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2359         }
2360         break;
2361
2362     OP_32_64(add):
2363         /* For 3-operand addition, use LEA.  */
2364         if (a0 != a1) {
2365             TCGArg c3 = 0;
2366             if (const_a2) {
2367                 c3 = a2, a2 = -1;
2368             } else if (a0 == a2) {
2369                 /* Watch out for dest = src + dest, since we've removed
2370                    the matching constraint on the add.  */
2371                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2372                 break;
2373             }
2374
2375             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2376             break;
2377         }
2378         c = ARITH_ADD;
2379         goto gen_arith;
2380     OP_32_64(sub):
2381         c = ARITH_SUB;
2382         goto gen_arith;
2383     OP_32_64(and):
2384         c = ARITH_AND;
2385         goto gen_arith;
2386     OP_32_64(or):
2387         c = ARITH_OR;
2388         goto gen_arith;
2389     OP_32_64(xor):
2390         c = ARITH_XOR;
2391         goto gen_arith;
2392     gen_arith:
2393         if (const_a2) {
2394             tgen_arithi(s, c + rexw, a0, a2, 0);
2395         } else {
2396             tgen_arithr(s, c + rexw, a0, a2);
2397         }
2398         break;
2399
2400     OP_32_64(andc):
2401         if (const_a2) {
2402             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2403             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2404         } else {
2405             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2406         }
2407         break;
2408
2409     OP_32_64(mul):
2410         if (const_a2) {
2411             int32_t val;
2412             val = a2;
2413             if (val == (int8_t)val) {
2414                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2415                 tcg_out8(s, val);
2416             } else {
2417                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2418                 tcg_out32(s, val);
2419             }
2420         } else {
2421             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2422         }
2423         break;
2424
2425     OP_32_64(div2):
2426         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2427         break;
2428     OP_32_64(divu2):
2429         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2430         break;
2431
2432     OP_32_64(shl):
2433         /* For small constant 3-operand shift, use LEA.  */
2434         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2435             if (a2 - 1 == 0) {
2436                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2437                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2438             } else {
2439                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2440                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2441             }
2442             break;
2443         }
2444         c = SHIFT_SHL;
2445         vexop = OPC_SHLX;
2446         goto gen_shift_maybe_vex;
2447     OP_32_64(shr):
2448         c = SHIFT_SHR;
2449         vexop = OPC_SHRX;
2450         goto gen_shift_maybe_vex;
2451     OP_32_64(sar):
2452         c = SHIFT_SAR;
2453         vexop = OPC_SARX;
2454         goto gen_shift_maybe_vex;
2455     OP_32_64(rotl):
2456         c = SHIFT_ROL;
2457         goto gen_shift;
2458     OP_32_64(rotr):
2459         c = SHIFT_ROR;
2460         goto gen_shift;
2461     gen_shift_maybe_vex:
2462         if (have_bmi2) {
2463             if (!const_a2) {
2464                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2465                 break;
2466             }
2467             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2468         }
2469         /* FALLTHRU */
2470     gen_shift:
2471         if (const_a2) {
2472             tcg_out_shifti(s, c + rexw, a0, a2);
2473         } else {
2474             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2475         }
2476         break;
2477
2478     OP_32_64(ctz):
2479         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2480         break;
2481     OP_32_64(clz):
2482         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2483         break;
2484     OP_32_64(ctpop):
2485         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2486         break;
2487
2488     case INDEX_op_brcond_i32:
2489         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2490         break;
2491     case INDEX_op_setcond_i32:
2492         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2493         break;
2494     case INDEX_op_movcond_i32:
2495         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2496         break;
2497
2498     OP_32_64(bswap16):
2499         tcg_out_rolw_8(s, a0);
2500         break;
2501     OP_32_64(bswap32):
2502         tcg_out_bswap32(s, a0);
2503         break;
2504
2505     OP_32_64(neg):
2506         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2507         break;
2508     OP_32_64(not):
2509         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2510         break;
2511
2512     OP_32_64(ext8s):
2513         tcg_out_ext8s(s, a0, a1, rexw);
2514         break;
2515     OP_32_64(ext16s):
2516         tcg_out_ext16s(s, a0, a1, rexw);
2517         break;
2518     OP_32_64(ext8u):
2519         tcg_out_ext8u(s, a0, a1);
2520         break;
2521     OP_32_64(ext16u):
2522         tcg_out_ext16u(s, a0, a1);
2523         break;
2524
2525     case INDEX_op_qemu_ld_i32:
2526         tcg_out_qemu_ld(s, args, 0);
2527         break;
2528     case INDEX_op_qemu_ld_i64:
2529         tcg_out_qemu_ld(s, args, 1);
2530         break;
2531     case INDEX_op_qemu_st_i32:
2532         tcg_out_qemu_st(s, args, 0);
2533         break;
2534     case INDEX_op_qemu_st_i64:
2535         tcg_out_qemu_st(s, args, 1);
2536         break;
2537
2538     OP_32_64(mulu2):
2539         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2540         break;
2541     OP_32_64(muls2):
2542         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2543         break;
2544     OP_32_64(add2):
2545         if (const_args[4]) {
2546             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2547         } else {
2548             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2549         }
2550         if (const_args[5]) {
2551             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2552         } else {
2553             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2554         }
2555         break;
2556     OP_32_64(sub2):
2557         if (const_args[4]) {
2558             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2559         } else {
2560             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2561         }
2562         if (const_args[5]) {
2563             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2564         } else {
2565             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2566         }
2567         break;
2568
2569 #if TCG_TARGET_REG_BITS == 32
2570     case INDEX_op_brcond2_i32:
2571         tcg_out_brcond2(s, args, const_args, 0);
2572         break;
2573     case INDEX_op_setcond2_i32:
2574         tcg_out_setcond2(s, args, const_args);
2575         break;
2576 #else /* TCG_TARGET_REG_BITS == 64 */
2577     case INDEX_op_ld32s_i64:
2578         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2579         break;
2580     case INDEX_op_ld_i64:
2581         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2582         break;
2583     case INDEX_op_st_i64:
2584         if (const_args[0]) {
2585             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2586             tcg_out32(s, a0);
2587         } else {
2588             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2589         }
2590         break;
2591
2592     case INDEX_op_brcond_i64:
2593         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2594         break;
2595     case INDEX_op_setcond_i64:
2596         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2597         break;
2598     case INDEX_op_movcond_i64:
2599         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2600         break;
2601
2602     case INDEX_op_bswap64_i64:
2603         tcg_out_bswap64(s, a0);
2604         break;
2605     case INDEX_op_extu_i32_i64:
2606     case INDEX_op_ext32u_i64:
2607     case INDEX_op_extrl_i64_i32:
2608         tcg_out_ext32u(s, a0, a1);
2609         break;
2610     case INDEX_op_ext_i32_i64:
2611     case INDEX_op_ext32s_i64:
2612         tcg_out_ext32s(s, a0, a1);
2613         break;
2614     case INDEX_op_extrh_i64_i32:
2615         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2616         break;
2617 #endif
2618
2619     OP_32_64(deposit):
2620         if (args[3] == 0 && args[4] == 8) {
2621             /* load bits 0..7 */
2622             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2623         } else if (args[3] == 8 && args[4] == 8) {
2624             /* load bits 8..15 */
2625             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2626         } else if (args[3] == 0 && args[4] == 16) {
2627             /* load bits 0..15 */
2628             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2629         } else {
2630             tcg_abort();
2631         }
2632         break;
2633
2634     case INDEX_op_extract_i64:
2635         if (a2 + args[3] == 32) {
2636             /* This is a 32-bit zero-extending right shift.  */
2637             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2638             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2639             break;
2640         }
2641         /* FALLTHRU */
2642     case INDEX_op_extract_i32:
2643         /* On the off-chance that we can use the high-byte registers.
2644            Otherwise we emit the same ext16 + shift pattern that we
2645            would have gotten from the normal tcg-op.c expansion.  */
2646         tcg_debug_assert(a2 == 8 && args[3] == 8);
2647         if (a1 < 4 && a0 < 8) {
2648             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2649         } else {
2650             tcg_out_ext16u(s, a0, a1);
2651             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2652         }
2653         break;
2654
2655     case INDEX_op_sextract_i32:
2656         /* We don't implement sextract_i64, as we cannot sign-extend to
2657            64-bits without using the REX prefix that explicitly excludes
2658            access to the high-byte registers.  */
2659         tcg_debug_assert(a2 == 8 && args[3] == 8);
2660         if (a1 < 4 && a0 < 8) {
2661             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2662         } else {
2663             tcg_out_ext16s(s, a0, a1, 0);
2664             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2665         }
2666         break;
2667
2668     OP_32_64(extract2):
2669         /* Note that SHRD outputs to the r/m operand.  */
2670         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2671         tcg_out8(s, args[3]);
2672         break;
2673
2674     case INDEX_op_mb:
2675         tcg_out_mb(s, a0);
2676         break;
2677     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2678     case INDEX_op_mov_i64:
2679     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2680     case INDEX_op_movi_i64:
2681     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2682     default:
2683         tcg_abort();
2684     }
2685
2686 #undef OP_32_64
2687 }
2688
2689 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2690                            unsigned vecl, unsigned vece,
2691                            const TCGArg *args, const int *const_args)
2692 {
2693     static int const add_insn[4] = {
2694         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2695     };
2696     static int const ssadd_insn[4] = {
2697         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2698     };
2699     static int const usadd_insn[4] = {
2700         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2701     };
2702     static int const sub_insn[4] = {
2703         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2704     };
2705     static int const sssub_insn[4] = {
2706         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2707     };
2708     static int const ussub_insn[4] = {
2709         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2710     };
2711     static int const mul_insn[4] = {
2712         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2713     };
2714     static int const shift_imm_insn[4] = {
2715         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2716     };
2717     static int const cmpeq_insn[4] = {
2718         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2719     };
2720     static int const cmpgt_insn[4] = {
2721         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2722     };
2723     static int const punpckl_insn[4] = {
2724         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2725     };
2726     static int const punpckh_insn[4] = {
2727         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2728     };
2729     static int const packss_insn[4] = {
2730         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2731     };
2732     static int const packus_insn[4] = {
2733         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2734     };
2735     static int const smin_insn[4] = {
2736         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2737     };
2738     static int const smax_insn[4] = {
2739         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2740     };
2741     static int const umin_insn[4] = {
2742         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2743     };
2744     static int const umax_insn[4] = {
2745         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2746     };
2747     static int const shlv_insn[4] = {
2748         /* TODO: AVX512 adds support for MO_16.  */
2749         OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2750     };
2751     static int const shrv_insn[4] = {
2752         /* TODO: AVX512 adds support for MO_16.  */
2753         OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2754     };
2755     static int const sarv_insn[4] = {
2756         /* TODO: AVX512 adds support for MO_16, MO_64.  */
2757         OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2758     };
2759     static int const shls_insn[4] = {
2760         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2761     };
2762     static int const shrs_insn[4] = {
2763         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2764     };
2765     static int const sars_insn[4] = {
2766         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2767     };
2768     static int const abs_insn[4] = {
2769         /* TODO: AVX512 adds support for MO_64.  */
2770         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2771     };
2772
2773     TCGType type = vecl + TCG_TYPE_V64;
2774     int insn, sub;
2775     TCGArg a0, a1, a2;
2776
2777     a0 = args[0];
2778     a1 = args[1];
2779     a2 = args[2];
2780
2781     switch (opc) {
2782     case INDEX_op_add_vec:
2783         insn = add_insn[vece];
2784         goto gen_simd;
2785     case INDEX_op_ssadd_vec:
2786         insn = ssadd_insn[vece];
2787         goto gen_simd;
2788     case INDEX_op_usadd_vec:
2789         insn = usadd_insn[vece];
2790         goto gen_simd;
2791     case INDEX_op_sub_vec:
2792         insn = sub_insn[vece];
2793         goto gen_simd;
2794     case INDEX_op_sssub_vec:
2795         insn = sssub_insn[vece];
2796         goto gen_simd;
2797     case INDEX_op_ussub_vec:
2798         insn = ussub_insn[vece];
2799         goto gen_simd;
2800     case INDEX_op_mul_vec:
2801         insn = mul_insn[vece];
2802         goto gen_simd;
2803     case INDEX_op_and_vec:
2804         insn = OPC_PAND;
2805         goto gen_simd;
2806     case INDEX_op_or_vec:
2807         insn = OPC_POR;
2808         goto gen_simd;
2809     case INDEX_op_xor_vec:
2810         insn = OPC_PXOR;
2811         goto gen_simd;
2812     case INDEX_op_smin_vec:
2813         insn = smin_insn[vece];
2814         goto gen_simd;
2815     case INDEX_op_umin_vec:
2816         insn = umin_insn[vece];
2817         goto gen_simd;
2818     case INDEX_op_smax_vec:
2819         insn = smax_insn[vece];
2820         goto gen_simd;
2821     case INDEX_op_umax_vec:
2822         insn = umax_insn[vece];
2823         goto gen_simd;
2824     case INDEX_op_shlv_vec:
2825         insn = shlv_insn[vece];
2826         goto gen_simd;
2827     case INDEX_op_shrv_vec:
2828         insn = shrv_insn[vece];
2829         goto gen_simd;
2830     case INDEX_op_sarv_vec:
2831         insn = sarv_insn[vece];
2832         goto gen_simd;
2833     case INDEX_op_shls_vec:
2834         insn = shls_insn[vece];
2835         goto gen_simd;
2836     case INDEX_op_shrs_vec:
2837         insn = shrs_insn[vece];
2838         goto gen_simd;
2839     case INDEX_op_sars_vec:
2840         insn = sars_insn[vece];
2841         goto gen_simd;
2842     case INDEX_op_x86_punpckl_vec:
2843         insn = punpckl_insn[vece];
2844         goto gen_simd;
2845     case INDEX_op_x86_punpckh_vec:
2846         insn = punpckh_insn[vece];
2847         goto gen_simd;
2848     case INDEX_op_x86_packss_vec:
2849         insn = packss_insn[vece];
2850         goto gen_simd;
2851     case INDEX_op_x86_packus_vec:
2852         insn = packus_insn[vece];
2853         goto gen_simd;
2854 #if TCG_TARGET_REG_BITS == 32
2855     case INDEX_op_dup2_vec:
2856         /* Constraints have already placed both 32-bit inputs in xmm regs.  */
2857         insn = OPC_PUNPCKLDQ;
2858         goto gen_simd;
2859 #endif
2860     case INDEX_op_abs_vec:
2861         insn = abs_insn[vece];
2862         a2 = a1;
2863         a1 = 0;
2864         goto gen_simd;
2865     gen_simd:
2866         tcg_debug_assert(insn != OPC_UD2);
2867         if (type == TCG_TYPE_V256) {
2868             insn |= P_VEXL;
2869         }
2870         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2871         break;
2872
2873     case INDEX_op_cmp_vec:
2874         sub = args[3];
2875         if (sub == TCG_COND_EQ) {
2876             insn = cmpeq_insn[vece];
2877         } else if (sub == TCG_COND_GT) {
2878             insn = cmpgt_insn[vece];
2879         } else {
2880             g_assert_not_reached();
2881         }
2882         goto gen_simd;
2883
2884     case INDEX_op_andc_vec:
2885         insn = OPC_PANDN;
2886         if (type == TCG_TYPE_V256) {
2887             insn |= P_VEXL;
2888         }
2889         tcg_out_vex_modrm(s, insn, a0, a2, a1);
2890         break;
2891
2892     case INDEX_op_shli_vec:
2893         sub = 6;
2894         goto gen_shift;
2895     case INDEX_op_shri_vec:
2896         sub = 2;
2897         goto gen_shift;
2898     case INDEX_op_sari_vec:
2899         tcg_debug_assert(vece != MO_64);
2900         sub = 4;
2901     gen_shift:
2902         tcg_debug_assert(vece != MO_8);
2903         insn = shift_imm_insn[vece];
2904         if (type == TCG_TYPE_V256) {
2905             insn |= P_VEXL;
2906         }
2907         tcg_out_vex_modrm(s, insn, sub, a0, a1);
2908         tcg_out8(s, a2);
2909         break;
2910
2911     case INDEX_op_ld_vec:
2912         tcg_out_ld(s, type, a0, a1, a2);
2913         break;
2914     case INDEX_op_st_vec:
2915         tcg_out_st(s, type, a0, a1, a2);
2916         break;
2917     case INDEX_op_dupm_vec:
2918         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2919         break;
2920
2921     case INDEX_op_x86_shufps_vec:
2922         insn = OPC_SHUFPS;
2923         sub = args[3];
2924         goto gen_simd_imm8;
2925     case INDEX_op_x86_blend_vec:
2926         if (vece == MO_16) {
2927             insn = OPC_PBLENDW;
2928         } else if (vece == MO_32) {
2929             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2930         } else {
2931             g_assert_not_reached();
2932         }
2933         sub = args[3];
2934         goto gen_simd_imm8;
2935     case INDEX_op_x86_vperm2i128_vec:
2936         insn = OPC_VPERM2I128;
2937         sub = args[3];
2938         goto gen_simd_imm8;
2939     gen_simd_imm8:
2940         if (type == TCG_TYPE_V256) {
2941             insn |= P_VEXL;
2942         }
2943         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2944         tcg_out8(s, sub);
2945         break;
2946
2947     case INDEX_op_x86_vpblendvb_vec:
2948         insn = OPC_VPBLENDVB;
2949         if (type == TCG_TYPE_V256) {
2950             insn |= P_VEXL;
2951         }
2952         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2953         tcg_out8(s, args[3] << 4);
2954         break;
2955
2956     case INDEX_op_x86_psrldq_vec:
2957         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2958         tcg_out8(s, a2);
2959         break;
2960
2961     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2962     case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
2963     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2964     default:
2965         g_assert_not_reached();
2966     }
2967 }
2968
2969 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2970 {
2971     static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2972     static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2973     static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2974     static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2975     static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2976     static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2977     static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2978     static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2979     static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2980     static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2981     static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2982     static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2983     static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2984     static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2985     static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2986     static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2987     static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2988     static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2989     static const TCGTargetOpDef r_r_L_L
2990         = { .args_ct_str = { "r", "r", "L", "L" } };
2991     static const TCGTargetOpDef L_L_L_L
2992         = { .args_ct_str = { "L", "L", "L", "L" } };
2993     static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2994     static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2995     static const TCGTargetOpDef x_x_x_x
2996         = { .args_ct_str = { "x", "x", "x", "x" } };
2997     static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2998
2999     switch (op) {
3000     case INDEX_op_goto_ptr:
3001         return &r;
3002
3003     case INDEX_op_ld8u_i32:
3004     case INDEX_op_ld8u_i64:
3005     case INDEX_op_ld8s_i32:
3006     case INDEX_op_ld8s_i64:
3007     case INDEX_op_ld16u_i32:
3008     case INDEX_op_ld16u_i64:
3009     case INDEX_op_ld16s_i32:
3010     case INDEX_op_ld16s_i64:
3011     case INDEX_op_ld_i32:
3012     case INDEX_op_ld32u_i64:
3013     case INDEX_op_ld32s_i64:
3014     case INDEX_op_ld_i64:
3015         return &r_r;
3016
3017     case INDEX_op_st8_i32:
3018     case INDEX_op_st8_i64:
3019         return &qi_r;
3020     case INDEX_op_st16_i32:
3021     case INDEX_op_st16_i64:
3022     case INDEX_op_st_i32:
3023     case INDEX_op_st32_i64:
3024         return &ri_r;
3025     case INDEX_op_st_i64:
3026         return &re_r;
3027
3028     case INDEX_op_add_i32:
3029     case INDEX_op_add_i64:
3030         return &r_r_re;
3031     case INDEX_op_sub_i32:
3032     case INDEX_op_sub_i64:
3033     case INDEX_op_mul_i32:
3034     case INDEX_op_mul_i64:
3035     case INDEX_op_or_i32:
3036     case INDEX_op_or_i64:
3037     case INDEX_op_xor_i32:
3038     case INDEX_op_xor_i64:
3039         return &r_0_re;
3040
3041     case INDEX_op_and_i32:
3042     case INDEX_op_and_i64:
3043         {
3044             static const TCGTargetOpDef and
3045                 = { .args_ct_str = { "r", "0", "reZ" } };
3046             return &and;
3047         }
3048         break;
3049     case INDEX_op_andc_i32:
3050     case INDEX_op_andc_i64:
3051         {
3052             static const TCGTargetOpDef andc
3053                 = { .args_ct_str = { "r", "r", "rI" } };
3054             return &andc;
3055         }
3056         break;
3057
3058     case INDEX_op_shl_i32:
3059     case INDEX_op_shl_i64:
3060     case INDEX_op_shr_i32:
3061     case INDEX_op_shr_i64:
3062     case INDEX_op_sar_i32:
3063     case INDEX_op_sar_i64:
3064         return have_bmi2 ? &r_r_ri : &r_0_ci;
3065     case INDEX_op_rotl_i32:
3066     case INDEX_op_rotl_i64:
3067     case INDEX_op_rotr_i32:
3068     case INDEX_op_rotr_i64:
3069         return &r_0_ci;
3070
3071     case INDEX_op_brcond_i32:
3072     case INDEX_op_brcond_i64:
3073         return &r_re;
3074
3075     case INDEX_op_bswap16_i32:
3076     case INDEX_op_bswap16_i64:
3077     case INDEX_op_bswap32_i32:
3078     case INDEX_op_bswap32_i64:
3079     case INDEX_op_bswap64_i64:
3080     case INDEX_op_neg_i32:
3081     case INDEX_op_neg_i64:
3082     case INDEX_op_not_i32:
3083     case INDEX_op_not_i64:
3084     case INDEX_op_extrh_i64_i32:
3085         return &r_0;
3086
3087     case INDEX_op_ext8s_i32:
3088     case INDEX_op_ext8s_i64:
3089     case INDEX_op_ext8u_i32:
3090     case INDEX_op_ext8u_i64:
3091         return &r_q;
3092     case INDEX_op_ext16s_i32:
3093     case INDEX_op_ext16s_i64:
3094     case INDEX_op_ext16u_i32:
3095     case INDEX_op_ext16u_i64:
3096     case INDEX_op_ext32s_i64:
3097     case INDEX_op_ext32u_i64:
3098     case INDEX_op_ext_i32_i64:
3099     case INDEX_op_extu_i32_i64:
3100     case INDEX_op_extrl_i64_i32:
3101     case INDEX_op_extract_i32:
3102     case INDEX_op_extract_i64:
3103     case INDEX_op_sextract_i32:
3104     case INDEX_op_ctpop_i32:
3105     case INDEX_op_ctpop_i64:
3106         return &r_r;
3107     case INDEX_op_extract2_i32:
3108     case INDEX_op_extract2_i64:
3109         return &r_0_r;
3110
3111     case INDEX_op_deposit_i32:
3112     case INDEX_op_deposit_i64:
3113         {
3114             static const TCGTargetOpDef dep
3115                 = { .args_ct_str = { "Q", "0", "Q" } };
3116             return &dep;
3117         }
3118     case INDEX_op_setcond_i32:
3119     case INDEX_op_setcond_i64:
3120         {
3121             static const TCGTargetOpDef setc
3122                 = { .args_ct_str = { "q", "r", "re" } };
3123             return &setc;
3124         }
3125     case INDEX_op_movcond_i32:
3126     case INDEX_op_movcond_i64:
3127         {
3128             static const TCGTargetOpDef movc
3129                 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3130             return &movc;
3131         }
3132     case INDEX_op_div2_i32:
3133     case INDEX_op_div2_i64:
3134     case INDEX_op_divu2_i32:
3135     case INDEX_op_divu2_i64:
3136         {
3137             static const TCGTargetOpDef div2
3138                 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3139             return &div2;
3140         }
3141     case INDEX_op_mulu2_i32:
3142     case INDEX_op_mulu2_i64:
3143     case INDEX_op_muls2_i32:
3144     case INDEX_op_muls2_i64:
3145         {
3146             static const TCGTargetOpDef mul2
3147                 = { .args_ct_str = { "a", "d", "a", "r" } };
3148             return &mul2;
3149         }
3150     case INDEX_op_add2_i32:
3151     case INDEX_op_add2_i64:
3152     case INDEX_op_sub2_i32:
3153     case INDEX_op_sub2_i64:
3154         {
3155             static const TCGTargetOpDef arith2
3156                 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3157             return &arith2;
3158         }
3159     case INDEX_op_ctz_i32:
3160     case INDEX_op_ctz_i64:
3161         {
3162             static const TCGTargetOpDef ctz[2] = {
3163                 { .args_ct_str = { "&r", "r", "r" } },
3164                 { .args_ct_str = { "&r", "r", "rW" } },
3165             };
3166             return &ctz[have_bmi1];
3167         }
3168     case INDEX_op_clz_i32:
3169     case INDEX_op_clz_i64:
3170         {
3171             static const TCGTargetOpDef clz[2] = {
3172                 { .args_ct_str = { "&r", "r", "r" } },
3173                 { .args_ct_str = { "&r", "r", "rW" } },
3174             };
3175             return &clz[have_lzcnt];
3176         }
3177
3178     case INDEX_op_qemu_ld_i32:
3179         return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3180     case INDEX_op_qemu_st_i32:
3181         return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3182     case INDEX_op_qemu_ld_i64:
3183         return (TCG_TARGET_REG_BITS == 64 ? &r_L
3184                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3185                 : &r_r_L_L);
3186     case INDEX_op_qemu_st_i64:
3187         return (TCG_TARGET_REG_BITS == 64 ? &L_L
3188                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3189                 : &L_L_L_L);
3190
3191     case INDEX_op_brcond2_i32:
3192         {
3193             static const TCGTargetOpDef b2
3194                 = { .args_ct_str = { "r", "r", "ri", "ri" } };
3195             return &b2;
3196         }
3197     case INDEX_op_setcond2_i32:
3198         {
3199             static const TCGTargetOpDef s2
3200                 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3201             return &s2;
3202         }
3203
3204     case INDEX_op_ld_vec:
3205     case INDEX_op_st_vec:
3206     case INDEX_op_dupm_vec:
3207         return &x_r;
3208
3209     case INDEX_op_add_vec:
3210     case INDEX_op_sub_vec:
3211     case INDEX_op_mul_vec:
3212     case INDEX_op_and_vec:
3213     case INDEX_op_or_vec:
3214     case INDEX_op_xor_vec:
3215     case INDEX_op_andc_vec:
3216     case INDEX_op_ssadd_vec:
3217     case INDEX_op_usadd_vec:
3218     case INDEX_op_sssub_vec:
3219     case INDEX_op_ussub_vec:
3220     case INDEX_op_smin_vec:
3221     case INDEX_op_umin_vec:
3222     case INDEX_op_smax_vec:
3223     case INDEX_op_umax_vec:
3224     case INDEX_op_shlv_vec:
3225     case INDEX_op_shrv_vec:
3226     case INDEX_op_sarv_vec:
3227     case INDEX_op_shls_vec:
3228     case INDEX_op_shrs_vec:
3229     case INDEX_op_sars_vec:
3230     case INDEX_op_cmp_vec:
3231     case INDEX_op_x86_shufps_vec:
3232     case INDEX_op_x86_blend_vec:
3233     case INDEX_op_x86_packss_vec:
3234     case INDEX_op_x86_packus_vec:
3235     case INDEX_op_x86_vperm2i128_vec:
3236     case INDEX_op_x86_punpckl_vec:
3237     case INDEX_op_x86_punpckh_vec:
3238 #if TCG_TARGET_REG_BITS == 32
3239     case INDEX_op_dup2_vec:
3240 #endif
3241         return &x_x_x;
3242     case INDEX_op_abs_vec:
3243     case INDEX_op_dup_vec:
3244     case INDEX_op_shli_vec:
3245     case INDEX_op_shri_vec:
3246     case INDEX_op_sari_vec:
3247     case INDEX_op_x86_psrldq_vec:
3248         return &x_x;
3249     case INDEX_op_x86_vpblendvb_vec:
3250         return &x_x_x_x;
3251
3252     default:
3253         break;
3254     }
3255     return NULL;
3256 }
3257
3258 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3259 {
3260     switch (opc) {
3261     case INDEX_op_add_vec:
3262     case INDEX_op_sub_vec:
3263     case INDEX_op_and_vec:
3264     case INDEX_op_or_vec:
3265     case INDEX_op_xor_vec:
3266     case INDEX_op_andc_vec:
3267         return 1;
3268     case INDEX_op_cmp_vec:
3269     case INDEX_op_cmpsel_vec:
3270         return -1;
3271
3272     case INDEX_op_shli_vec:
3273     case INDEX_op_shri_vec:
3274         /* We must expand the operation for MO_8.  */
3275         return vece == MO_8 ? -1 : 1;
3276
3277     case INDEX_op_sari_vec:
3278         /* We must expand the operation for MO_8.  */
3279         if (vece == MO_8) {
3280             return -1;
3281         }
3282         /* We can emulate this for MO_64, but it does not pay off
3283            unless we're producing at least 4 values.  */
3284         if (vece == MO_64) {
3285             return type >= TCG_TYPE_V256 ? -1 : 0;
3286         }
3287         return 1;
3288
3289     case INDEX_op_shls_vec:
3290     case INDEX_op_shrs_vec:
3291         return vece >= MO_16;
3292     case INDEX_op_sars_vec:
3293         return vece >= MO_16 && vece <= MO_32;
3294
3295     case INDEX_op_shlv_vec:
3296     case INDEX_op_shrv_vec:
3297         return have_avx2 && vece >= MO_32;
3298     case INDEX_op_sarv_vec:
3299         return have_avx2 && vece == MO_32;
3300
3301     case INDEX_op_mul_vec:
3302         if (vece == MO_8) {
3303             /* We can expand the operation for MO_8.  */
3304             return -1;
3305         }
3306         if (vece == MO_64) {
3307             return 0;
3308         }
3309         return 1;
3310
3311     case INDEX_op_ssadd_vec:
3312     case INDEX_op_usadd_vec:
3313     case INDEX_op_sssub_vec:
3314     case INDEX_op_ussub_vec:
3315         return vece <= MO_16;
3316     case INDEX_op_smin_vec:
3317     case INDEX_op_smax_vec:
3318     case INDEX_op_umin_vec:
3319     case INDEX_op_umax_vec:
3320     case INDEX_op_abs_vec:
3321         return vece <= MO_32;
3322
3323     default:
3324         return 0;
3325     }
3326 }
3327
3328 static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
3329                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3330 {
3331     TCGv_vec t1, t2;
3332
3333     tcg_debug_assert(vece == MO_8);
3334
3335     t1 = tcg_temp_new_vec(type);
3336     t2 = tcg_temp_new_vec(type);
3337
3338     /* Unpack to W, shift, and repack.  Tricky bits:
3339        (1) Use punpck*bw x,x to produce DDCCBBAA,
3340            i.e. duplicate in other half of the 16-bit lane.
3341        (2) For right-shift, add 8 so that the high half of
3342            the lane becomes zero.  For left-shift, we must
3343            shift up and down again.
3344        (3) Step 2 leaves high half zero such that PACKUSWB
3345            (pack with unsigned saturation) does not modify
3346            the quantity.  */
3347     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3348               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3349     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3350               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3351
3352     if (shr) {
3353         tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
3354         tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
3355     } else {
3356         tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
3357         tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
3358         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3359         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3360     }
3361
3362     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3363               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3364     tcg_temp_free_vec(t1);
3365     tcg_temp_free_vec(t2);
3366 }
3367
3368 static void expand_vec_sari(TCGType type, unsigned vece,
3369                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3370 {
3371     TCGv_vec t1, t2;
3372
3373     switch (vece) {
3374     case MO_8:
3375         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3376         t1 = tcg_temp_new_vec(type);
3377         t2 = tcg_temp_new_vec(type);
3378         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3379                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3380         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3381                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3382         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3383         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3384         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3385                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3386         tcg_temp_free_vec(t1);
3387         tcg_temp_free_vec(t2);
3388         break;
3389
3390     case MO_64:
3391         if (imm <= 32) {
3392             /* We can emulate a small sign extend by performing an arithmetic
3393              * 32-bit shift and overwriting the high half of a 64-bit logical
3394              * shift (note that the ISA says shift of 32 is valid).
3395              */
3396             t1 = tcg_temp_new_vec(type);
3397             tcg_gen_sari_vec(MO_32, t1, v1, imm);
3398             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3399             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3400                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3401                       tcgv_vec_arg(t1), 0xaa);
3402             tcg_temp_free_vec(t1);
3403         } else {
3404             /* Otherwise we will need to use a compare vs 0 to produce
3405              * the sign-extend, shift and merge.
3406              */
3407             t1 = tcg_const_zeros_vec(type);
3408             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3409             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3410             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3411             tcg_gen_or_vec(MO_64, v0, v0, t1);
3412             tcg_temp_free_vec(t1);
3413         }
3414         break;
3415
3416     default:
3417         g_assert_not_reached();
3418     }
3419 }
3420
3421 static void expand_vec_mul(TCGType type, unsigned vece,
3422                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3423 {
3424     TCGv_vec t1, t2, t3, t4;
3425
3426     tcg_debug_assert(vece == MO_8);
3427
3428     /*
3429      * Unpack v1 bytes to words, 0 | x.
3430      * Unpack v2 bytes to words, y | 0.
3431      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3432      * Shift logical right by 8 bits to clear the high 8 bytes before
3433      * using an unsigned saturated pack.
3434      *
3435      * The difference between the V64, V128 and V256 cases is merely how
3436      * we distribute the expansion between temporaries.
3437      */
3438     switch (type) {
3439     case TCG_TYPE_V64:
3440         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3441         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3442         tcg_gen_dup16i_vec(t2, 0);
3443         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3444                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3445         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3446                   tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3447         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3448         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3449         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3450                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3451         tcg_temp_free_vec(t1);
3452         tcg_temp_free_vec(t2);
3453         break;
3454
3455     case TCG_TYPE_V128:
3456     case TCG_TYPE_V256:
3457         t1 = tcg_temp_new_vec(type);
3458         t2 = tcg_temp_new_vec(type);
3459         t3 = tcg_temp_new_vec(type);
3460         t4 = tcg_temp_new_vec(type);
3461         tcg_gen_dup16i_vec(t4, 0);
3462         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3463                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3464         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3465                   tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3466         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3467                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3468         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3469                   tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3470         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3471         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3472         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3473         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3474         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3475                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3476         tcg_temp_free_vec(t1);
3477         tcg_temp_free_vec(t2);
3478         tcg_temp_free_vec(t3);
3479         tcg_temp_free_vec(t4);
3480         break;
3481
3482     default:
3483         g_assert_not_reached();
3484     }
3485 }
3486
3487 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3488                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3489 {
3490     enum {
3491         NEED_INV  = 1,
3492         NEED_SWAP = 2,
3493         NEED_BIAS = 4,
3494         NEED_UMIN = 8,
3495         NEED_UMAX = 16,
3496     };
3497     TCGv_vec t1, t2;
3498     uint8_t fixup;
3499
3500     switch (cond) {
3501     case TCG_COND_EQ:
3502     case TCG_COND_GT:
3503         fixup = 0;
3504         break;
3505     case TCG_COND_NE:
3506     case TCG_COND_LE:
3507         fixup = NEED_INV;
3508         break;
3509     case TCG_COND_LT:
3510         fixup = NEED_SWAP;
3511         break;
3512     case TCG_COND_GE:
3513         fixup = NEED_SWAP | NEED_INV;
3514         break;
3515     case TCG_COND_LEU:
3516         if (vece <= MO_32) {
3517             fixup = NEED_UMIN;
3518         } else {
3519             fixup = NEED_BIAS | NEED_INV;
3520         }
3521         break;
3522     case TCG_COND_GTU:
3523         if (vece <= MO_32) {
3524             fixup = NEED_UMIN | NEED_INV;
3525         } else {
3526             fixup = NEED_BIAS;
3527         }
3528         break;
3529     case TCG_COND_GEU:
3530         if (vece <= MO_32) {
3531             fixup = NEED_UMAX;
3532         } else {
3533             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3534         }
3535         break;
3536     case TCG_COND_LTU:
3537         if (vece <= MO_32) {
3538             fixup = NEED_UMAX | NEED_INV;
3539         } else {
3540             fixup = NEED_BIAS | NEED_SWAP;
3541         }
3542         break;
3543     default:
3544         g_assert_not_reached();
3545     }
3546
3547     if (fixup & NEED_INV) {
3548         cond = tcg_invert_cond(cond);
3549     }
3550     if (fixup & NEED_SWAP) {
3551         t1 = v1, v1 = v2, v2 = t1;
3552         cond = tcg_swap_cond(cond);
3553     }
3554
3555     t1 = t2 = NULL;
3556     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3557         t1 = tcg_temp_new_vec(type);
3558         if (fixup & NEED_UMIN) {
3559             tcg_gen_umin_vec(vece, t1, v1, v2);
3560         } else {
3561             tcg_gen_umax_vec(vece, t1, v1, v2);
3562         }
3563         v2 = t1;
3564         cond = TCG_COND_EQ;
3565     } else if (fixup & NEED_BIAS) {
3566         t1 = tcg_temp_new_vec(type);
3567         t2 = tcg_temp_new_vec(type);
3568         tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3569         tcg_gen_sub_vec(vece, t1, v1, t2);
3570         tcg_gen_sub_vec(vece, t2, v2, t2);
3571         v1 = t1;
3572         v2 = t2;
3573         cond = tcg_signed_cond(cond);
3574     }
3575
3576     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3577     /* Expand directly; do not recurse.  */
3578     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3579               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3580
3581     if (t1) {
3582         tcg_temp_free_vec(t1);
3583         if (t2) {
3584             tcg_temp_free_vec(t2);
3585         }
3586     }
3587     return fixup & NEED_INV;
3588 }
3589
3590 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3591                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3592 {
3593     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3594         tcg_gen_not_vec(vece, v0, v0);
3595     }
3596 }
3597
3598 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3599                               TCGv_vec c1, TCGv_vec c2,
3600                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3601 {
3602     TCGv_vec t = tcg_temp_new_vec(type);
3603
3604     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3605         /* Invert the sense of the compare by swapping arguments.  */
3606         TCGv_vec x;
3607         x = v3, v3 = v4, v4 = x;
3608     }
3609     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3610               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3611               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3612     tcg_temp_free_vec(t);
3613 }
3614
3615 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3616                        TCGArg a0, ...)
3617 {
3618     va_list va;
3619     TCGArg a2;
3620     TCGv_vec v0, v1, v2, v3, v4;
3621
3622     va_start(va, a0);
3623     v0 = temp_tcgv_vec(arg_temp(a0));
3624     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3625     a2 = va_arg(va, TCGArg);
3626
3627     switch (opc) {
3628     case INDEX_op_shli_vec:
3629     case INDEX_op_shri_vec:
3630         expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
3631         break;
3632
3633     case INDEX_op_sari_vec:
3634         expand_vec_sari(type, vece, v0, v1, a2);
3635         break;
3636
3637     case INDEX_op_mul_vec:
3638         v2 = temp_tcgv_vec(arg_temp(a2));
3639         expand_vec_mul(type, vece, v0, v1, v2);
3640         break;
3641
3642     case INDEX_op_cmp_vec:
3643         v2 = temp_tcgv_vec(arg_temp(a2));
3644         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3645         break;
3646
3647     case INDEX_op_cmpsel_vec:
3648         v2 = temp_tcgv_vec(arg_temp(a2));
3649         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3650         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3651         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3652         break;
3653
3654     default:
3655         break;
3656     }
3657
3658     va_end(va);
3659 }
3660
3661 static const int tcg_target_callee_save_regs[] = {
3662 #if TCG_TARGET_REG_BITS == 64
3663     TCG_REG_RBP,
3664     TCG_REG_RBX,
3665 #if defined(_WIN64)
3666     TCG_REG_RDI,
3667     TCG_REG_RSI,
3668 #endif
3669     TCG_REG_R12,
3670     TCG_REG_R13,
3671     TCG_REG_R14, /* Currently used for the global env. */
3672     TCG_REG_R15,
3673 #else
3674     TCG_REG_EBP, /* Currently used for the global env. */
3675     TCG_REG_EBX,
3676     TCG_REG_ESI,
3677     TCG_REG_EDI,
3678 #endif
3679 };
3680
3681 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3682    and tcg_register_jit.  */
3683
3684 #define PUSH_SIZE \
3685     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3686      * (TCG_TARGET_REG_BITS / 8))
3687
3688 #define FRAME_SIZE \
3689     ((PUSH_SIZE \
3690       + TCG_STATIC_CALL_ARGS_SIZE \
3691       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3692       + TCG_TARGET_STACK_ALIGN - 1) \
3693      & ~(TCG_TARGET_STACK_ALIGN - 1))
3694
3695 /* Generate global QEMU prologue and epilogue code */
3696 static void tcg_target_qemu_prologue(TCGContext *s)
3697 {
3698     int i, stack_addend;
3699
3700     /* TB prologue */
3701
3702     /* Reserve some stack space, also for TCG temps.  */
3703     stack_addend = FRAME_SIZE - PUSH_SIZE;
3704     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3705                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3706
3707     /* Save all callee saved registers.  */
3708     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3709         tcg_out_push(s, tcg_target_callee_save_regs[i]);
3710     }
3711
3712 #if TCG_TARGET_REG_BITS == 32
3713     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3714                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3715     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3716     /* jmp *tb.  */
3717     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3718                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3719                          + stack_addend);
3720 #else
3721 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3722     if (guest_base) {
3723         int seg = setup_guest_base_seg();
3724         if (seg != 0) {
3725             x86_guest_base_seg = seg;
3726         } else if (guest_base == (int32_t)guest_base) {
3727             x86_guest_base_offset = guest_base;
3728         } else {
3729             /* Choose R12 because, as a base, it requires a SIB byte. */
3730             x86_guest_base_index = TCG_REG_R12;
3731             tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3732             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3733         }
3734     }
3735 # endif
3736     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3737     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3738     /* jmp *tb.  */
3739     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3740 #endif
3741
3742     /*
3743      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3744      * and fall through to the rest of the epilogue.
3745      */
3746     s->code_gen_epilogue = s->code_ptr;
3747     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3748
3749     /* TB epilogue */
3750     tb_ret_addr = s->code_ptr;
3751
3752     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3753
3754     if (have_avx2) {
3755         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3756     }
3757     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3758         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3759     }
3760     tcg_out_opc(s, OPC_RET, 0, 0, 0);
3761 }
3762
3763 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3764 {
3765     memset(p, 0x90, count);
3766 }
3767
3768 static void tcg_target_init(TCGContext *s)
3769 {
3770 #ifdef CONFIG_CPUID_H
3771     unsigned a, b, c, d, b7 = 0;
3772     int max = __get_cpuid_max(0, 0);
3773
3774     if (max >= 7) {
3775         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3776         __cpuid_count(7, 0, a, b7, c, d);
3777         have_bmi1 = (b7 & bit_BMI) != 0;
3778         have_bmi2 = (b7 & bit_BMI2) != 0;
3779     }
3780
3781     if (max >= 1) {
3782         __cpuid(1, a, b, c, d);
3783 #ifndef have_cmov
3784         /* For 32-bit, 99% certainty that we're running on hardware that
3785            supports cmov, but we still need to check.  In case cmov is not
3786            available, we'll use a small forward branch.  */
3787         have_cmov = (d & bit_CMOV) != 0;
3788 #endif
3789
3790         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3791            need to probe for it.  */
3792         have_movbe = (c & bit_MOVBE) != 0;
3793         have_popcnt = (c & bit_POPCNT) != 0;
3794
3795         /* There are a number of things we must check before we can be
3796            sure of not hitting invalid opcode.  */
3797         if (c & bit_OSXSAVE) {
3798             unsigned xcrl, xcrh;
3799             /* The xgetbv instruction is not available to older versions of
3800              * the assembler, so we encode the instruction manually.
3801              */
3802             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3803             if ((xcrl & 6) == 6) {
3804                 have_avx1 = (c & bit_AVX) != 0;
3805                 have_avx2 = (b7 & bit_AVX2) != 0;
3806             }
3807         }
3808     }
3809
3810     max = __get_cpuid_max(0x8000000, 0);
3811     if (max >= 1) {
3812         __cpuid(0x80000001, a, b, c, d);
3813         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3814         have_lzcnt = (c & bit_LZCNT) != 0;
3815     }
3816 #endif /* CONFIG_CPUID_H */
3817
3818     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3819     if (TCG_TARGET_REG_BITS == 64) {
3820         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3821     }
3822     if (have_avx1) {
3823         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3824         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3825     }
3826     if (have_avx2) {
3827         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3828     }
3829
3830     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3831     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3832     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3833     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3834     if (TCG_TARGET_REG_BITS == 64) {
3835 #if !defined(_WIN64)
3836         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3837         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3838 #endif
3839         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3840         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3841         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3842         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3843     }
3844
3845     s->reserved_regs = 0;
3846     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3847 }
3848
3849 typedef struct {
3850     DebugFrameHeader h;
3851     uint8_t fde_def_cfa[4];
3852     uint8_t fde_reg_ofs[14];
3853 } DebugFrame;
3854
3855 /* We're expecting a 2 byte uleb128 encoded value.  */
3856 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3857
3858 #if !defined(__ELF__)
3859     /* Host machine without ELF. */
3860 #elif TCG_TARGET_REG_BITS == 64
3861 #define ELF_HOST_MACHINE EM_X86_64
3862 static const DebugFrame debug_frame = {
3863     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3864     .h.cie.id = -1,
3865     .h.cie.version = 1,
3866     .h.cie.code_align = 1,
3867     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3868     .h.cie.return_column = 16,
3869
3870     /* Total FDE size does not include the "len" member.  */
3871     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3872
3873     .fde_def_cfa = {
3874         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3875         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3876         (FRAME_SIZE >> 7)
3877     },
3878     .fde_reg_ofs = {
3879         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3880         /* The following ordering must match tcg_target_callee_save_regs.  */
3881         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3882         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3883         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3884         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3885         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3886         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3887     }
3888 };
3889 #else
3890 #define ELF_HOST_MACHINE EM_386
3891 static const DebugFrame debug_frame = {
3892     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3893     .h.cie.id = -1,
3894     .h.cie.version = 1,
3895     .h.cie.code_align = 1,
3896     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3897     .h.cie.return_column = 8,
3898
3899     /* Total FDE size does not include the "len" member.  */
3900     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3901
3902     .fde_def_cfa = {
3903         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3904         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3905         (FRAME_SIZE >> 7)
3906     },
3907     .fde_reg_ofs = {
3908         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3909         /* The following ordering must match tcg_target_callee_save_regs.  */
3910         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3911         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3912         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3913         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3914     }
3915 };
3916 #endif
3917
3918 #if defined(ELF_HOST_MACHINE)
3919 void tcg_register_jit(void *buf, size_t buf_size)
3920 {
3921     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3922 }
3923 #endif