tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-pool.c.inc"
  26
  27 #ifdef CONFIG_DEBUG_TCG
  28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29 #if TCG_TARGET_REG_BITS == 64
  30     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31 #else
  32     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33 #endif
  34     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36 #if TCG_TARGET_REG_BITS == 64
  37     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39 #endif
  40 };
  41 #endif
  42
  43 static const int tcg_target_reg_alloc_order[] = {
  44 #if TCG_TARGET_REG_BITS == 64
  45     TCG_REG_RBP,
  46     TCG_REG_RBX,
  47     TCG_REG_R12,
  48     TCG_REG_R13,
  49     TCG_REG_R14,
  50     TCG_REG_R15,
  51     TCG_REG_R10,
  52     TCG_REG_R11,
  53     TCG_REG_R9,
  54     TCG_REG_R8,
  55     TCG_REG_RCX,
  56     TCG_REG_RDX,
  57     TCG_REG_RSI,
  58     TCG_REG_RDI,
  59     TCG_REG_RAX,
  60 #else
  61     TCG_REG_EBX,
  62     TCG_REG_ESI,
  63     TCG_REG_EDI,
  64     TCG_REG_EBP,
  65     TCG_REG_ECX,
  66     TCG_REG_EDX,
  67     TCG_REG_EAX,
  68 #endif
  69     TCG_REG_XMM0,
  70     TCG_REG_XMM1,
  71     TCG_REG_XMM2,
  72     TCG_REG_XMM3,
  73     TCG_REG_XMM4,
  74     TCG_REG_XMM5,
  75 #ifndef _WIN64
  76     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78     TCG_REG_XMM6,
  79     TCG_REG_XMM7,
  80 #if TCG_TARGET_REG_BITS == 64
  81     TCG_REG_XMM8,
  82     TCG_REG_XMM9,
  83     TCG_REG_XMM10,
  84     TCG_REG_XMM11,
  85     TCG_REG_XMM12,
  86     TCG_REG_XMM13,
  87     TCG_REG_XMM14,
  88     TCG_REG_XMM15,
  89 #endif
  90 #endif
  91 };
  92
  93 static const int tcg_target_call_iarg_regs[] = {
  94 #if TCG_TARGET_REG_BITS == 64
  95 #if defined(_WIN64)
  96     TCG_REG_RCX,
  97     TCG_REG_RDX,
  98 #else
  99     TCG_REG_RDI,
 100     TCG_REG_RSI,
 101     TCG_REG_RDX,
 102     TCG_REG_RCX,
 103 #endif
 104     TCG_REG_R8,
 105     TCG_REG_R9,
 106 #else
 107     /* 32 bit mode uses stack based calling convention (GCC default). */
 108 #endif
 109 };
 110
 111 static const int tcg_target_call_oarg_regs[] = {
 112     TCG_REG_EAX,
 113 #if TCG_TARGET_REG_BITS == 32
 114     TCG_REG_EDX
 115 #endif
 116 };
 117
 118 /* Constants we accept.  */
 119 #define TCG_CT_CONST_S32 0x100
 120 #define TCG_CT_CONST_U32 0x200
 121 #define TCG_CT_CONST_I32 0x400
 122 #define TCG_CT_CONST_WSZ 0x800
 123
 124 /* Registers used with L constraint, which are the first argument
 125    registers on x86_64, and two random call clobbered registers on
 126    i386. */
 127 #if TCG_TARGET_REG_BITS == 64
 128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130 #else
 131 # define TCG_REG_L0 TCG_REG_EAX
 132 # define TCG_REG_L1 TCG_REG_EDX
 133 #endif
 134
 135 #define ALL_BYTEH_REGS         0x0000000fu
 136 #if TCG_TARGET_REG_BITS == 64
 137 # define ALL_GENERAL_REGS      0x0000ffffu
 138 # define ALL_VECTOR_REGS       0xffff0000u
 139 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 140 #else
 141 # define ALL_GENERAL_REGS      0x000000ffu
 142 # define ALL_VECTOR_REGS       0x00ff0000u
 143 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 144 #endif
 145 #ifdef CONFIG_SOFTMMU
 146 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 147 #else
 148 # define SOFTMMU_RESERVE_REGS  0
 149 #endif
 150
 151 /* The host compiler should supply <cpuid.h> to enable runtime features
 152    detection, as we're not going to go so far as our own inline assembly.
 153    If not available, default values will be assumed.  */
 154 #if defined(CONFIG_CPUID_H)
 155 #include "qemu/cpuid.h"
 156 #endif
 157
 158 /* For 64-bit, we always know that CMOV is available.  */
 159 #if TCG_TARGET_REG_BITS == 64
 160 # define have_cmov 1
 161 #elif defined(CONFIG_CPUID_H)
 162 static bool have_cmov;
 163 #else
 164 # define have_cmov 0
 165 #endif
 166
 167 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 168    it there.  Therefore we always define the variable.  */
 169 bool have_bmi1;
 170 bool have_popcnt;
 171 bool have_avx1;
 172 bool have_avx2;
 173 bool have_movbe;
 174
 175 #ifdef CONFIG_CPUID_H
 176 static bool have_bmi2;
 177 static bool have_lzcnt;
 178 #else
 179 # define have_bmi2 0
 180 # define have_lzcnt 0
 181 #endif
 182
 183 static const tcg_insn_unit *tb_ret_addr;
 184
 185 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 186                         intptr_t value, intptr_t addend)
 187 {
 188     value += addend;
 189     switch(type) {
 190     case R_386_PC32:
 191         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 192         if (value != (int32_t)value) {
 193             return false;
 194         }
 195         /* FALLTHRU */
 196     case R_386_32:
 197         tcg_patch32(code_ptr, value);
 198         break;
 199     case R_386_PC8:
 200         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 201         if (value != (int8_t)value) {
 202             return false;
 203         }
 204         tcg_patch8(code_ptr, value);
 205         break;
 206     default:
 207         tcg_abort();
 208     }
 209     return true;
 210 }
 211
 212 /* test if a constant matches the constraint */
 213 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 214 {
 215     if (ct & TCG_CT_CONST) {
 216         return 1;
 217     }
 218     if (type == TCG_TYPE_I32) {
 219         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 220             return 1;
 221         }
 222     } else {
 223         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 224             return 1;
 225         }
 226         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 227             return 1;
 228         }
 229         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 230             return 1;
 231         }
 232     }
 233     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 234         return 1;
 235     }
 236     return 0;
 237 }
 238
 239 # define LOWREGMASK(x)  ((x) & 7)
 240
 241 #define P_EXT           0x100           /* 0x0f opcode prefix */
 242 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 243 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 244 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 245 #if TCG_TARGET_REG_BITS == 64
 246 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 247 # define P_REXB_R       0x2000          /* REG field as byte register */
 248 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 249 # define P_GS           0x8000          /* gs segment override */
 250 #else
 251 # define P_REXW         0
 252 # define P_REXB_R       0
 253 # define P_REXB_RM      0
 254 # define P_GS           0
 255 #endif
 256 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 257 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 258 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 259 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 260
 261 #define OPC_ARITH_EvIz  (0x81)
 262 #define OPC_ARITH_EvIb  (0x83)
 263 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 264 #define OPC_ANDN        (0xf2 | P_EXT38)
 265 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 266 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 267 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 268 #define OPC_BSF         (0xbc | P_EXT)
 269 #define OPC_BSR         (0xbd | P_EXT)
 270 #define OPC_BSWAP       (0xc8 | P_EXT)
 271 #define OPC_CALL_Jz     (0xe8)
 272 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 273 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 274 #define OPC_DEC_r32     (0x48)
 275 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 276 #define OPC_IMUL_GvEvIb (0x6b)
 277 #define OPC_IMUL_GvEvIz (0x69)
 278 #define OPC_INC_r32     (0x40)
 279 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 280 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 281 #define OPC_JMP_long    (0xe9)
 282 #define OPC_JMP_short   (0xeb)
 283 #define OPC_LEA         (0x8d)
 284 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 285 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 286 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 287 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 288 #define OPC_MOVB_EvIz   (0xc6)
 289 #define OPC_MOVL_EvIz   (0xc7)
 290 #define OPC_MOVL_Iv     (0xb8)
 291 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 292 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 293 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 294 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 295 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 296 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 297 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 298 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 299 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 300 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 301 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 302 #define OPC_MOVSBL      (0xbe | P_EXT)
 303 #define OPC_MOVSWL      (0xbf | P_EXT)
 304 #define OPC_MOVSLQ      (0x63 | P_REXW)
 305 #define OPC_MOVZBL      (0xb6 | P_EXT)
 306 #define OPC_MOVZWL      (0xb7 | P_EXT)
 307 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 308 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 309 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 310 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 311 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 312 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 313 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 314 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 315 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 316 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 317 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 318 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 319 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 320 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 321 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 322 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 323 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 324 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 325 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 326 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 327 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 328 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 329 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 330 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 331 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 332 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 333 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 334 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 335 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 336 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 337 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 338 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 339 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 340 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 341 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 342 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 343 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 344 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 345 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 348 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 349 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 350 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 351 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 352 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 353 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 354 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 355 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 356 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 357 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 358 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 359 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 360 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 361 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 362 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 363 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 364 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 365 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 366 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 367 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 368 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 369 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 370 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 371 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 372 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 373 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 374 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 375 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 376 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 384 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 385 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 386 #define OPC_POP_r32     (0x58)
 387 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 388 #define OPC_PUSH_r32    (0x50)
 389 #define OPC_PUSH_Iv     (0x68)
 390 #define OPC_PUSH_Ib     (0x6a)
 391 #define OPC_RET         (0xc3)
 392 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 393 #define OPC_SHIFT_1     (0xd1)
 394 #define OPC_SHIFT_Ib    (0xc1)
 395 #define OPC_SHIFT_cl    (0xd3)
 396 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 397 #define OPC_SHUFPS      (0xc6 | P_EXT)
 398 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 399 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 400 #define OPC_SHRD_Ib     (0xac | P_EXT)
 401 #define OPC_TESTL       (0x85)
 402 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 403 #define OPC_UD2         (0x0b | P_EXT)
 404 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 405 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 406 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 407 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 408 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 409 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 410 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 413 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 414 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 415 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 416 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 417 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 418 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 419 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 420 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 421 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 422 #define OPC_XCHG_ax_r32 (0x90)
 423
 424 #define OPC_GRP3_Ev     (0xf7)
 425 #define OPC_GRP5        (0xff)
 426 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 427
 428 /* Group 1 opcode extensions for 0x80-0x83.
 429    These are also used as modifiers for OPC_ARITH.  */
 430 #define ARITH_ADD 0
 431 #define ARITH_OR  1
 432 #define ARITH_ADC 2
 433 #define ARITH_SBB 3
 434 #define ARITH_AND 4
 435 #define ARITH_SUB 5
 436 #define ARITH_XOR 6
 437 #define ARITH_CMP 7
 438
 439 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 440 #define SHIFT_ROL 0
 441 #define SHIFT_ROR 1
 442 #define SHIFT_SHL 4
 443 #define SHIFT_SHR 5
 444 #define SHIFT_SAR 7
 445
 446 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 447 #define EXT3_NOT   2
 448 #define EXT3_NEG   3
 449 #define EXT3_MUL   4
 450 #define EXT3_IMUL  5
 451 #define EXT3_DIV   6
 452 #define EXT3_IDIV  7
 453
 454 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 455 #define EXT5_INC_Ev     0
 456 #define EXT5_DEC_Ev     1
 457 #define EXT5_CALLN_Ev   2
 458 #define EXT5_JMPN_Ev    4
 459
 460 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 461 #define JCC_JMP (-1)
 462 #define JCC_JO  0x0
 463 #define JCC_JNO 0x1
 464 #define JCC_JB  0x2
 465 #define JCC_JAE 0x3
 466 #define JCC_JE  0x4
 467 #define JCC_JNE 0x5
 468 #define JCC_JBE 0x6
 469 #define JCC_JA  0x7
 470 #define JCC_JS  0x8
 471 #define JCC_JNS 0x9
 472 #define JCC_JP  0xa
 473 #define JCC_JNP 0xb
 474 #define JCC_JL  0xc
 475 #define JCC_JGE 0xd
 476 #define JCC_JLE 0xe
 477 #define JCC_JG  0xf
 478
 479 static const uint8_t tcg_cond_to_jcc[] = {
 480     [TCG_COND_EQ] = JCC_JE,
 481     [TCG_COND_NE] = JCC_JNE,
 482     [TCG_COND_LT] = JCC_JL,
 483     [TCG_COND_GE] = JCC_JGE,
 484     [TCG_COND_LE] = JCC_JLE,
 485     [TCG_COND_GT] = JCC_JG,
 486     [TCG_COND_LTU] = JCC_JB,
 487     [TCG_COND_GEU] = JCC_JAE,
 488     [TCG_COND_LEU] = JCC_JBE,
 489     [TCG_COND_GTU] = JCC_JA,
 490 };
 491
 492 #if TCG_TARGET_REG_BITS == 64
 493 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 494 {
 495     int rex;
 496
 497     if (opc & P_GS) {
 498         tcg_out8(s, 0x65);
 499     }
 500     if (opc & P_DATA16) {
 501         /* We should never be asking for both 16 and 64-bit operation.  */
 502         tcg_debug_assert((opc & P_REXW) == 0);
 503         tcg_out8(s, 0x66);
 504     }
 505     if (opc & P_SIMDF3) {
 506         tcg_out8(s, 0xf3);
 507     } else if (opc & P_SIMDF2) {
 508         tcg_out8(s, 0xf2);
 509     }
 510
 511     rex = 0;
 512     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 513     rex |= (r & 8) >> 1;                /* REX.R */
 514     rex |= (x & 8) >> 2;                /* REX.X */
 515     rex |= (rm & 8) >> 3;               /* REX.B */
 516
 517     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 518        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 519        as otherwise the encoding indicates %[abcd]h.  Note that the values
 520        that are ORed in merely indicate that the REX byte must be present;
 521        those bits get discarded in output.  */
 522     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 523     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 524
 525     if (rex) {
 526         tcg_out8(s, (uint8_t)(rex | 0x40));
 527     }
 528
 529     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 530         tcg_out8(s, 0x0f);
 531         if (opc & P_EXT38) {
 532             tcg_out8(s, 0x38);
 533         } else if (opc & P_EXT3A) {
 534             tcg_out8(s, 0x3a);
 535         }
 536     }
 537
 538     tcg_out8(s, opc);
 539 }
 540 #else
 541 static void tcg_out_opc(TCGContext *s, int opc)
 542 {
 543     if (opc & P_DATA16) {
 544         tcg_out8(s, 0x66);
 545     }
 546     if (opc & P_SIMDF3) {
 547         tcg_out8(s, 0xf3);
 548     } else if (opc & P_SIMDF2) {
 549         tcg_out8(s, 0xf2);
 550     }
 551     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 552         tcg_out8(s, 0x0f);
 553         if (opc & P_EXT38) {
 554             tcg_out8(s, 0x38);
 555         } else if (opc & P_EXT3A) {
 556             tcg_out8(s, 0x3a);
 557         }
 558     }
 559     tcg_out8(s, opc);
 560 }
 561 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 562    the 32-bit compilation paths.  This method works with all versions of gcc,
 563    whereas relying on optimization may not be able to exclude them.  */
 564 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 565 #endif
 566
 567 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 568 {
 569     tcg_out_opc(s, opc, r, rm, 0);
 570     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 571 }
 572
 573 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 574                             int rm, int index)
 575 {
 576     int tmp;
 577
 578     /* Use the two byte form if possible, which cannot encode
 579        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 580     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 581         && ((rm | index) & 8) == 0) {
 582         /* Two byte VEX prefix.  */
 583         tcg_out8(s, 0xc5);
 584
 585         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 586     } else {
 587         /* Three byte VEX prefix.  */
 588         tcg_out8(s, 0xc4);
 589
 590         /* VEX.m-mmmm */
 591         if (opc & P_EXT3A) {
 592             tmp = 3;
 593         } else if (opc & P_EXT38) {
 594             tmp = 2;
 595         } else if (opc & P_EXT) {
 596             tmp = 1;
 597         } else {
 598             g_assert_not_reached();
 599         }
 600         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 601         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 602         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 603         tcg_out8(s, tmp);
 604
 605         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 606     }
 607
 608     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 609     /* VEX.pp */
 610     if (opc & P_DATA16) {
 611         tmp |= 1;                          /* 0x66 */
 612     } else if (opc & P_SIMDF3) {
 613         tmp |= 2;                          /* 0xf3 */
 614     } else if (opc & P_SIMDF2) {
 615         tmp |= 3;                          /* 0xf2 */
 616     }
 617     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 618     tcg_out8(s, tmp);
 619     tcg_out8(s, opc);
 620 }
 621
 622 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 623 {
 624     tcg_out_vex_opc(s, opc, r, v, rm, 0);
 625     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 626 }
 627
 628 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 629    We handle either RM and INDEX missing with a negative value.  In 64-bit
 630    mode for absolute addresses, ~RM is the size of the immediate operand
 631    that will follow the instruction.  */
 632
 633 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 634                                int shift, intptr_t offset)
 635 {
 636     int mod, len;
 637
 638     if (index < 0 && rm < 0) {
 639         if (TCG_TARGET_REG_BITS == 64) {
 640             /* Try for a rip-relative addressing mode.  This has replaced
 641                the 32-bit-mode absolute addressing encoding.  */
 642             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 643             intptr_t disp = offset - pc;
 644             if (disp == (int32_t)disp) {
 645                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 646                 tcg_out32(s, disp);
 647                 return;
 648             }
 649
 650             /* Try for an absolute address encoding.  This requires the
 651                use of the MODRM+SIB encoding and is therefore larger than
 652                rip-relative addressing.  */
 653             if (offset == (int32_t)offset) {
 654                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 655                 tcg_out8(s, (4 << 3) | 5);
 656                 tcg_out32(s, offset);
 657                 return;
 658             }
 659
 660             /* ??? The memory isn't directly addressable.  */
 661             g_assert_not_reached();
 662         } else {
 663             /* Absolute address.  */
 664             tcg_out8(s, (r << 3) | 5);
 665             tcg_out32(s, offset);
 666             return;
 667         }
 668     }
 669
 670     /* Find the length of the immediate addend.  Note that the encoding
 671        that would be used for (%ebp) indicates absolute addressing.  */
 672     if (rm < 0) {
 673         mod = 0, len = 4, rm = 5;
 674     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 675         mod = 0, len = 0;
 676     } else if (offset == (int8_t)offset) {
 677         mod = 0x40, len = 1;
 678     } else {
 679         mod = 0x80, len = 4;
 680     }
 681
 682     /* Use a single byte MODRM format if possible.  Note that the encoding
 683        that would be used for %esp is the escape to the two byte form.  */
 684     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 685         /* Single byte MODRM format.  */
 686         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 687     } else {
 688         /* Two byte MODRM+SIB format.  */
 689
 690         /* Note that the encoding that would place %esp into the index
 691            field indicates no index register.  In 64-bit mode, the REX.X
 692            bit counts, so %r12 can be used as the index.  */
 693         if (index < 0) {
 694             index = 4;
 695         } else {
 696             tcg_debug_assert(index != TCG_REG_ESP);
 697         }
 698
 699         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 700         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 701     }
 702
 703     if (len == 1) {
 704         tcg_out8(s, offset);
 705     } else if (len == 4) {
 706         tcg_out32(s, offset);
 707     }
 708 }
 709
 710 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 711                                      int index, int shift, intptr_t offset)
 712 {
 713     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 714     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 715 }
 716
 717 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 718                                          int rm, int index, int shift,
 719                                          intptr_t offset)
 720 {
 721     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 722     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 723 }
 724
 725 /* A simplification of the above with no index or shift.  */
 726 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 727                                         int rm, intptr_t offset)
 728 {
 729     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 730 }
 731
 732 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 733                                             int v, int rm, intptr_t offset)
 734 {
 735     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 736 }
 737
 738 /* Output an opcode with an expected reference to the constant pool.  */
 739 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 740 {
 741     tcg_out_opc(s, opc, r, 0, 0);
 742     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 743     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 744     tcg_out32(s, 0);
 745 }
 746
 747 /* Output an opcode with an expected reference to the constant pool.  */
 748 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 749 {
 750     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 751     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 752     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 753     tcg_out32(s, 0);
 754 }
 755
 756 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 757 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 758 {
 759     /* Propagate an opcode prefix, such as P_REXW.  */
 760     int ext = subop & ~0x7;
 761     subop &= 0x7;
 762
 763     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 764 }
 765
 766 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 767 {
 768     int rexw = 0;
 769
 770     if (arg == ret) {
 771         return true;
 772     }
 773     switch (type) {
 774     case TCG_TYPE_I64:
 775         rexw = P_REXW;
 776         /* fallthru */
 777     case TCG_TYPE_I32:
 778         if (ret < 16) {
 779             if (arg < 16) {
 780                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 781             } else {
 782                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 783             }
 784         } else {
 785             if (arg < 16) {
 786                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 787             } else {
 788                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 789             }
 790         }
 791         break;
 792
 793     case TCG_TYPE_V64:
 794         tcg_debug_assert(ret >= 16 && arg >= 16);
 795         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 796         break;
 797     case TCG_TYPE_V128:
 798         tcg_debug_assert(ret >= 16 && arg >= 16);
 799         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 800         break;
 801     case TCG_TYPE_V256:
 802         tcg_debug_assert(ret >= 16 && arg >= 16);
 803         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 804         break;
 805
 806     default:
 807         g_assert_not_reached();
 808     }
 809     return true;
 810 }
 811
 812 static const int avx2_dup_insn[4] = {
 813     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 814     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 815 };
 816
 817 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 818                             TCGReg r, TCGReg a)
 819 {
 820     if (have_avx2) {
 821         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 822         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 823     } else {
 824         switch (vece) {
 825         case MO_8:
 826             /* ??? With zero in a register, use PSHUFB.  */
 827             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 828             a = r;
 829             /* FALLTHRU */
 830         case MO_16:
 831             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 832             a = r;
 833             /* FALLTHRU */
 834         case MO_32:
 835             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 836             /* imm8 operand: all output lanes selected from input lane 0.  */
 837             tcg_out8(s, 0);
 838             break;
 839         case MO_64:
 840             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 841             break;
 842         default:
 843             g_assert_not_reached();
 844         }
 845     }
 846     return true;
 847 }
 848
 849 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 850                              TCGReg r, TCGReg base, intptr_t offset)
 851 {
 852     if (have_avx2) {
 853         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 854         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 855                                  r, 0, base, offset);
 856     } else {
 857         switch (vece) {
 858         case MO_64:
 859             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 860             break;
 861         case MO_32:
 862             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 863             break;
 864         case MO_16:
 865             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 866             tcg_out8(s, 0); /* imm8 */
 867             tcg_out_dup_vec(s, type, vece, r, r);
 868             break;
 869         case MO_8:
 870             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 871             tcg_out8(s, 0); /* imm8 */
 872             tcg_out_dup_vec(s, type, vece, r, r);
 873             break;
 874         default:
 875             g_assert_not_reached();
 876         }
 877     }
 878     return true;
 879 }
 880
 881 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 882                              TCGReg ret, int64_t arg)
 883 {
 884     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 885
 886     if (arg == 0) {
 887         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 888         return;
 889     }
 890     if (arg == -1) {
 891         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 892         return;
 893     }
 894
 895     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 896         if (have_avx2) {
 897             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 898         } else {
 899             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 900         }
 901         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 902     } else {
 903         if (type == TCG_TYPE_V64) {
 904             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 905         } else if (have_avx2) {
 906             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 907         } else {
 908             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 909         }
 910         if (TCG_TARGET_REG_BITS == 64) {
 911             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 912         } else {
 913             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 914         }
 915     }
 916 }
 917
 918 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 919                              TCGReg ret, tcg_target_long arg)
 920 {
 921     if (arg == 0) {
 922         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 923         return;
 924     }
 925     if (arg == -1) {
 926         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 927         return;
 928     }
 929
 930     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 931     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 932     if (TCG_TARGET_REG_BITS == 64) {
 933         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 934     } else {
 935         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 936     }
 937 }
 938
 939 static void tcg_out_movi_int(TCGContext *s, TCGType type,
 940                              TCGReg ret, tcg_target_long arg)
 941 {
 942     tcg_target_long diff;
 943
 944     if (arg == 0) {
 945         tgen_arithr(s, ARITH_XOR, ret, ret);
 946         return;
 947     }
 948     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 949         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 950         tcg_out32(s, arg);
 951         return;
 952     }
 953     if (arg == (int32_t)arg) {
 954         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 955         tcg_out32(s, arg);
 956         return;
 957     }
 958
 959     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 960     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
 961     if (diff == (int32_t)diff) {
 962         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 963         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 964         tcg_out32(s, diff);
 965         return;
 966     }
 967
 968     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 969     tcg_out64(s, arg);
 970 }
 971
 972 static void tcg_out_movi(TCGContext *s, TCGType type,
 973                          TCGReg ret, tcg_target_long arg)
 974 {
 975     switch (type) {
 976     case TCG_TYPE_I32:
 977 #if TCG_TARGET_REG_BITS == 64
 978     case TCG_TYPE_I64:
 979 #endif
 980         if (ret < 16) {
 981             tcg_out_movi_int(s, type, ret, arg);
 982         } else {
 983             tcg_out_movi_vec(s, type, ret, arg);
 984         }
 985         break;
 986     default:
 987         g_assert_not_reached();
 988     }
 989 }
 990
 991 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 992 {
 993     if (val == (int8_t)val) {
 994         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 995         tcg_out8(s, val);
 996     } else if (val == (int32_t)val) {
 997         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 998         tcg_out32(s, val);
 999     } else {
1000         tcg_abort();
1001     }
1002 }
1003
1004 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1005 {
1006     /* Given the strength of x86 memory ordering, we only need care for
1007        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1008        faster than "mfence", so don't bother with the sse insn.  */
1009     if (a0 & TCG_MO_ST_LD) {
1010         tcg_out8(s, 0xf0);
1011         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1012         tcg_out8(s, 0);
1013     }
1014 }
1015
1016 static inline void tcg_out_push(TCGContext *s, int reg)
1017 {
1018     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1019 }
1020
1021 static inline void tcg_out_pop(TCGContext *s, int reg)
1022 {
1023     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1024 }
1025
1026 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1027                        TCGReg arg1, intptr_t arg2)
1028 {
1029     switch (type) {
1030     case TCG_TYPE_I32:
1031         if (ret < 16) {
1032             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1033         } else {
1034             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1035         }
1036         break;
1037     case TCG_TYPE_I64:
1038         if (ret < 16) {
1039             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1040             break;
1041         }
1042         /* FALLTHRU */
1043     case TCG_TYPE_V64:
1044         /* There is no instruction that can validate 8-byte alignment.  */
1045         tcg_debug_assert(ret >= 16);
1046         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1047         break;
1048     case TCG_TYPE_V128:
1049         /*
1050          * The gvec infrastructure is asserts that v128 vector loads
1051          * and stores use a 16-byte aligned offset.  Validate that the
1052          * final pointer is aligned by using an insn that will SIGSEGV.
1053          */
1054         tcg_debug_assert(ret >= 16);
1055         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1056         break;
1057     case TCG_TYPE_V256:
1058         /*
1059          * The gvec infrastructure only requires 16-byte alignment,
1060          * so here we must use an unaligned load.
1061          */
1062         tcg_debug_assert(ret >= 16);
1063         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1064                                  ret, 0, arg1, arg2);
1065         break;
1066     default:
1067         g_assert_not_reached();
1068     }
1069 }
1070
1071 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1072                        TCGReg arg1, intptr_t arg2)
1073 {
1074     switch (type) {
1075     case TCG_TYPE_I32:
1076         if (arg < 16) {
1077             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1078         } else {
1079             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1080         }
1081         break;
1082     case TCG_TYPE_I64:
1083         if (arg < 16) {
1084             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1085             break;
1086         }
1087         /* FALLTHRU */
1088     case TCG_TYPE_V64:
1089         /* There is no instruction that can validate 8-byte alignment.  */
1090         tcg_debug_assert(arg >= 16);
1091         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1092         break;
1093     case TCG_TYPE_V128:
1094         /*
1095          * The gvec infrastructure is asserts that v128 vector loads
1096          * and stores use a 16-byte aligned offset.  Validate that the
1097          * final pointer is aligned by using an insn that will SIGSEGV.
1098          */
1099         tcg_debug_assert(arg >= 16);
1100         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1101         break;
1102     case TCG_TYPE_V256:
1103         /*
1104          * The gvec infrastructure only requires 16-byte alignment,
1105          * so here we must use an unaligned store.
1106          */
1107         tcg_debug_assert(arg >= 16);
1108         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1109                                  arg, 0, arg1, arg2);
1110         break;
1111     default:
1112         g_assert_not_reached();
1113     }
1114 }
1115
1116 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1117                         TCGReg base, intptr_t ofs)
1118 {
1119     int rexw = 0;
1120     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1121         if (val != (int32_t)val) {
1122             return false;
1123         }
1124         rexw = P_REXW;
1125     } else if (type != TCG_TYPE_I32) {
1126         return false;
1127     }
1128     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1129     tcg_out32(s, val);
1130     return true;
1131 }
1132
1133 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1134 {
1135     /* Propagate an opcode prefix, such as P_DATA16.  */
1136     int ext = subopc & ~0x7;
1137     subopc &= 0x7;
1138
1139     if (count == 1) {
1140         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1141     } else {
1142         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1143         tcg_out8(s, count);
1144     }
1145 }
1146
1147 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1148 {
1149     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1150 }
1151
1152 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1153 {
1154     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1155 }
1156
1157 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1158 {
1159     /* movzbl */
1160     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1161     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1162 }
1163
1164 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1165 {
1166     /* movsbl */
1167     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1168     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1169 }
1170
1171 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1172 {
1173     /* movzwl */
1174     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1175 }
1176
1177 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1178 {
1179     /* movsw[lq] */
1180     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1181 }
1182
1183 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1184 {
1185     /* 32-bit mov zero extends.  */
1186     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1187 }
1188
1189 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1190 {
1191     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1192 }
1193
1194 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1195 {
1196     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1197 }
1198
1199 static void tgen_arithi(TCGContext *s, int c, int r0,
1200                         tcg_target_long val, int cf)
1201 {
1202     int rexw = 0;
1203
1204     if (TCG_TARGET_REG_BITS == 64) {
1205         rexw = c & -8;
1206         c &= 7;
1207     }
1208
1209     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1210        partial flags update stalls on Pentium4 and are not recommended
1211        by current Intel optimization manuals.  */
1212     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1213         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1214         if (TCG_TARGET_REG_BITS == 64) {
1215             /* The single-byte increment encodings are re-tasked as the
1216                REX prefixes.  Use the MODRM encoding.  */
1217             tcg_out_modrm(s, OPC_GRP5 + rexw,
1218                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1219         } else {
1220             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1221         }
1222         return;
1223     }
1224
1225     if (c == ARITH_AND) {
1226         if (TCG_TARGET_REG_BITS == 64) {
1227             if (val == 0xffffffffu) {
1228                 tcg_out_ext32u(s, r0, r0);
1229                 return;
1230             }
1231             if (val == (uint32_t)val) {
1232                 /* AND with no high bits set can use a 32-bit operation.  */
1233                 rexw = 0;
1234             }
1235         }
1236         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1237             tcg_out_ext8u(s, r0, r0);
1238             return;
1239         }
1240         if (val == 0xffffu) {
1241             tcg_out_ext16u(s, r0, r0);
1242             return;
1243         }
1244     }
1245
1246     if (val == (int8_t)val) {
1247         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1248         tcg_out8(s, val);
1249         return;
1250     }
1251     if (rexw == 0 || val == (int32_t)val) {
1252         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1253         tcg_out32(s, val);
1254         return;
1255     }
1256
1257     tcg_abort();
1258 }
1259
1260 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1261 {
1262     if (val != 0) {
1263         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1264     }
1265 }
1266
1267 /* Use SMALL != 0 to force a short forward branch.  */
1268 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1269 {
1270     int32_t val, val1;
1271
1272     if (l->has_value) {
1273         val = tcg_pcrel_diff(s, l->u.value_ptr);
1274         val1 = val - 2;
1275         if ((int8_t)val1 == val1) {
1276             if (opc == -1) {
1277                 tcg_out8(s, OPC_JMP_short);
1278             } else {
1279                 tcg_out8(s, OPC_JCC_short + opc);
1280             }
1281             tcg_out8(s, val1);
1282         } else {
1283             if (small) {
1284                 tcg_abort();
1285             }
1286             if (opc == -1) {
1287                 tcg_out8(s, OPC_JMP_long);
1288                 tcg_out32(s, val - 5);
1289             } else {
1290                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1291                 tcg_out32(s, val - 6);
1292             }
1293         }
1294     } else if (small) {
1295         if (opc == -1) {
1296             tcg_out8(s, OPC_JMP_short);
1297         } else {
1298             tcg_out8(s, OPC_JCC_short + opc);
1299         }
1300         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1301         s->code_ptr += 1;
1302     } else {
1303         if (opc == -1) {
1304             tcg_out8(s, OPC_JMP_long);
1305         } else {
1306             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1307         }
1308         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1309         s->code_ptr += 4;
1310     }
1311 }
1312
1313 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1314                         int const_arg2, int rexw)
1315 {
1316     if (const_arg2) {
1317         if (arg2 == 0) {
1318             /* test r, r */
1319             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1320         } else {
1321             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1322         }
1323     } else {
1324         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1325     }
1326 }
1327
1328 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1329                              TCGArg arg1, TCGArg arg2, int const_arg2,
1330                              TCGLabel *label, int small)
1331 {
1332     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1333     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1334 }
1335
1336 #if TCG_TARGET_REG_BITS == 64
1337 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1338                              TCGArg arg1, TCGArg arg2, int const_arg2,
1339                              TCGLabel *label, int small)
1340 {
1341     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1342     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1343 }
1344 #else
1345 /* XXX: we implement it at the target level to avoid having to
1346    handle cross basic blocks temporaries */
1347 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1348                             const int *const_args, int small)
1349 {
1350     TCGLabel *label_next = gen_new_label();
1351     TCGLabel *label_this = arg_label(args[5]);
1352
1353     switch(args[4]) {
1354     case TCG_COND_EQ:
1355         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1356                          label_next, 1);
1357         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1358                          label_this, small);
1359         break;
1360     case TCG_COND_NE:
1361         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1362                          label_this, small);
1363         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1364                          label_this, small);
1365         break;
1366     case TCG_COND_LT:
1367         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1368                          label_this, small);
1369         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1370         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1371                          label_this, small);
1372         break;
1373     case TCG_COND_LE:
1374         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1375                          label_this, small);
1376         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1377         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1378                          label_this, small);
1379         break;
1380     case TCG_COND_GT:
1381         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1382                          label_this, small);
1383         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1384         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1385                          label_this, small);
1386         break;
1387     case TCG_COND_GE:
1388         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1389                          label_this, small);
1390         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1391         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1392                          label_this, small);
1393         break;
1394     case TCG_COND_LTU:
1395         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1396                          label_this, small);
1397         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1398         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1399                          label_this, small);
1400         break;
1401     case TCG_COND_LEU:
1402         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1403                          label_this, small);
1404         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1405         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1406                          label_this, small);
1407         break;
1408     case TCG_COND_GTU:
1409         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1410                          label_this, small);
1411         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1412         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1413                          label_this, small);
1414         break;
1415     case TCG_COND_GEU:
1416         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1417                          label_this, small);
1418         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1419         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1420                          label_this, small);
1421         break;
1422     default:
1423         tcg_abort();
1424     }
1425     tcg_out_label(s, label_next);
1426 }
1427 #endif
1428
1429 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1430                               TCGArg arg1, TCGArg arg2, int const_arg2)
1431 {
1432     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1433     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1434     tcg_out_ext8u(s, dest, dest);
1435 }
1436
1437 #if TCG_TARGET_REG_BITS == 64
1438 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1439                               TCGArg arg1, TCGArg arg2, int const_arg2)
1440 {
1441     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1442     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1443     tcg_out_ext8u(s, dest, dest);
1444 }
1445 #else
1446 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1447                              const int *const_args)
1448 {
1449     TCGArg new_args[6];
1450     TCGLabel *label_true, *label_over;
1451
1452     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1453
1454     if (args[0] == args[1] || args[0] == args[2]
1455         || (!const_args[3] && args[0] == args[3])
1456         || (!const_args[4] && args[0] == args[4])) {
1457         /* When the destination overlaps with one of the argument
1458            registers, don't do anything tricky.  */
1459         label_true = gen_new_label();
1460         label_over = gen_new_label();
1461
1462         new_args[5] = label_arg(label_true);
1463         tcg_out_brcond2(s, new_args, const_args+1, 1);
1464
1465         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1466         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1467         tcg_out_label(s, label_true);
1468
1469         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1470         tcg_out_label(s, label_over);
1471     } else {
1472         /* When the destination does not overlap one of the arguments,
1473            clear the destination first, jump if cond false, and emit an
1474            increment in the true case.  This results in smaller code.  */
1475
1476         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1477
1478         label_over = gen_new_label();
1479         new_args[4] = tcg_invert_cond(new_args[4]);
1480         new_args[5] = label_arg(label_over);
1481         tcg_out_brcond2(s, new_args, const_args+1, 1);
1482
1483         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1484         tcg_out_label(s, label_over);
1485     }
1486 }
1487 #endif
1488
1489 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1490                          TCGReg dest, TCGReg v1)
1491 {
1492     if (have_cmov) {
1493         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1494     } else {
1495         TCGLabel *over = gen_new_label();
1496         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1497         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1498         tcg_out_label(s, over);
1499     }
1500 }
1501
1502 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1503                               TCGReg c1, TCGArg c2, int const_c2,
1504                               TCGReg v1)
1505 {
1506     tcg_out_cmp(s, c1, c2, const_c2, 0);
1507     tcg_out_cmov(s, cond, 0, dest, v1);
1508 }
1509
1510 #if TCG_TARGET_REG_BITS == 64
1511 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1512                               TCGReg c1, TCGArg c2, int const_c2,
1513                               TCGReg v1)
1514 {
1515     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1516     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1517 }
1518 #endif
1519
1520 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1521                         TCGArg arg2, bool const_a2)
1522 {
1523     if (have_bmi1) {
1524         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1525         if (const_a2) {
1526             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1527         } else {
1528             tcg_debug_assert(dest != arg2);
1529             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1530         }
1531     } else {
1532         tcg_debug_assert(dest != arg2);
1533         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1534         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1535     }
1536 }
1537
1538 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1539                         TCGArg arg2, bool const_a2)
1540 {
1541     if (have_lzcnt) {
1542         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1543         if (const_a2) {
1544             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1545         } else {
1546             tcg_debug_assert(dest != arg2);
1547             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1548         }
1549     } else {
1550         tcg_debug_assert(!const_a2);
1551         tcg_debug_assert(dest != arg1);
1552         tcg_debug_assert(dest != arg2);
1553
1554         /* Recall that the output of BSR is the index not the count.  */
1555         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1556         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1557
1558         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1559         tcg_out_cmp(s, arg1, 0, 1, rexw);
1560         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1561     }
1562 }
1563
1564 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1565 {
1566     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1567
1568     if (disp == (int32_t)disp) {
1569         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1570         tcg_out32(s, disp);
1571     } else {
1572         /* rip-relative addressing into the constant pool.
1573            This is 6 + 8 = 14 bytes, as compared to using an
1574            an immediate load 10 + 6 = 16 bytes, plus we may
1575            be able to re-use the pool constant for more calls.  */
1576         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1577         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1578         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1579         tcg_out32(s, 0);
1580     }
1581 }
1582
1583 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1584 {
1585     tcg_out_branch(s, 1, dest);
1586 }
1587
1588 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1589 {
1590     tcg_out_branch(s, 0, dest);
1591 }
1592
1593 static void tcg_out_nopn(TCGContext *s, int n)
1594 {
1595     int i;
1596     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1597      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1598      * duplicate prefix, and all of the interesting recent cores can
1599      * decode and discard the duplicates in a single cycle.
1600      */
1601     tcg_debug_assert(n >= 1);
1602     for (i = 1; i < n; ++i) {
1603         tcg_out8(s, 0x66);
1604     }
1605     tcg_out8(s, 0x90);
1606 }
1607
1608 #if defined(CONFIG_SOFTMMU)
1609 #include "../tcg-ldst.c.inc"
1610
1611 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1612  *                                     int mmu_idx, uintptr_t ra)
1613  */
1614 static void * const qemu_ld_helpers[16] = {
1615     [MO_UB]   = helper_ret_ldub_mmu,
1616     [MO_LEUW] = helper_le_lduw_mmu,
1617     [MO_LEUL] = helper_le_ldul_mmu,
1618     [MO_LEQ]  = helper_le_ldq_mmu,
1619     [MO_BEUW] = helper_be_lduw_mmu,
1620     [MO_BEUL] = helper_be_ldul_mmu,
1621     [MO_BEQ]  = helper_be_ldq_mmu,
1622 };
1623
1624 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1625  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1626  */
1627 static void * const qemu_st_helpers[16] = {
1628     [MO_UB]   = helper_ret_stb_mmu,
1629     [MO_LEUW] = helper_le_stw_mmu,
1630     [MO_LEUL] = helper_le_stl_mmu,
1631     [MO_LEQ]  = helper_le_stq_mmu,
1632     [MO_BEUW] = helper_be_stw_mmu,
1633     [MO_BEUL] = helper_be_stl_mmu,
1634     [MO_BEQ]  = helper_be_stq_mmu,
1635 };
1636
1637 /* Perform the TLB load and compare.
1638
1639    Inputs:
1640    ADDRLO and ADDRHI contain the low and high part of the address.
1641
1642    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1643
1644    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1645    This should be offsetof addr_read or addr_write.
1646
1647    Outputs:
1648    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1649    positions of the displacements of forward jumps to the TLB miss case.
1650
1651    Second argument register is loaded with the low part of the address.
1652    In the TLB hit case, it has been adjusted as indicated by the TLB
1653    and so is a host address.  In the TLB miss case, it continues to
1654    hold a guest address.
1655
1656    First argument register is clobbered.  */
1657
1658 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1659                                     int mem_index, MemOp opc,
1660                                     tcg_insn_unit **label_ptr, int which)
1661 {
1662     const TCGReg r0 = TCG_REG_L0;
1663     const TCGReg r1 = TCG_REG_L1;
1664     TCGType ttype = TCG_TYPE_I32;
1665     TCGType tlbtype = TCG_TYPE_I32;
1666     int trexw = 0, hrexw = 0, tlbrexw = 0;
1667     unsigned a_bits = get_alignment_bits(opc);
1668     unsigned s_bits = opc & MO_SIZE;
1669     unsigned a_mask = (1 << a_bits) - 1;
1670     unsigned s_mask = (1 << s_bits) - 1;
1671     target_ulong tlb_mask;
1672
1673     if (TCG_TARGET_REG_BITS == 64) {
1674         if (TARGET_LONG_BITS == 64) {
1675             ttype = TCG_TYPE_I64;
1676             trexw = P_REXW;
1677         }
1678         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1679             hrexw = P_REXW;
1680             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1681                 tlbtype = TCG_TYPE_I64;
1682                 tlbrexw = P_REXW;
1683             }
1684         }
1685     }
1686
1687     tcg_out_mov(s, tlbtype, r0, addrlo);
1688     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1689                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1690
1691     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1692                          TLB_MASK_TABLE_OFS(mem_index) +
1693                          offsetof(CPUTLBDescFast, mask));
1694
1695     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1696                          TLB_MASK_TABLE_OFS(mem_index) +
1697                          offsetof(CPUTLBDescFast, table));
1698
1699     /* If the required alignment is at least as large as the access, simply
1700        copy the address and mask.  For lesser alignments, check that we don't
1701        cross pages for the complete access.  */
1702     if (a_bits >= s_bits) {
1703         tcg_out_mov(s, ttype, r1, addrlo);
1704     } else {
1705         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1706     }
1707     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1708     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1709
1710     /* cmp 0(r0), r1 */
1711     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1712
1713     /* Prepare for both the fast path add of the tlb addend, and the slow
1714        path function argument setup.  */
1715     tcg_out_mov(s, ttype, r1, addrlo);
1716
1717     /* jne slow_path */
1718     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1719     label_ptr[0] = s->code_ptr;
1720     s->code_ptr += 4;
1721
1722     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1723         /* cmp 4(r0), addrhi */
1724         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1725
1726         /* jne slow_path */
1727         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1728         label_ptr[1] = s->code_ptr;
1729         s->code_ptr += 4;
1730     }
1731
1732     /* TLB Hit.  */
1733
1734     /* add addend(r0), r1 */
1735     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1736                          offsetof(CPUTLBEntry, addend));
1737 }
1738
1739 /*
1740  * Record the context of a call to the out of line helper code for the slow path
1741  * for a load or store, so that we can later generate the correct helper code
1742  */
1743 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1744                                 TCGMemOpIdx oi,
1745                                 TCGReg datalo, TCGReg datahi,
1746                                 TCGReg addrlo, TCGReg addrhi,
1747                                 tcg_insn_unit *raddr,
1748                                 tcg_insn_unit **label_ptr)
1749 {
1750     TCGLabelQemuLdst *label = new_ldst_label(s);
1751
1752     label->is_ld = is_ld;
1753     label->oi = oi;
1754     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1755     label->datalo_reg = datalo;
1756     label->datahi_reg = datahi;
1757     label->addrlo_reg = addrlo;
1758     label->addrhi_reg = addrhi;
1759     label->raddr = tcg_splitwx_to_rx(raddr);
1760     label->label_ptr[0] = label_ptr[0];
1761     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1762         label->label_ptr[1] = label_ptr[1];
1763     }
1764 }
1765
1766 /*
1767  * Generate code for the slow path for a load at the end of block
1768  */
1769 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1770 {
1771     TCGMemOpIdx oi = l->oi;
1772     MemOp opc = get_memop(oi);
1773     TCGReg data_reg;
1774     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1775     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1776
1777     /* resolve label address */
1778     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1779     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1780         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1781     }
1782
1783     if (TCG_TARGET_REG_BITS == 32) {
1784         int ofs = 0;
1785
1786         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1787         ofs += 4;
1788
1789         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1790         ofs += 4;
1791
1792         if (TARGET_LONG_BITS == 64) {
1793             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1794             ofs += 4;
1795         }
1796
1797         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1798         ofs += 4;
1799
1800         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1801     } else {
1802         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1803         /* The second argument is already loaded with addrlo.  */
1804         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1805         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1806                      (uintptr_t)l->raddr);
1807     }
1808
1809     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1810
1811     data_reg = l->datalo_reg;
1812     switch (opc & MO_SSIZE) {
1813     case MO_SB:
1814         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1815         break;
1816     case MO_SW:
1817         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1818         break;
1819 #if TCG_TARGET_REG_BITS == 64
1820     case MO_SL:
1821         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1822         break;
1823 #endif
1824     case MO_UB:
1825     case MO_UW:
1826         /* Note that the helpers have zero-extended to tcg_target_long.  */
1827     case MO_UL:
1828         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1829         break;
1830     case MO_Q:
1831         if (TCG_TARGET_REG_BITS == 64) {
1832             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1833         } else if (data_reg == TCG_REG_EDX) {
1834             /* xchg %edx, %eax */
1835             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1836             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1837         } else {
1838             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1839             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1840         }
1841         break;
1842     default:
1843         tcg_abort();
1844     }
1845
1846     /* Jump to the code corresponding to next IR of qemu_st */
1847     tcg_out_jmp(s, l->raddr);
1848     return true;
1849 }
1850
1851 /*
1852  * Generate code for the slow path for a store at the end of block
1853  */
1854 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1855 {
1856     TCGMemOpIdx oi = l->oi;
1857     MemOp opc = get_memop(oi);
1858     MemOp s_bits = opc & MO_SIZE;
1859     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1860     TCGReg retaddr;
1861
1862     /* resolve label address */
1863     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1864     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1865         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1866     }
1867
1868     if (TCG_TARGET_REG_BITS == 32) {
1869         int ofs = 0;
1870
1871         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1872         ofs += 4;
1873
1874         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1875         ofs += 4;
1876
1877         if (TARGET_LONG_BITS == 64) {
1878             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1879             ofs += 4;
1880         }
1881
1882         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1883         ofs += 4;
1884
1885         if (s_bits == MO_64) {
1886             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1887             ofs += 4;
1888         }
1889
1890         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1891         ofs += 4;
1892
1893         retaddr = TCG_REG_EAX;
1894         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1895         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1896     } else {
1897         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1898         /* The second argument is already loaded with addrlo.  */
1899         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1900                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1901         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1902
1903         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1904             retaddr = tcg_target_call_iarg_regs[4];
1905             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1906         } else {
1907             retaddr = TCG_REG_RAX;
1908             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1909             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1910                        TCG_TARGET_CALL_STACK_OFFSET);
1911         }
1912     }
1913
1914     /* "Tail call" to the helper, with the return address back inline.  */
1915     tcg_out_push(s, retaddr);
1916     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1917     return true;
1918 }
1919 #elif TCG_TARGET_REG_BITS == 32
1920 # define x86_guest_base_seg     0
1921 # define x86_guest_base_index   -1
1922 # define x86_guest_base_offset  guest_base
1923 #else
1924 static int x86_guest_base_seg;
1925 static int x86_guest_base_index = -1;
1926 static int32_t x86_guest_base_offset;
1927 # if defined(__x86_64__) && defined(__linux__)
1928 #  include <asm/prctl.h>
1929 #  include <sys/prctl.h>
1930 int arch_prctl(int code, unsigned long addr);
1931 static inline int setup_guest_base_seg(void)
1932 {
1933     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1934         return P_GS;
1935     }
1936     return 0;
1937 }
1938 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1939 #  include <machine/sysarch.h>
1940 static inline int setup_guest_base_seg(void)
1941 {
1942     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1943         return P_GS;
1944     }
1945     return 0;
1946 }
1947 # else
1948 static inline int setup_guest_base_seg(void)
1949 {
1950     return 0;
1951 }
1952 # endif
1953 #endif /* SOFTMMU */
1954
1955 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1956                                    TCGReg base, int index, intptr_t ofs,
1957                                    int seg, bool is64, MemOp memop)
1958 {
1959     bool use_movbe = false;
1960     int rexw = is64 * P_REXW;
1961     int movop = OPC_MOVL_GvEv;
1962
1963     /* Do big-endian loads with movbe.  */
1964     if (memop & MO_BSWAP) {
1965         tcg_debug_assert(have_movbe);
1966         use_movbe = true;
1967         movop = OPC_MOVBE_GyMy;
1968     }
1969
1970     switch (memop & MO_SSIZE) {
1971     case MO_UB:
1972         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1973                                  base, index, 0, ofs);
1974         break;
1975     case MO_SB:
1976         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1977                                  base, index, 0, ofs);
1978         break;
1979     case MO_UW:
1980         if (use_movbe) {
1981             /* There is no extending movbe; only low 16-bits are modified.  */
1982             if (datalo != base && datalo != index) {
1983                 /* XOR breaks dependency chains.  */
1984                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
1985                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1986                                          datalo, base, index, 0, ofs);
1987             } else {
1988                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1989                                          datalo, base, index, 0, ofs);
1990                 tcg_out_ext16u(s, datalo, datalo);
1991             }
1992         } else {
1993             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1994                                      base, index, 0, ofs);
1995         }
1996         break;
1997     case MO_SW:
1998         if (use_movbe) {
1999             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2000                                      datalo, base, index, 0, ofs);
2001             tcg_out_ext16s(s, datalo, datalo, rexw);
2002         } else {
2003             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2004                                      datalo, base, index, 0, ofs);
2005         }
2006         break;
2007     case MO_UL:
2008         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2009         break;
2010 #if TCG_TARGET_REG_BITS == 64
2011     case MO_SL:
2012         if (use_movbe) {
2013             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2014                                      base, index, 0, ofs);
2015             tcg_out_ext32s(s, datalo, datalo);
2016         } else {
2017             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2018                                      base, index, 0, ofs);
2019         }
2020         break;
2021 #endif
2022     case MO_Q:
2023         if (TCG_TARGET_REG_BITS == 64) {
2024             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2025                                      base, index, 0, ofs);
2026         } else {
2027             if (use_movbe) {
2028                 TCGReg t = datalo;
2029                 datalo = datahi;
2030                 datahi = t;
2031             }
2032             if (base != datalo) {
2033                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2034                                          base, index, 0, ofs);
2035                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2036                                          base, index, 0, ofs + 4);
2037             } else {
2038                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2039                                          base, index, 0, ofs + 4);
2040                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2041                                          base, index, 0, ofs);
2042             }
2043         }
2044         break;
2045     default:
2046         g_assert_not_reached();
2047     }
2048 }
2049
2050 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2051    EAX. It will be useful once fixed registers globals are less
2052    common. */
2053 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2054 {
2055     TCGReg datalo, datahi, addrlo;
2056     TCGReg addrhi __attribute__((unused));
2057     TCGMemOpIdx oi;
2058     MemOp opc;
2059 #if defined(CONFIG_SOFTMMU)
2060     int mem_index;
2061     tcg_insn_unit *label_ptr[2];
2062 #endif
2063
2064     datalo = *args++;
2065     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2066     addrlo = *args++;
2067     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2068     oi = *args++;
2069     opc = get_memop(oi);
2070
2071 #if defined(CONFIG_SOFTMMU)
2072     mem_index = get_mmuidx(oi);
2073
2074     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2075                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2076
2077     /* TLB Hit.  */
2078     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2079
2080     /* Record the current context of a load into ldst label */
2081     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2082                         s->code_ptr, label_ptr);
2083 #else
2084     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2085                            x86_guest_base_offset, x86_guest_base_seg,
2086                            is64, opc);
2087 #endif
2088 }
2089
2090 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2091                                    TCGReg base, int index, intptr_t ofs,
2092                                    int seg, MemOp memop)
2093 {
2094     bool use_movbe = false;
2095     int movop = OPC_MOVL_EvGv;
2096
2097     /*
2098      * Do big-endian stores with movbe or softmmu.
2099      * User-only without movbe will have its swapping done generically.
2100      */
2101     if (memop & MO_BSWAP) {
2102         tcg_debug_assert(have_movbe);
2103         use_movbe = true;
2104         movop = OPC_MOVBE_MyGy;
2105     }
2106
2107     switch (memop & MO_SIZE) {
2108     case MO_8:
2109         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2110         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2111         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2112                                  datalo, base, index, 0, ofs);
2113         break;
2114     case MO_16:
2115         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2116                                  base, index, 0, ofs);
2117         break;
2118     case MO_32:
2119         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2120         break;
2121     case MO_64:
2122         if (TCG_TARGET_REG_BITS == 64) {
2123             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2124                                      base, index, 0, ofs);
2125         } else {
2126             if (use_movbe) {
2127                 TCGReg t = datalo;
2128                 datalo = datahi;
2129                 datahi = t;
2130             }
2131             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2132                                      base, index, 0, ofs);
2133             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2134                                      base, index, 0, ofs + 4);
2135         }
2136         break;
2137     default:
2138         g_assert_not_reached();
2139     }
2140 }
2141
2142 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2143 {
2144     TCGReg datalo, datahi, addrlo;
2145     TCGReg addrhi __attribute__((unused));
2146     TCGMemOpIdx oi;
2147     MemOp opc;
2148 #if defined(CONFIG_SOFTMMU)
2149     int mem_index;
2150     tcg_insn_unit *label_ptr[2];
2151 #endif
2152
2153     datalo = *args++;
2154     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2155     addrlo = *args++;
2156     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2157     oi = *args++;
2158     opc = get_memop(oi);
2159
2160 #if defined(CONFIG_SOFTMMU)
2161     mem_index = get_mmuidx(oi);
2162
2163     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2164                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2165
2166     /* TLB Hit.  */
2167     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2168
2169     /* Record the current context of a store into ldst label */
2170     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2171                         s->code_ptr, label_ptr);
2172 #else
2173     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2174                            x86_guest_base_offset, x86_guest_base_seg, opc);
2175 #endif
2176 }
2177
2178 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2179                               const TCGArg args[TCG_MAX_OP_ARGS],
2180                               const int const_args[TCG_MAX_OP_ARGS])
2181 {
2182     TCGArg a0, a1, a2;
2183     int c, const_a2, vexop, rexw = 0;
2184
2185 #if TCG_TARGET_REG_BITS == 64
2186 # define OP_32_64(x) \
2187         case glue(glue(INDEX_op_, x), _i64): \
2188             rexw = P_REXW; /* FALLTHRU */    \
2189         case glue(glue(INDEX_op_, x), _i32)
2190 #else
2191 # define OP_32_64(x) \
2192         case glue(glue(INDEX_op_, x), _i32)
2193 #endif
2194
2195     /* Hoist the loads of the most common arguments.  */
2196     a0 = args[0];
2197     a1 = args[1];
2198     a2 = args[2];
2199     const_a2 = const_args[2];
2200
2201     switch (opc) {
2202     case INDEX_op_exit_tb:
2203         /* Reuse the zeroing that exists for goto_ptr.  */
2204         if (a0 == 0) {
2205             tcg_out_jmp(s, tcg_code_gen_epilogue);
2206         } else {
2207             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2208             tcg_out_jmp(s, tb_ret_addr);
2209         }
2210         break;
2211     case INDEX_op_goto_tb:
2212         if (s->tb_jmp_insn_offset) {
2213             /* direct jump method */
2214             int gap;
2215             /* jump displacement must be aligned for atomic patching;
2216              * see if we need to add extra nops before jump
2217              */
2218             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2219             if (gap != 1) {
2220                 tcg_out_nopn(s, gap - 1);
2221             }
2222             tcg_out8(s, OPC_JMP_long); /* jmp im */
2223             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2224             tcg_out32(s, 0);
2225         } else {
2226             /* indirect jump method */
2227             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2228                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2229         }
2230         set_jmp_reset_offset(s, a0);
2231         break;
2232     case INDEX_op_goto_ptr:
2233         /* jmp to the given host address (could be epilogue) */
2234         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2235         break;
2236     case INDEX_op_br:
2237         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2238         break;
2239     OP_32_64(ld8u):
2240         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2241         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2242         break;
2243     OP_32_64(ld8s):
2244         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2245         break;
2246     OP_32_64(ld16u):
2247         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2248         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2249         break;
2250     OP_32_64(ld16s):
2251         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2252         break;
2253 #if TCG_TARGET_REG_BITS == 64
2254     case INDEX_op_ld32u_i64:
2255 #endif
2256     case INDEX_op_ld_i32:
2257         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2258         break;
2259
2260     OP_32_64(st8):
2261         if (const_args[0]) {
2262             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2263             tcg_out8(s, a0);
2264         } else {
2265             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2266         }
2267         break;
2268     OP_32_64(st16):
2269         if (const_args[0]) {
2270             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2271             tcg_out16(s, a0);
2272         } else {
2273             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2274         }
2275         break;
2276 #if TCG_TARGET_REG_BITS == 64
2277     case INDEX_op_st32_i64:
2278 #endif
2279     case INDEX_op_st_i32:
2280         if (const_args[0]) {
2281             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2282             tcg_out32(s, a0);
2283         } else {
2284             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2285         }
2286         break;
2287
2288     OP_32_64(add):
2289         /* For 3-operand addition, use LEA.  */
2290         if (a0 != a1) {
2291             TCGArg c3 = 0;
2292             if (const_a2) {
2293                 c3 = a2, a2 = -1;
2294             } else if (a0 == a2) {
2295                 /* Watch out for dest = src + dest, since we've removed
2296                    the matching constraint on the add.  */
2297                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2298                 break;
2299             }
2300
2301             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2302             break;
2303         }
2304         c = ARITH_ADD;
2305         goto gen_arith;
2306     OP_32_64(sub):
2307         c = ARITH_SUB;
2308         goto gen_arith;
2309     OP_32_64(and):
2310         c = ARITH_AND;
2311         goto gen_arith;
2312     OP_32_64(or):
2313         c = ARITH_OR;
2314         goto gen_arith;
2315     OP_32_64(xor):
2316         c = ARITH_XOR;
2317         goto gen_arith;
2318     gen_arith:
2319         if (const_a2) {
2320             tgen_arithi(s, c + rexw, a0, a2, 0);
2321         } else {
2322             tgen_arithr(s, c + rexw, a0, a2);
2323         }
2324         break;
2325
2326     OP_32_64(andc):
2327         if (const_a2) {
2328             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2329             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2330         } else {
2331             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2332         }
2333         break;
2334
2335     OP_32_64(mul):
2336         if (const_a2) {
2337             int32_t val;
2338             val = a2;
2339             if (val == (int8_t)val) {
2340                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2341                 tcg_out8(s, val);
2342             } else {
2343                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2344                 tcg_out32(s, val);
2345             }
2346         } else {
2347             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2348         }
2349         break;
2350
2351     OP_32_64(div2):
2352         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2353         break;
2354     OP_32_64(divu2):
2355         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2356         break;
2357
2358     OP_32_64(shl):
2359         /* For small constant 3-operand shift, use LEA.  */
2360         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2361             if (a2 - 1 == 0) {
2362                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2363                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2364             } else {
2365                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2366                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2367             }
2368             break;
2369         }
2370         c = SHIFT_SHL;
2371         vexop = OPC_SHLX;
2372         goto gen_shift_maybe_vex;
2373     OP_32_64(shr):
2374         c = SHIFT_SHR;
2375         vexop = OPC_SHRX;
2376         goto gen_shift_maybe_vex;
2377     OP_32_64(sar):
2378         c = SHIFT_SAR;
2379         vexop = OPC_SARX;
2380         goto gen_shift_maybe_vex;
2381     OP_32_64(rotl):
2382         c = SHIFT_ROL;
2383         goto gen_shift;
2384     OP_32_64(rotr):
2385         c = SHIFT_ROR;
2386         goto gen_shift;
2387     gen_shift_maybe_vex:
2388         if (have_bmi2) {
2389             if (!const_a2) {
2390                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2391                 break;
2392             }
2393             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2394         }
2395         /* FALLTHRU */
2396     gen_shift:
2397         if (const_a2) {
2398             tcg_out_shifti(s, c + rexw, a0, a2);
2399         } else {
2400             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2401         }
2402         break;
2403
2404     OP_32_64(ctz):
2405         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2406         break;
2407     OP_32_64(clz):
2408         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2409         break;
2410     OP_32_64(ctpop):
2411         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2412         break;
2413
2414     case INDEX_op_brcond_i32:
2415         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2416         break;
2417     case INDEX_op_setcond_i32:
2418         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2419         break;
2420     case INDEX_op_movcond_i32:
2421         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2422         break;
2423
2424     OP_32_64(bswap16):
2425         if (a2 & TCG_BSWAP_OS) {
2426             /* Output must be sign-extended. */
2427             if (rexw) {
2428                 tcg_out_bswap64(s, a0);
2429                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2430             } else {
2431                 tcg_out_bswap32(s, a0);
2432                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2433             }
2434         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2435             /* Output must be zero-extended, but input isn't. */
2436             tcg_out_bswap32(s, a0);
2437             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2438         } else {
2439             tcg_out_rolw_8(s, a0);
2440         }
2441         break;
2442     OP_32_64(bswap32):
2443         tcg_out_bswap32(s, a0);
2444         if (rexw && (a2 & TCG_BSWAP_OS)) {
2445             tcg_out_ext32s(s, a0, a0);
2446         }
2447         break;
2448
2449     OP_32_64(neg):
2450         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2451         break;
2452     OP_32_64(not):
2453         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2454         break;
2455
2456     OP_32_64(ext8s):
2457         tcg_out_ext8s(s, a0, a1, rexw);
2458         break;
2459     OP_32_64(ext16s):
2460         tcg_out_ext16s(s, a0, a1, rexw);
2461         break;
2462     OP_32_64(ext8u):
2463         tcg_out_ext8u(s, a0, a1);
2464         break;
2465     OP_32_64(ext16u):
2466         tcg_out_ext16u(s, a0, a1);
2467         break;
2468
2469     case INDEX_op_qemu_ld_i32:
2470         tcg_out_qemu_ld(s, args, 0);
2471         break;
2472     case INDEX_op_qemu_ld_i64:
2473         tcg_out_qemu_ld(s, args, 1);
2474         break;
2475     case INDEX_op_qemu_st_i32:
2476     case INDEX_op_qemu_st8_i32:
2477         tcg_out_qemu_st(s, args, 0);
2478         break;
2479     case INDEX_op_qemu_st_i64:
2480         tcg_out_qemu_st(s, args, 1);
2481         break;
2482
2483     OP_32_64(mulu2):
2484         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2485         break;
2486     OP_32_64(muls2):
2487         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2488         break;
2489     OP_32_64(add2):
2490         if (const_args[4]) {
2491             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2492         } else {
2493             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2494         }
2495         if (const_args[5]) {
2496             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2497         } else {
2498             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2499         }
2500         break;
2501     OP_32_64(sub2):
2502         if (const_args[4]) {
2503             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2504         } else {
2505             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2506         }
2507         if (const_args[5]) {
2508             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2509         } else {
2510             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2511         }
2512         break;
2513
2514 #if TCG_TARGET_REG_BITS == 32
2515     case INDEX_op_brcond2_i32:
2516         tcg_out_brcond2(s, args, const_args, 0);
2517         break;
2518     case INDEX_op_setcond2_i32:
2519         tcg_out_setcond2(s, args, const_args);
2520         break;
2521 #else /* TCG_TARGET_REG_BITS == 64 */
2522     case INDEX_op_ld32s_i64:
2523         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2524         break;
2525     case INDEX_op_ld_i64:
2526         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2527         break;
2528     case INDEX_op_st_i64:
2529         if (const_args[0]) {
2530             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2531             tcg_out32(s, a0);
2532         } else {
2533             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2534         }
2535         break;
2536
2537     case INDEX_op_brcond_i64:
2538         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2539         break;
2540     case INDEX_op_setcond_i64:
2541         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2542         break;
2543     case INDEX_op_movcond_i64:
2544         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2545         break;
2546
2547     case INDEX_op_bswap64_i64:
2548         tcg_out_bswap64(s, a0);
2549         break;
2550     case INDEX_op_extu_i32_i64:
2551     case INDEX_op_ext32u_i64:
2552     case INDEX_op_extrl_i64_i32:
2553         tcg_out_ext32u(s, a0, a1);
2554         break;
2555     case INDEX_op_ext_i32_i64:
2556     case INDEX_op_ext32s_i64:
2557         tcg_out_ext32s(s, a0, a1);
2558         break;
2559     case INDEX_op_extrh_i64_i32:
2560         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2561         break;
2562 #endif
2563
2564     OP_32_64(deposit):
2565         if (args[3] == 0 && args[4] == 8) {
2566             /* load bits 0..7 */
2567             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2568         } else if (args[3] == 8 && args[4] == 8) {
2569             /* load bits 8..15 */
2570             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2571         } else if (args[3] == 0 && args[4] == 16) {
2572             /* load bits 0..15 */
2573             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2574         } else {
2575             tcg_abort();
2576         }
2577         break;
2578
2579     case INDEX_op_extract_i64:
2580         if (a2 + args[3] == 32) {
2581             /* This is a 32-bit zero-extending right shift.  */
2582             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2583             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2584             break;
2585         }
2586         /* FALLTHRU */
2587     case INDEX_op_extract_i32:
2588         /* On the off-chance that we can use the high-byte registers.
2589            Otherwise we emit the same ext16 + shift pattern that we
2590            would have gotten from the normal tcg-op.c expansion.  */
2591         tcg_debug_assert(a2 == 8 && args[3] == 8);
2592         if (a1 < 4 && a0 < 8) {
2593             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2594         } else {
2595             tcg_out_ext16u(s, a0, a1);
2596             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2597         }
2598         break;
2599
2600     case INDEX_op_sextract_i32:
2601         /* We don't implement sextract_i64, as we cannot sign-extend to
2602            64-bits without using the REX prefix that explicitly excludes
2603            access to the high-byte registers.  */
2604         tcg_debug_assert(a2 == 8 && args[3] == 8);
2605         if (a1 < 4 && a0 < 8) {
2606             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2607         } else {
2608             tcg_out_ext16s(s, a0, a1, 0);
2609             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2610         }
2611         break;
2612
2613     OP_32_64(extract2):
2614         /* Note that SHRD outputs to the r/m operand.  */
2615         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2616         tcg_out8(s, args[3]);
2617         break;
2618
2619     case INDEX_op_mb:
2620         tcg_out_mb(s, a0);
2621         break;
2622     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2623     case INDEX_op_mov_i64:
2624     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2625     default:
2626         tcg_abort();
2627     }
2628
2629 #undef OP_32_64
2630 }
2631
2632 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2633                            unsigned vecl, unsigned vece,
2634                            const TCGArg args[TCG_MAX_OP_ARGS],
2635                            const int const_args[TCG_MAX_OP_ARGS])
2636 {
2637     static int const add_insn[4] = {
2638         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2639     };
2640     static int const ssadd_insn[4] = {
2641         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2642     };
2643     static int const usadd_insn[4] = {
2644         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2645     };
2646     static int const sub_insn[4] = {
2647         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2648     };
2649     static int const sssub_insn[4] = {
2650         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2651     };
2652     static int const ussub_insn[4] = {
2653         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2654     };
2655     static int const mul_insn[4] = {
2656         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2657     };
2658     static int const shift_imm_insn[4] = {
2659         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2660     };
2661     static int const cmpeq_insn[4] = {
2662         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2663     };
2664     static int const cmpgt_insn[4] = {
2665         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2666     };
2667     static int const punpckl_insn[4] = {
2668         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2669     };
2670     static int const punpckh_insn[4] = {
2671         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2672     };
2673     static int const packss_insn[4] = {
2674         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2675     };
2676     static int const packus_insn[4] = {
2677         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2678     };
2679     static int const smin_insn[4] = {
2680         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2681     };
2682     static int const smax_insn[4] = {
2683         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2684     };
2685     static int const umin_insn[4] = {
2686         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2687     };
2688     static int const umax_insn[4] = {
2689         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2690     };
2691     static int const shlv_insn[4] = {
2692         /* TODO: AVX512 adds support for MO_16.  */
2693         OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2694     };
2695     static int const shrv_insn[4] = {
2696         /* TODO: AVX512 adds support for MO_16.  */
2697         OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2698     };
2699     static int const sarv_insn[4] = {
2700         /* TODO: AVX512 adds support for MO_16, MO_64.  */
2701         OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2702     };
2703     static int const shls_insn[4] = {
2704         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2705     };
2706     static int const shrs_insn[4] = {
2707         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2708     };
2709     static int const sars_insn[4] = {
2710         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2711     };
2712     static int const abs_insn[4] = {
2713         /* TODO: AVX512 adds support for MO_64.  */
2714         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2715     };
2716
2717     TCGType type = vecl + TCG_TYPE_V64;
2718     int insn, sub;
2719     TCGArg a0, a1, a2;
2720
2721     a0 = args[0];
2722     a1 = args[1];
2723     a2 = args[2];
2724
2725     switch (opc) {
2726     case INDEX_op_add_vec:
2727         insn = add_insn[vece];
2728         goto gen_simd;
2729     case INDEX_op_ssadd_vec:
2730         insn = ssadd_insn[vece];
2731         goto gen_simd;
2732     case INDEX_op_usadd_vec:
2733         insn = usadd_insn[vece];
2734         goto gen_simd;
2735     case INDEX_op_sub_vec:
2736         insn = sub_insn[vece];
2737         goto gen_simd;
2738     case INDEX_op_sssub_vec:
2739         insn = sssub_insn[vece];
2740         goto gen_simd;
2741     case INDEX_op_ussub_vec:
2742         insn = ussub_insn[vece];
2743         goto gen_simd;
2744     case INDEX_op_mul_vec:
2745         insn = mul_insn[vece];
2746         goto gen_simd;
2747     case INDEX_op_and_vec:
2748         insn = OPC_PAND;
2749         goto gen_simd;
2750     case INDEX_op_or_vec:
2751         insn = OPC_POR;
2752         goto gen_simd;
2753     case INDEX_op_xor_vec:
2754         insn = OPC_PXOR;
2755         goto gen_simd;
2756     case INDEX_op_smin_vec:
2757         insn = smin_insn[vece];
2758         goto gen_simd;
2759     case INDEX_op_umin_vec:
2760         insn = umin_insn[vece];
2761         goto gen_simd;
2762     case INDEX_op_smax_vec:
2763         insn = smax_insn[vece];
2764         goto gen_simd;
2765     case INDEX_op_umax_vec:
2766         insn = umax_insn[vece];
2767         goto gen_simd;
2768     case INDEX_op_shlv_vec:
2769         insn = shlv_insn[vece];
2770         goto gen_simd;
2771     case INDEX_op_shrv_vec:
2772         insn = shrv_insn[vece];
2773         goto gen_simd;
2774     case INDEX_op_sarv_vec:
2775         insn = sarv_insn[vece];
2776         goto gen_simd;
2777     case INDEX_op_shls_vec:
2778         insn = shls_insn[vece];
2779         goto gen_simd;
2780     case INDEX_op_shrs_vec:
2781         insn = shrs_insn[vece];
2782         goto gen_simd;
2783     case INDEX_op_sars_vec:
2784         insn = sars_insn[vece];
2785         goto gen_simd;
2786     case INDEX_op_x86_punpckl_vec:
2787         insn = punpckl_insn[vece];
2788         goto gen_simd;
2789     case INDEX_op_x86_punpckh_vec:
2790         insn = punpckh_insn[vece];
2791         goto gen_simd;
2792     case INDEX_op_x86_packss_vec:
2793         insn = packss_insn[vece];
2794         goto gen_simd;
2795     case INDEX_op_x86_packus_vec:
2796         insn = packus_insn[vece];
2797         goto gen_simd;
2798 #if TCG_TARGET_REG_BITS == 32
2799     case INDEX_op_dup2_vec:
2800         /* First merge the two 32-bit inputs to a single 64-bit element. */
2801         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2802         /* Then replicate the 64-bit elements across the rest of the vector. */
2803         if (type != TCG_TYPE_V64) {
2804             tcg_out_dup_vec(s, type, MO_64, a0, a0);
2805         }
2806         break;
2807 #endif
2808     case INDEX_op_abs_vec:
2809         insn = abs_insn[vece];
2810         a2 = a1;
2811         a1 = 0;
2812         goto gen_simd;
2813     gen_simd:
2814         tcg_debug_assert(insn != OPC_UD2);
2815         if (type == TCG_TYPE_V256) {
2816             insn |= P_VEXL;
2817         }
2818         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2819         break;
2820
2821     case INDEX_op_cmp_vec:
2822         sub = args[3];
2823         if (sub == TCG_COND_EQ) {
2824             insn = cmpeq_insn[vece];
2825         } else if (sub == TCG_COND_GT) {
2826             insn = cmpgt_insn[vece];
2827         } else {
2828             g_assert_not_reached();
2829         }
2830         goto gen_simd;
2831
2832     case INDEX_op_andc_vec:
2833         insn = OPC_PANDN;
2834         if (type == TCG_TYPE_V256) {
2835             insn |= P_VEXL;
2836         }
2837         tcg_out_vex_modrm(s, insn, a0, a2, a1);
2838         break;
2839
2840     case INDEX_op_shli_vec:
2841         sub = 6;
2842         goto gen_shift;
2843     case INDEX_op_shri_vec:
2844         sub = 2;
2845         goto gen_shift;
2846     case INDEX_op_sari_vec:
2847         tcg_debug_assert(vece != MO_64);
2848         sub = 4;
2849     gen_shift:
2850         tcg_debug_assert(vece != MO_8);
2851         insn = shift_imm_insn[vece];
2852         if (type == TCG_TYPE_V256) {
2853             insn |= P_VEXL;
2854         }
2855         tcg_out_vex_modrm(s, insn, sub, a0, a1);
2856         tcg_out8(s, a2);
2857         break;
2858
2859     case INDEX_op_ld_vec:
2860         tcg_out_ld(s, type, a0, a1, a2);
2861         break;
2862     case INDEX_op_st_vec:
2863         tcg_out_st(s, type, a0, a1, a2);
2864         break;
2865     case INDEX_op_dupm_vec:
2866         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2867         break;
2868
2869     case INDEX_op_x86_shufps_vec:
2870         insn = OPC_SHUFPS;
2871         sub = args[3];
2872         goto gen_simd_imm8;
2873     case INDEX_op_x86_blend_vec:
2874         if (vece == MO_16) {
2875             insn = OPC_PBLENDW;
2876         } else if (vece == MO_32) {
2877             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2878         } else {
2879             g_assert_not_reached();
2880         }
2881         sub = args[3];
2882         goto gen_simd_imm8;
2883     case INDEX_op_x86_vperm2i128_vec:
2884         insn = OPC_VPERM2I128;
2885         sub = args[3];
2886         goto gen_simd_imm8;
2887     gen_simd_imm8:
2888         if (type == TCG_TYPE_V256) {
2889             insn |= P_VEXL;
2890         }
2891         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2892         tcg_out8(s, sub);
2893         break;
2894
2895     case INDEX_op_x86_vpblendvb_vec:
2896         insn = OPC_VPBLENDVB;
2897         if (type == TCG_TYPE_V256) {
2898             insn |= P_VEXL;
2899         }
2900         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2901         tcg_out8(s, args[3] << 4);
2902         break;
2903
2904     case INDEX_op_x86_psrldq_vec:
2905         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2906         tcg_out8(s, a2);
2907         break;
2908
2909     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2910     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2911     default:
2912         g_assert_not_reached();
2913     }
2914 }
2915
2916 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2917 {
2918     switch (op) {
2919     case INDEX_op_goto_ptr:
2920         return C_O0_I1(r);
2921
2922     case INDEX_op_ld8u_i32:
2923     case INDEX_op_ld8u_i64:
2924     case INDEX_op_ld8s_i32:
2925     case INDEX_op_ld8s_i64:
2926     case INDEX_op_ld16u_i32:
2927     case INDEX_op_ld16u_i64:
2928     case INDEX_op_ld16s_i32:
2929     case INDEX_op_ld16s_i64:
2930     case INDEX_op_ld_i32:
2931     case INDEX_op_ld32u_i64:
2932     case INDEX_op_ld32s_i64:
2933     case INDEX_op_ld_i64:
2934         return C_O1_I1(r, r);
2935
2936     case INDEX_op_st8_i32:
2937     case INDEX_op_st8_i64:
2938         return C_O0_I2(qi, r);
2939
2940     case INDEX_op_st16_i32:
2941     case INDEX_op_st16_i64:
2942     case INDEX_op_st_i32:
2943     case INDEX_op_st32_i64:
2944         return C_O0_I2(ri, r);
2945
2946     case INDEX_op_st_i64:
2947         return C_O0_I2(re, r);
2948
2949     case INDEX_op_add_i32:
2950     case INDEX_op_add_i64:
2951         return C_O1_I2(r, r, re);
2952
2953     case INDEX_op_sub_i32:
2954     case INDEX_op_sub_i64:
2955     case INDEX_op_mul_i32:
2956     case INDEX_op_mul_i64:
2957     case INDEX_op_or_i32:
2958     case INDEX_op_or_i64:
2959     case INDEX_op_xor_i32:
2960     case INDEX_op_xor_i64:
2961         return C_O1_I2(r, 0, re);
2962
2963     case INDEX_op_and_i32:
2964     case INDEX_op_and_i64:
2965         return C_O1_I2(r, 0, reZ);
2966
2967     case INDEX_op_andc_i32:
2968     case INDEX_op_andc_i64:
2969         return C_O1_I2(r, r, rI);
2970
2971     case INDEX_op_shl_i32:
2972     case INDEX_op_shl_i64:
2973     case INDEX_op_shr_i32:
2974     case INDEX_op_shr_i64:
2975     case INDEX_op_sar_i32:
2976     case INDEX_op_sar_i64:
2977         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2978
2979     case INDEX_op_rotl_i32:
2980     case INDEX_op_rotl_i64:
2981     case INDEX_op_rotr_i32:
2982     case INDEX_op_rotr_i64:
2983         return C_O1_I2(r, 0, ci);
2984
2985     case INDEX_op_brcond_i32:
2986     case INDEX_op_brcond_i64:
2987         return C_O0_I2(r, re);
2988
2989     case INDEX_op_bswap16_i32:
2990     case INDEX_op_bswap16_i64:
2991     case INDEX_op_bswap32_i32:
2992     case INDEX_op_bswap32_i64:
2993     case INDEX_op_bswap64_i64:
2994     case INDEX_op_neg_i32:
2995     case INDEX_op_neg_i64:
2996     case INDEX_op_not_i32:
2997     case INDEX_op_not_i64:
2998     case INDEX_op_extrh_i64_i32:
2999         return C_O1_I1(r, 0);
3000
3001     case INDEX_op_ext8s_i32:
3002     case INDEX_op_ext8s_i64:
3003     case INDEX_op_ext8u_i32:
3004     case INDEX_op_ext8u_i64:
3005         return C_O1_I1(r, q);
3006
3007     case INDEX_op_ext16s_i32:
3008     case INDEX_op_ext16s_i64:
3009     case INDEX_op_ext16u_i32:
3010     case INDEX_op_ext16u_i64:
3011     case INDEX_op_ext32s_i64:
3012     case INDEX_op_ext32u_i64:
3013     case INDEX_op_ext_i32_i64:
3014     case INDEX_op_extu_i32_i64:
3015     case INDEX_op_extrl_i64_i32:
3016     case INDEX_op_extract_i32:
3017     case INDEX_op_extract_i64:
3018     case INDEX_op_sextract_i32:
3019     case INDEX_op_ctpop_i32:
3020     case INDEX_op_ctpop_i64:
3021         return C_O1_I1(r, r);
3022
3023     case INDEX_op_extract2_i32:
3024     case INDEX_op_extract2_i64:
3025         return C_O1_I2(r, 0, r);
3026
3027     case INDEX_op_deposit_i32:
3028     case INDEX_op_deposit_i64:
3029         return C_O1_I2(Q, 0, Q);
3030
3031     case INDEX_op_setcond_i32:
3032     case INDEX_op_setcond_i64:
3033         return C_O1_I2(q, r, re);
3034
3035     case INDEX_op_movcond_i32:
3036     case INDEX_op_movcond_i64:
3037         return C_O1_I4(r, r, re, r, 0);
3038
3039     case INDEX_op_div2_i32:
3040     case INDEX_op_div2_i64:
3041     case INDEX_op_divu2_i32:
3042     case INDEX_op_divu2_i64:
3043         return C_O2_I3(a, d, 0, 1, r);
3044
3045     case INDEX_op_mulu2_i32:
3046     case INDEX_op_mulu2_i64:
3047     case INDEX_op_muls2_i32:
3048     case INDEX_op_muls2_i64:
3049         return C_O2_I2(a, d, a, r);
3050
3051     case INDEX_op_add2_i32:
3052     case INDEX_op_add2_i64:
3053     case INDEX_op_sub2_i32:
3054     case INDEX_op_sub2_i64:
3055         return C_O2_I4(r, r, 0, 1, re, re);
3056
3057     case INDEX_op_ctz_i32:
3058     case INDEX_op_ctz_i64:
3059         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3060
3061     case INDEX_op_clz_i32:
3062     case INDEX_op_clz_i64:
3063         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3064
3065     case INDEX_op_qemu_ld_i32:
3066         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3067                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3068
3069     case INDEX_op_qemu_st_i32:
3070         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3071                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3072     case INDEX_op_qemu_st8_i32:
3073         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3074                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3075
3076     case INDEX_op_qemu_ld_i64:
3077         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3078                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3079                 : C_O2_I2(r, r, L, L));
3080
3081     case INDEX_op_qemu_st_i64:
3082         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3083                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3084                 : C_O0_I4(L, L, L, L));
3085
3086     case INDEX_op_brcond2_i32:
3087         return C_O0_I4(r, r, ri, ri);
3088
3089     case INDEX_op_setcond2_i32:
3090         return C_O1_I4(r, r, r, ri, ri);
3091
3092     case INDEX_op_ld_vec:
3093     case INDEX_op_dupm_vec:
3094         return C_O1_I1(x, r);
3095
3096     case INDEX_op_st_vec:
3097         return C_O0_I2(x, r);
3098
3099     case INDEX_op_add_vec:
3100     case INDEX_op_sub_vec:
3101     case INDEX_op_mul_vec:
3102     case INDEX_op_and_vec:
3103     case INDEX_op_or_vec:
3104     case INDEX_op_xor_vec:
3105     case INDEX_op_andc_vec:
3106     case INDEX_op_ssadd_vec:
3107     case INDEX_op_usadd_vec:
3108     case INDEX_op_sssub_vec:
3109     case INDEX_op_ussub_vec:
3110     case INDEX_op_smin_vec:
3111     case INDEX_op_umin_vec:
3112     case INDEX_op_smax_vec:
3113     case INDEX_op_umax_vec:
3114     case INDEX_op_shlv_vec:
3115     case INDEX_op_shrv_vec:
3116     case INDEX_op_sarv_vec:
3117     case INDEX_op_shls_vec:
3118     case INDEX_op_shrs_vec:
3119     case INDEX_op_sars_vec:
3120     case INDEX_op_rotls_vec:
3121     case INDEX_op_cmp_vec:
3122     case INDEX_op_x86_shufps_vec:
3123     case INDEX_op_x86_blend_vec:
3124     case INDEX_op_x86_packss_vec:
3125     case INDEX_op_x86_packus_vec:
3126     case INDEX_op_x86_vperm2i128_vec:
3127     case INDEX_op_x86_punpckl_vec:
3128     case INDEX_op_x86_punpckh_vec:
3129 #if TCG_TARGET_REG_BITS == 32
3130     case INDEX_op_dup2_vec:
3131 #endif
3132         return C_O1_I2(x, x, x);
3133
3134     case INDEX_op_abs_vec:
3135     case INDEX_op_dup_vec:
3136     case INDEX_op_shli_vec:
3137     case INDEX_op_shri_vec:
3138     case INDEX_op_sari_vec:
3139     case INDEX_op_x86_psrldq_vec:
3140         return C_O1_I1(x, x);
3141
3142     case INDEX_op_x86_vpblendvb_vec:
3143         return C_O1_I3(x, x, x, x);
3144
3145     default:
3146         g_assert_not_reached();
3147     }
3148 }
3149
3150 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3151 {
3152     switch (opc) {
3153     case INDEX_op_add_vec:
3154     case INDEX_op_sub_vec:
3155     case INDEX_op_and_vec:
3156     case INDEX_op_or_vec:
3157     case INDEX_op_xor_vec:
3158     case INDEX_op_andc_vec:
3159         return 1;
3160     case INDEX_op_rotli_vec:
3161     case INDEX_op_cmp_vec:
3162     case INDEX_op_cmpsel_vec:
3163         return -1;
3164
3165     case INDEX_op_shli_vec:
3166     case INDEX_op_shri_vec:
3167         /* We must expand the operation for MO_8.  */
3168         return vece == MO_8 ? -1 : 1;
3169
3170     case INDEX_op_sari_vec:
3171         /* We must expand the operation for MO_8.  */
3172         if (vece == MO_8) {
3173             return -1;
3174         }
3175         /* We can emulate this for MO_64, but it does not pay off
3176            unless we're producing at least 4 values.  */
3177         if (vece == MO_64) {
3178             return type >= TCG_TYPE_V256 ? -1 : 0;
3179         }
3180         return 1;
3181
3182     case INDEX_op_shls_vec:
3183     case INDEX_op_shrs_vec:
3184         return vece >= MO_16;
3185     case INDEX_op_sars_vec:
3186         return vece >= MO_16 && vece <= MO_32;
3187     case INDEX_op_rotls_vec:
3188         return vece >= MO_16 ? -1 : 0;
3189
3190     case INDEX_op_shlv_vec:
3191     case INDEX_op_shrv_vec:
3192         return have_avx2 && vece >= MO_32;
3193     case INDEX_op_sarv_vec:
3194         return have_avx2 && vece == MO_32;
3195     case INDEX_op_rotlv_vec:
3196     case INDEX_op_rotrv_vec:
3197         return have_avx2 && vece >= MO_32 ? -1 : 0;
3198
3199     case INDEX_op_mul_vec:
3200         if (vece == MO_8) {
3201             /* We can expand the operation for MO_8.  */
3202             return -1;
3203         }
3204         if (vece == MO_64) {
3205             return 0;
3206         }
3207         return 1;
3208
3209     case INDEX_op_ssadd_vec:
3210     case INDEX_op_usadd_vec:
3211     case INDEX_op_sssub_vec:
3212     case INDEX_op_ussub_vec:
3213         return vece <= MO_16;
3214     case INDEX_op_smin_vec:
3215     case INDEX_op_smax_vec:
3216     case INDEX_op_umin_vec:
3217     case INDEX_op_umax_vec:
3218     case INDEX_op_abs_vec:
3219         return vece <= MO_32;
3220
3221     default:
3222         return 0;
3223     }
3224 }
3225
3226 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3227                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3228 {
3229     TCGv_vec t1, t2;
3230
3231     tcg_debug_assert(vece == MO_8);
3232
3233     t1 = tcg_temp_new_vec(type);
3234     t2 = tcg_temp_new_vec(type);
3235
3236     /*
3237      * Unpack to W, shift, and repack.  Tricky bits:
3238      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3239      *     i.e. duplicate in other half of the 16-bit lane.
3240      * (2) For right-shift, add 8 so that the high half of the lane
3241      *     becomes zero.  For left-shift, and left-rotate, we must
3242      *     shift up and down again.
3243      * (3) Step 2 leaves high half zero such that PACKUSWB
3244      *     (pack with unsigned saturation) does not modify
3245      *     the quantity.
3246      */
3247     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3248               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3249     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3250               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3251
3252     if (opc != INDEX_op_rotli_vec) {
3253         imm += 8;
3254     }
3255     if (opc == INDEX_op_shri_vec) {
3256         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3257         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3258     } else {
3259         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3260         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3261         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3262         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3263     }
3264
3265     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3266               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3267     tcg_temp_free_vec(t1);
3268     tcg_temp_free_vec(t2);
3269 }
3270
3271 static void expand_vec_sari(TCGType type, unsigned vece,
3272                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3273 {
3274     TCGv_vec t1, t2;
3275
3276     switch (vece) {
3277     case MO_8:
3278         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3279         t1 = tcg_temp_new_vec(type);
3280         t2 = tcg_temp_new_vec(type);
3281         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3282                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3283         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3284                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3285         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3286         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3287         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3288                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3289         tcg_temp_free_vec(t1);
3290         tcg_temp_free_vec(t2);
3291         break;
3292
3293     case MO_64:
3294         if (imm <= 32) {
3295             /*
3296              * We can emulate a small sign extend by performing an arithmetic
3297              * 32-bit shift and overwriting the high half of a 64-bit logical
3298              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3299              * does not, so we have to bound the smaller shift -- we get the
3300              * same result in the high half either way.
3301              */
3302             t1 = tcg_temp_new_vec(type);
3303             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3304             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3305             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3306                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3307                       tcgv_vec_arg(t1), 0xaa);
3308             tcg_temp_free_vec(t1);
3309         } else {
3310             /* Otherwise we will need to use a compare vs 0 to produce
3311              * the sign-extend, shift and merge.
3312              */
3313             t1 = tcg_const_zeros_vec(type);
3314             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3315             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3316             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3317             tcg_gen_or_vec(MO_64, v0, v0, t1);
3318             tcg_temp_free_vec(t1);
3319         }
3320         break;
3321
3322     default:
3323         g_assert_not_reached();
3324     }
3325 }
3326
3327 static void expand_vec_rotli(TCGType type, unsigned vece,
3328                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3329 {
3330     TCGv_vec t;
3331
3332     if (vece == MO_8) {
3333         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3334         return;
3335     }
3336
3337     t = tcg_temp_new_vec(type);
3338     tcg_gen_shli_vec(vece, t, v1, imm);
3339     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3340     tcg_gen_or_vec(vece, v0, v0, t);
3341     tcg_temp_free_vec(t);
3342 }
3343
3344 static void expand_vec_rotls(TCGType type, unsigned vece,
3345                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3346 {
3347     TCGv_i32 rsh;
3348     TCGv_vec t;
3349
3350     tcg_debug_assert(vece != MO_8);
3351
3352     t = tcg_temp_new_vec(type);
3353     rsh = tcg_temp_new_i32();
3354
3355     tcg_gen_neg_i32(rsh, lsh);
3356     tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3357     tcg_gen_shls_vec(vece, t, v1, lsh);
3358     tcg_gen_shrs_vec(vece, v0, v1, rsh);
3359     tcg_gen_or_vec(vece, v0, v0, t);
3360     tcg_temp_free_vec(t);
3361     tcg_temp_free_i32(rsh);
3362 }
3363
3364 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3365                             TCGv_vec v1, TCGv_vec sh, bool right)
3366 {
3367     TCGv_vec t = tcg_temp_new_vec(type);
3368
3369     tcg_gen_dupi_vec(vece, t, 8 << vece);
3370     tcg_gen_sub_vec(vece, t, t, sh);
3371     if (right) {
3372         tcg_gen_shlv_vec(vece, t, v1, t);
3373         tcg_gen_shrv_vec(vece, v0, v1, sh);
3374     } else {
3375         tcg_gen_shrv_vec(vece, t, v1, t);
3376         tcg_gen_shlv_vec(vece, v0, v1, sh);
3377     }
3378     tcg_gen_or_vec(vece, v0, v0, t);
3379     tcg_temp_free_vec(t);
3380 }
3381
3382 static void expand_vec_mul(TCGType type, unsigned vece,
3383                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3384 {
3385     TCGv_vec t1, t2, t3, t4, zero;
3386
3387     tcg_debug_assert(vece == MO_8);
3388
3389     /*
3390      * Unpack v1 bytes to words, 0 | x.
3391      * Unpack v2 bytes to words, y | 0.
3392      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3393      * Shift logical right by 8 bits to clear the high 8 bytes before
3394      * using an unsigned saturated pack.
3395      *
3396      * The difference between the V64, V128 and V256 cases is merely how
3397      * we distribute the expansion between temporaries.
3398      */
3399     switch (type) {
3400     case TCG_TYPE_V64:
3401         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3402         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3403         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3404         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3405                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3406         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3407                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3408         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3409         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3410         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3411                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3412         tcg_temp_free_vec(t1);
3413         tcg_temp_free_vec(t2);
3414         break;
3415
3416     case TCG_TYPE_V128:
3417     case TCG_TYPE_V256:
3418         t1 = tcg_temp_new_vec(type);
3419         t2 = tcg_temp_new_vec(type);
3420         t3 = tcg_temp_new_vec(type);
3421         t4 = tcg_temp_new_vec(type);
3422         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3423         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3424                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3425         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3426                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3427         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3428                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3429         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3430                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3431         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3432         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3433         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3434         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3435         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3436                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3437         tcg_temp_free_vec(t1);
3438         tcg_temp_free_vec(t2);
3439         tcg_temp_free_vec(t3);
3440         tcg_temp_free_vec(t4);
3441         break;
3442
3443     default:
3444         g_assert_not_reached();
3445     }
3446 }
3447
3448 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3449                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3450 {
3451     enum {
3452         NEED_INV  = 1,
3453         NEED_SWAP = 2,
3454         NEED_BIAS = 4,
3455         NEED_UMIN = 8,
3456         NEED_UMAX = 16,
3457     };
3458     TCGv_vec t1, t2, t3;
3459     uint8_t fixup;
3460
3461     switch (cond) {
3462     case TCG_COND_EQ:
3463     case TCG_COND_GT:
3464         fixup = 0;
3465         break;
3466     case TCG_COND_NE:
3467     case TCG_COND_LE:
3468         fixup = NEED_INV;
3469         break;
3470     case TCG_COND_LT:
3471         fixup = NEED_SWAP;
3472         break;
3473     case TCG_COND_GE:
3474         fixup = NEED_SWAP | NEED_INV;
3475         break;
3476     case TCG_COND_LEU:
3477         if (vece <= MO_32) {
3478             fixup = NEED_UMIN;
3479         } else {
3480             fixup = NEED_BIAS | NEED_INV;
3481         }
3482         break;
3483     case TCG_COND_GTU:
3484         if (vece <= MO_32) {
3485             fixup = NEED_UMIN | NEED_INV;
3486         } else {
3487             fixup = NEED_BIAS;
3488         }
3489         break;
3490     case TCG_COND_GEU:
3491         if (vece <= MO_32) {
3492             fixup = NEED_UMAX;
3493         } else {
3494             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3495         }
3496         break;
3497     case TCG_COND_LTU:
3498         if (vece <= MO_32) {
3499             fixup = NEED_UMAX | NEED_INV;
3500         } else {
3501             fixup = NEED_BIAS | NEED_SWAP;
3502         }
3503         break;
3504     default:
3505         g_assert_not_reached();
3506     }
3507
3508     if (fixup & NEED_INV) {
3509         cond = tcg_invert_cond(cond);
3510     }
3511     if (fixup & NEED_SWAP) {
3512         t1 = v1, v1 = v2, v2 = t1;
3513         cond = tcg_swap_cond(cond);
3514     }
3515
3516     t1 = t2 = NULL;
3517     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3518         t1 = tcg_temp_new_vec(type);
3519         if (fixup & NEED_UMIN) {
3520             tcg_gen_umin_vec(vece, t1, v1, v2);
3521         } else {
3522             tcg_gen_umax_vec(vece, t1, v1, v2);
3523         }
3524         v2 = t1;
3525         cond = TCG_COND_EQ;
3526     } else if (fixup & NEED_BIAS) {
3527         t1 = tcg_temp_new_vec(type);
3528         t2 = tcg_temp_new_vec(type);
3529         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3530         tcg_gen_sub_vec(vece, t1, v1, t3);
3531         tcg_gen_sub_vec(vece, t2, v2, t3);
3532         v1 = t1;
3533         v2 = t2;
3534         cond = tcg_signed_cond(cond);
3535     }
3536
3537     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3538     /* Expand directly; do not recurse.  */
3539     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3540               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3541
3542     if (t1) {
3543         tcg_temp_free_vec(t1);
3544         if (t2) {
3545             tcg_temp_free_vec(t2);
3546         }
3547     }
3548     return fixup & NEED_INV;
3549 }
3550
3551 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3552                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3553 {
3554     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3555         tcg_gen_not_vec(vece, v0, v0);
3556     }
3557 }
3558
3559 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3560                               TCGv_vec c1, TCGv_vec c2,
3561                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3562 {
3563     TCGv_vec t = tcg_temp_new_vec(type);
3564
3565     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3566         /* Invert the sense of the compare by swapping arguments.  */
3567         TCGv_vec x;
3568         x = v3, v3 = v4, v4 = x;
3569     }
3570     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3571               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3572               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3573     tcg_temp_free_vec(t);
3574 }
3575
3576 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3577                        TCGArg a0, ...)
3578 {
3579     va_list va;
3580     TCGArg a2;
3581     TCGv_vec v0, v1, v2, v3, v4;
3582
3583     va_start(va, a0);
3584     v0 = temp_tcgv_vec(arg_temp(a0));
3585     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3586     a2 = va_arg(va, TCGArg);
3587
3588     switch (opc) {
3589     case INDEX_op_shli_vec:
3590     case INDEX_op_shri_vec:
3591         expand_vec_shi(type, vece, opc, v0, v1, a2);
3592         break;
3593
3594     case INDEX_op_sari_vec:
3595         expand_vec_sari(type, vece, v0, v1, a2);
3596         break;
3597
3598     case INDEX_op_rotli_vec:
3599         expand_vec_rotli(type, vece, v0, v1, a2);
3600         break;
3601
3602     case INDEX_op_rotls_vec:
3603         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3604         break;
3605
3606     case INDEX_op_rotlv_vec:
3607         v2 = temp_tcgv_vec(arg_temp(a2));
3608         expand_vec_rotv(type, vece, v0, v1, v2, false);
3609         break;
3610     case INDEX_op_rotrv_vec:
3611         v2 = temp_tcgv_vec(arg_temp(a2));
3612         expand_vec_rotv(type, vece, v0, v1, v2, true);
3613         break;
3614
3615     case INDEX_op_mul_vec:
3616         v2 = temp_tcgv_vec(arg_temp(a2));
3617         expand_vec_mul(type, vece, v0, v1, v2);
3618         break;
3619
3620     case INDEX_op_cmp_vec:
3621         v2 = temp_tcgv_vec(arg_temp(a2));
3622         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3623         break;
3624
3625     case INDEX_op_cmpsel_vec:
3626         v2 = temp_tcgv_vec(arg_temp(a2));
3627         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3628         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3629         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3630         break;
3631
3632     default:
3633         break;
3634     }
3635
3636     va_end(va);
3637 }
3638
3639 static const int tcg_target_callee_save_regs[] = {
3640 #if TCG_TARGET_REG_BITS == 64
3641     TCG_REG_RBP,
3642     TCG_REG_RBX,
3643 #if defined(_WIN64)
3644     TCG_REG_RDI,
3645     TCG_REG_RSI,
3646 #endif
3647     TCG_REG_R12,
3648     TCG_REG_R13,
3649     TCG_REG_R14, /* Currently used for the global env. */
3650     TCG_REG_R15,
3651 #else
3652     TCG_REG_EBP, /* Currently used for the global env. */
3653     TCG_REG_EBX,
3654     TCG_REG_ESI,
3655     TCG_REG_EDI,
3656 #endif
3657 };
3658
3659 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3660    and tcg_register_jit.  */
3661
3662 #define PUSH_SIZE \
3663     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3664      * (TCG_TARGET_REG_BITS / 8))
3665
3666 #define FRAME_SIZE \
3667     ((PUSH_SIZE \
3668       + TCG_STATIC_CALL_ARGS_SIZE \
3669       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3670       + TCG_TARGET_STACK_ALIGN - 1) \
3671      & ~(TCG_TARGET_STACK_ALIGN - 1))
3672
3673 /* Generate global QEMU prologue and epilogue code */
3674 static void tcg_target_qemu_prologue(TCGContext *s)
3675 {
3676     int i, stack_addend;
3677
3678     /* TB prologue */
3679
3680     /* Reserve some stack space, also for TCG temps.  */
3681     stack_addend = FRAME_SIZE - PUSH_SIZE;
3682     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3683                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3684
3685     /* Save all callee saved registers.  */
3686     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3687         tcg_out_push(s, tcg_target_callee_save_regs[i]);
3688     }
3689
3690 #if TCG_TARGET_REG_BITS == 32
3691     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3692                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3693     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3694     /* jmp *tb.  */
3695     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3696                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3697                          + stack_addend);
3698 #else
3699 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3700     if (guest_base) {
3701         int seg = setup_guest_base_seg();
3702         if (seg != 0) {
3703             x86_guest_base_seg = seg;
3704         } else if (guest_base == (int32_t)guest_base) {
3705             x86_guest_base_offset = guest_base;
3706         } else {
3707             /* Choose R12 because, as a base, it requires a SIB byte. */
3708             x86_guest_base_index = TCG_REG_R12;
3709             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3710             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3711         }
3712     }
3713 # endif
3714     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3715     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3716     /* jmp *tb.  */
3717     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3718 #endif
3719
3720     /*
3721      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3722      * and fall through to the rest of the epilogue.
3723      */
3724     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3725     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3726
3727     /* TB epilogue */
3728     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3729
3730     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3731
3732     if (have_avx2) {
3733         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3734     }
3735     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3736         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3737     }
3738     tcg_out_opc(s, OPC_RET, 0, 0, 0);
3739 }
3740
3741 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3742 {
3743     memset(p, 0x90, count);
3744 }
3745
3746 static void tcg_target_init(TCGContext *s)
3747 {
3748 #ifdef CONFIG_CPUID_H
3749     unsigned a, b, c, d, b7 = 0;
3750     int max = __get_cpuid_max(0, 0);
3751
3752     if (max >= 7) {
3753         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3754         __cpuid_count(7, 0, a, b7, c, d);
3755         have_bmi1 = (b7 & bit_BMI) != 0;
3756         have_bmi2 = (b7 & bit_BMI2) != 0;
3757     }
3758
3759     if (max >= 1) {
3760         __cpuid(1, a, b, c, d);
3761 #ifndef have_cmov
3762         /* For 32-bit, 99% certainty that we're running on hardware that
3763            supports cmov, but we still need to check.  In case cmov is not
3764            available, we'll use a small forward branch.  */
3765         have_cmov = (d & bit_CMOV) != 0;
3766 #endif
3767
3768         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3769            need to probe for it.  */
3770         have_movbe = (c & bit_MOVBE) != 0;
3771         have_popcnt = (c & bit_POPCNT) != 0;
3772
3773         /* There are a number of things we must check before we can be
3774            sure of not hitting invalid opcode.  */
3775         if (c & bit_OSXSAVE) {
3776             unsigned xcrl, xcrh;
3777             /* The xgetbv instruction is not available to older versions of
3778              * the assembler, so we encode the instruction manually.
3779              */
3780             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3781             if ((xcrl & 6) == 6) {
3782                 have_avx1 = (c & bit_AVX) != 0;
3783                 have_avx2 = (b7 & bit_AVX2) != 0;
3784             }
3785         }
3786     }
3787
3788     max = __get_cpuid_max(0x8000000, 0);
3789     if (max >= 1) {
3790         __cpuid(0x80000001, a, b, c, d);
3791         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3792         have_lzcnt = (c & bit_LZCNT) != 0;
3793     }
3794 #endif /* CONFIG_CPUID_H */
3795
3796     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3797     if (TCG_TARGET_REG_BITS == 64) {
3798         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3799     }
3800     if (have_avx1) {
3801         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3802         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3803     }
3804     if (have_avx2) {
3805         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3806     }
3807
3808     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3809     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3810     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3811     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3812     if (TCG_TARGET_REG_BITS == 64) {
3813 #if !defined(_WIN64)
3814         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3815         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3816 #endif
3817         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3818         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3819         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3820         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3821     }
3822
3823     s->reserved_regs = 0;
3824     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3825 }
3826
3827 typedef struct {
3828     DebugFrameHeader h;
3829     uint8_t fde_def_cfa[4];
3830     uint8_t fde_reg_ofs[14];
3831 } DebugFrame;
3832
3833 /* We're expecting a 2 byte uleb128 encoded value.  */
3834 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3835
3836 #if !defined(__ELF__)
3837     /* Host machine without ELF. */
3838 #elif TCG_TARGET_REG_BITS == 64
3839 #define ELF_HOST_MACHINE EM_X86_64
3840 static const DebugFrame debug_frame = {
3841     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3842     .h.cie.id = -1,
3843     .h.cie.version = 1,
3844     .h.cie.code_align = 1,
3845     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3846     .h.cie.return_column = 16,
3847
3848     /* Total FDE size does not include the "len" member.  */
3849     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3850
3851     .fde_def_cfa = {
3852         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3853         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3854         (FRAME_SIZE >> 7)
3855     },
3856     .fde_reg_ofs = {
3857         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3858         /* The following ordering must match tcg_target_callee_save_regs.  */
3859         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3860         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3861         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3862         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3863         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3864         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3865     }
3866 };
3867 #else
3868 #define ELF_HOST_MACHINE EM_386
3869 static const DebugFrame debug_frame = {
3870     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3871     .h.cie.id = -1,
3872     .h.cie.version = 1,
3873     .h.cie.code_align = 1,
3874     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3875     .h.cie.return_column = 8,
3876
3877     /* Total FDE size does not include the "len" member.  */
3878     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3879
3880     .fde_def_cfa = {
3881         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3882         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3883         (FRAME_SIZE >> 7)
3884     },
3885     .fde_reg_ofs = {
3886         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3887         /* The following ordering must match tcg_target_callee_save_regs.  */
3888         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3889         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3890         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3891         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3892     }
3893 };
3894 #endif
3895
3896 #if defined(ELF_HOST_MACHINE)
3897 void tcg_register_jit(const void *buf, size_t buf_size)
3898 {
3899     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3900 }
3901 #endif