tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-pool.c.inc"
  26
  27 #ifdef CONFIG_DEBUG_TCG
  28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29 #if TCG_TARGET_REG_BITS == 64
  30     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31 #else
  32     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33 #endif
  34     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36 #if TCG_TARGET_REG_BITS == 64
  37     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39 #endif
  40 };
  41 #endif
  42
  43 static const int tcg_target_reg_alloc_order[] = {
  44 #if TCG_TARGET_REG_BITS == 64
  45     TCG_REG_RBP,
  46     TCG_REG_RBX,
  47     TCG_REG_R12,
  48     TCG_REG_R13,
  49     TCG_REG_R14,
  50     TCG_REG_R15,
  51     TCG_REG_R10,
  52     TCG_REG_R11,
  53     TCG_REG_R9,
  54     TCG_REG_R8,
  55     TCG_REG_RCX,
  56     TCG_REG_RDX,
  57     TCG_REG_RSI,
  58     TCG_REG_RDI,
  59     TCG_REG_RAX,
  60 #else
  61     TCG_REG_EBX,
  62     TCG_REG_ESI,
  63     TCG_REG_EDI,
  64     TCG_REG_EBP,
  65     TCG_REG_ECX,
  66     TCG_REG_EDX,
  67     TCG_REG_EAX,
  68 #endif
  69     TCG_REG_XMM0,
  70     TCG_REG_XMM1,
  71     TCG_REG_XMM2,
  72     TCG_REG_XMM3,
  73     TCG_REG_XMM4,
  74     TCG_REG_XMM5,
  75 #ifndef _WIN64
  76     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78     TCG_REG_XMM6,
  79     TCG_REG_XMM7,
  80 #if TCG_TARGET_REG_BITS == 64
  81     TCG_REG_XMM8,
  82     TCG_REG_XMM9,
  83     TCG_REG_XMM10,
  84     TCG_REG_XMM11,
  85     TCG_REG_XMM12,
  86     TCG_REG_XMM13,
  87     TCG_REG_XMM14,
  88     TCG_REG_XMM15,
  89 #endif
  90 #endif
  91 };
  92
  93 static const int tcg_target_call_iarg_regs[] = {
  94 #if TCG_TARGET_REG_BITS == 64
  95 #if defined(_WIN64)
  96     TCG_REG_RCX,
  97     TCG_REG_RDX,
  98 #else
  99     TCG_REG_RDI,
 100     TCG_REG_RSI,
 101     TCG_REG_RDX,
 102     TCG_REG_RCX,
 103 #endif
 104     TCG_REG_R8,
 105     TCG_REG_R9,
 106 #else
 107     /* 32 bit mode uses stack based calling convention (GCC default). */
 108 #endif
 109 };
 110
 111 static const int tcg_target_call_oarg_regs[] = {
 112     TCG_REG_EAX,
 113 #if TCG_TARGET_REG_BITS == 32
 114     TCG_REG_EDX
 115 #endif
 116 };
 117
 118 /* Constants we accept.  */
 119 #define TCG_CT_CONST_S32 0x100
 120 #define TCG_CT_CONST_U32 0x200
 121 #define TCG_CT_CONST_I32 0x400
 122 #define TCG_CT_CONST_WSZ 0x800
 123
 124 /* Registers used with L constraint, which are the first argument
 125    registers on x86_64, and two random call clobbered registers on
 126    i386. */
 127 #if TCG_TARGET_REG_BITS == 64
 128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130 #else
 131 # define TCG_REG_L0 TCG_REG_EAX
 132 # define TCG_REG_L1 TCG_REG_EDX
 133 #endif
 134
 135 #define ALL_BYTEH_REGS         0x0000000fu
 136 #if TCG_TARGET_REG_BITS == 64
 137 # define ALL_GENERAL_REGS      0x0000ffffu
 138 # define ALL_VECTOR_REGS       0xffff0000u
 139 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 140 #else
 141 # define ALL_GENERAL_REGS      0x000000ffu
 142 # define ALL_VECTOR_REGS       0x00ff0000u
 143 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 144 #endif
 145 #ifdef CONFIG_SOFTMMU
 146 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 147 #else
 148 # define SOFTMMU_RESERVE_REGS  0
 149 #endif
 150
 151 /* The host compiler should supply <cpuid.h> to enable runtime features
 152    detection, as we're not going to go so far as our own inline assembly.
 153    If not available, default values will be assumed.  */
 154 #if defined(CONFIG_CPUID_H)
 155 #include "qemu/cpuid.h"
 156 #endif
 157
 158 /* For 64-bit, we always know that CMOV is available.  */
 159 #if TCG_TARGET_REG_BITS == 64
 160 # define have_cmov 1
 161 #elif defined(CONFIG_CPUID_H)
 162 static bool have_cmov;
 163 #else
 164 # define have_cmov 0
 165 #endif
 166
 167 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 168    it there.  Therefore we always define the variable.  */
 169 bool have_bmi1;
 170 bool have_popcnt;
 171 bool have_avx1;
 172 bool have_avx2;
 173 bool have_movbe;
 174
 175 #ifdef CONFIG_CPUID_H
 176 static bool have_bmi2;
 177 static bool have_lzcnt;
 178 #else
 179 # define have_bmi2 0
 180 # define have_lzcnt 0
 181 #endif
 182
 183 static const tcg_insn_unit *tb_ret_addr;
 184
 185 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 186                         intptr_t value, intptr_t addend)
 187 {
 188     value += addend;
 189     switch(type) {
 190     case R_386_PC32:
 191         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 192         if (value != (int32_t)value) {
 193             return false;
 194         }
 195         /* FALLTHRU */
 196     case R_386_32:
 197         tcg_patch32(code_ptr, value);
 198         break;
 199     case R_386_PC8:
 200         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 201         if (value != (int8_t)value) {
 202             return false;
 203         }
 204         tcg_patch8(code_ptr, value);
 205         break;
 206     default:
 207         tcg_abort();
 208     }
 209     return true;
 210 }
 211
 212 /* test if a constant matches the constraint */
 213 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 214 {
 215     if (ct & TCG_CT_CONST) {
 216         return 1;
 217     }
 218     if (type == TCG_TYPE_I32) {
 219         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 220             return 1;
 221         }
 222     } else {
 223         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 224             return 1;
 225         }
 226         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 227             return 1;
 228         }
 229         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 230             return 1;
 231         }
 232     }
 233     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 234         return 1;
 235     }
 236     return 0;
 237 }
 238
 239 # define LOWREGMASK(x)  ((x) & 7)
 240
 241 #define P_EXT           0x100           /* 0x0f opcode prefix */
 242 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 243 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 244 #if TCG_TARGET_REG_BITS == 64
 245 # define P_REXW         0x1000          /* Set REX.W = 1 */
 246 # define P_REXB_R       0x2000          /* REG field as byte register */
 247 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 248 # define P_GS           0x8000          /* gs segment override */
 249 #else
 250 # define P_REXW         0
 251 # define P_REXB_R       0
 252 # define P_REXB_RM      0
 253 # define P_GS           0
 254 #endif
 255 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 256 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 257 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 258 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 259
 260 #define OPC_ARITH_EvIz  (0x81)
 261 #define OPC_ARITH_EvIb  (0x83)
 262 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 263 #define OPC_ANDN        (0xf2 | P_EXT38)
 264 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 265 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 266 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 267 #define OPC_BSF         (0xbc | P_EXT)
 268 #define OPC_BSR         (0xbd | P_EXT)
 269 #define OPC_BSWAP       (0xc8 | P_EXT)
 270 #define OPC_CALL_Jz     (0xe8)
 271 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 272 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 273 #define OPC_DEC_r32     (0x48)
 274 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 275 #define OPC_IMUL_GvEvIb (0x6b)
 276 #define OPC_IMUL_GvEvIz (0x69)
 277 #define OPC_INC_r32     (0x40)
 278 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 279 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 280 #define OPC_JMP_long    (0xe9)
 281 #define OPC_JMP_short   (0xeb)
 282 #define OPC_LEA         (0x8d)
 283 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 284 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 285 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 286 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 287 #define OPC_MOVB_EvIz   (0xc6)
 288 #define OPC_MOVL_EvIz   (0xc7)
 289 #define OPC_MOVL_Iv     (0xb8)
 290 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 291 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 292 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 293 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 294 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 295 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 296 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 297 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 298 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 299 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 300 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 301 #define OPC_MOVSBL      (0xbe | P_EXT)
 302 #define OPC_MOVSWL      (0xbf | P_EXT)
 303 #define OPC_MOVSLQ      (0x63 | P_REXW)
 304 #define OPC_MOVZBL      (0xb6 | P_EXT)
 305 #define OPC_MOVZWL      (0xb7 | P_EXT)
 306 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 307 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 308 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 309 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 310 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 311 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 312 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 313 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 314 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 315 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 316 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 317 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 318 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 319 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 320 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 321 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 322 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 323 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 324 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 325 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 326 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 327 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 328 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 329 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 330 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 331 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 332 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 333 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 334 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 335 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 336 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 337 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 338 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 339 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 340 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 341 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 342 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 343 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 344 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 345 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 348 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 349 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 350 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 351 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 352 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 353 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 354 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 355 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 356 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 357 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 358 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 359 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 360 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 361 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 362 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 363 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 364 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 365 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 366 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 368 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 369 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 370 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 371 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 372 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 373 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 374 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 376 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 384 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 385 #define OPC_POP_r32     (0x58)
 386 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 387 #define OPC_PUSH_r32    (0x50)
 388 #define OPC_PUSH_Iv     (0x68)
 389 #define OPC_PUSH_Ib     (0x6a)
 390 #define OPC_RET         (0xc3)
 391 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 392 #define OPC_SHIFT_1     (0xd1)
 393 #define OPC_SHIFT_Ib    (0xc1)
 394 #define OPC_SHIFT_cl    (0xd3)
 395 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 396 #define OPC_SHUFPS      (0xc6 | P_EXT)
 397 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 398 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 399 #define OPC_SHRD_Ib     (0xac | P_EXT)
 400 #define OPC_TESTL       (0x85)
 401 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 402 #define OPC_UD2         (0x0b | P_EXT)
 403 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 404 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 405 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 406 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 407 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 408 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 409 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 410 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 413 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 414 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 415 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 416 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
 417 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 418 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 419 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
 420 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 421 #define OPC_XCHG_ax_r32 (0x90)
 422
 423 #define OPC_GRP3_Ev     (0xf7)
 424 #define OPC_GRP5        (0xff)
 425 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 426
 427 /* Group 1 opcode extensions for 0x80-0x83.
 428    These are also used as modifiers for OPC_ARITH.  */
 429 #define ARITH_ADD 0
 430 #define ARITH_OR  1
 431 #define ARITH_ADC 2
 432 #define ARITH_SBB 3
 433 #define ARITH_AND 4
 434 #define ARITH_SUB 5
 435 #define ARITH_XOR 6
 436 #define ARITH_CMP 7
 437
 438 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 439 #define SHIFT_ROL 0
 440 #define SHIFT_ROR 1
 441 #define SHIFT_SHL 4
 442 #define SHIFT_SHR 5
 443 #define SHIFT_SAR 7
 444
 445 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 446 #define EXT3_NOT   2
 447 #define EXT3_NEG   3
 448 #define EXT3_MUL   4
 449 #define EXT3_IMUL  5
 450 #define EXT3_DIV   6
 451 #define EXT3_IDIV  7
 452
 453 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 454 #define EXT5_INC_Ev     0
 455 #define EXT5_DEC_Ev     1
 456 #define EXT5_CALLN_Ev   2
 457 #define EXT5_JMPN_Ev    4
 458
 459 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 460 #define JCC_JMP (-1)
 461 #define JCC_JO  0x0
 462 #define JCC_JNO 0x1
 463 #define JCC_JB  0x2
 464 #define JCC_JAE 0x3
 465 #define JCC_JE  0x4
 466 #define JCC_JNE 0x5
 467 #define JCC_JBE 0x6
 468 #define JCC_JA  0x7
 469 #define JCC_JS  0x8
 470 #define JCC_JNS 0x9
 471 #define JCC_JP  0xa
 472 #define JCC_JNP 0xb
 473 #define JCC_JL  0xc
 474 #define JCC_JGE 0xd
 475 #define JCC_JLE 0xe
 476 #define JCC_JG  0xf
 477
 478 static const uint8_t tcg_cond_to_jcc[] = {
 479     [TCG_COND_EQ] = JCC_JE,
 480     [TCG_COND_NE] = JCC_JNE,
 481     [TCG_COND_LT] = JCC_JL,
 482     [TCG_COND_GE] = JCC_JGE,
 483     [TCG_COND_LE] = JCC_JLE,
 484     [TCG_COND_GT] = JCC_JG,
 485     [TCG_COND_LTU] = JCC_JB,
 486     [TCG_COND_GEU] = JCC_JAE,
 487     [TCG_COND_LEU] = JCC_JBE,
 488     [TCG_COND_GTU] = JCC_JA,
 489 };
 490
 491 #if TCG_TARGET_REG_BITS == 64
 492 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 493 {
 494     int rex;
 495
 496     if (opc & P_GS) {
 497         tcg_out8(s, 0x65);
 498     }
 499     if (opc & P_DATA16) {
 500         /* We should never be asking for both 16 and 64-bit operation.  */
 501         tcg_debug_assert((opc & P_REXW) == 0);
 502         tcg_out8(s, 0x66);
 503     }
 504     if (opc & P_SIMDF3) {
 505         tcg_out8(s, 0xf3);
 506     } else if (opc & P_SIMDF2) {
 507         tcg_out8(s, 0xf2);
 508     }
 509
 510     rex = 0;
 511     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 512     rex |= (r & 8) >> 1;                /* REX.R */
 513     rex |= (x & 8) >> 2;                /* REX.X */
 514     rex |= (rm & 8) >> 3;               /* REX.B */
 515
 516     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 517        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 518        as otherwise the encoding indicates %[abcd]h.  Note that the values
 519        that are ORed in merely indicate that the REX byte must be present;
 520        those bits get discarded in output.  */
 521     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 522     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 523
 524     if (rex) {
 525         tcg_out8(s, (uint8_t)(rex | 0x40));
 526     }
 527
 528     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 529         tcg_out8(s, 0x0f);
 530         if (opc & P_EXT38) {
 531             tcg_out8(s, 0x38);
 532         } else if (opc & P_EXT3A) {
 533             tcg_out8(s, 0x3a);
 534         }
 535     }
 536
 537     tcg_out8(s, opc);
 538 }
 539 #else
 540 static void tcg_out_opc(TCGContext *s, int opc)
 541 {
 542     if (opc & P_DATA16) {
 543         tcg_out8(s, 0x66);
 544     }
 545     if (opc & P_SIMDF3) {
 546         tcg_out8(s, 0xf3);
 547     } else if (opc & P_SIMDF2) {
 548         tcg_out8(s, 0xf2);
 549     }
 550     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 551         tcg_out8(s, 0x0f);
 552         if (opc & P_EXT38) {
 553             tcg_out8(s, 0x38);
 554         } else if (opc & P_EXT3A) {
 555             tcg_out8(s, 0x3a);
 556         }
 557     }
 558     tcg_out8(s, opc);
 559 }
 560 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 561    the 32-bit compilation paths.  This method works with all versions of gcc,
 562    whereas relying on optimization may not be able to exclude them.  */
 563 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 564 #endif
 565
 566 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 567 {
 568     tcg_out_opc(s, opc, r, rm, 0);
 569     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 570 }
 571
 572 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 573                             int rm, int index)
 574 {
 575     int tmp;
 576
 577     /* Use the two byte form if possible, which cannot encode
 578        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 579     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 580         && ((rm | index) & 8) == 0) {
 581         /* Two byte VEX prefix.  */
 582         tcg_out8(s, 0xc5);
 583
 584         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 585     } else {
 586         /* Three byte VEX prefix.  */
 587         tcg_out8(s, 0xc4);
 588
 589         /* VEX.m-mmmm */
 590         if (opc & P_EXT3A) {
 591             tmp = 3;
 592         } else if (opc & P_EXT38) {
 593             tmp = 2;
 594         } else if (opc & P_EXT) {
 595             tmp = 1;
 596         } else {
 597             g_assert_not_reached();
 598         }
 599         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 600         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 601         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 602         tcg_out8(s, tmp);
 603
 604         tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 605     }
 606
 607     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 608     /* VEX.pp */
 609     if (opc & P_DATA16) {
 610         tmp |= 1;                          /* 0x66 */
 611     } else if (opc & P_SIMDF3) {
 612         tmp |= 2;                          /* 0xf3 */
 613     } else if (opc & P_SIMDF2) {
 614         tmp |= 3;                          /* 0xf2 */
 615     }
 616     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 617     tcg_out8(s, tmp);
 618     tcg_out8(s, opc);
 619 }
 620
 621 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 622 {
 623     tcg_out_vex_opc(s, opc, r, v, rm, 0);
 624     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 625 }
 626
 627 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 628    We handle either RM and INDEX missing with a negative value.  In 64-bit
 629    mode for absolute addresses, ~RM is the size of the immediate operand
 630    that will follow the instruction.  */
 631
 632 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 633                                int shift, intptr_t offset)
 634 {
 635     int mod, len;
 636
 637     if (index < 0 && rm < 0) {
 638         if (TCG_TARGET_REG_BITS == 64) {
 639             /* Try for a rip-relative addressing mode.  This has replaced
 640                the 32-bit-mode absolute addressing encoding.  */
 641             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 642             intptr_t disp = offset - pc;
 643             if (disp == (int32_t)disp) {
 644                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 645                 tcg_out32(s, disp);
 646                 return;
 647             }
 648
 649             /* Try for an absolute address encoding.  This requires the
 650                use of the MODRM+SIB encoding and is therefore larger than
 651                rip-relative addressing.  */
 652             if (offset == (int32_t)offset) {
 653                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 654                 tcg_out8(s, (4 << 3) | 5);
 655                 tcg_out32(s, offset);
 656                 return;
 657             }
 658
 659             /* ??? The memory isn't directly addressable.  */
 660             g_assert_not_reached();
 661         } else {
 662             /* Absolute address.  */
 663             tcg_out8(s, (r << 3) | 5);
 664             tcg_out32(s, offset);
 665             return;
 666         }
 667     }
 668
 669     /* Find the length of the immediate addend.  Note that the encoding
 670        that would be used for (%ebp) indicates absolute addressing.  */
 671     if (rm < 0) {
 672         mod = 0, len = 4, rm = 5;
 673     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 674         mod = 0, len = 0;
 675     } else if (offset == (int8_t)offset) {
 676         mod = 0x40, len = 1;
 677     } else {
 678         mod = 0x80, len = 4;
 679     }
 680
 681     /* Use a single byte MODRM format if possible.  Note that the encoding
 682        that would be used for %esp is the escape to the two byte form.  */
 683     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 684         /* Single byte MODRM format.  */
 685         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 686     } else {
 687         /* Two byte MODRM+SIB format.  */
 688
 689         /* Note that the encoding that would place %esp into the index
 690            field indicates no index register.  In 64-bit mode, the REX.X
 691            bit counts, so %r12 can be used as the index.  */
 692         if (index < 0) {
 693             index = 4;
 694         } else {
 695             tcg_debug_assert(index != TCG_REG_ESP);
 696         }
 697
 698         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 699         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 700     }
 701
 702     if (len == 1) {
 703         tcg_out8(s, offset);
 704     } else if (len == 4) {
 705         tcg_out32(s, offset);
 706     }
 707 }
 708
 709 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 710                                      int index, int shift, intptr_t offset)
 711 {
 712     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 713     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 714 }
 715
 716 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 717                                          int rm, int index, int shift,
 718                                          intptr_t offset)
 719 {
 720     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 721     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 722 }
 723
 724 /* A simplification of the above with no index or shift.  */
 725 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 726                                         int rm, intptr_t offset)
 727 {
 728     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 729 }
 730
 731 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 732                                             int v, int rm, intptr_t offset)
 733 {
 734     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 735 }
 736
 737 /* Output an opcode with an expected reference to the constant pool.  */
 738 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 739 {
 740     tcg_out_opc(s, opc, r, 0, 0);
 741     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 742     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 743     tcg_out32(s, 0);
 744 }
 745
 746 /* Output an opcode with an expected reference to the constant pool.  */
 747 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 748 {
 749     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 750     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 751     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 752     tcg_out32(s, 0);
 753 }
 754
 755 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 756 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 757 {
 758     /* Propagate an opcode prefix, such as P_REXW.  */
 759     int ext = subop & ~0x7;
 760     subop &= 0x7;
 761
 762     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 763 }
 764
 765 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 766 {
 767     int rexw = 0;
 768
 769     if (arg == ret) {
 770         return true;
 771     }
 772     switch (type) {
 773     case TCG_TYPE_I64:
 774         rexw = P_REXW;
 775         /* fallthru */
 776     case TCG_TYPE_I32:
 777         if (ret < 16) {
 778             if (arg < 16) {
 779                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 780             } else {
 781                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 782             }
 783         } else {
 784             if (arg < 16) {
 785                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 786             } else {
 787                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 788             }
 789         }
 790         break;
 791
 792     case TCG_TYPE_V64:
 793         tcg_debug_assert(ret >= 16 && arg >= 16);
 794         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 795         break;
 796     case TCG_TYPE_V128:
 797         tcg_debug_assert(ret >= 16 && arg >= 16);
 798         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 799         break;
 800     case TCG_TYPE_V256:
 801         tcg_debug_assert(ret >= 16 && arg >= 16);
 802         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 803         break;
 804
 805     default:
 806         g_assert_not_reached();
 807     }
 808     return true;
 809 }
 810
 811 static const int avx2_dup_insn[4] = {
 812     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 813     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 814 };
 815
 816 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 817                             TCGReg r, TCGReg a)
 818 {
 819     if (have_avx2) {
 820         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 821         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 822     } else {
 823         switch (vece) {
 824         case MO_8:
 825             /* ??? With zero in a register, use PSHUFB.  */
 826             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 827             a = r;
 828             /* FALLTHRU */
 829         case MO_16:
 830             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 831             a = r;
 832             /* FALLTHRU */
 833         case MO_32:
 834             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 835             /* imm8 operand: all output lanes selected from input lane 0.  */
 836             tcg_out8(s, 0);
 837             break;
 838         case MO_64:
 839             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 840             break;
 841         default:
 842             g_assert_not_reached();
 843         }
 844     }
 845     return true;
 846 }
 847
 848 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 849                              TCGReg r, TCGReg base, intptr_t offset)
 850 {
 851     if (have_avx2) {
 852         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 853         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 854                                  r, 0, base, offset);
 855     } else {
 856         switch (vece) {
 857         case MO_64:
 858             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 859             break;
 860         case MO_32:
 861             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 862             break;
 863         case MO_16:
 864             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 865             tcg_out8(s, 0); /* imm8 */
 866             tcg_out_dup_vec(s, type, vece, r, r);
 867             break;
 868         case MO_8:
 869             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 870             tcg_out8(s, 0); /* imm8 */
 871             tcg_out_dup_vec(s, type, vece, r, r);
 872             break;
 873         default:
 874             g_assert_not_reached();
 875         }
 876     }
 877     return true;
 878 }
 879
 880 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 881                              TCGReg ret, int64_t arg)
 882 {
 883     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 884
 885     if (arg == 0) {
 886         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 887         return;
 888     }
 889     if (arg == -1) {
 890         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 891         return;
 892     }
 893
 894     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 895         if (have_avx2) {
 896             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 897         } else {
 898             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 899         }
 900         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 901     } else {
 902         if (type == TCG_TYPE_V64) {
 903             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 904         } else if (have_avx2) {
 905             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 906         } else {
 907             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 908         }
 909         if (TCG_TARGET_REG_BITS == 64) {
 910             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 911         } else {
 912             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 913         }
 914     }
 915 }
 916
 917 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 918                              TCGReg ret, tcg_target_long arg)
 919 {
 920     if (arg == 0) {
 921         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 922         return;
 923     }
 924     if (arg == -1) {
 925         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 926         return;
 927     }
 928
 929     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 930     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 931     if (TCG_TARGET_REG_BITS == 64) {
 932         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 933     } else {
 934         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 935     }
 936 }
 937
 938 static void tcg_out_movi_int(TCGContext *s, TCGType type,
 939                              TCGReg ret, tcg_target_long arg)
 940 {
 941     tcg_target_long diff;
 942
 943     if (arg == 0) {
 944         tgen_arithr(s, ARITH_XOR, ret, ret);
 945         return;
 946     }
 947     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 948         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 949         tcg_out32(s, arg);
 950         return;
 951     }
 952     if (arg == (int32_t)arg) {
 953         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 954         tcg_out32(s, arg);
 955         return;
 956     }
 957
 958     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 959     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
 960     if (diff == (int32_t)diff) {
 961         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 962         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 963         tcg_out32(s, diff);
 964         return;
 965     }
 966
 967     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 968     tcg_out64(s, arg);
 969 }
 970
 971 static void tcg_out_movi(TCGContext *s, TCGType type,
 972                          TCGReg ret, tcg_target_long arg)
 973 {
 974     switch (type) {
 975     case TCG_TYPE_I32:
 976 #if TCG_TARGET_REG_BITS == 64
 977     case TCG_TYPE_I64:
 978 #endif
 979         if (ret < 16) {
 980             tcg_out_movi_int(s, type, ret, arg);
 981         } else {
 982             tcg_out_movi_vec(s, type, ret, arg);
 983         }
 984         break;
 985     default:
 986         g_assert_not_reached();
 987     }
 988 }
 989
 990 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 991 {
 992     if (val == (int8_t)val) {
 993         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 994         tcg_out8(s, val);
 995     } else if (val == (int32_t)val) {
 996         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 997         tcg_out32(s, val);
 998     } else {
 999         tcg_abort();
1000     }
1001 }
1002
1003 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1004 {
1005     /* Given the strength of x86 memory ordering, we only need care for
1006        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1007        faster than "mfence", so don't bother with the sse insn.  */
1008     if (a0 & TCG_MO_ST_LD) {
1009         tcg_out8(s, 0xf0);
1010         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1011         tcg_out8(s, 0);
1012     }
1013 }
1014
1015 static inline void tcg_out_push(TCGContext *s, int reg)
1016 {
1017     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1018 }
1019
1020 static inline void tcg_out_pop(TCGContext *s, int reg)
1021 {
1022     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1023 }
1024
1025 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1026                        TCGReg arg1, intptr_t arg2)
1027 {
1028     switch (type) {
1029     case TCG_TYPE_I32:
1030         if (ret < 16) {
1031             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1032         } else {
1033             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1034         }
1035         break;
1036     case TCG_TYPE_I64:
1037         if (ret < 16) {
1038             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1039             break;
1040         }
1041         /* FALLTHRU */
1042     case TCG_TYPE_V64:
1043         /* There is no instruction that can validate 8-byte alignment.  */
1044         tcg_debug_assert(ret >= 16);
1045         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1046         break;
1047     case TCG_TYPE_V128:
1048         /*
1049          * The gvec infrastructure is asserts that v128 vector loads
1050          * and stores use a 16-byte aligned offset.  Validate that the
1051          * final pointer is aligned by using an insn that will SIGSEGV.
1052          */
1053         tcg_debug_assert(ret >= 16);
1054         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1055         break;
1056     case TCG_TYPE_V256:
1057         /*
1058          * The gvec infrastructure only requires 16-byte alignment,
1059          * so here we must use an unaligned load.
1060          */
1061         tcg_debug_assert(ret >= 16);
1062         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1063                                  ret, 0, arg1, arg2);
1064         break;
1065     default:
1066         g_assert_not_reached();
1067     }
1068 }
1069
1070 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1071                        TCGReg arg1, intptr_t arg2)
1072 {
1073     switch (type) {
1074     case TCG_TYPE_I32:
1075         if (arg < 16) {
1076             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1077         } else {
1078             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1079         }
1080         break;
1081     case TCG_TYPE_I64:
1082         if (arg < 16) {
1083             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1084             break;
1085         }
1086         /* FALLTHRU */
1087     case TCG_TYPE_V64:
1088         /* There is no instruction that can validate 8-byte alignment.  */
1089         tcg_debug_assert(arg >= 16);
1090         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1091         break;
1092     case TCG_TYPE_V128:
1093         /*
1094          * The gvec infrastructure is asserts that v128 vector loads
1095          * and stores use a 16-byte aligned offset.  Validate that the
1096          * final pointer is aligned by using an insn that will SIGSEGV.
1097          */
1098         tcg_debug_assert(arg >= 16);
1099         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1100         break;
1101     case TCG_TYPE_V256:
1102         /*
1103          * The gvec infrastructure only requires 16-byte alignment,
1104          * so here we must use an unaligned store.
1105          */
1106         tcg_debug_assert(arg >= 16);
1107         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1108                                  arg, 0, arg1, arg2);
1109         break;
1110     default:
1111         g_assert_not_reached();
1112     }
1113 }
1114
1115 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1116                         TCGReg base, intptr_t ofs)
1117 {
1118     int rexw = 0;
1119     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1120         if (val != (int32_t)val) {
1121             return false;
1122         }
1123         rexw = P_REXW;
1124     } else if (type != TCG_TYPE_I32) {
1125         return false;
1126     }
1127     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1128     tcg_out32(s, val);
1129     return true;
1130 }
1131
1132 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1133 {
1134     /* Propagate an opcode prefix, such as P_DATA16.  */
1135     int ext = subopc & ~0x7;
1136     subopc &= 0x7;
1137
1138     if (count == 1) {
1139         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1140     } else {
1141         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1142         tcg_out8(s, count);
1143     }
1144 }
1145
1146 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1147 {
1148     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1149 }
1150
1151 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1152 {
1153     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1154 }
1155
1156 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1157 {
1158     /* movzbl */
1159     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1160     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1161 }
1162
1163 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1164 {
1165     /* movsbl */
1166     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1167     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1168 }
1169
1170 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1171 {
1172     /* movzwl */
1173     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1174 }
1175
1176 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1177 {
1178     /* movsw[lq] */
1179     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1180 }
1181
1182 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1183 {
1184     /* 32-bit mov zero extends.  */
1185     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1186 }
1187
1188 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1189 {
1190     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1191 }
1192
1193 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1194 {
1195     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1196 }
1197
1198 static void tgen_arithi(TCGContext *s, int c, int r0,
1199                         tcg_target_long val, int cf)
1200 {
1201     int rexw = 0;
1202
1203     if (TCG_TARGET_REG_BITS == 64) {
1204         rexw = c & -8;
1205         c &= 7;
1206     }
1207
1208     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1209        partial flags update stalls on Pentium4 and are not recommended
1210        by current Intel optimization manuals.  */
1211     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1212         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1213         if (TCG_TARGET_REG_BITS == 64) {
1214             /* The single-byte increment encodings are re-tasked as the
1215                REX prefixes.  Use the MODRM encoding.  */
1216             tcg_out_modrm(s, OPC_GRP5 + rexw,
1217                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1218         } else {
1219             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1220         }
1221         return;
1222     }
1223
1224     if (c == ARITH_AND) {
1225         if (TCG_TARGET_REG_BITS == 64) {
1226             if (val == 0xffffffffu) {
1227                 tcg_out_ext32u(s, r0, r0);
1228                 return;
1229             }
1230             if (val == (uint32_t)val) {
1231                 /* AND with no high bits set can use a 32-bit operation.  */
1232                 rexw = 0;
1233             }
1234         }
1235         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1236             tcg_out_ext8u(s, r0, r0);
1237             return;
1238         }
1239         if (val == 0xffffu) {
1240             tcg_out_ext16u(s, r0, r0);
1241             return;
1242         }
1243     }
1244
1245     if (val == (int8_t)val) {
1246         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1247         tcg_out8(s, val);
1248         return;
1249     }
1250     if (rexw == 0 || val == (int32_t)val) {
1251         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1252         tcg_out32(s, val);
1253         return;
1254     }
1255
1256     tcg_abort();
1257 }
1258
1259 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1260 {
1261     if (val != 0) {
1262         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1263     }
1264 }
1265
1266 /* Use SMALL != 0 to force a short forward branch.  */
1267 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1268 {
1269     int32_t val, val1;
1270
1271     if (l->has_value) {
1272         val = tcg_pcrel_diff(s, l->u.value_ptr);
1273         val1 = val - 2;
1274         if ((int8_t)val1 == val1) {
1275             if (opc == -1) {
1276                 tcg_out8(s, OPC_JMP_short);
1277             } else {
1278                 tcg_out8(s, OPC_JCC_short + opc);
1279             }
1280             tcg_out8(s, val1);
1281         } else {
1282             if (small) {
1283                 tcg_abort();
1284             }
1285             if (opc == -1) {
1286                 tcg_out8(s, OPC_JMP_long);
1287                 tcg_out32(s, val - 5);
1288             } else {
1289                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1290                 tcg_out32(s, val - 6);
1291             }
1292         }
1293     } else if (small) {
1294         if (opc == -1) {
1295             tcg_out8(s, OPC_JMP_short);
1296         } else {
1297             tcg_out8(s, OPC_JCC_short + opc);
1298         }
1299         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1300         s->code_ptr += 1;
1301     } else {
1302         if (opc == -1) {
1303             tcg_out8(s, OPC_JMP_long);
1304         } else {
1305             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1306         }
1307         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1308         s->code_ptr += 4;
1309     }
1310 }
1311
1312 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1313                         int const_arg2, int rexw)
1314 {
1315     if (const_arg2) {
1316         if (arg2 == 0) {
1317             /* test r, r */
1318             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1319         } else {
1320             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1321         }
1322     } else {
1323         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1324     }
1325 }
1326
1327 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1328                              TCGArg arg1, TCGArg arg2, int const_arg2,
1329                              TCGLabel *label, int small)
1330 {
1331     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1332     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1333 }
1334
1335 #if TCG_TARGET_REG_BITS == 64
1336 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1337                              TCGArg arg1, TCGArg arg2, int const_arg2,
1338                              TCGLabel *label, int small)
1339 {
1340     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1341     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1342 }
1343 #else
1344 /* XXX: we implement it at the target level to avoid having to
1345    handle cross basic blocks temporaries */
1346 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1347                             const int *const_args, int small)
1348 {
1349     TCGLabel *label_next = gen_new_label();
1350     TCGLabel *label_this = arg_label(args[5]);
1351
1352     switch(args[4]) {
1353     case TCG_COND_EQ:
1354         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1355                          label_next, 1);
1356         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1357                          label_this, small);
1358         break;
1359     case TCG_COND_NE:
1360         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1361                          label_this, small);
1362         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1363                          label_this, small);
1364         break;
1365     case TCG_COND_LT:
1366         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1367                          label_this, small);
1368         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1369         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1370                          label_this, small);
1371         break;
1372     case TCG_COND_LE:
1373         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1374                          label_this, small);
1375         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1376         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1377                          label_this, small);
1378         break;
1379     case TCG_COND_GT:
1380         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1381                          label_this, small);
1382         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1383         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1384                          label_this, small);
1385         break;
1386     case TCG_COND_GE:
1387         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1388                          label_this, small);
1389         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1390         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1391                          label_this, small);
1392         break;
1393     case TCG_COND_LTU:
1394         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1395                          label_this, small);
1396         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1397         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1398                          label_this, small);
1399         break;
1400     case TCG_COND_LEU:
1401         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1402                          label_this, small);
1403         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1404         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1405                          label_this, small);
1406         break;
1407     case TCG_COND_GTU:
1408         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1409                          label_this, small);
1410         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1411         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1412                          label_this, small);
1413         break;
1414     case TCG_COND_GEU:
1415         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1416                          label_this, small);
1417         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1418         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1419                          label_this, small);
1420         break;
1421     default:
1422         tcg_abort();
1423     }
1424     tcg_out_label(s, label_next);
1425 }
1426 #endif
1427
1428 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1429                               TCGArg arg1, TCGArg arg2, int const_arg2)
1430 {
1431     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1432     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1433     tcg_out_ext8u(s, dest, dest);
1434 }
1435
1436 #if TCG_TARGET_REG_BITS == 64
1437 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1438                               TCGArg arg1, TCGArg arg2, int const_arg2)
1439 {
1440     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1441     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1442     tcg_out_ext8u(s, dest, dest);
1443 }
1444 #else
1445 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1446                              const int *const_args)
1447 {
1448     TCGArg new_args[6];
1449     TCGLabel *label_true, *label_over;
1450
1451     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1452
1453     if (args[0] == args[1] || args[0] == args[2]
1454         || (!const_args[3] && args[0] == args[3])
1455         || (!const_args[4] && args[0] == args[4])) {
1456         /* When the destination overlaps with one of the argument
1457            registers, don't do anything tricky.  */
1458         label_true = gen_new_label();
1459         label_over = gen_new_label();
1460
1461         new_args[5] = label_arg(label_true);
1462         tcg_out_brcond2(s, new_args, const_args+1, 1);
1463
1464         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1465         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1466         tcg_out_label(s, label_true);
1467
1468         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1469         tcg_out_label(s, label_over);
1470     } else {
1471         /* When the destination does not overlap one of the arguments,
1472            clear the destination first, jump if cond false, and emit an
1473            increment in the true case.  This results in smaller code.  */
1474
1475         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1476
1477         label_over = gen_new_label();
1478         new_args[4] = tcg_invert_cond(new_args[4]);
1479         new_args[5] = label_arg(label_over);
1480         tcg_out_brcond2(s, new_args, const_args+1, 1);
1481
1482         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1483         tcg_out_label(s, label_over);
1484     }
1485 }
1486 #endif
1487
1488 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1489                          TCGReg dest, TCGReg v1)
1490 {
1491     if (have_cmov) {
1492         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1493     } else {
1494         TCGLabel *over = gen_new_label();
1495         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1496         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1497         tcg_out_label(s, over);
1498     }
1499 }
1500
1501 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1502                               TCGReg c1, TCGArg c2, int const_c2,
1503                               TCGReg v1)
1504 {
1505     tcg_out_cmp(s, c1, c2, const_c2, 0);
1506     tcg_out_cmov(s, cond, 0, dest, v1);
1507 }
1508
1509 #if TCG_TARGET_REG_BITS == 64
1510 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1511                               TCGReg c1, TCGArg c2, int const_c2,
1512                               TCGReg v1)
1513 {
1514     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1515     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1516 }
1517 #endif
1518
1519 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1520                         TCGArg arg2, bool const_a2)
1521 {
1522     if (have_bmi1) {
1523         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1524         if (const_a2) {
1525             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1526         } else {
1527             tcg_debug_assert(dest != arg2);
1528             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1529         }
1530     } else {
1531         tcg_debug_assert(dest != arg2);
1532         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1533         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1534     }
1535 }
1536
1537 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1538                         TCGArg arg2, bool const_a2)
1539 {
1540     if (have_lzcnt) {
1541         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1542         if (const_a2) {
1543             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1544         } else {
1545             tcg_debug_assert(dest != arg2);
1546             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1547         }
1548     } else {
1549         tcg_debug_assert(!const_a2);
1550         tcg_debug_assert(dest != arg1);
1551         tcg_debug_assert(dest != arg2);
1552
1553         /* Recall that the output of BSR is the index not the count.  */
1554         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1555         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1556
1557         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1558         tcg_out_cmp(s, arg1, 0, 1, rexw);
1559         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1560     }
1561 }
1562
1563 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1564 {
1565     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1566
1567     if (disp == (int32_t)disp) {
1568         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1569         tcg_out32(s, disp);
1570     } else {
1571         /* rip-relative addressing into the constant pool.
1572            This is 6 + 8 = 14 bytes, as compared to using an
1573            an immediate load 10 + 6 = 16 bytes, plus we may
1574            be able to re-use the pool constant for more calls.  */
1575         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1576         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1577         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1578         tcg_out32(s, 0);
1579     }
1580 }
1581
1582 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1583 {
1584     tcg_out_branch(s, 1, dest);
1585 }
1586
1587 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1588 {
1589     tcg_out_branch(s, 0, dest);
1590 }
1591
1592 static void tcg_out_nopn(TCGContext *s, int n)
1593 {
1594     int i;
1595     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1596      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1597      * duplicate prefix, and all of the interesting recent cores can
1598      * decode and discard the duplicates in a single cycle.
1599      */
1600     tcg_debug_assert(n >= 1);
1601     for (i = 1; i < n; ++i) {
1602         tcg_out8(s, 0x66);
1603     }
1604     tcg_out8(s, 0x90);
1605 }
1606
1607 #if defined(CONFIG_SOFTMMU)
1608 #include "../tcg-ldst.c.inc"
1609
1610 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1611  *                                     int mmu_idx, uintptr_t ra)
1612  */
1613 static void * const qemu_ld_helpers[16] = {
1614     [MO_UB]   = helper_ret_ldub_mmu,
1615     [MO_LEUW] = helper_le_lduw_mmu,
1616     [MO_LEUL] = helper_le_ldul_mmu,
1617     [MO_LEQ]  = helper_le_ldq_mmu,
1618     [MO_BEUW] = helper_be_lduw_mmu,
1619     [MO_BEUL] = helper_be_ldul_mmu,
1620     [MO_BEQ]  = helper_be_ldq_mmu,
1621 };
1622
1623 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1624  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1625  */
1626 static void * const qemu_st_helpers[16] = {
1627     [MO_UB]   = helper_ret_stb_mmu,
1628     [MO_LEUW] = helper_le_stw_mmu,
1629     [MO_LEUL] = helper_le_stl_mmu,
1630     [MO_LEQ]  = helper_le_stq_mmu,
1631     [MO_BEUW] = helper_be_stw_mmu,
1632     [MO_BEUL] = helper_be_stl_mmu,
1633     [MO_BEQ]  = helper_be_stq_mmu,
1634 };
1635
1636 /* Perform the TLB load and compare.
1637
1638    Inputs:
1639    ADDRLO and ADDRHI contain the low and high part of the address.
1640
1641    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1642
1643    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1644    This should be offsetof addr_read or addr_write.
1645
1646    Outputs:
1647    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1648    positions of the displacements of forward jumps to the TLB miss case.
1649
1650    Second argument register is loaded with the low part of the address.
1651    In the TLB hit case, it has been adjusted as indicated by the TLB
1652    and so is a host address.  In the TLB miss case, it continues to
1653    hold a guest address.
1654
1655    First argument register is clobbered.  */
1656
1657 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1658                                     int mem_index, MemOp opc,
1659                                     tcg_insn_unit **label_ptr, int which)
1660 {
1661     const TCGReg r0 = TCG_REG_L0;
1662     const TCGReg r1 = TCG_REG_L1;
1663     TCGType ttype = TCG_TYPE_I32;
1664     TCGType tlbtype = TCG_TYPE_I32;
1665     int trexw = 0, hrexw = 0, tlbrexw = 0;
1666     unsigned a_bits = get_alignment_bits(opc);
1667     unsigned s_bits = opc & MO_SIZE;
1668     unsigned a_mask = (1 << a_bits) - 1;
1669     unsigned s_mask = (1 << s_bits) - 1;
1670     target_ulong tlb_mask;
1671
1672     if (TCG_TARGET_REG_BITS == 64) {
1673         if (TARGET_LONG_BITS == 64) {
1674             ttype = TCG_TYPE_I64;
1675             trexw = P_REXW;
1676         }
1677         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1678             hrexw = P_REXW;
1679             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1680                 tlbtype = TCG_TYPE_I64;
1681                 tlbrexw = P_REXW;
1682             }
1683         }
1684     }
1685
1686     tcg_out_mov(s, tlbtype, r0, addrlo);
1687     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1688                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1689
1690     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1691                          TLB_MASK_TABLE_OFS(mem_index) +
1692                          offsetof(CPUTLBDescFast, mask));
1693
1694     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1695                          TLB_MASK_TABLE_OFS(mem_index) +
1696                          offsetof(CPUTLBDescFast, table));
1697
1698     /* If the required alignment is at least as large as the access, simply
1699        copy the address and mask.  For lesser alignments, check that we don't
1700        cross pages for the complete access.  */
1701     if (a_bits >= s_bits) {
1702         tcg_out_mov(s, ttype, r1, addrlo);
1703     } else {
1704         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1705     }
1706     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1707     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1708
1709     /* cmp 0(r0), r1 */
1710     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1711
1712     /* Prepare for both the fast path add of the tlb addend, and the slow
1713        path function argument setup.  */
1714     tcg_out_mov(s, ttype, r1, addrlo);
1715
1716     /* jne slow_path */
1717     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1718     label_ptr[0] = s->code_ptr;
1719     s->code_ptr += 4;
1720
1721     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1722         /* cmp 4(r0), addrhi */
1723         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1724
1725         /* jne slow_path */
1726         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1727         label_ptr[1] = s->code_ptr;
1728         s->code_ptr += 4;
1729     }
1730
1731     /* TLB Hit.  */
1732
1733     /* add addend(r0), r1 */
1734     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1735                          offsetof(CPUTLBEntry, addend));
1736 }
1737
1738 /*
1739  * Record the context of a call to the out of line helper code for the slow path
1740  * for a load or store, so that we can later generate the correct helper code
1741  */
1742 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1743                                 TCGMemOpIdx oi,
1744                                 TCGReg datalo, TCGReg datahi,
1745                                 TCGReg addrlo, TCGReg addrhi,
1746                                 tcg_insn_unit *raddr,
1747                                 tcg_insn_unit **label_ptr)
1748 {
1749     TCGLabelQemuLdst *label = new_ldst_label(s);
1750
1751     label->is_ld = is_ld;
1752     label->oi = oi;
1753     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1754     label->datalo_reg = datalo;
1755     label->datahi_reg = datahi;
1756     label->addrlo_reg = addrlo;
1757     label->addrhi_reg = addrhi;
1758     label->raddr = tcg_splitwx_to_rx(raddr);
1759     label->label_ptr[0] = label_ptr[0];
1760     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1761         label->label_ptr[1] = label_ptr[1];
1762     }
1763 }
1764
1765 /*
1766  * Generate code for the slow path for a load at the end of block
1767  */
1768 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1769 {
1770     TCGMemOpIdx oi = l->oi;
1771     MemOp opc = get_memop(oi);
1772     TCGReg data_reg;
1773     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1774     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1775
1776     /* resolve label address */
1777     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1778     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1779         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1780     }
1781
1782     if (TCG_TARGET_REG_BITS == 32) {
1783         int ofs = 0;
1784
1785         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1786         ofs += 4;
1787
1788         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1789         ofs += 4;
1790
1791         if (TARGET_LONG_BITS == 64) {
1792             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1793             ofs += 4;
1794         }
1795
1796         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1797         ofs += 4;
1798
1799         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1800     } else {
1801         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1802         /* The second argument is already loaded with addrlo.  */
1803         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1804         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1805                      (uintptr_t)l->raddr);
1806     }
1807
1808     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1809
1810     data_reg = l->datalo_reg;
1811     switch (opc & MO_SSIZE) {
1812     case MO_SB:
1813         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1814         break;
1815     case MO_SW:
1816         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1817         break;
1818 #if TCG_TARGET_REG_BITS == 64
1819     case MO_SL:
1820         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1821         break;
1822 #endif
1823     case MO_UB:
1824     case MO_UW:
1825         /* Note that the helpers have zero-extended to tcg_target_long.  */
1826     case MO_UL:
1827         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1828         break;
1829     case MO_Q:
1830         if (TCG_TARGET_REG_BITS == 64) {
1831             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1832         } else if (data_reg == TCG_REG_EDX) {
1833             /* xchg %edx, %eax */
1834             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1835             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1836         } else {
1837             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1838             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1839         }
1840         break;
1841     default:
1842         tcg_abort();
1843     }
1844
1845     /* Jump to the code corresponding to next IR of qemu_st */
1846     tcg_out_jmp(s, l->raddr);
1847     return true;
1848 }
1849
1850 /*
1851  * Generate code for the slow path for a store at the end of block
1852  */
1853 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1854 {
1855     TCGMemOpIdx oi = l->oi;
1856     MemOp opc = get_memop(oi);
1857     MemOp s_bits = opc & MO_SIZE;
1858     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1859     TCGReg retaddr;
1860
1861     /* resolve label address */
1862     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1863     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1864         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1865     }
1866
1867     if (TCG_TARGET_REG_BITS == 32) {
1868         int ofs = 0;
1869
1870         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1871         ofs += 4;
1872
1873         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1874         ofs += 4;
1875
1876         if (TARGET_LONG_BITS == 64) {
1877             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1878             ofs += 4;
1879         }
1880
1881         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1882         ofs += 4;
1883
1884         if (s_bits == MO_64) {
1885             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1886             ofs += 4;
1887         }
1888
1889         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1890         ofs += 4;
1891
1892         retaddr = TCG_REG_EAX;
1893         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1894         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1895     } else {
1896         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1897         /* The second argument is already loaded with addrlo.  */
1898         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1899                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1900         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1901
1902         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1903             retaddr = tcg_target_call_iarg_regs[4];
1904             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1905         } else {
1906             retaddr = TCG_REG_RAX;
1907             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1908             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1909                        TCG_TARGET_CALL_STACK_OFFSET);
1910         }
1911     }
1912
1913     /* "Tail call" to the helper, with the return address back inline.  */
1914     tcg_out_push(s, retaddr);
1915     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1916     return true;
1917 }
1918 #elif TCG_TARGET_REG_BITS == 32
1919 # define x86_guest_base_seg     0
1920 # define x86_guest_base_index   -1
1921 # define x86_guest_base_offset  guest_base
1922 #else
1923 static int x86_guest_base_seg;
1924 static int x86_guest_base_index = -1;
1925 static int32_t x86_guest_base_offset;
1926 # if defined(__x86_64__) && defined(__linux__)
1927 #  include <asm/prctl.h>
1928 #  include <sys/prctl.h>
1929 int arch_prctl(int code, unsigned long addr);
1930 static inline int setup_guest_base_seg(void)
1931 {
1932     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1933         return P_GS;
1934     }
1935     return 0;
1936 }
1937 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1938 #  include <machine/sysarch.h>
1939 static inline int setup_guest_base_seg(void)
1940 {
1941     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1942         return P_GS;
1943     }
1944     return 0;
1945 }
1946 # else
1947 static inline int setup_guest_base_seg(void)
1948 {
1949     return 0;
1950 }
1951 # endif
1952 #endif /* SOFTMMU */
1953
1954 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1955                                    TCGReg base, int index, intptr_t ofs,
1956                                    int seg, bool is64, MemOp memop)
1957 {
1958     bool use_movbe = false;
1959     int rexw = is64 * P_REXW;
1960     int movop = OPC_MOVL_GvEv;
1961
1962     /* Do big-endian loads with movbe.  */
1963     if (memop & MO_BSWAP) {
1964         tcg_debug_assert(have_movbe);
1965         use_movbe = true;
1966         movop = OPC_MOVBE_GyMy;
1967     }
1968
1969     switch (memop & MO_SSIZE) {
1970     case MO_UB:
1971         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1972                                  base, index, 0, ofs);
1973         break;
1974     case MO_SB:
1975         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1976                                  base, index, 0, ofs);
1977         break;
1978     case MO_UW:
1979         if (use_movbe) {
1980             /* There is no extending movbe; only low 16-bits are modified.  */
1981             if (datalo != base && datalo != index) {
1982                 /* XOR breaks dependency chains.  */
1983                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
1984                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1985                                          datalo, base, index, 0, ofs);
1986             } else {
1987                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1988                                          datalo, base, index, 0, ofs);
1989                 tcg_out_ext16u(s, datalo, datalo);
1990             }
1991         } else {
1992             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1993                                      base, index, 0, ofs);
1994         }
1995         break;
1996     case MO_SW:
1997         if (use_movbe) {
1998             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1999                                      datalo, base, index, 0, ofs);
2000             tcg_out_ext16s(s, datalo, datalo, rexw);
2001         } else {
2002             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2003                                      datalo, base, index, 0, ofs);
2004         }
2005         break;
2006     case MO_UL:
2007         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2008         break;
2009 #if TCG_TARGET_REG_BITS == 64
2010     case MO_SL:
2011         if (use_movbe) {
2012             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2013                                      base, index, 0, ofs);
2014             tcg_out_ext32s(s, datalo, datalo);
2015         } else {
2016             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2017                                      base, index, 0, ofs);
2018         }
2019         break;
2020 #endif
2021     case MO_Q:
2022         if (TCG_TARGET_REG_BITS == 64) {
2023             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2024                                      base, index, 0, ofs);
2025         } else {
2026             if (use_movbe) {
2027                 TCGReg t = datalo;
2028                 datalo = datahi;
2029                 datahi = t;
2030             }
2031             if (base != datalo) {
2032                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2033                                          base, index, 0, ofs);
2034                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2035                                          base, index, 0, ofs + 4);
2036             } else {
2037                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2038                                          base, index, 0, ofs + 4);
2039                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2040                                          base, index, 0, ofs);
2041             }
2042         }
2043         break;
2044     default:
2045         g_assert_not_reached();
2046     }
2047 }
2048
2049 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2050    EAX. It will be useful once fixed registers globals are less
2051    common. */
2052 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2053 {
2054     TCGReg datalo, datahi, addrlo;
2055     TCGReg addrhi __attribute__((unused));
2056     TCGMemOpIdx oi;
2057     MemOp opc;
2058 #if defined(CONFIG_SOFTMMU)
2059     int mem_index;
2060     tcg_insn_unit *label_ptr[2];
2061 #endif
2062
2063     datalo = *args++;
2064     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2065     addrlo = *args++;
2066     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2067     oi = *args++;
2068     opc = get_memop(oi);
2069
2070 #if defined(CONFIG_SOFTMMU)
2071     mem_index = get_mmuidx(oi);
2072
2073     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2074                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2075
2076     /* TLB Hit.  */
2077     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2078
2079     /* Record the current context of a load into ldst label */
2080     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2081                         s->code_ptr, label_ptr);
2082 #else
2083     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2084                            x86_guest_base_offset, x86_guest_base_seg,
2085                            is64, opc);
2086 #endif
2087 }
2088
2089 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2090                                    TCGReg base, int index, intptr_t ofs,
2091                                    int seg, MemOp memop)
2092 {
2093     bool use_movbe = false;
2094     int movop = OPC_MOVL_EvGv;
2095
2096     /*
2097      * Do big-endian stores with movbe or softmmu.
2098      * User-only without movbe will have its swapping done generically.
2099      */
2100     if (memop & MO_BSWAP) {
2101         tcg_debug_assert(have_movbe);
2102         use_movbe = true;
2103         movop = OPC_MOVBE_MyGy;
2104     }
2105
2106     switch (memop & MO_SIZE) {
2107     case MO_8:
2108         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2109         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2110         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2111                                  datalo, base, index, 0, ofs);
2112         break;
2113     case MO_16:
2114         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2115                                  base, index, 0, ofs);
2116         break;
2117     case MO_32:
2118         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2119         break;
2120     case MO_64:
2121         if (TCG_TARGET_REG_BITS == 64) {
2122             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2123                                      base, index, 0, ofs);
2124         } else {
2125             if (use_movbe) {
2126                 TCGReg t = datalo;
2127                 datalo = datahi;
2128                 datahi = t;
2129             }
2130             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2131                                      base, index, 0, ofs);
2132             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2133                                      base, index, 0, ofs + 4);
2134         }
2135         break;
2136     default:
2137         g_assert_not_reached();
2138     }
2139 }
2140
2141 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2142 {
2143     TCGReg datalo, datahi, addrlo;
2144     TCGReg addrhi __attribute__((unused));
2145     TCGMemOpIdx oi;
2146     MemOp opc;
2147 #if defined(CONFIG_SOFTMMU)
2148     int mem_index;
2149     tcg_insn_unit *label_ptr[2];
2150 #endif
2151
2152     datalo = *args++;
2153     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2154     addrlo = *args++;
2155     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2156     oi = *args++;
2157     opc = get_memop(oi);
2158
2159 #if defined(CONFIG_SOFTMMU)
2160     mem_index = get_mmuidx(oi);
2161
2162     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2163                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2164
2165     /* TLB Hit.  */
2166     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2167
2168     /* Record the current context of a store into ldst label */
2169     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2170                         s->code_ptr, label_ptr);
2171 #else
2172     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2173                            x86_guest_base_offset, x86_guest_base_seg, opc);
2174 #endif
2175 }
2176
2177 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2178                               const TCGArg args[TCG_MAX_OP_ARGS],
2179                               const int const_args[TCG_MAX_OP_ARGS])
2180 {
2181     TCGArg a0, a1, a2;
2182     int c, const_a2, vexop, rexw = 0;
2183
2184 #if TCG_TARGET_REG_BITS == 64
2185 # define OP_32_64(x) \
2186         case glue(glue(INDEX_op_, x), _i64): \
2187             rexw = P_REXW; /* FALLTHRU */    \
2188         case glue(glue(INDEX_op_, x), _i32)
2189 #else
2190 # define OP_32_64(x) \
2191         case glue(glue(INDEX_op_, x), _i32)
2192 #endif
2193
2194     /* Hoist the loads of the most common arguments.  */
2195     a0 = args[0];
2196     a1 = args[1];
2197     a2 = args[2];
2198     const_a2 = const_args[2];
2199
2200     switch (opc) {
2201     case INDEX_op_exit_tb:
2202         /* Reuse the zeroing that exists for goto_ptr.  */
2203         if (a0 == 0) {
2204             tcg_out_jmp(s, tcg_code_gen_epilogue);
2205         } else {
2206             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2207             tcg_out_jmp(s, tb_ret_addr);
2208         }
2209         break;
2210     case INDEX_op_goto_tb:
2211         if (s->tb_jmp_insn_offset) {
2212             /* direct jump method */
2213             int gap;
2214             /* jump displacement must be aligned for atomic patching;
2215              * see if we need to add extra nops before jump
2216              */
2217             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2218             if (gap != 1) {
2219                 tcg_out_nopn(s, gap - 1);
2220             }
2221             tcg_out8(s, OPC_JMP_long); /* jmp im */
2222             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2223             tcg_out32(s, 0);
2224         } else {
2225             /* indirect jump method */
2226             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2227                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2228         }
2229         set_jmp_reset_offset(s, a0);
2230         break;
2231     case INDEX_op_goto_ptr:
2232         /* jmp to the given host address (could be epilogue) */
2233         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2234         break;
2235     case INDEX_op_br:
2236         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2237         break;
2238     OP_32_64(ld8u):
2239         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2240         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2241         break;
2242     OP_32_64(ld8s):
2243         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2244         break;
2245     OP_32_64(ld16u):
2246         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2247         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2248         break;
2249     OP_32_64(ld16s):
2250         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2251         break;
2252 #if TCG_TARGET_REG_BITS == 64
2253     case INDEX_op_ld32u_i64:
2254 #endif
2255     case INDEX_op_ld_i32:
2256         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2257         break;
2258
2259     OP_32_64(st8):
2260         if (const_args[0]) {
2261             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2262             tcg_out8(s, a0);
2263         } else {
2264             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2265         }
2266         break;
2267     OP_32_64(st16):
2268         if (const_args[0]) {
2269             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2270             tcg_out16(s, a0);
2271         } else {
2272             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2273         }
2274         break;
2275 #if TCG_TARGET_REG_BITS == 64
2276     case INDEX_op_st32_i64:
2277 #endif
2278     case INDEX_op_st_i32:
2279         if (const_args[0]) {
2280             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2281             tcg_out32(s, a0);
2282         } else {
2283             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2284         }
2285         break;
2286
2287     OP_32_64(add):
2288         /* For 3-operand addition, use LEA.  */
2289         if (a0 != a1) {
2290             TCGArg c3 = 0;
2291             if (const_a2) {
2292                 c3 = a2, a2 = -1;
2293             } else if (a0 == a2) {
2294                 /* Watch out for dest = src + dest, since we've removed
2295                    the matching constraint on the add.  */
2296                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2297                 break;
2298             }
2299
2300             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2301             break;
2302         }
2303         c = ARITH_ADD;
2304         goto gen_arith;
2305     OP_32_64(sub):
2306         c = ARITH_SUB;
2307         goto gen_arith;
2308     OP_32_64(and):
2309         c = ARITH_AND;
2310         goto gen_arith;
2311     OP_32_64(or):
2312         c = ARITH_OR;
2313         goto gen_arith;
2314     OP_32_64(xor):
2315         c = ARITH_XOR;
2316         goto gen_arith;
2317     gen_arith:
2318         if (const_a2) {
2319             tgen_arithi(s, c + rexw, a0, a2, 0);
2320         } else {
2321             tgen_arithr(s, c + rexw, a0, a2);
2322         }
2323         break;
2324
2325     OP_32_64(andc):
2326         if (const_a2) {
2327             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2328             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2329         } else {
2330             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2331         }
2332         break;
2333
2334     OP_32_64(mul):
2335         if (const_a2) {
2336             int32_t val;
2337             val = a2;
2338             if (val == (int8_t)val) {
2339                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2340                 tcg_out8(s, val);
2341             } else {
2342                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2343                 tcg_out32(s, val);
2344             }
2345         } else {
2346             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2347         }
2348         break;
2349
2350     OP_32_64(div2):
2351         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2352         break;
2353     OP_32_64(divu2):
2354         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2355         break;
2356
2357     OP_32_64(shl):
2358         /* For small constant 3-operand shift, use LEA.  */
2359         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2360             if (a2 - 1 == 0) {
2361                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2362                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2363             } else {
2364                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2365                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2366             }
2367             break;
2368         }
2369         c = SHIFT_SHL;
2370         vexop = OPC_SHLX;
2371         goto gen_shift_maybe_vex;
2372     OP_32_64(shr):
2373         c = SHIFT_SHR;
2374         vexop = OPC_SHRX;
2375         goto gen_shift_maybe_vex;
2376     OP_32_64(sar):
2377         c = SHIFT_SAR;
2378         vexop = OPC_SARX;
2379         goto gen_shift_maybe_vex;
2380     OP_32_64(rotl):
2381         c = SHIFT_ROL;
2382         goto gen_shift;
2383     OP_32_64(rotr):
2384         c = SHIFT_ROR;
2385         goto gen_shift;
2386     gen_shift_maybe_vex:
2387         if (have_bmi2) {
2388             if (!const_a2) {
2389                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2390                 break;
2391             }
2392             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2393         }
2394         /* FALLTHRU */
2395     gen_shift:
2396         if (const_a2) {
2397             tcg_out_shifti(s, c + rexw, a0, a2);
2398         } else {
2399             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2400         }
2401         break;
2402
2403     OP_32_64(ctz):
2404         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2405         break;
2406     OP_32_64(clz):
2407         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2408         break;
2409     OP_32_64(ctpop):
2410         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2411         break;
2412
2413     case INDEX_op_brcond_i32:
2414         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2415         break;
2416     case INDEX_op_setcond_i32:
2417         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2418         break;
2419     case INDEX_op_movcond_i32:
2420         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2421         break;
2422
2423     OP_32_64(bswap16):
2424         if (a2 & TCG_BSWAP_OS) {
2425             /* Output must be sign-extended. */
2426             if (rexw) {
2427                 tcg_out_bswap64(s, a0);
2428                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2429             } else {
2430                 tcg_out_bswap32(s, a0);
2431                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2432             }
2433         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2434             /* Output must be zero-extended, but input isn't. */
2435             tcg_out_bswap32(s, a0);
2436             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2437         } else {
2438             tcg_out_rolw_8(s, a0);
2439         }
2440         break;
2441     OP_32_64(bswap32):
2442         tcg_out_bswap32(s, a0);
2443         if (rexw && (a2 & TCG_BSWAP_OS)) {
2444             tcg_out_ext32s(s, a0, a0);
2445         }
2446         break;
2447
2448     OP_32_64(neg):
2449         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2450         break;
2451     OP_32_64(not):
2452         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2453         break;
2454
2455     OP_32_64(ext8s):
2456         tcg_out_ext8s(s, a0, a1, rexw);
2457         break;
2458     OP_32_64(ext16s):
2459         tcg_out_ext16s(s, a0, a1, rexw);
2460         break;
2461     OP_32_64(ext8u):
2462         tcg_out_ext8u(s, a0, a1);
2463         break;
2464     OP_32_64(ext16u):
2465         tcg_out_ext16u(s, a0, a1);
2466         break;
2467
2468     case INDEX_op_qemu_ld_i32:
2469         tcg_out_qemu_ld(s, args, 0);
2470         break;
2471     case INDEX_op_qemu_ld_i64:
2472         tcg_out_qemu_ld(s, args, 1);
2473         break;
2474     case INDEX_op_qemu_st_i32:
2475     case INDEX_op_qemu_st8_i32:
2476         tcg_out_qemu_st(s, args, 0);
2477         break;
2478     case INDEX_op_qemu_st_i64:
2479         tcg_out_qemu_st(s, args, 1);
2480         break;
2481
2482     OP_32_64(mulu2):
2483         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2484         break;
2485     OP_32_64(muls2):
2486         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2487         break;
2488     OP_32_64(add2):
2489         if (const_args[4]) {
2490             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2491         } else {
2492             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2493         }
2494         if (const_args[5]) {
2495             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2496         } else {
2497             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2498         }
2499         break;
2500     OP_32_64(sub2):
2501         if (const_args[4]) {
2502             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2503         } else {
2504             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2505         }
2506         if (const_args[5]) {
2507             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2508         } else {
2509             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2510         }
2511         break;
2512
2513 #if TCG_TARGET_REG_BITS == 32
2514     case INDEX_op_brcond2_i32:
2515         tcg_out_brcond2(s, args, const_args, 0);
2516         break;
2517     case INDEX_op_setcond2_i32:
2518         tcg_out_setcond2(s, args, const_args);
2519         break;
2520 #else /* TCG_TARGET_REG_BITS == 64 */
2521     case INDEX_op_ld32s_i64:
2522         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2523         break;
2524     case INDEX_op_ld_i64:
2525         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2526         break;
2527     case INDEX_op_st_i64:
2528         if (const_args[0]) {
2529             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2530             tcg_out32(s, a0);
2531         } else {
2532             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2533         }
2534         break;
2535
2536     case INDEX_op_brcond_i64:
2537         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2538         break;
2539     case INDEX_op_setcond_i64:
2540         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2541         break;
2542     case INDEX_op_movcond_i64:
2543         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2544         break;
2545
2546     case INDEX_op_bswap64_i64:
2547         tcg_out_bswap64(s, a0);
2548         break;
2549     case INDEX_op_extu_i32_i64:
2550     case INDEX_op_ext32u_i64:
2551     case INDEX_op_extrl_i64_i32:
2552         tcg_out_ext32u(s, a0, a1);
2553         break;
2554     case INDEX_op_ext_i32_i64:
2555     case INDEX_op_ext32s_i64:
2556         tcg_out_ext32s(s, a0, a1);
2557         break;
2558     case INDEX_op_extrh_i64_i32:
2559         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2560         break;
2561 #endif
2562
2563     OP_32_64(deposit):
2564         if (args[3] == 0 && args[4] == 8) {
2565             /* load bits 0..7 */
2566             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2567         } else if (args[3] == 8 && args[4] == 8) {
2568             /* load bits 8..15 */
2569             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2570         } else if (args[3] == 0 && args[4] == 16) {
2571             /* load bits 0..15 */
2572             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2573         } else {
2574             tcg_abort();
2575         }
2576         break;
2577
2578     case INDEX_op_extract_i64:
2579         if (a2 + args[3] == 32) {
2580             /* This is a 32-bit zero-extending right shift.  */
2581             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2582             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2583             break;
2584         }
2585         /* FALLTHRU */
2586     case INDEX_op_extract_i32:
2587         /* On the off-chance that we can use the high-byte registers.
2588            Otherwise we emit the same ext16 + shift pattern that we
2589            would have gotten from the normal tcg-op.c expansion.  */
2590         tcg_debug_assert(a2 == 8 && args[3] == 8);
2591         if (a1 < 4 && a0 < 8) {
2592             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2593         } else {
2594             tcg_out_ext16u(s, a0, a1);
2595             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2596         }
2597         break;
2598
2599     case INDEX_op_sextract_i32:
2600         /* We don't implement sextract_i64, as we cannot sign-extend to
2601            64-bits without using the REX prefix that explicitly excludes
2602            access to the high-byte registers.  */
2603         tcg_debug_assert(a2 == 8 && args[3] == 8);
2604         if (a1 < 4 && a0 < 8) {
2605             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2606         } else {
2607             tcg_out_ext16s(s, a0, a1, 0);
2608             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2609         }
2610         break;
2611
2612     OP_32_64(extract2):
2613         /* Note that SHRD outputs to the r/m operand.  */
2614         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2615         tcg_out8(s, args[3]);
2616         break;
2617
2618     case INDEX_op_mb:
2619         tcg_out_mb(s, a0);
2620         break;
2621     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2622     case INDEX_op_mov_i64:
2623     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2624     default:
2625         tcg_abort();
2626     }
2627
2628 #undef OP_32_64
2629 }
2630
2631 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2632                            unsigned vecl, unsigned vece,
2633                            const TCGArg args[TCG_MAX_OP_ARGS],
2634                            const int const_args[TCG_MAX_OP_ARGS])
2635 {
2636     static int const add_insn[4] = {
2637         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2638     };
2639     static int const ssadd_insn[4] = {
2640         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2641     };
2642     static int const usadd_insn[4] = {
2643         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2644     };
2645     static int const sub_insn[4] = {
2646         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2647     };
2648     static int const sssub_insn[4] = {
2649         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2650     };
2651     static int const ussub_insn[4] = {
2652         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2653     };
2654     static int const mul_insn[4] = {
2655         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2656     };
2657     static int const shift_imm_insn[4] = {
2658         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2659     };
2660     static int const cmpeq_insn[4] = {
2661         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2662     };
2663     static int const cmpgt_insn[4] = {
2664         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2665     };
2666     static int const punpckl_insn[4] = {
2667         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2668     };
2669     static int const punpckh_insn[4] = {
2670         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2671     };
2672     static int const packss_insn[4] = {
2673         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2674     };
2675     static int const packus_insn[4] = {
2676         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2677     };
2678     static int const smin_insn[4] = {
2679         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2680     };
2681     static int const smax_insn[4] = {
2682         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2683     };
2684     static int const umin_insn[4] = {
2685         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2686     };
2687     static int const umax_insn[4] = {
2688         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2689     };
2690     static int const shlv_insn[4] = {
2691         /* TODO: AVX512 adds support for MO_16.  */
2692         OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2693     };
2694     static int const shrv_insn[4] = {
2695         /* TODO: AVX512 adds support for MO_16.  */
2696         OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2697     };
2698     static int const sarv_insn[4] = {
2699         /* TODO: AVX512 adds support for MO_16, MO_64.  */
2700         OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2701     };
2702     static int const shls_insn[4] = {
2703         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2704     };
2705     static int const shrs_insn[4] = {
2706         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2707     };
2708     static int const sars_insn[4] = {
2709         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2710     };
2711     static int const abs_insn[4] = {
2712         /* TODO: AVX512 adds support for MO_64.  */
2713         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2714     };
2715
2716     TCGType type = vecl + TCG_TYPE_V64;
2717     int insn, sub;
2718     TCGArg a0, a1, a2;
2719
2720     a0 = args[0];
2721     a1 = args[1];
2722     a2 = args[2];
2723
2724     switch (opc) {
2725     case INDEX_op_add_vec:
2726         insn = add_insn[vece];
2727         goto gen_simd;
2728     case INDEX_op_ssadd_vec:
2729         insn = ssadd_insn[vece];
2730         goto gen_simd;
2731     case INDEX_op_usadd_vec:
2732         insn = usadd_insn[vece];
2733         goto gen_simd;
2734     case INDEX_op_sub_vec:
2735         insn = sub_insn[vece];
2736         goto gen_simd;
2737     case INDEX_op_sssub_vec:
2738         insn = sssub_insn[vece];
2739         goto gen_simd;
2740     case INDEX_op_ussub_vec:
2741         insn = ussub_insn[vece];
2742         goto gen_simd;
2743     case INDEX_op_mul_vec:
2744         insn = mul_insn[vece];
2745         goto gen_simd;
2746     case INDEX_op_and_vec:
2747         insn = OPC_PAND;
2748         goto gen_simd;
2749     case INDEX_op_or_vec:
2750         insn = OPC_POR;
2751         goto gen_simd;
2752     case INDEX_op_xor_vec:
2753         insn = OPC_PXOR;
2754         goto gen_simd;
2755     case INDEX_op_smin_vec:
2756         insn = smin_insn[vece];
2757         goto gen_simd;
2758     case INDEX_op_umin_vec:
2759         insn = umin_insn[vece];
2760         goto gen_simd;
2761     case INDEX_op_smax_vec:
2762         insn = smax_insn[vece];
2763         goto gen_simd;
2764     case INDEX_op_umax_vec:
2765         insn = umax_insn[vece];
2766         goto gen_simd;
2767     case INDEX_op_shlv_vec:
2768         insn = shlv_insn[vece];
2769         goto gen_simd;
2770     case INDEX_op_shrv_vec:
2771         insn = shrv_insn[vece];
2772         goto gen_simd;
2773     case INDEX_op_sarv_vec:
2774         insn = sarv_insn[vece];
2775         goto gen_simd;
2776     case INDEX_op_shls_vec:
2777         insn = shls_insn[vece];
2778         goto gen_simd;
2779     case INDEX_op_shrs_vec:
2780         insn = shrs_insn[vece];
2781         goto gen_simd;
2782     case INDEX_op_sars_vec:
2783         insn = sars_insn[vece];
2784         goto gen_simd;
2785     case INDEX_op_x86_punpckl_vec:
2786         insn = punpckl_insn[vece];
2787         goto gen_simd;
2788     case INDEX_op_x86_punpckh_vec:
2789         insn = punpckh_insn[vece];
2790         goto gen_simd;
2791     case INDEX_op_x86_packss_vec:
2792         insn = packss_insn[vece];
2793         goto gen_simd;
2794     case INDEX_op_x86_packus_vec:
2795         insn = packus_insn[vece];
2796         goto gen_simd;
2797 #if TCG_TARGET_REG_BITS == 32
2798     case INDEX_op_dup2_vec:
2799         /* First merge the two 32-bit inputs to a single 64-bit element. */
2800         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2801         /* Then replicate the 64-bit elements across the rest of the vector. */
2802         if (type != TCG_TYPE_V64) {
2803             tcg_out_dup_vec(s, type, MO_64, a0, a0);
2804         }
2805         break;
2806 #endif
2807     case INDEX_op_abs_vec:
2808         insn = abs_insn[vece];
2809         a2 = a1;
2810         a1 = 0;
2811         goto gen_simd;
2812     gen_simd:
2813         tcg_debug_assert(insn != OPC_UD2);
2814         if (type == TCG_TYPE_V256) {
2815             insn |= P_VEXL;
2816         }
2817         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2818         break;
2819
2820     case INDEX_op_cmp_vec:
2821         sub = args[3];
2822         if (sub == TCG_COND_EQ) {
2823             insn = cmpeq_insn[vece];
2824         } else if (sub == TCG_COND_GT) {
2825             insn = cmpgt_insn[vece];
2826         } else {
2827             g_assert_not_reached();
2828         }
2829         goto gen_simd;
2830
2831     case INDEX_op_andc_vec:
2832         insn = OPC_PANDN;
2833         if (type == TCG_TYPE_V256) {
2834             insn |= P_VEXL;
2835         }
2836         tcg_out_vex_modrm(s, insn, a0, a2, a1);
2837         break;
2838
2839     case INDEX_op_shli_vec:
2840         sub = 6;
2841         goto gen_shift;
2842     case INDEX_op_shri_vec:
2843         sub = 2;
2844         goto gen_shift;
2845     case INDEX_op_sari_vec:
2846         tcg_debug_assert(vece != MO_64);
2847         sub = 4;
2848     gen_shift:
2849         tcg_debug_assert(vece != MO_8);
2850         insn = shift_imm_insn[vece];
2851         if (type == TCG_TYPE_V256) {
2852             insn |= P_VEXL;
2853         }
2854         tcg_out_vex_modrm(s, insn, sub, a0, a1);
2855         tcg_out8(s, a2);
2856         break;
2857
2858     case INDEX_op_ld_vec:
2859         tcg_out_ld(s, type, a0, a1, a2);
2860         break;
2861     case INDEX_op_st_vec:
2862         tcg_out_st(s, type, a0, a1, a2);
2863         break;
2864     case INDEX_op_dupm_vec:
2865         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2866         break;
2867
2868     case INDEX_op_x86_shufps_vec:
2869         insn = OPC_SHUFPS;
2870         sub = args[3];
2871         goto gen_simd_imm8;
2872     case INDEX_op_x86_blend_vec:
2873         if (vece == MO_16) {
2874             insn = OPC_PBLENDW;
2875         } else if (vece == MO_32) {
2876             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2877         } else {
2878             g_assert_not_reached();
2879         }
2880         sub = args[3];
2881         goto gen_simd_imm8;
2882     case INDEX_op_x86_vperm2i128_vec:
2883         insn = OPC_VPERM2I128;
2884         sub = args[3];
2885         goto gen_simd_imm8;
2886     gen_simd_imm8:
2887         if (type == TCG_TYPE_V256) {
2888             insn |= P_VEXL;
2889         }
2890         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2891         tcg_out8(s, sub);
2892         break;
2893
2894     case INDEX_op_x86_vpblendvb_vec:
2895         insn = OPC_VPBLENDVB;
2896         if (type == TCG_TYPE_V256) {
2897             insn |= P_VEXL;
2898         }
2899         tcg_out_vex_modrm(s, insn, a0, a1, a2);
2900         tcg_out8(s, args[3] << 4);
2901         break;
2902
2903     case INDEX_op_x86_psrldq_vec:
2904         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2905         tcg_out8(s, a2);
2906         break;
2907
2908     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2909     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2910     default:
2911         g_assert_not_reached();
2912     }
2913 }
2914
2915 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2916 {
2917     switch (op) {
2918     case INDEX_op_goto_ptr:
2919         return C_O0_I1(r);
2920
2921     case INDEX_op_ld8u_i32:
2922     case INDEX_op_ld8u_i64:
2923     case INDEX_op_ld8s_i32:
2924     case INDEX_op_ld8s_i64:
2925     case INDEX_op_ld16u_i32:
2926     case INDEX_op_ld16u_i64:
2927     case INDEX_op_ld16s_i32:
2928     case INDEX_op_ld16s_i64:
2929     case INDEX_op_ld_i32:
2930     case INDEX_op_ld32u_i64:
2931     case INDEX_op_ld32s_i64:
2932     case INDEX_op_ld_i64:
2933         return C_O1_I1(r, r);
2934
2935     case INDEX_op_st8_i32:
2936     case INDEX_op_st8_i64:
2937         return C_O0_I2(qi, r);
2938
2939     case INDEX_op_st16_i32:
2940     case INDEX_op_st16_i64:
2941     case INDEX_op_st_i32:
2942     case INDEX_op_st32_i64:
2943         return C_O0_I2(ri, r);
2944
2945     case INDEX_op_st_i64:
2946         return C_O0_I2(re, r);
2947
2948     case INDEX_op_add_i32:
2949     case INDEX_op_add_i64:
2950         return C_O1_I2(r, r, re);
2951
2952     case INDEX_op_sub_i32:
2953     case INDEX_op_sub_i64:
2954     case INDEX_op_mul_i32:
2955     case INDEX_op_mul_i64:
2956     case INDEX_op_or_i32:
2957     case INDEX_op_or_i64:
2958     case INDEX_op_xor_i32:
2959     case INDEX_op_xor_i64:
2960         return C_O1_I2(r, 0, re);
2961
2962     case INDEX_op_and_i32:
2963     case INDEX_op_and_i64:
2964         return C_O1_I2(r, 0, reZ);
2965
2966     case INDEX_op_andc_i32:
2967     case INDEX_op_andc_i64:
2968         return C_O1_I2(r, r, rI);
2969
2970     case INDEX_op_shl_i32:
2971     case INDEX_op_shl_i64:
2972     case INDEX_op_shr_i32:
2973     case INDEX_op_shr_i64:
2974     case INDEX_op_sar_i32:
2975     case INDEX_op_sar_i64:
2976         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2977
2978     case INDEX_op_rotl_i32:
2979     case INDEX_op_rotl_i64:
2980     case INDEX_op_rotr_i32:
2981     case INDEX_op_rotr_i64:
2982         return C_O1_I2(r, 0, ci);
2983
2984     case INDEX_op_brcond_i32:
2985     case INDEX_op_brcond_i64:
2986         return C_O0_I2(r, re);
2987
2988     case INDEX_op_bswap16_i32:
2989     case INDEX_op_bswap16_i64:
2990     case INDEX_op_bswap32_i32:
2991     case INDEX_op_bswap32_i64:
2992     case INDEX_op_bswap64_i64:
2993     case INDEX_op_neg_i32:
2994     case INDEX_op_neg_i64:
2995     case INDEX_op_not_i32:
2996     case INDEX_op_not_i64:
2997     case INDEX_op_extrh_i64_i32:
2998         return C_O1_I1(r, 0);
2999
3000     case INDEX_op_ext8s_i32:
3001     case INDEX_op_ext8s_i64:
3002     case INDEX_op_ext8u_i32:
3003     case INDEX_op_ext8u_i64:
3004         return C_O1_I1(r, q);
3005
3006     case INDEX_op_ext16s_i32:
3007     case INDEX_op_ext16s_i64:
3008     case INDEX_op_ext16u_i32:
3009     case INDEX_op_ext16u_i64:
3010     case INDEX_op_ext32s_i64:
3011     case INDEX_op_ext32u_i64:
3012     case INDEX_op_ext_i32_i64:
3013     case INDEX_op_extu_i32_i64:
3014     case INDEX_op_extrl_i64_i32:
3015     case INDEX_op_extract_i32:
3016     case INDEX_op_extract_i64:
3017     case INDEX_op_sextract_i32:
3018     case INDEX_op_ctpop_i32:
3019     case INDEX_op_ctpop_i64:
3020         return C_O1_I1(r, r);
3021
3022     case INDEX_op_extract2_i32:
3023     case INDEX_op_extract2_i64:
3024         return C_O1_I2(r, 0, r);
3025
3026     case INDEX_op_deposit_i32:
3027     case INDEX_op_deposit_i64:
3028         return C_O1_I2(Q, 0, Q);
3029
3030     case INDEX_op_setcond_i32:
3031     case INDEX_op_setcond_i64:
3032         return C_O1_I2(q, r, re);
3033
3034     case INDEX_op_movcond_i32:
3035     case INDEX_op_movcond_i64:
3036         return C_O1_I4(r, r, re, r, 0);
3037
3038     case INDEX_op_div2_i32:
3039     case INDEX_op_div2_i64:
3040     case INDEX_op_divu2_i32:
3041     case INDEX_op_divu2_i64:
3042         return C_O2_I3(a, d, 0, 1, r);
3043
3044     case INDEX_op_mulu2_i32:
3045     case INDEX_op_mulu2_i64:
3046     case INDEX_op_muls2_i32:
3047     case INDEX_op_muls2_i64:
3048         return C_O2_I2(a, d, a, r);
3049
3050     case INDEX_op_add2_i32:
3051     case INDEX_op_add2_i64:
3052     case INDEX_op_sub2_i32:
3053     case INDEX_op_sub2_i64:
3054         return C_O2_I4(r, r, 0, 1, re, re);
3055
3056     case INDEX_op_ctz_i32:
3057     case INDEX_op_ctz_i64:
3058         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3059
3060     case INDEX_op_clz_i32:
3061     case INDEX_op_clz_i64:
3062         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3063
3064     case INDEX_op_qemu_ld_i32:
3065         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3066                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3067
3068     case INDEX_op_qemu_st_i32:
3069         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3070                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3071     case INDEX_op_qemu_st8_i32:
3072         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3073                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3074
3075     case INDEX_op_qemu_ld_i64:
3076         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3077                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3078                 : C_O2_I2(r, r, L, L));
3079
3080     case INDEX_op_qemu_st_i64:
3081         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3082                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3083                 : C_O0_I4(L, L, L, L));
3084
3085     case INDEX_op_brcond2_i32:
3086         return C_O0_I4(r, r, ri, ri);
3087
3088     case INDEX_op_setcond2_i32:
3089         return C_O1_I4(r, r, r, ri, ri);
3090
3091     case INDEX_op_ld_vec:
3092     case INDEX_op_dupm_vec:
3093         return C_O1_I1(x, r);
3094
3095     case INDEX_op_st_vec:
3096         return C_O0_I2(x, r);
3097
3098     case INDEX_op_add_vec:
3099     case INDEX_op_sub_vec:
3100     case INDEX_op_mul_vec:
3101     case INDEX_op_and_vec:
3102     case INDEX_op_or_vec:
3103     case INDEX_op_xor_vec:
3104     case INDEX_op_andc_vec:
3105     case INDEX_op_ssadd_vec:
3106     case INDEX_op_usadd_vec:
3107     case INDEX_op_sssub_vec:
3108     case INDEX_op_ussub_vec:
3109     case INDEX_op_smin_vec:
3110     case INDEX_op_umin_vec:
3111     case INDEX_op_smax_vec:
3112     case INDEX_op_umax_vec:
3113     case INDEX_op_shlv_vec:
3114     case INDEX_op_shrv_vec:
3115     case INDEX_op_sarv_vec:
3116     case INDEX_op_shls_vec:
3117     case INDEX_op_shrs_vec:
3118     case INDEX_op_sars_vec:
3119     case INDEX_op_rotls_vec:
3120     case INDEX_op_cmp_vec:
3121     case INDEX_op_x86_shufps_vec:
3122     case INDEX_op_x86_blend_vec:
3123     case INDEX_op_x86_packss_vec:
3124     case INDEX_op_x86_packus_vec:
3125     case INDEX_op_x86_vperm2i128_vec:
3126     case INDEX_op_x86_punpckl_vec:
3127     case INDEX_op_x86_punpckh_vec:
3128 #if TCG_TARGET_REG_BITS == 32
3129     case INDEX_op_dup2_vec:
3130 #endif
3131         return C_O1_I2(x, x, x);
3132
3133     case INDEX_op_abs_vec:
3134     case INDEX_op_dup_vec:
3135     case INDEX_op_shli_vec:
3136     case INDEX_op_shri_vec:
3137     case INDEX_op_sari_vec:
3138     case INDEX_op_x86_psrldq_vec:
3139         return C_O1_I1(x, x);
3140
3141     case INDEX_op_x86_vpblendvb_vec:
3142         return C_O1_I3(x, x, x, x);
3143
3144     default:
3145         g_assert_not_reached();
3146     }
3147 }
3148
3149 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3150 {
3151     switch (opc) {
3152     case INDEX_op_add_vec:
3153     case INDEX_op_sub_vec:
3154     case INDEX_op_and_vec:
3155     case INDEX_op_or_vec:
3156     case INDEX_op_xor_vec:
3157     case INDEX_op_andc_vec:
3158         return 1;
3159     case INDEX_op_rotli_vec:
3160     case INDEX_op_cmp_vec:
3161     case INDEX_op_cmpsel_vec:
3162         return -1;
3163
3164     case INDEX_op_shli_vec:
3165     case INDEX_op_shri_vec:
3166         /* We must expand the operation for MO_8.  */
3167         return vece == MO_8 ? -1 : 1;
3168
3169     case INDEX_op_sari_vec:
3170         /* We must expand the operation for MO_8.  */
3171         if (vece == MO_8) {
3172             return -1;
3173         }
3174         /* We can emulate this for MO_64, but it does not pay off
3175            unless we're producing at least 4 values.  */
3176         if (vece == MO_64) {
3177             return type >= TCG_TYPE_V256 ? -1 : 0;
3178         }
3179         return 1;
3180
3181     case INDEX_op_shls_vec:
3182     case INDEX_op_shrs_vec:
3183         return vece >= MO_16;
3184     case INDEX_op_sars_vec:
3185         return vece >= MO_16 && vece <= MO_32;
3186     case INDEX_op_rotls_vec:
3187         return vece >= MO_16 ? -1 : 0;
3188
3189     case INDEX_op_shlv_vec:
3190     case INDEX_op_shrv_vec:
3191         return have_avx2 && vece >= MO_32;
3192     case INDEX_op_sarv_vec:
3193         return have_avx2 && vece == MO_32;
3194     case INDEX_op_rotlv_vec:
3195     case INDEX_op_rotrv_vec:
3196         return have_avx2 && vece >= MO_32 ? -1 : 0;
3197
3198     case INDEX_op_mul_vec:
3199         if (vece == MO_8) {
3200             /* We can expand the operation for MO_8.  */
3201             return -1;
3202         }
3203         if (vece == MO_64) {
3204             return 0;
3205         }
3206         return 1;
3207
3208     case INDEX_op_ssadd_vec:
3209     case INDEX_op_usadd_vec:
3210     case INDEX_op_sssub_vec:
3211     case INDEX_op_ussub_vec:
3212         return vece <= MO_16;
3213     case INDEX_op_smin_vec:
3214     case INDEX_op_smax_vec:
3215     case INDEX_op_umin_vec:
3216     case INDEX_op_umax_vec:
3217     case INDEX_op_abs_vec:
3218         return vece <= MO_32;
3219
3220     default:
3221         return 0;
3222     }
3223 }
3224
3225 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3226                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3227 {
3228     TCGv_vec t1, t2;
3229
3230     tcg_debug_assert(vece == MO_8);
3231
3232     t1 = tcg_temp_new_vec(type);
3233     t2 = tcg_temp_new_vec(type);
3234
3235     /*
3236      * Unpack to W, shift, and repack.  Tricky bits:
3237      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3238      *     i.e. duplicate in other half of the 16-bit lane.
3239      * (2) For right-shift, add 8 so that the high half of the lane
3240      *     becomes zero.  For left-shift, and left-rotate, we must
3241      *     shift up and down again.
3242      * (3) Step 2 leaves high half zero such that PACKUSWB
3243      *     (pack with unsigned saturation) does not modify
3244      *     the quantity.
3245      */
3246     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3247               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3248     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3249               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3250
3251     if (opc != INDEX_op_rotli_vec) {
3252         imm += 8;
3253     }
3254     if (opc == INDEX_op_shri_vec) {
3255         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3256         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3257     } else {
3258         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3259         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3260         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3261         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3262     }
3263
3264     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3265               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3266     tcg_temp_free_vec(t1);
3267     tcg_temp_free_vec(t2);
3268 }
3269
3270 static void expand_vec_sari(TCGType type, unsigned vece,
3271                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3272 {
3273     TCGv_vec t1, t2;
3274
3275     switch (vece) {
3276     case MO_8:
3277         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3278         t1 = tcg_temp_new_vec(type);
3279         t2 = tcg_temp_new_vec(type);
3280         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3281                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3282         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3283                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3284         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3285         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3286         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3287                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3288         tcg_temp_free_vec(t1);
3289         tcg_temp_free_vec(t2);
3290         break;
3291
3292     case MO_64:
3293         if (imm <= 32) {
3294             /*
3295              * We can emulate a small sign extend by performing an arithmetic
3296              * 32-bit shift and overwriting the high half of a 64-bit logical
3297              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3298              * does not, so we have to bound the smaller shift -- we get the
3299              * same result in the high half either way.
3300              */
3301             t1 = tcg_temp_new_vec(type);
3302             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3303             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3304             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3305                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3306                       tcgv_vec_arg(t1), 0xaa);
3307             tcg_temp_free_vec(t1);
3308         } else {
3309             /* Otherwise we will need to use a compare vs 0 to produce
3310              * the sign-extend, shift and merge.
3311              */
3312             t1 = tcg_const_zeros_vec(type);
3313             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3314             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3315             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3316             tcg_gen_or_vec(MO_64, v0, v0, t1);
3317             tcg_temp_free_vec(t1);
3318         }
3319         break;
3320
3321     default:
3322         g_assert_not_reached();
3323     }
3324 }
3325
3326 static void expand_vec_rotli(TCGType type, unsigned vece,
3327                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3328 {
3329     TCGv_vec t;
3330
3331     if (vece == MO_8) {
3332         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3333         return;
3334     }
3335
3336     t = tcg_temp_new_vec(type);
3337     tcg_gen_shli_vec(vece, t, v1, imm);
3338     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3339     tcg_gen_or_vec(vece, v0, v0, t);
3340     tcg_temp_free_vec(t);
3341 }
3342
3343 static void expand_vec_rotls(TCGType type, unsigned vece,
3344                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3345 {
3346     TCGv_i32 rsh;
3347     TCGv_vec t;
3348
3349     tcg_debug_assert(vece != MO_8);
3350
3351     t = tcg_temp_new_vec(type);
3352     rsh = tcg_temp_new_i32();
3353
3354     tcg_gen_neg_i32(rsh, lsh);
3355     tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3356     tcg_gen_shls_vec(vece, t, v1, lsh);
3357     tcg_gen_shrs_vec(vece, v0, v1, rsh);
3358     tcg_gen_or_vec(vece, v0, v0, t);
3359     tcg_temp_free_vec(t);
3360     tcg_temp_free_i32(rsh);
3361 }
3362
3363 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3364                             TCGv_vec v1, TCGv_vec sh, bool right)
3365 {
3366     TCGv_vec t = tcg_temp_new_vec(type);
3367
3368     tcg_gen_dupi_vec(vece, t, 8 << vece);
3369     tcg_gen_sub_vec(vece, t, t, sh);
3370     if (right) {
3371         tcg_gen_shlv_vec(vece, t, v1, t);
3372         tcg_gen_shrv_vec(vece, v0, v1, sh);
3373     } else {
3374         tcg_gen_shrv_vec(vece, t, v1, t);
3375         tcg_gen_shlv_vec(vece, v0, v1, sh);
3376     }
3377     tcg_gen_or_vec(vece, v0, v0, t);
3378     tcg_temp_free_vec(t);
3379 }
3380
3381 static void expand_vec_mul(TCGType type, unsigned vece,
3382                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3383 {
3384     TCGv_vec t1, t2, t3, t4, zero;
3385
3386     tcg_debug_assert(vece == MO_8);
3387
3388     /*
3389      * Unpack v1 bytes to words, 0 | x.
3390      * Unpack v2 bytes to words, y | 0.
3391      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3392      * Shift logical right by 8 bits to clear the high 8 bytes before
3393      * using an unsigned saturated pack.
3394      *
3395      * The difference between the V64, V128 and V256 cases is merely how
3396      * we distribute the expansion between temporaries.
3397      */
3398     switch (type) {
3399     case TCG_TYPE_V64:
3400         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3401         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3402         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3403         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3404                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3405         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3406                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3407         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3408         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3409         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3410                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3411         tcg_temp_free_vec(t1);
3412         tcg_temp_free_vec(t2);
3413         break;
3414
3415     case TCG_TYPE_V128:
3416     case TCG_TYPE_V256:
3417         t1 = tcg_temp_new_vec(type);
3418         t2 = tcg_temp_new_vec(type);
3419         t3 = tcg_temp_new_vec(type);
3420         t4 = tcg_temp_new_vec(type);
3421         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3422         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3423                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3424         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3425                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3426         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3427                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3428         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3429                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3430         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3431         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3432         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3433         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3434         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3435                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3436         tcg_temp_free_vec(t1);
3437         tcg_temp_free_vec(t2);
3438         tcg_temp_free_vec(t3);
3439         tcg_temp_free_vec(t4);
3440         break;
3441
3442     default:
3443         g_assert_not_reached();
3444     }
3445 }
3446
3447 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3448                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3449 {
3450     enum {
3451         NEED_INV  = 1,
3452         NEED_SWAP = 2,
3453         NEED_BIAS = 4,
3454         NEED_UMIN = 8,
3455         NEED_UMAX = 16,
3456     };
3457     TCGv_vec t1, t2, t3;
3458     uint8_t fixup;
3459
3460     switch (cond) {
3461     case TCG_COND_EQ:
3462     case TCG_COND_GT:
3463         fixup = 0;
3464         break;
3465     case TCG_COND_NE:
3466     case TCG_COND_LE:
3467         fixup = NEED_INV;
3468         break;
3469     case TCG_COND_LT:
3470         fixup = NEED_SWAP;
3471         break;
3472     case TCG_COND_GE:
3473         fixup = NEED_SWAP | NEED_INV;
3474         break;
3475     case TCG_COND_LEU:
3476         if (vece <= MO_32) {
3477             fixup = NEED_UMIN;
3478         } else {
3479             fixup = NEED_BIAS | NEED_INV;
3480         }
3481         break;
3482     case TCG_COND_GTU:
3483         if (vece <= MO_32) {
3484             fixup = NEED_UMIN | NEED_INV;
3485         } else {
3486             fixup = NEED_BIAS;
3487         }
3488         break;
3489     case TCG_COND_GEU:
3490         if (vece <= MO_32) {
3491             fixup = NEED_UMAX;
3492         } else {
3493             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3494         }
3495         break;
3496     case TCG_COND_LTU:
3497         if (vece <= MO_32) {
3498             fixup = NEED_UMAX | NEED_INV;
3499         } else {
3500             fixup = NEED_BIAS | NEED_SWAP;
3501         }
3502         break;
3503     default:
3504         g_assert_not_reached();
3505     }
3506
3507     if (fixup & NEED_INV) {
3508         cond = tcg_invert_cond(cond);
3509     }
3510     if (fixup & NEED_SWAP) {
3511         t1 = v1, v1 = v2, v2 = t1;
3512         cond = tcg_swap_cond(cond);
3513     }
3514
3515     t1 = t2 = NULL;
3516     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3517         t1 = tcg_temp_new_vec(type);
3518         if (fixup & NEED_UMIN) {
3519             tcg_gen_umin_vec(vece, t1, v1, v2);
3520         } else {
3521             tcg_gen_umax_vec(vece, t1, v1, v2);
3522         }
3523         v2 = t1;
3524         cond = TCG_COND_EQ;
3525     } else if (fixup & NEED_BIAS) {
3526         t1 = tcg_temp_new_vec(type);
3527         t2 = tcg_temp_new_vec(type);
3528         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3529         tcg_gen_sub_vec(vece, t1, v1, t3);
3530         tcg_gen_sub_vec(vece, t2, v2, t3);
3531         v1 = t1;
3532         v2 = t2;
3533         cond = tcg_signed_cond(cond);
3534     }
3535
3536     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3537     /* Expand directly; do not recurse.  */
3538     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3539               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3540
3541     if (t1) {
3542         tcg_temp_free_vec(t1);
3543         if (t2) {
3544             tcg_temp_free_vec(t2);
3545         }
3546     }
3547     return fixup & NEED_INV;
3548 }
3549
3550 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3551                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3552 {
3553     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3554         tcg_gen_not_vec(vece, v0, v0);
3555     }
3556 }
3557
3558 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3559                               TCGv_vec c1, TCGv_vec c2,
3560                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3561 {
3562     TCGv_vec t = tcg_temp_new_vec(type);
3563
3564     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3565         /* Invert the sense of the compare by swapping arguments.  */
3566         TCGv_vec x;
3567         x = v3, v3 = v4, v4 = x;
3568     }
3569     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3570               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3571               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3572     tcg_temp_free_vec(t);
3573 }
3574
3575 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3576                        TCGArg a0, ...)
3577 {
3578     va_list va;
3579     TCGArg a2;
3580     TCGv_vec v0, v1, v2, v3, v4;
3581
3582     va_start(va, a0);
3583     v0 = temp_tcgv_vec(arg_temp(a0));
3584     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3585     a2 = va_arg(va, TCGArg);
3586
3587     switch (opc) {
3588     case INDEX_op_shli_vec:
3589     case INDEX_op_shri_vec:
3590         expand_vec_shi(type, vece, opc, v0, v1, a2);
3591         break;
3592
3593     case INDEX_op_sari_vec:
3594         expand_vec_sari(type, vece, v0, v1, a2);
3595         break;
3596
3597     case INDEX_op_rotli_vec:
3598         expand_vec_rotli(type, vece, v0, v1, a2);
3599         break;
3600
3601     case INDEX_op_rotls_vec:
3602         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3603         break;
3604
3605     case INDEX_op_rotlv_vec:
3606         v2 = temp_tcgv_vec(arg_temp(a2));
3607         expand_vec_rotv(type, vece, v0, v1, v2, false);
3608         break;
3609     case INDEX_op_rotrv_vec:
3610         v2 = temp_tcgv_vec(arg_temp(a2));
3611         expand_vec_rotv(type, vece, v0, v1, v2, true);
3612         break;
3613
3614     case INDEX_op_mul_vec:
3615         v2 = temp_tcgv_vec(arg_temp(a2));
3616         expand_vec_mul(type, vece, v0, v1, v2);
3617         break;
3618
3619     case INDEX_op_cmp_vec:
3620         v2 = temp_tcgv_vec(arg_temp(a2));
3621         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3622         break;
3623
3624     case INDEX_op_cmpsel_vec:
3625         v2 = temp_tcgv_vec(arg_temp(a2));
3626         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3627         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3628         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3629         break;
3630
3631     default:
3632         break;
3633     }
3634
3635     va_end(va);
3636 }
3637
3638 static const int tcg_target_callee_save_regs[] = {
3639 #if TCG_TARGET_REG_BITS == 64
3640     TCG_REG_RBP,
3641     TCG_REG_RBX,
3642 #if defined(_WIN64)
3643     TCG_REG_RDI,
3644     TCG_REG_RSI,
3645 #endif
3646     TCG_REG_R12,
3647     TCG_REG_R13,
3648     TCG_REG_R14, /* Currently used for the global env. */
3649     TCG_REG_R15,
3650 #else
3651     TCG_REG_EBP, /* Currently used for the global env. */
3652     TCG_REG_EBX,
3653     TCG_REG_ESI,
3654     TCG_REG_EDI,
3655 #endif
3656 };
3657
3658 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3659    and tcg_register_jit.  */
3660
3661 #define PUSH_SIZE \
3662     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3663      * (TCG_TARGET_REG_BITS / 8))
3664
3665 #define FRAME_SIZE \
3666     ((PUSH_SIZE \
3667       + TCG_STATIC_CALL_ARGS_SIZE \
3668       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3669       + TCG_TARGET_STACK_ALIGN - 1) \
3670      & ~(TCG_TARGET_STACK_ALIGN - 1))
3671
3672 /* Generate global QEMU prologue and epilogue code */
3673 static void tcg_target_qemu_prologue(TCGContext *s)
3674 {
3675     int i, stack_addend;
3676
3677     /* TB prologue */
3678
3679     /* Reserve some stack space, also for TCG temps.  */
3680     stack_addend = FRAME_SIZE - PUSH_SIZE;
3681     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3682                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3683
3684     /* Save all callee saved registers.  */
3685     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3686         tcg_out_push(s, tcg_target_callee_save_regs[i]);
3687     }
3688
3689 #if TCG_TARGET_REG_BITS == 32
3690     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3691                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3692     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3693     /* jmp *tb.  */
3694     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3695                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3696                          + stack_addend);
3697 #else
3698 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3699     if (guest_base) {
3700         int seg = setup_guest_base_seg();
3701         if (seg != 0) {
3702             x86_guest_base_seg = seg;
3703         } else if (guest_base == (int32_t)guest_base) {
3704             x86_guest_base_offset = guest_base;
3705         } else {
3706             /* Choose R12 because, as a base, it requires a SIB byte. */
3707             x86_guest_base_index = TCG_REG_R12;
3708             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3709             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3710         }
3711     }
3712 # endif
3713     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3714     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3715     /* jmp *tb.  */
3716     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3717 #endif
3718
3719     /*
3720      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3721      * and fall through to the rest of the epilogue.
3722      */
3723     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3724     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3725
3726     /* TB epilogue */
3727     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3728
3729     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3730
3731     if (have_avx2) {
3732         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3733     }
3734     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3735         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3736     }
3737     tcg_out_opc(s, OPC_RET, 0, 0, 0);
3738 }
3739
3740 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3741 {
3742     memset(p, 0x90, count);
3743 }
3744
3745 static void tcg_target_init(TCGContext *s)
3746 {
3747 #ifdef CONFIG_CPUID_H
3748     unsigned a, b, c, d, b7 = 0;
3749     int max = __get_cpuid_max(0, 0);
3750
3751     if (max >= 7) {
3752         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3753         __cpuid_count(7, 0, a, b7, c, d);
3754         have_bmi1 = (b7 & bit_BMI) != 0;
3755         have_bmi2 = (b7 & bit_BMI2) != 0;
3756     }
3757
3758     if (max >= 1) {
3759         __cpuid(1, a, b, c, d);
3760 #ifndef have_cmov
3761         /* For 32-bit, 99% certainty that we're running on hardware that
3762            supports cmov, but we still need to check.  In case cmov is not
3763            available, we'll use a small forward branch.  */
3764         have_cmov = (d & bit_CMOV) != 0;
3765 #endif
3766
3767         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3768            need to probe for it.  */
3769         have_movbe = (c & bit_MOVBE) != 0;
3770         have_popcnt = (c & bit_POPCNT) != 0;
3771
3772         /* There are a number of things we must check before we can be
3773            sure of not hitting invalid opcode.  */
3774         if (c & bit_OSXSAVE) {
3775             unsigned xcrl, xcrh;
3776             /* The xgetbv instruction is not available to older versions of
3777              * the assembler, so we encode the instruction manually.
3778              */
3779             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3780             if ((xcrl & 6) == 6) {
3781                 have_avx1 = (c & bit_AVX) != 0;
3782                 have_avx2 = (b7 & bit_AVX2) != 0;
3783             }
3784         }
3785     }
3786
3787     max = __get_cpuid_max(0x8000000, 0);
3788     if (max >= 1) {
3789         __cpuid(0x80000001, a, b, c, d);
3790         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3791         have_lzcnt = (c & bit_LZCNT) != 0;
3792     }
3793 #endif /* CONFIG_CPUID_H */
3794
3795     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3796     if (TCG_TARGET_REG_BITS == 64) {
3797         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3798     }
3799     if (have_avx1) {
3800         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3801         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3802     }
3803     if (have_avx2) {
3804         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3805     }
3806
3807     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3808     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3809     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3810     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3811     if (TCG_TARGET_REG_BITS == 64) {
3812 #if !defined(_WIN64)
3813         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3814         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3815 #endif
3816         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3817         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3818         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3819         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3820     }
3821
3822     s->reserved_regs = 0;
3823     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3824 }
3825
3826 typedef struct {
3827     DebugFrameHeader h;
3828     uint8_t fde_def_cfa[4];
3829     uint8_t fde_reg_ofs[14];
3830 } DebugFrame;
3831
3832 /* We're expecting a 2 byte uleb128 encoded value.  */
3833 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3834
3835 #if !defined(__ELF__)
3836     /* Host machine without ELF. */
3837 #elif TCG_TARGET_REG_BITS == 64
3838 #define ELF_HOST_MACHINE EM_X86_64
3839 static const DebugFrame debug_frame = {
3840     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3841     .h.cie.id = -1,
3842     .h.cie.version = 1,
3843     .h.cie.code_align = 1,
3844     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3845     .h.cie.return_column = 16,
3846
3847     /* Total FDE size does not include the "len" member.  */
3848     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3849
3850     .fde_def_cfa = {
3851         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3852         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3853         (FRAME_SIZE >> 7)
3854     },
3855     .fde_reg_ofs = {
3856         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3857         /* The following ordering must match tcg_target_callee_save_regs.  */
3858         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3859         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3860         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3861         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3862         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3863         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3864     }
3865 };
3866 #else
3867 #define ELF_HOST_MACHINE EM_386
3868 static const DebugFrame debug_frame = {
3869     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3870     .h.cie.id = -1,
3871     .h.cie.version = 1,
3872     .h.cie.code_align = 1,
3873     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3874     .h.cie.return_column = 8,
3875
3876     /* Total FDE size does not include the "len" member.  */
3877     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3878
3879     .fde_def_cfa = {
3880         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3881         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3882         (FRAME_SIZE >> 7)
3883     },
3884     .fde_reg_ofs = {
3885         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3886         /* The following ordering must match tcg_target_callee_save_regs.  */
3887         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3888         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3889         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3890         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3891     }
3892 };
3893 #endif
3894
3895 #if defined(ELF_HOST_MACHINE)
3896 void tcg_register_jit(const void *buf, size_t buf_size)
3897 {
3898     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3899 }
3900 #endif