tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135 #define TCG_CT_CONST_TST 0x1000
 136
 137 /* Registers used with L constraint, which are the first argument
 138    registers on x86_64, and two random call clobbered registers on
 139    i386. */
 140 #if TCG_TARGET_REG_BITS == 64
 141 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 142 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 143 #else
 144 # define TCG_REG_L0 TCG_REG_EAX
 145 # define TCG_REG_L1 TCG_REG_EDX
 146 #endif
 147
 148 #if TCG_TARGET_REG_BITS == 64
 149 # define ALL_GENERAL_REGS      0x0000ffffu
 150 # define ALL_VECTOR_REGS       0xffff0000u
 151 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 152 #else
 153 # define ALL_GENERAL_REGS      0x000000ffu
 154 # define ALL_VECTOR_REGS       0x00ff0000u
 155 # define ALL_BYTEL_REGS        0x0000000fu
 156 #endif
 157 #define SOFTMMU_RESERVE_REGS \
 158     (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
 159
 160 /* For 64-bit, we always know that CMOV is available.  */
 161 #if TCG_TARGET_REG_BITS == 64
 162 # define have_cmov      true
 163 #else
 164 # define have_cmov      (cpuinfo & CPUINFO_CMOV)
 165 #endif
 166 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 167 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 168
 169 static const tcg_insn_unit *tb_ret_addr;
 170
 171 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 172                         intptr_t value, intptr_t addend)
 173 {
 174     value += addend;
 175     switch(type) {
 176     case R_386_PC32:
 177         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 178         if (value != (int32_t)value) {
 179             return false;
 180         }
 181         /* FALLTHRU */
 182     case R_386_32:
 183         tcg_patch32(code_ptr, value);
 184         break;
 185     case R_386_PC8:
 186         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 187         if (value != (int8_t)value) {
 188             return false;
 189         }
 190         tcg_patch8(code_ptr, value);
 191         break;
 192     default:
 193         g_assert_not_reached();
 194     }
 195     return true;
 196 }
 197
 198 /* test if a constant matches the constraint */
 199 static bool tcg_target_const_match(int64_t val, int ct,
 200                                    TCGType type, TCGCond cond, int vece)
 201 {
 202     if (ct & TCG_CT_CONST) {
 203         return 1;
 204     }
 205     if (type == TCG_TYPE_I32) {
 206         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
 207                   TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
 208             return 1;
 209         }
 210     } else {
 211         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 212             return 1;
 213         }
 214         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 215             return 1;
 216         }
 217         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 218             return 1;
 219         }
 220         /*
 221          * This will be used in combination with TCG_CT_CONST_S32,
 222          * so "normal" TESTQ is already matched.  Also accept:
 223          *    TESTQ -> TESTL   (uint32_t)
 224          *    TESTQ -> BT      (is_power_of_2)
 225          */
 226         if ((ct & TCG_CT_CONST_TST)
 227             && is_tst_cond(cond)
 228             && (val == (uint32_t)val || is_power_of_2(val))) {
 229             return 1;
 230         }
 231     }
 232     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 233         return 1;
 234     }
 235     return 0;
 236 }
 237
 238 # define LOWREGMASK(x)  ((x) & 7)
 239
 240 #define P_EXT           0x100           /* 0x0f opcode prefix */
 241 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 242 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 243 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 244 #if TCG_TARGET_REG_BITS == 64
 245 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 246 # define P_REXB_R       0x2000          /* REG field as byte register */
 247 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 248 # define P_GS           0x8000          /* gs segment override */
 249 #else
 250 # define P_REXW         0
 251 # define P_REXB_R       0
 252 # define P_REXB_RM      0
 253 # define P_GS           0
 254 #endif
 255 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 256 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 257 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 258 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 259 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 260
 261 #define OPC_ARITH_EbIb  (0x80)
 262 #define OPC_ARITH_EvIz  (0x81)
 263 #define OPC_ARITH_EvIb  (0x83)
 264 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 265 #define OPC_ANDN        (0xf2 | P_EXT38)
 266 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 267 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 268 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 269 #define OPC_BSF         (0xbc | P_EXT)
 270 #define OPC_BSR         (0xbd | P_EXT)
 271 #define OPC_BSWAP       (0xc8 | P_EXT)
 272 #define OPC_CALL_Jz     (0xe8)
 273 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 274 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 275 #define OPC_DEC_r32     (0x48)
 276 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 277 #define OPC_IMUL_GvEvIb (0x6b)
 278 #define OPC_IMUL_GvEvIz (0x69)
 279 #define OPC_INC_r32     (0x40)
 280 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 281 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 282 #define OPC_JMP_long    (0xe9)
 283 #define OPC_JMP_short   (0xeb)
 284 #define OPC_LEA         (0x8d)
 285 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 286 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 287 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 288 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 289 #define OPC_MOVB_EvIz   (0xc6)
 290 #define OPC_MOVL_EvIz   (0xc7)
 291 #define OPC_MOVB_Ib     (0xb0)
 292 #define OPC_MOVL_Iv     (0xb8)
 293 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 294 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 295 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 296 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 297 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 298 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 299 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 300 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 301 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 302 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 303 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 304 #define OPC_MOVSBL      (0xbe | P_EXT)
 305 #define OPC_MOVSWL      (0xbf | P_EXT)
 306 #define OPC_MOVSLQ      (0x63 | P_REXW)
 307 #define OPC_MOVZBL      (0xb6 | P_EXT)
 308 #define OPC_MOVZWL      (0xb7 | P_EXT)
 309 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 310 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 311 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 312 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 313 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 314 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 315 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 316 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 317 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 318 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 319 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 320 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 321 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 322 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 323 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 324 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 325 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 326 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 327 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 328 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 329 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 330 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 331 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 332 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 333 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 334 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 335 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 336 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 337 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 338 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 339 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 340 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 341 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 342 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 343 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 344 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 345 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 346 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 347 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 348 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 349 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 350 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 351 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 352 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 353 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 354 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 355 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 356 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 357 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 358 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 359 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 360 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 361 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 362 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 363 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 364 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 365 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 366 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 367 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 368 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 369 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 370 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 371 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 372 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 373 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 374 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 375 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 376 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 377 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 378 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 379 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 380 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 381 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 382 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 383 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 384 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 385 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 386 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 387 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 388 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 389 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 390 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 391 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 392 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 393 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 394 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 395 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 396 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 397 #define OPC_POP_r32     (0x58)
 398 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 399 #define OPC_PUSH_r32    (0x50)
 400 #define OPC_PUSH_Iv     (0x68)
 401 #define OPC_PUSH_Ib     (0x6a)
 402 #define OPC_RET         (0xc3)
 403 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 404 #define OPC_SHIFT_1     (0xd1)
 405 #define OPC_SHIFT_Ib    (0xc1)
 406 #define OPC_SHIFT_cl    (0xd3)
 407 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 408 #define OPC_SHUFPS      (0xc6 | P_EXT)
 409 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 410 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 411 #define OPC_SHRD_Ib     (0xac | P_EXT)
 412 #define OPC_TESTB       (0x84)
 413 #define OPC_TESTL       (0x85)
 414 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 415 #define OPC_UD2         (0x0b | P_EXT)
 416 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 417 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 418 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 419 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 420 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 421 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 422 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 423 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 424 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 425 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 426 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 427 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 428 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 429 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 430 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 431 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 432 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 433 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 434 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 435 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 436 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 437 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 438 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 439 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 440 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 441 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 442 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 443 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 444 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 445 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 446 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 447 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 448 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 449 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 450 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 451 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 452 #define OPC_XCHG_ax_r32 (0x90)
 453 #define OPC_XCHG_EvGv   (0x87)
 454
 455 #define OPC_GRP3_Eb     (0xf6)
 456 #define OPC_GRP3_Ev     (0xf7)
 457 #define OPC_GRP5        (0xff)
 458 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 459 #define OPC_GRPBT       (0xba | P_EXT)
 460
 461 #define OPC_GRPBT_BT    4
 462 #define OPC_GRPBT_BTS   5
 463 #define OPC_GRPBT_BTR   6
 464 #define OPC_GRPBT_BTC   7
 465
 466 /* Group 1 opcode extensions for 0x80-0x83.
 467    These are also used as modifiers for OPC_ARITH.  */
 468 #define ARITH_ADD 0
 469 #define ARITH_OR  1
 470 #define ARITH_ADC 2
 471 #define ARITH_SBB 3
 472 #define ARITH_AND 4
 473 #define ARITH_SUB 5
 474 #define ARITH_XOR 6
 475 #define ARITH_CMP 7
 476
 477 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 478 #define SHIFT_ROL 0
 479 #define SHIFT_ROR 1
 480 #define SHIFT_SHL 4
 481 #define SHIFT_SHR 5
 482 #define SHIFT_SAR 7
 483
 484 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 485 #define EXT3_TESTi 0
 486 #define EXT3_NOT   2
 487 #define EXT3_NEG   3
 488 #define EXT3_MUL   4
 489 #define EXT3_IMUL  5
 490 #define EXT3_DIV   6
 491 #define EXT3_IDIV  7
 492
 493 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 494 #define EXT5_INC_Ev     0
 495 #define EXT5_DEC_Ev     1
 496 #define EXT5_CALLN_Ev   2
 497 #define EXT5_JMPN_Ev    4
 498
 499 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 500 #define JCC_JMP (-1)
 501 #define JCC_JO  0x0
 502 #define JCC_JNO 0x1
 503 #define JCC_JB  0x2
 504 #define JCC_JAE 0x3
 505 #define JCC_JE  0x4
 506 #define JCC_JNE 0x5
 507 #define JCC_JBE 0x6
 508 #define JCC_JA  0x7
 509 #define JCC_JS  0x8
 510 #define JCC_JNS 0x9
 511 #define JCC_JP  0xa
 512 #define JCC_JNP 0xb
 513 #define JCC_JL  0xc
 514 #define JCC_JGE 0xd
 515 #define JCC_JLE 0xe
 516 #define JCC_JG  0xf
 517
 518 static const uint8_t tcg_cond_to_jcc[] = {
 519     [TCG_COND_EQ] = JCC_JE,
 520     [TCG_COND_NE] = JCC_JNE,
 521     [TCG_COND_LT] = JCC_JL,
 522     [TCG_COND_GE] = JCC_JGE,
 523     [TCG_COND_LE] = JCC_JLE,
 524     [TCG_COND_GT] = JCC_JG,
 525     [TCG_COND_LTU] = JCC_JB,
 526     [TCG_COND_GEU] = JCC_JAE,
 527     [TCG_COND_LEU] = JCC_JBE,
 528     [TCG_COND_GTU] = JCC_JA,
 529     [TCG_COND_TSTEQ] = JCC_JE,
 530     [TCG_COND_TSTNE] = JCC_JNE,
 531 };
 532
 533 #if TCG_TARGET_REG_BITS == 64
 534 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 535 {
 536     int rex;
 537
 538     if (opc & P_GS) {
 539         tcg_out8(s, 0x65);
 540     }
 541     if (opc & P_DATA16) {
 542         /* We should never be asking for both 16 and 64-bit operation.  */
 543         tcg_debug_assert((opc & P_REXW) == 0);
 544         tcg_out8(s, 0x66);
 545     }
 546     if (opc & P_SIMDF3) {
 547         tcg_out8(s, 0xf3);
 548     } else if (opc & P_SIMDF2) {
 549         tcg_out8(s, 0xf2);
 550     }
 551
 552     rex = 0;
 553     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 554     rex |= (r & 8) >> 1;                /* REX.R */
 555     rex |= (x & 8) >> 2;                /* REX.X */
 556     rex |= (rm & 8) >> 3;               /* REX.B */
 557
 558     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 559        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 560        as otherwise the encoding indicates %[abcd]h.  Note that the values
 561        that are ORed in merely indicate that the REX byte must be present;
 562        those bits get discarded in output.  */
 563     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 564     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 565
 566     if (rex) {
 567         tcg_out8(s, (uint8_t)(rex | 0x40));
 568     }
 569
 570     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 571         tcg_out8(s, 0x0f);
 572         if (opc & P_EXT38) {
 573             tcg_out8(s, 0x38);
 574         } else if (opc & P_EXT3A) {
 575             tcg_out8(s, 0x3a);
 576         }
 577     }
 578
 579     tcg_out8(s, opc);
 580 }
 581 #else
 582 static void tcg_out_opc(TCGContext *s, int opc)
 583 {
 584     if (opc & P_DATA16) {
 585         tcg_out8(s, 0x66);
 586     }
 587     if (opc & P_SIMDF3) {
 588         tcg_out8(s, 0xf3);
 589     } else if (opc & P_SIMDF2) {
 590         tcg_out8(s, 0xf2);
 591     }
 592     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 593         tcg_out8(s, 0x0f);
 594         if (opc & P_EXT38) {
 595             tcg_out8(s, 0x38);
 596         } else if (opc & P_EXT3A) {
 597             tcg_out8(s, 0x3a);
 598         }
 599     }
 600     tcg_out8(s, opc);
 601 }
 602 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 603    the 32-bit compilation paths.  This method works with all versions of gcc,
 604    whereas relying on optimization may not be able to exclude them.  */
 605 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 606 #endif
 607
 608 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 609 {
 610     tcg_out_opc(s, opc, r, rm, 0);
 611     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 612 }
 613
 614 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 615                             int rm, int index)
 616 {
 617     int tmp;
 618
 619     if (opc & P_GS) {
 620         tcg_out8(s, 0x65);
 621     }
 622     /* Use the two byte form if possible, which cannot encode
 623        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 624     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 625         && ((rm | index) & 8) == 0) {
 626         /* Two byte VEX prefix.  */
 627         tcg_out8(s, 0xc5);
 628
 629         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 630     } else {
 631         /* Three byte VEX prefix.  */
 632         tcg_out8(s, 0xc4);
 633
 634         /* VEX.m-mmmm */
 635         if (opc & P_EXT3A) {
 636             tmp = 3;
 637         } else if (opc & P_EXT38) {
 638             tmp = 2;
 639         } else if (opc & P_EXT) {
 640             tmp = 1;
 641         } else {
 642             g_assert_not_reached();
 643         }
 644         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 645         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 646         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 647         tcg_out8(s, tmp);
 648
 649         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 650     }
 651
 652     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 653     /* VEX.pp */
 654     if (opc & P_DATA16) {
 655         tmp |= 1;                          /* 0x66 */
 656     } else if (opc & P_SIMDF3) {
 657         tmp |= 2;                          /* 0xf3 */
 658     } else if (opc & P_SIMDF2) {
 659         tmp |= 3;                          /* 0xf2 */
 660     }
 661     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 662     tcg_out8(s, tmp);
 663     tcg_out8(s, opc);
 664 }
 665
 666 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 667                              int rm, int index)
 668 {
 669     /* The entire 4-byte evex prefix; with R' and V' set. */
 670     uint32_t p = 0x08041062;
 671     int mm, pp;
 672
 673     tcg_debug_assert(have_avx512vl);
 674
 675     /* EVEX.mm */
 676     if (opc & P_EXT3A) {
 677         mm = 3;
 678     } else if (opc & P_EXT38) {
 679         mm = 2;
 680     } else if (opc & P_EXT) {
 681         mm = 1;
 682     } else {
 683         g_assert_not_reached();
 684     }
 685
 686     /* EVEX.pp */
 687     if (opc & P_DATA16) {
 688         pp = 1;                          /* 0x66 */
 689     } else if (opc & P_SIMDF3) {
 690         pp = 2;                          /* 0xf3 */
 691     } else if (opc & P_SIMDF2) {
 692         pp = 3;                          /* 0xf2 */
 693     } else {
 694         pp = 0;
 695     }
 696
 697     p = deposit32(p, 8, 2, mm);
 698     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 699     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 700     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 701     p = deposit32(p, 16, 2, pp);
 702     p = deposit32(p, 19, 4, ~v);
 703     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 704     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 705
 706     tcg_out32(s, p);
 707     tcg_out8(s, opc);
 708 }
 709
 710 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 711 {
 712     if (opc & P_EVEX) {
 713         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 714     } else {
 715         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 716     }
 717     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 718 }
 719
 720 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 721    We handle either RM and INDEX missing with a negative value.  In 64-bit
 722    mode for absolute addresses, ~RM is the size of the immediate operand
 723    that will follow the instruction.  */
 724
 725 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 726                                int shift, intptr_t offset)
 727 {
 728     int mod, len;
 729
 730     if (index < 0 && rm < 0) {
 731         if (TCG_TARGET_REG_BITS == 64) {
 732             /* Try for a rip-relative addressing mode.  This has replaced
 733                the 32-bit-mode absolute addressing encoding.  */
 734             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 735             intptr_t disp = offset - pc;
 736             if (disp == (int32_t)disp) {
 737                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 738                 tcg_out32(s, disp);
 739                 return;
 740             }
 741
 742             /* Try for an absolute address encoding.  This requires the
 743                use of the MODRM+SIB encoding and is therefore larger than
 744                rip-relative addressing.  */
 745             if (offset == (int32_t)offset) {
 746                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 747                 tcg_out8(s, (4 << 3) | 5);
 748                 tcg_out32(s, offset);
 749                 return;
 750             }
 751
 752             /* ??? The memory isn't directly addressable.  */
 753             g_assert_not_reached();
 754         } else {
 755             /* Absolute address.  */
 756             tcg_out8(s, (r << 3) | 5);
 757             tcg_out32(s, offset);
 758             return;
 759         }
 760     }
 761
 762     /* Find the length of the immediate addend.  Note that the encoding
 763        that would be used for (%ebp) indicates absolute addressing.  */
 764     if (rm < 0) {
 765         mod = 0, len = 4, rm = 5;
 766     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 767         mod = 0, len = 0;
 768     } else if (offset == (int8_t)offset) {
 769         mod = 0x40, len = 1;
 770     } else {
 771         mod = 0x80, len = 4;
 772     }
 773
 774     /* Use a single byte MODRM format if possible.  Note that the encoding
 775        that would be used for %esp is the escape to the two byte form.  */
 776     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 777         /* Single byte MODRM format.  */
 778         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 779     } else {
 780         /* Two byte MODRM+SIB format.  */
 781
 782         /* Note that the encoding that would place %esp into the index
 783            field indicates no index register.  In 64-bit mode, the REX.X
 784            bit counts, so %r12 can be used as the index.  */
 785         if (index < 0) {
 786             index = 4;
 787         } else {
 788             tcg_debug_assert(index != TCG_REG_ESP);
 789         }
 790
 791         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 792         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 793     }
 794
 795     if (len == 1) {
 796         tcg_out8(s, offset);
 797     } else if (len == 4) {
 798         tcg_out32(s, offset);
 799     }
 800 }
 801
 802 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 803                                      int index, int shift, intptr_t offset)
 804 {
 805     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 806     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 807 }
 808
 809 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 810                                          int rm, int index, int shift,
 811                                          intptr_t offset)
 812 {
 813     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 814     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 815 }
 816
 817 /* A simplification of the above with no index or shift.  */
 818 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 819                                         int rm, intptr_t offset)
 820 {
 821     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 822 }
 823
 824 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 825                                             int v, int rm, intptr_t offset)
 826 {
 827     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 828 }
 829
 830 /* Output an opcode with an expected reference to the constant pool.  */
 831 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 832 {
 833     tcg_out_opc(s, opc, r, 0, 0);
 834     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 835     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 836     tcg_out32(s, 0);
 837 }
 838
 839 /* Output an opcode with an expected reference to the constant pool.  */
 840 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 841 {
 842     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 843     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 844     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 845     tcg_out32(s, 0);
 846 }
 847
 848 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 849 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 850 {
 851     /* Propagate an opcode prefix, such as P_REXW.  */
 852     int ext = subop & ~0x7;
 853     subop &= 0x7;
 854
 855     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 856 }
 857
 858 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 859 {
 860     int rexw = 0;
 861
 862     if (arg == ret) {
 863         return true;
 864     }
 865     switch (type) {
 866     case TCG_TYPE_I64:
 867         rexw = P_REXW;
 868         /* fallthru */
 869     case TCG_TYPE_I32:
 870         if (ret < 16) {
 871             if (arg < 16) {
 872                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 873             } else {
 874                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 875             }
 876         } else {
 877             if (arg < 16) {
 878                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 879             } else {
 880                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 881             }
 882         }
 883         break;
 884
 885     case TCG_TYPE_V64:
 886         tcg_debug_assert(ret >= 16 && arg >= 16);
 887         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 888         break;
 889     case TCG_TYPE_V128:
 890         tcg_debug_assert(ret >= 16 && arg >= 16);
 891         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 892         break;
 893     case TCG_TYPE_V256:
 894         tcg_debug_assert(ret >= 16 && arg >= 16);
 895         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 896         break;
 897
 898     default:
 899         g_assert_not_reached();
 900     }
 901     return true;
 902 }
 903
 904 static const int avx2_dup_insn[4] = {
 905     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 906     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 907 };
 908
 909 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 910                             TCGReg r, TCGReg a)
 911 {
 912     if (have_avx2) {
 913         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 914         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 915     } else {
 916         switch (vece) {
 917         case MO_8:
 918             /* ??? With zero in a register, use PSHUFB.  */
 919             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 920             a = r;
 921             /* FALLTHRU */
 922         case MO_16:
 923             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 924             a = r;
 925             /* FALLTHRU */
 926         case MO_32:
 927             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 928             /* imm8 operand: all output lanes selected from input lane 0.  */
 929             tcg_out8(s, 0);
 930             break;
 931         case MO_64:
 932             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 933             break;
 934         default:
 935             g_assert_not_reached();
 936         }
 937     }
 938     return true;
 939 }
 940
 941 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 942                              TCGReg r, TCGReg base, intptr_t offset)
 943 {
 944     if (have_avx2) {
 945         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 946         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 947                                  r, 0, base, offset);
 948     } else {
 949         switch (vece) {
 950         case MO_64:
 951             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 952             break;
 953         case MO_32:
 954             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 955             break;
 956         case MO_16:
 957             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 958             tcg_out8(s, 0); /* imm8 */
 959             tcg_out_dup_vec(s, type, vece, r, r);
 960             break;
 961         case MO_8:
 962             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 963             tcg_out8(s, 0); /* imm8 */
 964             tcg_out_dup_vec(s, type, vece, r, r);
 965             break;
 966         default:
 967             g_assert_not_reached();
 968         }
 969     }
 970     return true;
 971 }
 972
 973 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 974                              TCGReg ret, int64_t arg)
 975 {
 976     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 977
 978     if (arg == 0) {
 979         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 980         return;
 981     }
 982     if (arg == -1) {
 983         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 984         return;
 985     }
 986
 987     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 988         if (have_avx2) {
 989             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 990         } else {
 991             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 992         }
 993         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 994     } else {
 995         if (type == TCG_TYPE_V64) {
 996             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 997         } else if (have_avx2) {
 998             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 999         } else {
1000             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1001         }
1002         if (TCG_TARGET_REG_BITS == 64) {
1003             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1004         } else {
1005             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1006         }
1007     }
1008 }
1009
1010 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1011                              TCGReg ret, tcg_target_long arg)
1012 {
1013     if (arg == 0) {
1014         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1015         return;
1016     }
1017     if (arg == -1) {
1018         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1019         return;
1020     }
1021
1022     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1023     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1024     if (TCG_TARGET_REG_BITS == 64) {
1025         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1026     } else {
1027         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1028     }
1029 }
1030
1031 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1032                              TCGReg ret, tcg_target_long arg)
1033 {
1034     tcg_target_long diff;
1035
1036     if (arg == 0) {
1037         tgen_arithr(s, ARITH_XOR, ret, ret);
1038         return;
1039     }
1040     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1041         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1042         tcg_out32(s, arg);
1043         return;
1044     }
1045     if (arg == (int32_t)arg) {
1046         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1047         tcg_out32(s, arg);
1048         return;
1049     }
1050
1051     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1052     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1053     if (diff == (int32_t)diff) {
1054         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1055         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1056         tcg_out32(s, diff);
1057         return;
1058     }
1059
1060     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1061     tcg_out64(s, arg);
1062 }
1063
1064 static void tcg_out_movi(TCGContext *s, TCGType type,
1065                          TCGReg ret, tcg_target_long arg)
1066 {
1067     switch (type) {
1068     case TCG_TYPE_I32:
1069 #if TCG_TARGET_REG_BITS == 64
1070     case TCG_TYPE_I64:
1071 #endif
1072         if (ret < 16) {
1073             tcg_out_movi_int(s, type, ret, arg);
1074         } else {
1075             tcg_out_movi_vec(s, type, ret, arg);
1076         }
1077         break;
1078     default:
1079         g_assert_not_reached();
1080     }
1081 }
1082
1083 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1084 {
1085     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1086     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1087     return true;
1088 }
1089
1090 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1091                              tcg_target_long imm)
1092 {
1093     /* This function is only used for passing structs by reference. */
1094     tcg_debug_assert(imm == (int32_t)imm);
1095     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1096 }
1097
1098 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1099 {
1100     if (val == (int8_t)val) {
1101         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1102         tcg_out8(s, val);
1103     } else if (val == (int32_t)val) {
1104         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1105         tcg_out32(s, val);
1106     } else {
1107         g_assert_not_reached();
1108     }
1109 }
1110
1111 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1112 {
1113     /* Given the strength of x86 memory ordering, we only need care for
1114        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1115        faster than "mfence", so don't bother with the sse insn.  */
1116     if (a0 & TCG_MO_ST_LD) {
1117         tcg_out8(s, 0xf0);
1118         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1119         tcg_out8(s, 0);
1120     }
1121 }
1122
1123 static inline void tcg_out_push(TCGContext *s, int reg)
1124 {
1125     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1126 }
1127
1128 static inline void tcg_out_pop(TCGContext *s, int reg)
1129 {
1130     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1131 }
1132
1133 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1134                        TCGReg arg1, intptr_t arg2)
1135 {
1136     switch (type) {
1137     case TCG_TYPE_I32:
1138         if (ret < 16) {
1139             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1140         } else {
1141             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1142         }
1143         break;
1144     case TCG_TYPE_I64:
1145         if (ret < 16) {
1146             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1147             break;
1148         }
1149         /* FALLTHRU */
1150     case TCG_TYPE_V64:
1151         /* There is no instruction that can validate 8-byte alignment.  */
1152         tcg_debug_assert(ret >= 16);
1153         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1154         break;
1155     case TCG_TYPE_V128:
1156         /*
1157          * The gvec infrastructure is asserts that v128 vector loads
1158          * and stores use a 16-byte aligned offset.  Validate that the
1159          * final pointer is aligned by using an insn that will SIGSEGV.
1160          */
1161         tcg_debug_assert(ret >= 16);
1162         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1163         break;
1164     case TCG_TYPE_V256:
1165         /*
1166          * The gvec infrastructure only requires 16-byte alignment,
1167          * so here we must use an unaligned load.
1168          */
1169         tcg_debug_assert(ret >= 16);
1170         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1171                                  ret, 0, arg1, arg2);
1172         break;
1173     default:
1174         g_assert_not_reached();
1175     }
1176 }
1177
1178 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1179                        TCGReg arg1, intptr_t arg2)
1180 {
1181     switch (type) {
1182     case TCG_TYPE_I32:
1183         if (arg < 16) {
1184             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1185         } else {
1186             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1187         }
1188         break;
1189     case TCG_TYPE_I64:
1190         if (arg < 16) {
1191             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1192             break;
1193         }
1194         /* FALLTHRU */
1195     case TCG_TYPE_V64:
1196         /* There is no instruction that can validate 8-byte alignment.  */
1197         tcg_debug_assert(arg >= 16);
1198         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1199         break;
1200     case TCG_TYPE_V128:
1201         /*
1202          * The gvec infrastructure is asserts that v128 vector loads
1203          * and stores use a 16-byte aligned offset.  Validate that the
1204          * final pointer is aligned by using an insn that will SIGSEGV.
1205          *
1206          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1207          * for _WIN64, which must have SSE2 but may not have AVX.
1208          */
1209         tcg_debug_assert(arg >= 16);
1210         if (have_avx1) {
1211             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1212         } else {
1213             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1214         }
1215         break;
1216     case TCG_TYPE_V256:
1217         /*
1218          * The gvec infrastructure only requires 16-byte alignment,
1219          * so here we must use an unaligned store.
1220          */
1221         tcg_debug_assert(arg >= 16);
1222         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1223                                  arg, 0, arg1, arg2);
1224         break;
1225     default:
1226         g_assert_not_reached();
1227     }
1228 }
1229
1230 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1231                         TCGReg base, intptr_t ofs)
1232 {
1233     int rexw = 0;
1234     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1235         if (val != (int32_t)val) {
1236             return false;
1237         }
1238         rexw = P_REXW;
1239     } else if (type != TCG_TYPE_I32) {
1240         return false;
1241     }
1242     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1243     tcg_out32(s, val);
1244     return true;
1245 }
1246
1247 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1248 {
1249     /* Propagate an opcode prefix, such as P_DATA16.  */
1250     int ext = subopc & ~0x7;
1251     subopc &= 0x7;
1252
1253     if (count == 1) {
1254         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1255     } else {
1256         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1257         tcg_out8(s, count);
1258     }
1259 }
1260
1261 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1262 {
1263     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1264 }
1265
1266 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1267 {
1268     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1269 }
1270
1271 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1272 {
1273     /* movzbl */
1274     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1275     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1276 }
1277
1278 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1279 {
1280     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1281     /* movsbl */
1282     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1283     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1284 }
1285
1286 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1287 {
1288     /* movzwl */
1289     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1290 }
1291
1292 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1293 {
1294     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1295     /* movsw[lq] */
1296     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1297 }
1298
1299 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1300 {
1301     /* 32-bit mov zero extends.  */
1302     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1303 }
1304
1305 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1306 {
1307     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1308     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1309 }
1310
1311 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1312 {
1313     tcg_out_ext32s(s, dest, src);
1314 }
1315
1316 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1317 {
1318     if (dest != src) {
1319         tcg_out_ext32u(s, dest, src);
1320     }
1321 }
1322
1323 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1324 {
1325     tcg_out_ext32u(s, dest, src);
1326 }
1327
1328 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1329 {
1330     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1331 }
1332
1333 static void tgen_arithi(TCGContext *s, int c, int r0,
1334                         tcg_target_long val, int cf)
1335 {
1336     int rexw = 0;
1337
1338     if (TCG_TARGET_REG_BITS == 64) {
1339         rexw = c & -8;
1340         c &= 7;
1341     }
1342
1343     switch (c) {
1344     case ARITH_ADD:
1345     case ARITH_SUB:
1346         if (!cf) {
1347             /*
1348              * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1349              * partial flags update stalls on Pentium4 and are not recommended
1350              * by current Intel optimization manuals.
1351              */
1352             if (val == 1 || val == -1) {
1353                 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1354                 if (TCG_TARGET_REG_BITS == 64) {
1355                     /*
1356                      * The single-byte increment encodings are re-tasked
1357                      * as the REX prefixes.  Use the MODRM encoding.
1358                      */
1359                     tcg_out_modrm(s, OPC_GRP5 + rexw,
1360                                   (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1361                 } else {
1362                     tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1363                 }
1364                 return;
1365             }
1366             if (val == 128) {
1367                 /*
1368                  * Facilitate using an 8-bit immediate.  Carry is inverted
1369                  * by this transformation, so do it only if cf == 0.
1370                  */
1371                 c ^= ARITH_ADD ^ ARITH_SUB;
1372                 val = -128;
1373             }
1374         }
1375         break;
1376
1377     case ARITH_AND:
1378         if (TCG_TARGET_REG_BITS == 64) {
1379             if (val == 0xffffffffu) {
1380                 tcg_out_ext32u(s, r0, r0);
1381                 return;
1382             }
1383             if (val == (uint32_t)val) {
1384                 /* AND with no high bits set can use a 32-bit operation.  */
1385                 rexw = 0;
1386             }
1387         }
1388         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1389             tcg_out_ext8u(s, r0, r0);
1390             return;
1391         }
1392         if (val == 0xffffu) {
1393             tcg_out_ext16u(s, r0, r0);
1394             return;
1395         }
1396         break;
1397
1398     case ARITH_OR:
1399     case ARITH_XOR:
1400         if (val >= 0x80 && val <= 0xff
1401             && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1402             tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1403             tcg_out8(s, val);
1404             return;
1405         }
1406         break;
1407     }
1408
1409     if (val == (int8_t)val) {
1410         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1411         tcg_out8(s, val);
1412         return;
1413     }
1414     if (rexw == 0 || val == (int32_t)val) {
1415         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1416         tcg_out32(s, val);
1417         return;
1418     }
1419
1420     g_assert_not_reached();
1421 }
1422
1423 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1424 {
1425     if (val != 0) {
1426         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1427     }
1428 }
1429
1430 /* Set SMALL to force a short forward branch.  */
1431 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1432 {
1433     int32_t val, val1;
1434
1435     if (l->has_value) {
1436         val = tcg_pcrel_diff(s, l->u.value_ptr);
1437         val1 = val - 2;
1438         if ((int8_t)val1 == val1) {
1439             if (opc == -1) {
1440                 tcg_out8(s, OPC_JMP_short);
1441             } else {
1442                 tcg_out8(s, OPC_JCC_short + opc);
1443             }
1444             tcg_out8(s, val1);
1445         } else {
1446             tcg_debug_assert(!small);
1447             if (opc == -1) {
1448                 tcg_out8(s, OPC_JMP_long);
1449                 tcg_out32(s, val - 5);
1450             } else {
1451                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1452                 tcg_out32(s, val - 6);
1453             }
1454         }
1455     } else if (small) {
1456         if (opc == -1) {
1457             tcg_out8(s, OPC_JMP_short);
1458         } else {
1459             tcg_out8(s, OPC_JCC_short + opc);
1460         }
1461         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1462         s->code_ptr += 1;
1463     } else {
1464         if (opc == -1) {
1465             tcg_out8(s, OPC_JMP_long);
1466         } else {
1467             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1468         }
1469         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1470         s->code_ptr += 4;
1471     }
1472 }
1473
1474 static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1475                        TCGArg arg2, int const_arg2, int rexw)
1476 {
1477     int jz, js;
1478
1479     if (!is_tst_cond(cond)) {
1480         if (!const_arg2) {
1481             tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1482         } else if (arg2 == 0) {
1483             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1484         } else {
1485             tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1486             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1487         }
1488         return tcg_cond_to_jcc[cond];
1489     }
1490
1491     jz = tcg_cond_to_jcc[cond];
1492     js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1493
1494     if (!const_arg2) {
1495         tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1496         return jz;
1497     }
1498
1499     if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1500         if (arg2 == 0x80) {
1501             tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1502             return js;
1503         }
1504         if (arg2 == 0xff) {
1505             tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1506             return jz;
1507         }
1508         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1509         tcg_out8(s, arg2);
1510         return jz;
1511     }
1512
1513     if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1514         if (arg2 == 0x8000) {
1515             tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1516             return js;
1517         }
1518         if (arg2 == 0xff00) {
1519             tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1520             return jz;
1521         }
1522         tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1523         tcg_out8(s, arg2 >> 8);
1524         return jz;
1525     }
1526
1527     if (arg2 == 0xffff) {
1528         tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1529         return jz;
1530     }
1531     if (arg2 == 0xffffffffu) {
1532         tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1533         return jz;
1534     }
1535
1536     if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1537         int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1538         int sh = ctz64(arg2);
1539
1540         rexw = (sh & 32 ? P_REXW : 0);
1541         if ((sh & 31) == 31) {
1542             tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1543             return js;
1544         } else {
1545             tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1546             tcg_out8(s, sh);
1547             return jc;
1548         }
1549     }
1550
1551     if (rexw) {
1552         if (arg2 == (uint32_t)arg2) {
1553             rexw = 0;
1554         } else {
1555             tcg_debug_assert(arg2 == (int32_t)arg2);
1556         }
1557     }
1558     tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1559     tcg_out32(s, arg2);
1560     return jz;
1561 }
1562
1563 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1564                            TCGArg arg1, TCGArg arg2, int const_arg2,
1565                            TCGLabel *label, bool small)
1566 {
1567     int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1568     tcg_out_jxx(s, jcc, label, small);
1569 }
1570
1571 #if TCG_TARGET_REG_BITS == 32
1572 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1573                             const int *const_args, bool small)
1574 {
1575     TCGLabel *label_next = gen_new_label();
1576     TCGLabel *label_this = arg_label(args[5]);
1577     TCGCond cond = args[4];
1578
1579     switch (cond) {
1580     case TCG_COND_EQ:
1581     case TCG_COND_TSTEQ:
1582         tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1583                        args[0], args[2], const_args[2], label_next, 1);
1584         tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1585                        label_this, small);
1586         break;
1587     case TCG_COND_NE:
1588     case TCG_COND_TSTNE:
1589         tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1590                        label_this, small);
1591         tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1592                        label_this, small);
1593         break;
1594     case TCG_COND_LT:
1595         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1596                        label_this, small);
1597         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1598         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1599                        label_this, small);
1600         break;
1601     case TCG_COND_LE:
1602         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1603                        label_this, small);
1604         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1605         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1606                        label_this, small);
1607         break;
1608     case TCG_COND_GT:
1609         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1610                        label_this, small);
1611         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1612         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1613                        label_this, small);
1614         break;
1615     case TCG_COND_GE:
1616         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1617                        label_this, small);
1618         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1619         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1620                        label_this, small);
1621         break;
1622     case TCG_COND_LTU:
1623         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1624                        label_this, small);
1625         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1626         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1627                        label_this, small);
1628         break;
1629     case TCG_COND_LEU:
1630         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1631                        label_this, small);
1632         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1633         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1634                        label_this, small);
1635         break;
1636     case TCG_COND_GTU:
1637         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1638                        label_this, small);
1639         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1640         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1641                        label_this, small);
1642         break;
1643     case TCG_COND_GEU:
1644         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1645                        label_this, small);
1646         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1647         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1648                        label_this, small);
1649         break;
1650     default:
1651         g_assert_not_reached();
1652     }
1653     tcg_out_label(s, label_next);
1654 }
1655 #endif
1656
1657 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1658                             TCGArg dest, TCGArg arg1, TCGArg arg2,
1659                             int const_arg2, bool neg)
1660 {
1661     bool inv = false;
1662     bool cleared;
1663     int jcc;
1664
1665     switch (cond) {
1666     case TCG_COND_NE:
1667         inv = true;
1668         /* fall through */
1669     case TCG_COND_EQ:
1670         /* If arg2 is 0, convert to LTU/GEU vs 1. */
1671         if (const_arg2 && arg2 == 0) {
1672             arg2 = 1;
1673             goto do_ltu;
1674         }
1675         break;
1676
1677     case TCG_COND_LEU:
1678         inv = true;
1679         /* fall through */
1680     case TCG_COND_GTU:
1681         /* If arg2 is a register, swap for LTU/GEU. */
1682         if (!const_arg2) {
1683             TCGReg t = arg1;
1684             arg1 = arg2;
1685             arg2 = t;
1686             goto do_ltu;
1687         }
1688         break;
1689
1690     case TCG_COND_GEU:
1691         inv = true;
1692         /* fall through */
1693     case TCG_COND_LTU:
1694     do_ltu:
1695         /*
1696          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1697          * We can then use NEG or INC to produce the desired result.
1698          * This is always smaller than the SETCC expansion.
1699          */
1700         tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, rexw);
1701
1702         /* X - X - C = -C = (C ? -1 : 0) */
1703         tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1704         if (inv && neg) {
1705             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1706             tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1707         } else if (inv) {
1708             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1709             tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1710         } else if (!neg) {
1711             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1712             tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1713         }
1714         return;
1715
1716     case TCG_COND_GE:
1717         inv = true;
1718         /* fall through */
1719     case TCG_COND_LT:
1720         /* If arg2 is 0, extract the sign bit. */
1721         if (const_arg2 && arg2 == 0) {
1722             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1723             if (inv) {
1724                 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1725             }
1726             tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1727                            dest, rexw ? 63 : 31);
1728             return;
1729         }
1730         break;
1731
1732     default:
1733         break;
1734     }
1735
1736     /*
1737      * If dest does not overlap the inputs, clearing it first is preferred.
1738      * The XOR breaks any false dependency for the low-byte write to dest,
1739      * and is also one byte smaller than MOVZBL.
1740      */
1741     cleared = false;
1742     if (dest != arg1 && (const_arg2 || dest != arg2)) {
1743         tgen_arithr(s, ARITH_XOR, dest, dest);
1744         cleared = true;
1745     }
1746
1747     jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1748     tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1749
1750     if (!cleared) {
1751         tcg_out_ext8u(s, dest, dest);
1752     }
1753     if (neg) {
1754         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1755     }
1756 }
1757
1758 #if TCG_TARGET_REG_BITS == 32
1759 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1760                              const int *const_args)
1761 {
1762     TCGArg new_args[6];
1763     TCGLabel *label_true, *label_over;
1764
1765     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1766
1767     if (args[0] == args[1] || args[0] == args[2]
1768         || (!const_args[3] && args[0] == args[3])
1769         || (!const_args[4] && args[0] == args[4])) {
1770         /* When the destination overlaps with one of the argument
1771            registers, don't do anything tricky.  */
1772         label_true = gen_new_label();
1773         label_over = gen_new_label();
1774
1775         new_args[5] = label_arg(label_true);
1776         tcg_out_brcond2(s, new_args, const_args+1, 1);
1777
1778         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1779         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1780         tcg_out_label(s, label_true);
1781
1782         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1783         tcg_out_label(s, label_over);
1784     } else {
1785         /* When the destination does not overlap one of the arguments,
1786            clear the destination first, jump if cond false, and emit an
1787            increment in the true case.  This results in smaller code.  */
1788
1789         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1790
1791         label_over = gen_new_label();
1792         new_args[4] = tcg_invert_cond(new_args[4]);
1793         new_args[5] = label_arg(label_over);
1794         tcg_out_brcond2(s, new_args, const_args+1, 1);
1795
1796         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1797         tcg_out_label(s, label_over);
1798     }
1799 }
1800 #endif
1801
1802 static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1803                          TCGReg dest, TCGReg v1)
1804 {
1805     if (have_cmov) {
1806         tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1807     } else {
1808         TCGLabel *over = gen_new_label();
1809         tcg_out_jxx(s, jcc ^ 1, over, 1);
1810         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1811         tcg_out_label(s, over);
1812     }
1813 }
1814
1815 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1816                             TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1817                             TCGReg v1)
1818 {
1819     int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1820     tcg_out_cmov(s, jcc, rexw, dest, v1);
1821 }
1822
1823 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1824                         TCGArg arg2, bool const_a2)
1825 {
1826     if (have_bmi1) {
1827         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1828         if (const_a2) {
1829             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1830         } else {
1831             tcg_debug_assert(dest != arg2);
1832             tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1833         }
1834     } else {
1835         tcg_debug_assert(dest != arg2);
1836         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1837         tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1838     }
1839 }
1840
1841 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1842                         TCGArg arg2, bool const_a2)
1843 {
1844     if (have_lzcnt) {
1845         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1846         if (const_a2) {
1847             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1848         } else {
1849             tcg_debug_assert(dest != arg2);
1850             tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1851         }
1852     } else {
1853         tcg_debug_assert(!const_a2);
1854         tcg_debug_assert(dest != arg1);
1855         tcg_debug_assert(dest != arg2);
1856
1857         /* Recall that the output of BSR is the index not the count.  */
1858         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1859         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1860
1861         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1862         int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1863         tcg_out_cmov(s, jcc, rexw, dest, arg2);
1864     }
1865 }
1866
1867 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1868 {
1869     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1870
1871     if (disp == (int32_t)disp) {
1872         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1873         tcg_out32(s, disp);
1874     } else {
1875         /* rip-relative addressing into the constant pool.
1876            This is 6 + 8 = 14 bytes, as compared to using an
1877            immediate load 10 + 6 = 16 bytes, plus we may
1878            be able to re-use the pool constant for more calls.  */
1879         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1880         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1881         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1882         tcg_out32(s, 0);
1883     }
1884 }
1885
1886 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1887                          const TCGHelperInfo *info)
1888 {
1889     tcg_out_branch(s, 1, dest);
1890
1891 #ifndef _WIN32
1892     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1893         /*
1894          * The sysv i386 abi for struct return places a reference as the
1895          * first argument of the stack, and pops that argument with the
1896          * return statement.  Since we want to retain the aligned stack
1897          * pointer for the callee, we do not want to actually push that
1898          * argument before the call but rely on the normal store to the
1899          * stack slot.  But we do need to compensate for the pop in order
1900          * to reset our correct stack pointer value.
1901          * Pushing a garbage value back onto the stack is quickest.
1902          */
1903         tcg_out_push(s, TCG_REG_EAX);
1904     }
1905 #endif
1906 }
1907
1908 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1909 {
1910     tcg_out_branch(s, 0, dest);
1911 }
1912
1913 static void tcg_out_nopn(TCGContext *s, int n)
1914 {
1915     int i;
1916     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1917      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1918      * duplicate prefix, and all of the interesting recent cores can
1919      * decode and discard the duplicates in a single cycle.
1920      */
1921     tcg_debug_assert(n >= 1);
1922     for (i = 1; i < n; ++i) {
1923         tcg_out8(s, 0x66);
1924     }
1925     tcg_out8(s, 0x90);
1926 }
1927
1928 typedef struct {
1929     TCGReg base;
1930     int index;
1931     int ofs;
1932     int seg;
1933     TCGAtomAlign aa;
1934 } HostAddress;
1935
1936 bool tcg_target_has_memory_bswap(MemOp memop)
1937 {
1938     TCGAtomAlign aa;
1939
1940     if (!have_movbe) {
1941         return false;
1942     }
1943     if ((memop & MO_SIZE) < MO_128) {
1944         return true;
1945     }
1946
1947     /*
1948      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1949      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1950      */
1951     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1952     return aa.atom < MO_128;
1953 }
1954
1955 /*
1956  * Because i686 has no register parameters and because x86_64 has xchg
1957  * to handle addr/data register overlap, we have placed all input arguments
1958  * before we need might need a scratch reg.
1959  *
1960  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1961  * a general-purpose scratch when we don't actually know it's available,
1962  * use the ra_gen hook to load into RAX if needed.
1963  */
1964 #if TCG_TARGET_REG_BITS == 64
1965 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1966 {
1967     if (arg < 0) {
1968         arg = TCG_REG_RAX;
1969     }
1970     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1971     return arg;
1972 }
1973 static const TCGLdstHelperParam ldst_helper_param = {
1974     .ra_gen = ldst_ra_gen
1975 };
1976 #else
1977 static const TCGLdstHelperParam ldst_helper_param = { };
1978 #endif
1979
1980 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1981                                 TCGReg l, TCGReg h, TCGReg v)
1982 {
1983     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1984
1985     /* vpmov{d,q} %v, %l */
1986     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1987     /* vpextr{d,q} $1, %v, %h */
1988     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1989     tcg_out8(s, 1);
1990 }
1991
1992 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1993                                 TCGReg v, TCGReg l, TCGReg h)
1994 {
1995     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1996
1997     /* vmov{d,q} %l, %v */
1998     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1999     /* vpinsr{d,q} $1, %h, %v, %v */
2000     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2001     tcg_out8(s, 1);
2002 }
2003
2004 /*
2005  * Generate code for the slow path for a load at the end of block
2006  */
2007 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2008 {
2009     MemOp opc = get_memop(l->oi);
2010     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2011
2012     /* resolve label address */
2013     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2014     if (label_ptr[1]) {
2015         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2016     }
2017
2018     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2019     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2020     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2021
2022     tcg_out_jmp(s, l->raddr);
2023     return true;
2024 }
2025
2026 /*
2027  * Generate code for the slow path for a store at the end of block
2028  */
2029 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2030 {
2031     MemOp opc = get_memop(l->oi);
2032     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2033
2034     /* resolve label address */
2035     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2036     if (label_ptr[1]) {
2037         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2038     }
2039
2040     tcg_out_st_helper_args(s, l, &ldst_helper_param);
2041     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2042
2043     tcg_out_jmp(s, l->raddr);
2044     return true;
2045 }
2046
2047 #ifdef CONFIG_USER_ONLY
2048 static HostAddress x86_guest_base = {
2049     .index = -1
2050 };
2051
2052 #if defined(__x86_64__) && defined(__linux__)
2053 # include <asm/prctl.h>
2054 # include <sys/prctl.h>
2055 int arch_prctl(int code, unsigned long addr);
2056 static inline int setup_guest_base_seg(void)
2057 {
2058     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2059         return P_GS;
2060     }
2061     return 0;
2062 }
2063 #define setup_guest_base_seg  setup_guest_base_seg
2064 #elif defined(__x86_64__) && \
2065       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2066 # include <machine/sysarch.h>
2067 static inline int setup_guest_base_seg(void)
2068 {
2069     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2070         return P_GS;
2071     }
2072     return 0;
2073 }
2074 #define setup_guest_base_seg  setup_guest_base_seg
2075 #endif
2076 #else
2077 # define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2078 #endif /* CONFIG_USER_ONLY */
2079 #ifndef setup_guest_base_seg
2080 # define setup_guest_base_seg()  0
2081 #endif
2082
2083 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2084
2085 /*
2086  * For softmmu, perform the TLB load and compare.
2087  * For useronly, perform any required alignment tests.
2088  * In both cases, return a TCGLabelQemuLdst structure if the slow path
2089  * is required and fill in @h with the host address for the fast path.
2090  */
2091 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2092                                            TCGReg addrlo, TCGReg addrhi,
2093                                            MemOpIdx oi, bool is_ld)
2094 {
2095     TCGLabelQemuLdst *ldst = NULL;
2096     MemOp opc = get_memop(oi);
2097     MemOp s_bits = opc & MO_SIZE;
2098     unsigned a_mask;
2099
2100     if (tcg_use_softmmu) {
2101         h->index = TCG_REG_L0;
2102         h->ofs = 0;
2103         h->seg = 0;
2104     } else {
2105         *h = x86_guest_base;
2106     }
2107     h->base = addrlo;
2108     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2109     a_mask = (1 << h->aa.align) - 1;
2110
2111     if (tcg_use_softmmu) {
2112         int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2113                             : offsetof(CPUTLBEntry, addr_write);
2114         TCGType ttype = TCG_TYPE_I32;
2115         TCGType tlbtype = TCG_TYPE_I32;
2116         int trexw = 0, hrexw = 0, tlbrexw = 0;
2117         unsigned mem_index = get_mmuidx(oi);
2118         unsigned s_mask = (1 << s_bits) - 1;
2119         int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2120         int tlb_mask;
2121
2122         ldst = new_ldst_label(s);
2123         ldst->is_ld = is_ld;
2124         ldst->oi = oi;
2125         ldst->addrlo_reg = addrlo;
2126         ldst->addrhi_reg = addrhi;
2127
2128         if (TCG_TARGET_REG_BITS == 64) {
2129             ttype = s->addr_type;
2130             trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2131             if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2132                 hrexw = P_REXW;
2133                 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2134                     tlbtype = TCG_TYPE_I64;
2135                     tlbrexw = P_REXW;
2136                 }
2137             }
2138         }
2139
2140         tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2141         tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2142                        s->page_bits - CPU_TLB_ENTRY_BITS);
2143
2144         tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2145                              fast_ofs + offsetof(CPUTLBDescFast, mask));
2146
2147         tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2148                              fast_ofs + offsetof(CPUTLBDescFast, table));
2149
2150         /*
2151          * If the required alignment is at least as large as the access,
2152          * simply copy the address and mask.  For lesser alignments,
2153          * check that we don't cross pages for the complete access.
2154          */
2155         if (a_mask >= s_mask) {
2156             tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2157         } else {
2158             tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2159                                  addrlo, s_mask - a_mask);
2160         }
2161         tlb_mask = s->page_mask | a_mask;
2162         tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2163
2164         /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2165         tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2166                              TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2167
2168         /* jne slow_path */
2169         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2170         ldst->label_ptr[0] = s->code_ptr;
2171         s->code_ptr += 4;
2172
2173         if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2174             /* cmp 4(TCG_REG_L0), addrhi */
2175             tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2176                                  TCG_REG_L0, cmp_ofs + 4);
2177
2178             /* jne slow_path */
2179             tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2180             ldst->label_ptr[1] = s->code_ptr;
2181             s->code_ptr += 4;
2182         }
2183
2184         /* TLB Hit.  */
2185         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2186                    offsetof(CPUTLBEntry, addend));
2187     } else if (a_mask) {
2188         int jcc;
2189
2190         ldst = new_ldst_label(s);
2191         ldst->is_ld = is_ld;
2192         ldst->oi = oi;
2193         ldst->addrlo_reg = addrlo;
2194         ldst->addrhi_reg = addrhi;
2195
2196         /* jne slow_path */
2197         jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2198         tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2199         ldst->label_ptr[0] = s->code_ptr;
2200         s->code_ptr += 4;
2201     }
2202
2203     return ldst;
2204 }
2205
2206 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2207                                    HostAddress h, TCGType type, MemOp memop)
2208 {
2209     bool use_movbe = false;
2210     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2211     int movop = OPC_MOVL_GvEv;
2212
2213     /* Do big-endian loads with movbe.  */
2214     if (memop & MO_BSWAP) {
2215         tcg_debug_assert(have_movbe);
2216         use_movbe = true;
2217         movop = OPC_MOVBE_GyMy;
2218     }
2219
2220     switch (memop & MO_SSIZE) {
2221     case MO_UB:
2222         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2223                                  h.base, h.index, 0, h.ofs);
2224         break;
2225     case MO_SB:
2226         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2227                                  h.base, h.index, 0, h.ofs);
2228         break;
2229     case MO_UW:
2230         if (use_movbe) {
2231             /* There is no extending movbe; only low 16-bits are modified.  */
2232             if (datalo != h.base && datalo != h.index) {
2233                 /* XOR breaks dependency chains.  */
2234                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2235                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2236                                          datalo, h.base, h.index, 0, h.ofs);
2237             } else {
2238                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2239                                          datalo, h.base, h.index, 0, h.ofs);
2240                 tcg_out_ext16u(s, datalo, datalo);
2241             }
2242         } else {
2243             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2244                                      h.base, h.index, 0, h.ofs);
2245         }
2246         break;
2247     case MO_SW:
2248         if (use_movbe) {
2249             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2250                                      datalo, h.base, h.index, 0, h.ofs);
2251             tcg_out_ext16s(s, type, datalo, datalo);
2252         } else {
2253             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2254                                      datalo, h.base, h.index, 0, h.ofs);
2255         }
2256         break;
2257     case MO_UL:
2258         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2259                                  h.base, h.index, 0, h.ofs);
2260         break;
2261 #if TCG_TARGET_REG_BITS == 64
2262     case MO_SL:
2263         if (use_movbe) {
2264             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2265                                      h.base, h.index, 0, h.ofs);
2266             tcg_out_ext32s(s, datalo, datalo);
2267         } else {
2268             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2269                                      h.base, h.index, 0, h.ofs);
2270         }
2271         break;
2272 #endif
2273     case MO_UQ:
2274         if (TCG_TARGET_REG_BITS == 64) {
2275             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2276                                      h.base, h.index, 0, h.ofs);
2277             break;
2278         }
2279         if (use_movbe) {
2280             TCGReg t = datalo;
2281             datalo = datahi;
2282             datahi = t;
2283         }
2284         if (h.base == datalo || h.index == datalo) {
2285             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2286                                      h.base, h.index, 0, h.ofs);
2287             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2288             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2289         } else {
2290             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2291                                      h.base, h.index, 0, h.ofs);
2292             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2293                                      h.base, h.index, 0, h.ofs + 4);
2294         }
2295         break;
2296
2297     case MO_128:
2298         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2299
2300         /*
2301          * Without 16-byte atomicity, use integer regs.
2302          * That is where we want the data, and it allows bswaps.
2303          */
2304         if (h.aa.atom < MO_128) {
2305             if (use_movbe) {
2306                 TCGReg t = datalo;
2307                 datalo = datahi;
2308                 datahi = t;
2309             }
2310             if (h.base == datalo || h.index == datalo) {
2311                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2312                                          h.base, h.index, 0, h.ofs);
2313                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2314                                      datalo, datahi, 0);
2315                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2316                                      datahi, datahi, 8);
2317             } else {
2318                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2319                                          h.base, h.index, 0, h.ofs);
2320                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2321                                          h.base, h.index, 0, h.ofs + 8);
2322             }
2323             break;
2324         }
2325
2326         /*
2327          * With 16-byte atomicity, a vector load is required.
2328          * If we already have 16-byte alignment, then VMOVDQA always works.
2329          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2330          * Else use we require a runtime test for alignment for VMOVDQA;
2331          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2332          */
2333         if (h.aa.align >= MO_128) {
2334             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2335                                          TCG_TMP_VEC, 0,
2336                                          h.base, h.index, 0, h.ofs);
2337         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2338             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2339                                          TCG_TMP_VEC, 0,
2340                                          h.base, h.index, 0, h.ofs);
2341         } else {
2342             TCGLabel *l1 = gen_new_label();
2343             TCGLabel *l2 = gen_new_label();
2344             int jcc;
2345
2346             jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2347             tcg_out_jxx(s, jcc, l1, true);
2348
2349             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2350                                          TCG_TMP_VEC, 0,
2351                                          h.base, h.index, 0, h.ofs);
2352             tcg_out_jxx(s, JCC_JMP, l2, true);
2353
2354             tcg_out_label(s, l1);
2355             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2356                                          TCG_TMP_VEC, 0,
2357                                          h.base, h.index, 0, h.ofs);
2358             tcg_out_label(s, l2);
2359         }
2360         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2361         break;
2362
2363     default:
2364         g_assert_not_reached();
2365     }
2366 }
2367
2368 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2369                             TCGReg addrlo, TCGReg addrhi,
2370                             MemOpIdx oi, TCGType data_type)
2371 {
2372     TCGLabelQemuLdst *ldst;
2373     HostAddress h;
2374
2375     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2376     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2377
2378     if (ldst) {
2379         ldst->type = data_type;
2380         ldst->datalo_reg = datalo;
2381         ldst->datahi_reg = datahi;
2382         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2383     }
2384 }
2385
2386 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2387                                    HostAddress h, MemOp memop)
2388 {
2389     bool use_movbe = false;
2390     int movop = OPC_MOVL_EvGv;
2391
2392     /*
2393      * Do big-endian stores with movbe or system-mode.
2394      * User-only without movbe will have its swapping done generically.
2395      */
2396     if (memop & MO_BSWAP) {
2397         tcg_debug_assert(have_movbe);
2398         use_movbe = true;
2399         movop = OPC_MOVBE_MyGy;
2400     }
2401
2402     switch (memop & MO_SIZE) {
2403     case MO_8:
2404         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2405         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2406         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2407                                  datalo, h.base, h.index, 0, h.ofs);
2408         break;
2409     case MO_16:
2410         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2411                                  h.base, h.index, 0, h.ofs);
2412         break;
2413     case MO_32:
2414         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2415                                  h.base, h.index, 0, h.ofs);
2416         break;
2417     case MO_64:
2418         if (TCG_TARGET_REG_BITS == 64) {
2419             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2420                                      h.base, h.index, 0, h.ofs);
2421         } else {
2422             if (use_movbe) {
2423                 TCGReg t = datalo;
2424                 datalo = datahi;
2425                 datahi = t;
2426             }
2427             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2428                                      h.base, h.index, 0, h.ofs);
2429             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2430                                      h.base, h.index, 0, h.ofs + 4);
2431         }
2432         break;
2433
2434     case MO_128:
2435         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2436
2437         /*
2438          * Without 16-byte atomicity, use integer regs.
2439          * That is where we have the data, and it allows bswaps.
2440          */
2441         if (h.aa.atom < MO_128) {
2442             if (use_movbe) {
2443                 TCGReg t = datalo;
2444                 datalo = datahi;
2445                 datahi = t;
2446             }
2447             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2448                                      h.base, h.index, 0, h.ofs);
2449             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2450                                      h.base, h.index, 0, h.ofs + 8);
2451             break;
2452         }
2453
2454         /*
2455          * With 16-byte atomicity, a vector store is required.
2456          * If we already have 16-byte alignment, then VMOVDQA always works.
2457          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2458          * Else use we require a runtime test for alignment for VMOVDQA;
2459          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2460          */
2461         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2462         if (h.aa.align >= MO_128) {
2463             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2464                                          TCG_TMP_VEC, 0,
2465                                          h.base, h.index, 0, h.ofs);
2466         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2467             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2468                                          TCG_TMP_VEC, 0,
2469                                          h.base, h.index, 0, h.ofs);
2470         } else {
2471             TCGLabel *l1 = gen_new_label();
2472             TCGLabel *l2 = gen_new_label();
2473             int jcc;
2474
2475             jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2476             tcg_out_jxx(s, jcc, l1, true);
2477
2478             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2479                                          TCG_TMP_VEC, 0,
2480                                          h.base, h.index, 0, h.ofs);
2481             tcg_out_jxx(s, JCC_JMP, l2, true);
2482
2483             tcg_out_label(s, l1);
2484             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2485                                          TCG_TMP_VEC, 0,
2486                                          h.base, h.index, 0, h.ofs);
2487             tcg_out_label(s, l2);
2488         }
2489         break;
2490
2491     default:
2492         g_assert_not_reached();
2493     }
2494 }
2495
2496 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2497                             TCGReg addrlo, TCGReg addrhi,
2498                             MemOpIdx oi, TCGType data_type)
2499 {
2500     TCGLabelQemuLdst *ldst;
2501     HostAddress h;
2502
2503     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2504     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2505
2506     if (ldst) {
2507         ldst->type = data_type;
2508         ldst->datalo_reg = datalo;
2509         ldst->datahi_reg = datahi;
2510         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2511     }
2512 }
2513
2514 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2515 {
2516     /* Reuse the zeroing that exists for goto_ptr.  */
2517     if (a0 == 0) {
2518         tcg_out_jmp(s, tcg_code_gen_epilogue);
2519     } else {
2520         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2521         tcg_out_jmp(s, tb_ret_addr);
2522     }
2523 }
2524
2525 static void tcg_out_goto_tb(TCGContext *s, int which)
2526 {
2527     /*
2528      * Jump displacement must be aligned for atomic patching;
2529      * see if we need to add extra nops before jump
2530      */
2531     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2532     if (gap != 1) {
2533         tcg_out_nopn(s, gap - 1);
2534     }
2535     tcg_out8(s, OPC_JMP_long); /* jmp im */
2536     set_jmp_insn_offset(s, which);
2537     tcg_out32(s, 0);
2538     set_jmp_reset_offset(s, which);
2539 }
2540
2541 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2542                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2543 {
2544     /* patch the branch destination */
2545     uintptr_t addr = tb->jmp_target_addr[n];
2546     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2547     /* no need to flush icache explicitly */
2548 }
2549
2550 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2551                               const TCGArg args[TCG_MAX_OP_ARGS],
2552                               const int const_args[TCG_MAX_OP_ARGS])
2553 {
2554     TCGArg a0, a1, a2;
2555     int c, const_a2, vexop, rexw = 0;
2556
2557 #if TCG_TARGET_REG_BITS == 64
2558 # define OP_32_64(x) \
2559         case glue(glue(INDEX_op_, x), _i64): \
2560             rexw = P_REXW; /* FALLTHRU */    \
2561         case glue(glue(INDEX_op_, x), _i32)
2562 #else
2563 # define OP_32_64(x) \
2564         case glue(glue(INDEX_op_, x), _i32)
2565 #endif
2566
2567     /* Hoist the loads of the most common arguments.  */
2568     a0 = args[0];
2569     a1 = args[1];
2570     a2 = args[2];
2571     const_a2 = const_args[2];
2572
2573     switch (opc) {
2574     case INDEX_op_goto_ptr:
2575         /* jmp to the given host address (could be epilogue) */
2576         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2577         break;
2578     case INDEX_op_br:
2579         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2580         break;
2581     OP_32_64(ld8u):
2582         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2583         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2584         break;
2585     OP_32_64(ld8s):
2586         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2587         break;
2588     OP_32_64(ld16u):
2589         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2590         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2591         break;
2592     OP_32_64(ld16s):
2593         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2594         break;
2595 #if TCG_TARGET_REG_BITS == 64
2596     case INDEX_op_ld32u_i64:
2597 #endif
2598     case INDEX_op_ld_i32:
2599         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2600         break;
2601
2602     OP_32_64(st8):
2603         if (const_args[0]) {
2604             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2605             tcg_out8(s, a0);
2606         } else {
2607             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2608         }
2609         break;
2610     OP_32_64(st16):
2611         if (const_args[0]) {
2612             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2613             tcg_out16(s, a0);
2614         } else {
2615             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2616         }
2617         break;
2618 #if TCG_TARGET_REG_BITS == 64
2619     case INDEX_op_st32_i64:
2620 #endif
2621     case INDEX_op_st_i32:
2622         if (const_args[0]) {
2623             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2624             tcg_out32(s, a0);
2625         } else {
2626             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2627         }
2628         break;
2629
2630     OP_32_64(add):
2631         /* For 3-operand addition, use LEA.  */
2632         if (a0 != a1) {
2633             TCGArg c3 = 0;
2634             if (const_a2) {
2635                 c3 = a2, a2 = -1;
2636             } else if (a0 == a2) {
2637                 /* Watch out for dest = src + dest, since we've removed
2638                    the matching constraint on the add.  */
2639                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2640                 break;
2641             }
2642
2643             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2644             break;
2645         }
2646         c = ARITH_ADD;
2647         goto gen_arith;
2648     OP_32_64(sub):
2649         c = ARITH_SUB;
2650         goto gen_arith;
2651     OP_32_64(and):
2652         c = ARITH_AND;
2653         goto gen_arith;
2654     OP_32_64(or):
2655         c = ARITH_OR;
2656         goto gen_arith;
2657     OP_32_64(xor):
2658         c = ARITH_XOR;
2659         goto gen_arith;
2660     gen_arith:
2661         if (const_a2) {
2662             tgen_arithi(s, c + rexw, a0, a2, 0);
2663         } else {
2664             tgen_arithr(s, c + rexw, a0, a2);
2665         }
2666         break;
2667
2668     OP_32_64(andc):
2669         if (const_a2) {
2670             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2671             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2672         } else {
2673             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2674         }
2675         break;
2676
2677     OP_32_64(mul):
2678         if (const_a2) {
2679             int32_t val;
2680             val = a2;
2681             if (val == (int8_t)val) {
2682                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2683                 tcg_out8(s, val);
2684             } else {
2685                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2686                 tcg_out32(s, val);
2687             }
2688         } else {
2689             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2690         }
2691         break;
2692
2693     OP_32_64(div2):
2694         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2695         break;
2696     OP_32_64(divu2):
2697         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2698         break;
2699
2700     OP_32_64(shl):
2701         /* For small constant 3-operand shift, use LEA.  */
2702         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2703             if (a2 - 1 == 0) {
2704                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2705                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2706             } else {
2707                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2708                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2709             }
2710             break;
2711         }
2712         c = SHIFT_SHL;
2713         vexop = OPC_SHLX;
2714         goto gen_shift_maybe_vex;
2715     OP_32_64(shr):
2716         c = SHIFT_SHR;
2717         vexop = OPC_SHRX;
2718         goto gen_shift_maybe_vex;
2719     OP_32_64(sar):
2720         c = SHIFT_SAR;
2721         vexop = OPC_SARX;
2722         goto gen_shift_maybe_vex;
2723     OP_32_64(rotl):
2724         c = SHIFT_ROL;
2725         goto gen_shift;
2726     OP_32_64(rotr):
2727         c = SHIFT_ROR;
2728         goto gen_shift;
2729     gen_shift_maybe_vex:
2730         if (have_bmi2) {
2731             if (!const_a2) {
2732                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2733                 break;
2734             }
2735             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2736         }
2737         /* FALLTHRU */
2738     gen_shift:
2739         if (const_a2) {
2740             tcg_out_shifti(s, c + rexw, a0, a2);
2741         } else {
2742             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2743         }
2744         break;
2745
2746     OP_32_64(ctz):
2747         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2748         break;
2749     OP_32_64(clz):
2750         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2751         break;
2752     OP_32_64(ctpop):
2753         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2754         break;
2755
2756     OP_32_64(brcond):
2757         tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2758                        arg_label(args[3]), 0);
2759         break;
2760     OP_32_64(setcond):
2761         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2762         break;
2763     OP_32_64(negsetcond):
2764         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2765         break;
2766     OP_32_64(movcond):
2767         tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2768         break;
2769
2770     OP_32_64(bswap16):
2771         if (a2 & TCG_BSWAP_OS) {
2772             /* Output must be sign-extended. */
2773             if (rexw) {
2774                 tcg_out_bswap64(s, a0);
2775                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2776             } else {
2777                 tcg_out_bswap32(s, a0);
2778                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2779             }
2780         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2781             /* Output must be zero-extended, but input isn't. */
2782             tcg_out_bswap32(s, a0);
2783             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2784         } else {
2785             tcg_out_rolw_8(s, a0);
2786         }
2787         break;
2788     OP_32_64(bswap32):
2789         tcg_out_bswap32(s, a0);
2790         if (rexw && (a2 & TCG_BSWAP_OS)) {
2791             tcg_out_ext32s(s, a0, a0);
2792         }
2793         break;
2794
2795     OP_32_64(neg):
2796         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2797         break;
2798     OP_32_64(not):
2799         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2800         break;
2801
2802     case INDEX_op_qemu_ld_a64_i32:
2803         if (TCG_TARGET_REG_BITS == 32) {
2804             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2805             break;
2806         }
2807         /* fall through */
2808     case INDEX_op_qemu_ld_a32_i32:
2809         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2810         break;
2811     case INDEX_op_qemu_ld_a32_i64:
2812         if (TCG_TARGET_REG_BITS == 64) {
2813             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2814         } else {
2815             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2816         }
2817         break;
2818     case INDEX_op_qemu_ld_a64_i64:
2819         if (TCG_TARGET_REG_BITS == 64) {
2820             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2821         } else {
2822             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2823         }
2824         break;
2825     case INDEX_op_qemu_ld_a32_i128:
2826     case INDEX_op_qemu_ld_a64_i128:
2827         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2828         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2829         break;
2830
2831     case INDEX_op_qemu_st_a64_i32:
2832     case INDEX_op_qemu_st8_a64_i32:
2833         if (TCG_TARGET_REG_BITS == 32) {
2834             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2835             break;
2836         }
2837         /* fall through */
2838     case INDEX_op_qemu_st_a32_i32:
2839     case INDEX_op_qemu_st8_a32_i32:
2840         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2841         break;
2842     case INDEX_op_qemu_st_a32_i64:
2843         if (TCG_TARGET_REG_BITS == 64) {
2844             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2845         } else {
2846             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2847         }
2848         break;
2849     case INDEX_op_qemu_st_a64_i64:
2850         if (TCG_TARGET_REG_BITS == 64) {
2851             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2852         } else {
2853             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2854         }
2855         break;
2856     case INDEX_op_qemu_st_a32_i128:
2857     case INDEX_op_qemu_st_a64_i128:
2858         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2859         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2860         break;
2861
2862     OP_32_64(mulu2):
2863         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2864         break;
2865     OP_32_64(muls2):
2866         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2867         break;
2868     OP_32_64(add2):
2869         if (const_args[4]) {
2870             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2871         } else {
2872             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2873         }
2874         if (const_args[5]) {
2875             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2876         } else {
2877             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2878         }
2879         break;
2880     OP_32_64(sub2):
2881         if (const_args[4]) {
2882             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2883         } else {
2884             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2885         }
2886         if (const_args[5]) {
2887             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2888         } else {
2889             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2890         }
2891         break;
2892
2893 #if TCG_TARGET_REG_BITS == 32
2894     case INDEX_op_brcond2_i32:
2895         tcg_out_brcond2(s, args, const_args, 0);
2896         break;
2897     case INDEX_op_setcond2_i32:
2898         tcg_out_setcond2(s, args, const_args);
2899         break;
2900 #else /* TCG_TARGET_REG_BITS == 64 */
2901     case INDEX_op_ld32s_i64:
2902         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2903         break;
2904     case INDEX_op_ld_i64:
2905         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2906         break;
2907     case INDEX_op_st_i64:
2908         if (const_args[0]) {
2909             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2910             tcg_out32(s, a0);
2911         } else {
2912             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2913         }
2914         break;
2915
2916     case INDEX_op_bswap64_i64:
2917         tcg_out_bswap64(s, a0);
2918         break;
2919     case INDEX_op_extrh_i64_i32:
2920         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2921         break;
2922 #endif
2923
2924     OP_32_64(deposit):
2925         if (args[3] == 0 && args[4] == 8) {
2926             /* load bits 0..7 */
2927             if (const_a2) {
2928                 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2929                             0, a0, 0);
2930                 tcg_out8(s, a2);
2931             } else {
2932                 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2933             }
2934         } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2935             /* load bits 8..15 */
2936             if (const_a2) {
2937                 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2938                 tcg_out8(s, a2);
2939             } else {
2940                 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2941             }
2942         } else if (args[3] == 0 && args[4] == 16) {
2943             /* load bits 0..15 */
2944             if (const_a2) {
2945                 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2946                             0, a0, 0);
2947                 tcg_out16(s, a2);
2948             } else {
2949                 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2950             }
2951         } else {
2952             g_assert_not_reached();
2953         }
2954         break;
2955
2956     case INDEX_op_extract_i64:
2957         if (a2 + args[3] == 32) {
2958             /* This is a 32-bit zero-extending right shift.  */
2959             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2960             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2961             break;
2962         }
2963         /* FALLTHRU */
2964     case INDEX_op_extract_i32:
2965         /* On the off-chance that we can use the high-byte registers.
2966            Otherwise we emit the same ext16 + shift pattern that we
2967            would have gotten from the normal tcg-op.c expansion.  */
2968         tcg_debug_assert(a2 == 8 && args[3] == 8);
2969         if (a1 < 4 && a0 < 8) {
2970             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2971         } else {
2972             tcg_out_ext16u(s, a0, a1);
2973             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2974         }
2975         break;
2976
2977     case INDEX_op_sextract_i32:
2978         /* We don't implement sextract_i64, as we cannot sign-extend to
2979            64-bits without using the REX prefix that explicitly excludes
2980            access to the high-byte registers.  */
2981         tcg_debug_assert(a2 == 8 && args[3] == 8);
2982         if (a1 < 4 && a0 < 8) {
2983             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2984         } else {
2985             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2986             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2987         }
2988         break;
2989
2990     OP_32_64(extract2):
2991         /* Note that SHRD outputs to the r/m operand.  */
2992         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2993         tcg_out8(s, args[3]);
2994         break;
2995
2996     case INDEX_op_mb:
2997         tcg_out_mb(s, a0);
2998         break;
2999     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3000     case INDEX_op_mov_i64:
3001     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3002     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3003     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3004     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3005     case INDEX_op_ext8s_i64:
3006     case INDEX_op_ext8u_i32:
3007     case INDEX_op_ext8u_i64:
3008     case INDEX_op_ext16s_i32:
3009     case INDEX_op_ext16s_i64:
3010     case INDEX_op_ext16u_i32:
3011     case INDEX_op_ext16u_i64:
3012     case INDEX_op_ext32s_i64:
3013     case INDEX_op_ext32u_i64:
3014     case INDEX_op_ext_i32_i64:
3015     case INDEX_op_extu_i32_i64:
3016     case INDEX_op_extrl_i64_i32:
3017     default:
3018         g_assert_not_reached();
3019     }
3020
3021 #undef OP_32_64
3022 }
3023
3024 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3025                            unsigned vecl, unsigned vece,
3026                            const TCGArg args[TCG_MAX_OP_ARGS],
3027                            const int const_args[TCG_MAX_OP_ARGS])
3028 {
3029     static int const add_insn[4] = {
3030         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3031     };
3032     static int const ssadd_insn[4] = {
3033         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3034     };
3035     static int const usadd_insn[4] = {
3036         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3037     };
3038     static int const sub_insn[4] = {
3039         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3040     };
3041     static int const sssub_insn[4] = {
3042         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3043     };
3044     static int const ussub_insn[4] = {
3045         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3046     };
3047     static int const mul_insn[4] = {
3048         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3049     };
3050     static int const shift_imm_insn[4] = {
3051         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3052     };
3053     static int const cmpeq_insn[4] = {
3054         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3055     };
3056     static int const cmpgt_insn[4] = {
3057         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3058     };
3059     static int const punpckl_insn[4] = {
3060         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3061     };
3062     static int const punpckh_insn[4] = {
3063         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3064     };
3065     static int const packss_insn[4] = {
3066         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3067     };
3068     static int const packus_insn[4] = {
3069         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3070     };
3071     static int const smin_insn[4] = {
3072         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3073     };
3074     static int const smax_insn[4] = {
3075         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3076     };
3077     static int const umin_insn[4] = {
3078         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3079     };
3080     static int const umax_insn[4] = {
3081         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3082     };
3083     static int const rotlv_insn[4] = {
3084         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3085     };
3086     static int const rotrv_insn[4] = {
3087         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3088     };
3089     static int const shlv_insn[4] = {
3090         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3091     };
3092     static int const shrv_insn[4] = {
3093         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3094     };
3095     static int const sarv_insn[4] = {
3096         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3097     };
3098     static int const shls_insn[4] = {
3099         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3100     };
3101     static int const shrs_insn[4] = {
3102         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3103     };
3104     static int const sars_insn[4] = {
3105         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3106     };
3107     static int const vpshldi_insn[4] = {
3108         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3109     };
3110     static int const vpshldv_insn[4] = {
3111         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3112     };
3113     static int const vpshrdv_insn[4] = {
3114         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3115     };
3116     static int const abs_insn[4] = {
3117         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3118     };
3119
3120     TCGType type = vecl + TCG_TYPE_V64;
3121     int insn, sub;
3122     TCGArg a0, a1, a2, a3;
3123
3124     a0 = args[0];
3125     a1 = args[1];
3126     a2 = args[2];
3127
3128     switch (opc) {
3129     case INDEX_op_add_vec:
3130         insn = add_insn[vece];
3131         goto gen_simd;
3132     case INDEX_op_ssadd_vec:
3133         insn = ssadd_insn[vece];
3134         goto gen_simd;
3135     case INDEX_op_usadd_vec:
3136         insn = usadd_insn[vece];
3137         goto gen_simd;
3138     case INDEX_op_sub_vec:
3139         insn = sub_insn[vece];
3140         goto gen_simd;
3141     case INDEX_op_sssub_vec:
3142         insn = sssub_insn[vece];
3143         goto gen_simd;
3144     case INDEX_op_ussub_vec:
3145         insn = ussub_insn[vece];
3146         goto gen_simd;
3147     case INDEX_op_mul_vec:
3148         insn = mul_insn[vece];
3149         goto gen_simd;
3150     case INDEX_op_and_vec:
3151         insn = OPC_PAND;
3152         goto gen_simd;
3153     case INDEX_op_or_vec:
3154         insn = OPC_POR;
3155         goto gen_simd;
3156     case INDEX_op_xor_vec:
3157         insn = OPC_PXOR;
3158         goto gen_simd;
3159     case INDEX_op_smin_vec:
3160         insn = smin_insn[vece];
3161         goto gen_simd;
3162     case INDEX_op_umin_vec:
3163         insn = umin_insn[vece];
3164         goto gen_simd;
3165     case INDEX_op_smax_vec:
3166         insn = smax_insn[vece];
3167         goto gen_simd;
3168     case INDEX_op_umax_vec:
3169         insn = umax_insn[vece];
3170         goto gen_simd;
3171     case INDEX_op_shlv_vec:
3172         insn = shlv_insn[vece];
3173         goto gen_simd;
3174     case INDEX_op_shrv_vec:
3175         insn = shrv_insn[vece];
3176         goto gen_simd;
3177     case INDEX_op_sarv_vec:
3178         insn = sarv_insn[vece];
3179         goto gen_simd;
3180     case INDEX_op_rotlv_vec:
3181         insn = rotlv_insn[vece];
3182         goto gen_simd;
3183     case INDEX_op_rotrv_vec:
3184         insn = rotrv_insn[vece];
3185         goto gen_simd;
3186     case INDEX_op_shls_vec:
3187         insn = shls_insn[vece];
3188         goto gen_simd;
3189     case INDEX_op_shrs_vec:
3190         insn = shrs_insn[vece];
3191         goto gen_simd;
3192     case INDEX_op_sars_vec:
3193         insn = sars_insn[vece];
3194         goto gen_simd;
3195     case INDEX_op_x86_punpckl_vec:
3196         insn = punpckl_insn[vece];
3197         goto gen_simd;
3198     case INDEX_op_x86_punpckh_vec:
3199         insn = punpckh_insn[vece];
3200         goto gen_simd;
3201     case INDEX_op_x86_packss_vec:
3202         insn = packss_insn[vece];
3203         goto gen_simd;
3204     case INDEX_op_x86_packus_vec:
3205         insn = packus_insn[vece];
3206         goto gen_simd;
3207     case INDEX_op_x86_vpshldv_vec:
3208         insn = vpshldv_insn[vece];
3209         a1 = a2;
3210         a2 = args[3];
3211         goto gen_simd;
3212     case INDEX_op_x86_vpshrdv_vec:
3213         insn = vpshrdv_insn[vece];
3214         a1 = a2;
3215         a2 = args[3];
3216         goto gen_simd;
3217 #if TCG_TARGET_REG_BITS == 32
3218     case INDEX_op_dup2_vec:
3219         /* First merge the two 32-bit inputs to a single 64-bit element. */
3220         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3221         /* Then replicate the 64-bit elements across the rest of the vector. */
3222         if (type != TCG_TYPE_V64) {
3223             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3224         }
3225         break;
3226 #endif
3227     case INDEX_op_abs_vec:
3228         insn = abs_insn[vece];
3229         a2 = a1;
3230         a1 = 0;
3231         goto gen_simd;
3232     gen_simd:
3233         tcg_debug_assert(insn != OPC_UD2);
3234         if (type == TCG_TYPE_V256) {
3235             insn |= P_VEXL;
3236         }
3237         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3238         break;
3239
3240     case INDEX_op_cmp_vec:
3241         sub = args[3];
3242         if (sub == TCG_COND_EQ) {
3243             insn = cmpeq_insn[vece];
3244         } else if (sub == TCG_COND_GT) {
3245             insn = cmpgt_insn[vece];
3246         } else {
3247             g_assert_not_reached();
3248         }
3249         goto gen_simd;
3250
3251     case INDEX_op_andc_vec:
3252         insn = OPC_PANDN;
3253         if (type == TCG_TYPE_V256) {
3254             insn |= P_VEXL;
3255         }
3256         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3257         break;
3258
3259     case INDEX_op_shli_vec:
3260         insn = shift_imm_insn[vece];
3261         sub = 6;
3262         goto gen_shift;
3263     case INDEX_op_shri_vec:
3264         insn = shift_imm_insn[vece];
3265         sub = 2;
3266         goto gen_shift;
3267     case INDEX_op_sari_vec:
3268         if (vece == MO_64) {
3269             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3270         } else {
3271             insn = shift_imm_insn[vece];
3272         }
3273         sub = 4;
3274         goto gen_shift;
3275     case INDEX_op_rotli_vec:
3276         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3277         if (vece == MO_64) {
3278             insn |= P_VEXW;
3279         }
3280         sub = 1;
3281         goto gen_shift;
3282     gen_shift:
3283         tcg_debug_assert(vece != MO_8);
3284         if (type == TCG_TYPE_V256) {
3285             insn |= P_VEXL;
3286         }
3287         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3288         tcg_out8(s, a2);
3289         break;
3290
3291     case INDEX_op_ld_vec:
3292         tcg_out_ld(s, type, a0, a1, a2);
3293         break;
3294     case INDEX_op_st_vec:
3295         tcg_out_st(s, type, a0, a1, a2);
3296         break;
3297     case INDEX_op_dupm_vec:
3298         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3299         break;
3300
3301     case INDEX_op_x86_shufps_vec:
3302         insn = OPC_SHUFPS;
3303         sub = args[3];
3304         goto gen_simd_imm8;
3305     case INDEX_op_x86_blend_vec:
3306         if (vece == MO_16) {
3307             insn = OPC_PBLENDW;
3308         } else if (vece == MO_32) {
3309             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3310         } else {
3311             g_assert_not_reached();
3312         }
3313         sub = args[3];
3314         goto gen_simd_imm8;
3315     case INDEX_op_x86_vperm2i128_vec:
3316         insn = OPC_VPERM2I128;
3317         sub = args[3];
3318         goto gen_simd_imm8;
3319     case INDEX_op_x86_vpshldi_vec:
3320         insn = vpshldi_insn[vece];
3321         sub = args[3];
3322         goto gen_simd_imm8;
3323
3324     case INDEX_op_not_vec:
3325         insn = OPC_VPTERNLOGQ;
3326         a2 = a1;
3327         sub = 0x33; /* !B */
3328         goto gen_simd_imm8;
3329     case INDEX_op_nor_vec:
3330         insn = OPC_VPTERNLOGQ;
3331         sub = 0x11; /* norCB */
3332         goto gen_simd_imm8;
3333     case INDEX_op_nand_vec:
3334         insn = OPC_VPTERNLOGQ;
3335         sub = 0x77; /* nandCB */
3336         goto gen_simd_imm8;
3337     case INDEX_op_eqv_vec:
3338         insn = OPC_VPTERNLOGQ;
3339         sub = 0x99; /* xnorCB */
3340         goto gen_simd_imm8;
3341     case INDEX_op_orc_vec:
3342         insn = OPC_VPTERNLOGQ;
3343         sub = 0xdd; /* orB!C */
3344         goto gen_simd_imm8;
3345
3346     case INDEX_op_bitsel_vec:
3347         insn = OPC_VPTERNLOGQ;
3348         a3 = args[3];
3349         if (a0 == a1) {
3350             a1 = a2;
3351             a2 = a3;
3352             sub = 0xca; /* A?B:C */
3353         } else if (a0 == a2) {
3354             a2 = a3;
3355             sub = 0xe2; /* B?A:C */
3356         } else {
3357             tcg_out_mov(s, type, a0, a3);
3358             sub = 0xb8; /* B?C:A */
3359         }
3360         goto gen_simd_imm8;
3361
3362     gen_simd_imm8:
3363         tcg_debug_assert(insn != OPC_UD2);
3364         if (type == TCG_TYPE_V256) {
3365             insn |= P_VEXL;
3366         }
3367         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3368         tcg_out8(s, sub);
3369         break;
3370
3371     case INDEX_op_x86_vpblendvb_vec:
3372         insn = OPC_VPBLENDVB;
3373         if (type == TCG_TYPE_V256) {
3374             insn |= P_VEXL;
3375         }
3376         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3377         tcg_out8(s, args[3] << 4);
3378         break;
3379
3380     case INDEX_op_x86_psrldq_vec:
3381         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3382         tcg_out8(s, a2);
3383         break;
3384
3385     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3386     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3387     default:
3388         g_assert_not_reached();
3389     }
3390 }
3391
3392 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3393 {
3394     switch (op) {
3395     case INDEX_op_goto_ptr:
3396         return C_O0_I1(r);
3397
3398     case INDEX_op_ld8u_i32:
3399     case INDEX_op_ld8u_i64:
3400     case INDEX_op_ld8s_i32:
3401     case INDEX_op_ld8s_i64:
3402     case INDEX_op_ld16u_i32:
3403     case INDEX_op_ld16u_i64:
3404     case INDEX_op_ld16s_i32:
3405     case INDEX_op_ld16s_i64:
3406     case INDEX_op_ld_i32:
3407     case INDEX_op_ld32u_i64:
3408     case INDEX_op_ld32s_i64:
3409     case INDEX_op_ld_i64:
3410         return C_O1_I1(r, r);
3411
3412     case INDEX_op_st8_i32:
3413     case INDEX_op_st8_i64:
3414         return C_O0_I2(qi, r);
3415
3416     case INDEX_op_st16_i32:
3417     case INDEX_op_st16_i64:
3418     case INDEX_op_st_i32:
3419     case INDEX_op_st32_i64:
3420         return C_O0_I2(ri, r);
3421
3422     case INDEX_op_st_i64:
3423         return C_O0_I2(re, r);
3424
3425     case INDEX_op_add_i32:
3426     case INDEX_op_add_i64:
3427         return C_O1_I2(r, r, re);
3428
3429     case INDEX_op_sub_i32:
3430     case INDEX_op_sub_i64:
3431     case INDEX_op_mul_i32:
3432     case INDEX_op_mul_i64:
3433     case INDEX_op_or_i32:
3434     case INDEX_op_or_i64:
3435     case INDEX_op_xor_i32:
3436     case INDEX_op_xor_i64:
3437         return C_O1_I2(r, 0, re);
3438
3439     case INDEX_op_and_i32:
3440     case INDEX_op_and_i64:
3441         return C_O1_I2(r, 0, reZ);
3442
3443     case INDEX_op_andc_i32:
3444     case INDEX_op_andc_i64:
3445         return C_O1_I2(r, r, rI);
3446
3447     case INDEX_op_shl_i32:
3448     case INDEX_op_shl_i64:
3449     case INDEX_op_shr_i32:
3450     case INDEX_op_shr_i64:
3451     case INDEX_op_sar_i32:
3452     case INDEX_op_sar_i64:
3453         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3454
3455     case INDEX_op_rotl_i32:
3456     case INDEX_op_rotl_i64:
3457     case INDEX_op_rotr_i32:
3458     case INDEX_op_rotr_i64:
3459         return C_O1_I2(r, 0, ci);
3460
3461     case INDEX_op_brcond_i32:
3462     case INDEX_op_brcond_i64:
3463         return C_O0_I2(r, reT);
3464
3465     case INDEX_op_bswap16_i32:
3466     case INDEX_op_bswap16_i64:
3467     case INDEX_op_bswap32_i32:
3468     case INDEX_op_bswap32_i64:
3469     case INDEX_op_bswap64_i64:
3470     case INDEX_op_neg_i32:
3471     case INDEX_op_neg_i64:
3472     case INDEX_op_not_i32:
3473     case INDEX_op_not_i64:
3474     case INDEX_op_extrh_i64_i32:
3475         return C_O1_I1(r, 0);
3476
3477     case INDEX_op_ext8s_i32:
3478     case INDEX_op_ext8s_i64:
3479     case INDEX_op_ext8u_i32:
3480     case INDEX_op_ext8u_i64:
3481         return C_O1_I1(r, q);
3482
3483     case INDEX_op_ext16s_i32:
3484     case INDEX_op_ext16s_i64:
3485     case INDEX_op_ext16u_i32:
3486     case INDEX_op_ext16u_i64:
3487     case INDEX_op_ext32s_i64:
3488     case INDEX_op_ext32u_i64:
3489     case INDEX_op_ext_i32_i64:
3490     case INDEX_op_extu_i32_i64:
3491     case INDEX_op_extrl_i64_i32:
3492     case INDEX_op_extract_i32:
3493     case INDEX_op_extract_i64:
3494     case INDEX_op_sextract_i32:
3495     case INDEX_op_ctpop_i32:
3496     case INDEX_op_ctpop_i64:
3497         return C_O1_I1(r, r);
3498
3499     case INDEX_op_extract2_i32:
3500     case INDEX_op_extract2_i64:
3501         return C_O1_I2(r, 0, r);
3502
3503     case INDEX_op_deposit_i32:
3504     case INDEX_op_deposit_i64:
3505         return C_O1_I2(q, 0, qi);
3506
3507     case INDEX_op_setcond_i32:
3508     case INDEX_op_setcond_i64:
3509     case INDEX_op_negsetcond_i32:
3510     case INDEX_op_negsetcond_i64:
3511         return C_O1_I2(q, r, reT);
3512
3513     case INDEX_op_movcond_i32:
3514     case INDEX_op_movcond_i64:
3515         return C_O1_I4(r, r, reT, r, 0);
3516
3517     case INDEX_op_div2_i32:
3518     case INDEX_op_div2_i64:
3519     case INDEX_op_divu2_i32:
3520     case INDEX_op_divu2_i64:
3521         return C_O2_I3(a, d, 0, 1, r);
3522
3523     case INDEX_op_mulu2_i32:
3524     case INDEX_op_mulu2_i64:
3525     case INDEX_op_muls2_i32:
3526     case INDEX_op_muls2_i64:
3527         return C_O2_I2(a, d, a, r);
3528
3529     case INDEX_op_add2_i32:
3530     case INDEX_op_add2_i64:
3531     case INDEX_op_sub2_i32:
3532     case INDEX_op_sub2_i64:
3533         return C_N1_O1_I4(r, r, 0, 1, re, re);
3534
3535     case INDEX_op_ctz_i32:
3536     case INDEX_op_ctz_i64:
3537         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3538
3539     case INDEX_op_clz_i32:
3540     case INDEX_op_clz_i64:
3541         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3542
3543     case INDEX_op_qemu_ld_a32_i32:
3544         return C_O1_I1(r, L);
3545     case INDEX_op_qemu_ld_a64_i32:
3546         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3547
3548     case INDEX_op_qemu_st_a32_i32:
3549         return C_O0_I2(L, L);
3550     case INDEX_op_qemu_st_a64_i32:
3551         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3552     case INDEX_op_qemu_st8_a32_i32:
3553         return C_O0_I2(s, L);
3554     case INDEX_op_qemu_st8_a64_i32:
3555         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3556
3557     case INDEX_op_qemu_ld_a32_i64:
3558         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3559     case INDEX_op_qemu_ld_a64_i64:
3560         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3561
3562     case INDEX_op_qemu_st_a32_i64:
3563         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3564     case INDEX_op_qemu_st_a64_i64:
3565         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3566
3567     case INDEX_op_qemu_ld_a32_i128:
3568     case INDEX_op_qemu_ld_a64_i128:
3569         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3570         return C_O2_I1(r, r, L);
3571     case INDEX_op_qemu_st_a32_i128:
3572     case INDEX_op_qemu_st_a64_i128:
3573         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3574         return C_O0_I3(L, L, L);
3575
3576     case INDEX_op_brcond2_i32:
3577         return C_O0_I4(r, r, ri, ri);
3578
3579     case INDEX_op_setcond2_i32:
3580         return C_O1_I4(r, r, r, ri, ri);
3581
3582     case INDEX_op_ld_vec:
3583     case INDEX_op_dupm_vec:
3584         return C_O1_I1(x, r);
3585
3586     case INDEX_op_st_vec:
3587         return C_O0_I2(x, r);
3588
3589     case INDEX_op_add_vec:
3590     case INDEX_op_sub_vec:
3591     case INDEX_op_mul_vec:
3592     case INDEX_op_and_vec:
3593     case INDEX_op_or_vec:
3594     case INDEX_op_xor_vec:
3595     case INDEX_op_andc_vec:
3596     case INDEX_op_orc_vec:
3597     case INDEX_op_nand_vec:
3598     case INDEX_op_nor_vec:
3599     case INDEX_op_eqv_vec:
3600     case INDEX_op_ssadd_vec:
3601     case INDEX_op_usadd_vec:
3602     case INDEX_op_sssub_vec:
3603     case INDEX_op_ussub_vec:
3604     case INDEX_op_smin_vec:
3605     case INDEX_op_umin_vec:
3606     case INDEX_op_smax_vec:
3607     case INDEX_op_umax_vec:
3608     case INDEX_op_shlv_vec:
3609     case INDEX_op_shrv_vec:
3610     case INDEX_op_sarv_vec:
3611     case INDEX_op_rotlv_vec:
3612     case INDEX_op_rotrv_vec:
3613     case INDEX_op_shls_vec:
3614     case INDEX_op_shrs_vec:
3615     case INDEX_op_sars_vec:
3616     case INDEX_op_cmp_vec:
3617     case INDEX_op_x86_shufps_vec:
3618     case INDEX_op_x86_blend_vec:
3619     case INDEX_op_x86_packss_vec:
3620     case INDEX_op_x86_packus_vec:
3621     case INDEX_op_x86_vperm2i128_vec:
3622     case INDEX_op_x86_punpckl_vec:
3623     case INDEX_op_x86_punpckh_vec:
3624     case INDEX_op_x86_vpshldi_vec:
3625 #if TCG_TARGET_REG_BITS == 32
3626     case INDEX_op_dup2_vec:
3627 #endif
3628         return C_O1_I2(x, x, x);
3629
3630     case INDEX_op_abs_vec:
3631     case INDEX_op_dup_vec:
3632     case INDEX_op_not_vec:
3633     case INDEX_op_shli_vec:
3634     case INDEX_op_shri_vec:
3635     case INDEX_op_sari_vec:
3636     case INDEX_op_rotli_vec:
3637     case INDEX_op_x86_psrldq_vec:
3638         return C_O1_I1(x, x);
3639
3640     case INDEX_op_x86_vpshldv_vec:
3641     case INDEX_op_x86_vpshrdv_vec:
3642         return C_O1_I3(x, 0, x, x);
3643
3644     case INDEX_op_bitsel_vec:
3645     case INDEX_op_x86_vpblendvb_vec:
3646         return C_O1_I3(x, x, x, x);
3647
3648     default:
3649         g_assert_not_reached();
3650     }
3651 }
3652
3653 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3654 {
3655     switch (opc) {
3656     case INDEX_op_add_vec:
3657     case INDEX_op_sub_vec:
3658     case INDEX_op_and_vec:
3659     case INDEX_op_or_vec:
3660     case INDEX_op_xor_vec:
3661     case INDEX_op_andc_vec:
3662     case INDEX_op_orc_vec:
3663     case INDEX_op_nand_vec:
3664     case INDEX_op_nor_vec:
3665     case INDEX_op_eqv_vec:
3666     case INDEX_op_not_vec:
3667     case INDEX_op_bitsel_vec:
3668         return 1;
3669     case INDEX_op_cmp_vec:
3670     case INDEX_op_cmpsel_vec:
3671         return -1;
3672
3673     case INDEX_op_rotli_vec:
3674         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3675
3676     case INDEX_op_shli_vec:
3677     case INDEX_op_shri_vec:
3678         /* We must expand the operation for MO_8.  */
3679         return vece == MO_8 ? -1 : 1;
3680
3681     case INDEX_op_sari_vec:
3682         switch (vece) {
3683         case MO_8:
3684             return -1;
3685         case MO_16:
3686         case MO_32:
3687             return 1;
3688         case MO_64:
3689             if (have_avx512vl) {
3690                 return 1;
3691             }
3692             /*
3693              * We can emulate this for MO_64, but it does not pay off
3694              * unless we're producing at least 4 values.
3695              */
3696             return type >= TCG_TYPE_V256 ? -1 : 0;
3697         }
3698         return 0;
3699
3700     case INDEX_op_shls_vec:
3701     case INDEX_op_shrs_vec:
3702         return vece >= MO_16;
3703     case INDEX_op_sars_vec:
3704         switch (vece) {
3705         case MO_16:
3706         case MO_32:
3707             return 1;
3708         case MO_64:
3709             return have_avx512vl;
3710         }
3711         return 0;
3712     case INDEX_op_rotls_vec:
3713         return vece >= MO_16 ? -1 : 0;
3714
3715     case INDEX_op_shlv_vec:
3716     case INDEX_op_shrv_vec:
3717         switch (vece) {
3718         case MO_16:
3719             return have_avx512bw;
3720         case MO_32:
3721         case MO_64:
3722             return have_avx2;
3723         }
3724         return 0;
3725     case INDEX_op_sarv_vec:
3726         switch (vece) {
3727         case MO_16:
3728             return have_avx512bw;
3729         case MO_32:
3730             return have_avx2;
3731         case MO_64:
3732             return have_avx512vl;
3733         }
3734         return 0;
3735     case INDEX_op_rotlv_vec:
3736     case INDEX_op_rotrv_vec:
3737         switch (vece) {
3738         case MO_16:
3739             return have_avx512vbmi2 ? -1 : 0;
3740         case MO_32:
3741         case MO_64:
3742             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3743         }
3744         return 0;
3745
3746     case INDEX_op_mul_vec:
3747         switch (vece) {
3748         case MO_8:
3749             return -1;
3750         case MO_64:
3751             return have_avx512dq;
3752         }
3753         return 1;
3754
3755     case INDEX_op_ssadd_vec:
3756     case INDEX_op_usadd_vec:
3757     case INDEX_op_sssub_vec:
3758     case INDEX_op_ussub_vec:
3759         return vece <= MO_16;
3760     case INDEX_op_smin_vec:
3761     case INDEX_op_smax_vec:
3762     case INDEX_op_umin_vec:
3763     case INDEX_op_umax_vec:
3764     case INDEX_op_abs_vec:
3765         return vece <= MO_32 || have_avx512vl;
3766
3767     default:
3768         return 0;
3769     }
3770 }
3771
3772 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3773                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3774 {
3775     TCGv_vec t1, t2;
3776
3777     tcg_debug_assert(vece == MO_8);
3778
3779     t1 = tcg_temp_new_vec(type);
3780     t2 = tcg_temp_new_vec(type);
3781
3782     /*
3783      * Unpack to W, shift, and repack.  Tricky bits:
3784      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3785      *     i.e. duplicate in other half of the 16-bit lane.
3786      * (2) For right-shift, add 8 so that the high half of the lane
3787      *     becomes zero.  For left-shift, and left-rotate, we must
3788      *     shift up and down again.
3789      * (3) Step 2 leaves high half zero such that PACKUSWB
3790      *     (pack with unsigned saturation) does not modify
3791      *     the quantity.
3792      */
3793     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3794               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3795     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3796               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3797
3798     if (opc != INDEX_op_rotli_vec) {
3799         imm += 8;
3800     }
3801     if (opc == INDEX_op_shri_vec) {
3802         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3803         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3804     } else {
3805         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3806         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3807         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3808         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3809     }
3810
3811     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3812               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3813     tcg_temp_free_vec(t1);
3814     tcg_temp_free_vec(t2);
3815 }
3816
3817 static void expand_vec_sari(TCGType type, unsigned vece,
3818                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3819 {
3820     TCGv_vec t1, t2;
3821
3822     switch (vece) {
3823     case MO_8:
3824         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3825         t1 = tcg_temp_new_vec(type);
3826         t2 = tcg_temp_new_vec(type);
3827         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3828                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3829         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3830                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3831         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3832         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3833         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3834                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3835         tcg_temp_free_vec(t1);
3836         tcg_temp_free_vec(t2);
3837         break;
3838
3839     case MO_64:
3840         t1 = tcg_temp_new_vec(type);
3841         if (imm <= 32) {
3842             /*
3843              * We can emulate a small sign extend by performing an arithmetic
3844              * 32-bit shift and overwriting the high half of a 64-bit logical
3845              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3846              * does not, so we have to bound the smaller shift -- we get the
3847              * same result in the high half either way.
3848              */
3849             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3850             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3851             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3852                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3853                       tcgv_vec_arg(t1), 0xaa);
3854         } else {
3855             /* Otherwise we will need to use a compare vs 0 to produce
3856              * the sign-extend, shift and merge.
3857              */
3858             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3859                             tcg_constant_vec(type, MO_64, 0), v1);
3860             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3861             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3862             tcg_gen_or_vec(MO_64, v0, v0, t1);
3863         }
3864         tcg_temp_free_vec(t1);
3865         break;
3866
3867     default:
3868         g_assert_not_reached();
3869     }
3870 }
3871
3872 static void expand_vec_rotli(TCGType type, unsigned vece,
3873                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3874 {
3875     TCGv_vec t;
3876
3877     if (vece == MO_8) {
3878         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3879         return;
3880     }
3881
3882     if (have_avx512vbmi2) {
3883         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3884                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3885         return;
3886     }
3887
3888     t = tcg_temp_new_vec(type);
3889     tcg_gen_shli_vec(vece, t, v1, imm);
3890     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3891     tcg_gen_or_vec(vece, v0, v0, t);
3892     tcg_temp_free_vec(t);
3893 }
3894
3895 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3896                             TCGv_vec v1, TCGv_vec sh, bool right)
3897 {
3898     TCGv_vec t;
3899
3900     if (have_avx512vbmi2) {
3901         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3902                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3903                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3904         return;
3905     }
3906
3907     t = tcg_temp_new_vec(type);
3908     tcg_gen_dupi_vec(vece, t, 8 << vece);
3909     tcg_gen_sub_vec(vece, t, t, sh);
3910     if (right) {
3911         tcg_gen_shlv_vec(vece, t, v1, t);
3912         tcg_gen_shrv_vec(vece, v0, v1, sh);
3913     } else {
3914         tcg_gen_shrv_vec(vece, t, v1, t);
3915         tcg_gen_shlv_vec(vece, v0, v1, sh);
3916     }
3917     tcg_gen_or_vec(vece, v0, v0, t);
3918     tcg_temp_free_vec(t);
3919 }
3920
3921 static void expand_vec_rotls(TCGType type, unsigned vece,
3922                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3923 {
3924     TCGv_vec t = tcg_temp_new_vec(type);
3925
3926     tcg_debug_assert(vece != MO_8);
3927
3928     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3929         tcg_gen_dup_i32_vec(vece, t, lsh);
3930         if (vece >= MO_32) {
3931             tcg_gen_rotlv_vec(vece, v0, v1, t);
3932         } else {
3933             expand_vec_rotv(type, vece, v0, v1, t, false);
3934         }
3935     } else {
3936         TCGv_i32 rsh = tcg_temp_new_i32();
3937
3938         tcg_gen_neg_i32(rsh, lsh);
3939         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3940         tcg_gen_shls_vec(vece, t, v1, lsh);
3941         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3942         tcg_gen_or_vec(vece, v0, v0, t);
3943
3944         tcg_temp_free_i32(rsh);
3945     }
3946
3947     tcg_temp_free_vec(t);
3948 }
3949
3950 static void expand_vec_mul(TCGType type, unsigned vece,
3951                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3952 {
3953     TCGv_vec t1, t2, t3, t4, zero;
3954
3955     tcg_debug_assert(vece == MO_8);
3956
3957     /*
3958      * Unpack v1 bytes to words, 0 | x.
3959      * Unpack v2 bytes to words, y | 0.
3960      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3961      * Shift logical right by 8 bits to clear the high 8 bytes before
3962      * using an unsigned saturated pack.
3963      *
3964      * The difference between the V64, V128 and V256 cases is merely how
3965      * we distribute the expansion between temporaries.
3966      */
3967     switch (type) {
3968     case TCG_TYPE_V64:
3969         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3970         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3971         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3972         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3973                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3974         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3975                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3976         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3977         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3978         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3979                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3980         tcg_temp_free_vec(t1);
3981         tcg_temp_free_vec(t2);
3982         break;
3983
3984     case TCG_TYPE_V128:
3985     case TCG_TYPE_V256:
3986         t1 = tcg_temp_new_vec(type);
3987         t2 = tcg_temp_new_vec(type);
3988         t3 = tcg_temp_new_vec(type);
3989         t4 = tcg_temp_new_vec(type);
3990         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3991         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3992                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3993         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3994                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3995         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3996                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3997         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3998                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3999         tcg_gen_mul_vec(MO_16, t1, t1, t2);
4000         tcg_gen_mul_vec(MO_16, t3, t3, t4);
4001         tcg_gen_shri_vec(MO_16, t1, t1, 8);
4002         tcg_gen_shri_vec(MO_16, t3, t3, 8);
4003         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
4004                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
4005         tcg_temp_free_vec(t1);
4006         tcg_temp_free_vec(t2);
4007         tcg_temp_free_vec(t3);
4008         tcg_temp_free_vec(t4);
4009         break;
4010
4011     default:
4012         g_assert_not_reached();
4013     }
4014 }
4015
4016 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
4017                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4018 {
4019     enum {
4020         NEED_INV  = 1,
4021         NEED_SWAP = 2,
4022         NEED_BIAS = 4,
4023         NEED_UMIN = 8,
4024         NEED_UMAX = 16,
4025     };
4026     TCGv_vec t1, t2, t3;
4027     uint8_t fixup;
4028
4029     switch (cond) {
4030     case TCG_COND_EQ:
4031     case TCG_COND_GT:
4032         fixup = 0;
4033         break;
4034     case TCG_COND_NE:
4035     case TCG_COND_LE:
4036         fixup = NEED_INV;
4037         break;
4038     case TCG_COND_LT:
4039         fixup = NEED_SWAP;
4040         break;
4041     case TCG_COND_GE:
4042         fixup = NEED_SWAP | NEED_INV;
4043         break;
4044     case TCG_COND_LEU:
4045         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4046             fixup = NEED_UMIN;
4047         } else {
4048             fixup = NEED_BIAS | NEED_INV;
4049         }
4050         break;
4051     case TCG_COND_GTU:
4052         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4053             fixup = NEED_UMIN | NEED_INV;
4054         } else {
4055             fixup = NEED_BIAS;
4056         }
4057         break;
4058     case TCG_COND_GEU:
4059         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4060             fixup = NEED_UMAX;
4061         } else {
4062             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
4063         }
4064         break;
4065     case TCG_COND_LTU:
4066         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4067             fixup = NEED_UMAX | NEED_INV;
4068         } else {
4069             fixup = NEED_BIAS | NEED_SWAP;
4070         }
4071         break;
4072     default:
4073         g_assert_not_reached();
4074     }
4075
4076     if (fixup & NEED_INV) {
4077         cond = tcg_invert_cond(cond);
4078     }
4079     if (fixup & NEED_SWAP) {
4080         t1 = v1, v1 = v2, v2 = t1;
4081         cond = tcg_swap_cond(cond);
4082     }
4083
4084     t1 = t2 = NULL;
4085     if (fixup & (NEED_UMIN | NEED_UMAX)) {
4086         t1 = tcg_temp_new_vec(type);
4087         if (fixup & NEED_UMIN) {
4088             tcg_gen_umin_vec(vece, t1, v1, v2);
4089         } else {
4090             tcg_gen_umax_vec(vece, t1, v1, v2);
4091         }
4092         v2 = t1;
4093         cond = TCG_COND_EQ;
4094     } else if (fixup & NEED_BIAS) {
4095         t1 = tcg_temp_new_vec(type);
4096         t2 = tcg_temp_new_vec(type);
4097         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4098         tcg_gen_sub_vec(vece, t1, v1, t3);
4099         tcg_gen_sub_vec(vece, t2, v2, t3);
4100         v1 = t1;
4101         v2 = t2;
4102         cond = tcg_signed_cond(cond);
4103     }
4104
4105     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4106     /* Expand directly; do not recurse.  */
4107     vec_gen_4(INDEX_op_cmp_vec, type, vece,
4108               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4109
4110     if (t1) {
4111         tcg_temp_free_vec(t1);
4112         if (t2) {
4113             tcg_temp_free_vec(t2);
4114         }
4115     }
4116     return fixup & NEED_INV;
4117 }
4118
4119 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4120                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4121 {
4122     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4123         tcg_gen_not_vec(vece, v0, v0);
4124     }
4125 }
4126
4127 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4128                               TCGv_vec c1, TCGv_vec c2,
4129                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4130 {
4131     TCGv_vec t = tcg_temp_new_vec(type);
4132
4133     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4134         /* Invert the sense of the compare by swapping arguments.  */
4135         TCGv_vec x;
4136         x = v3, v3 = v4, v4 = x;
4137     }
4138     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4139               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4140               tcgv_vec_arg(v3), tcgv_vec_arg(t));
4141     tcg_temp_free_vec(t);
4142 }
4143
4144 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4145                        TCGArg a0, ...)
4146 {
4147     va_list va;
4148     TCGArg a2;
4149     TCGv_vec v0, v1, v2, v3, v4;
4150
4151     va_start(va, a0);
4152     v0 = temp_tcgv_vec(arg_temp(a0));
4153     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4154     a2 = va_arg(va, TCGArg);
4155
4156     switch (opc) {
4157     case INDEX_op_shli_vec:
4158     case INDEX_op_shri_vec:
4159         expand_vec_shi(type, vece, opc, v0, v1, a2);
4160         break;
4161
4162     case INDEX_op_sari_vec:
4163         expand_vec_sari(type, vece, v0, v1, a2);
4164         break;
4165
4166     case INDEX_op_rotli_vec:
4167         expand_vec_rotli(type, vece, v0, v1, a2);
4168         break;
4169
4170     case INDEX_op_rotls_vec:
4171         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4172         break;
4173
4174     case INDEX_op_rotlv_vec:
4175         v2 = temp_tcgv_vec(arg_temp(a2));
4176         expand_vec_rotv(type, vece, v0, v1, v2, false);
4177         break;
4178     case INDEX_op_rotrv_vec:
4179         v2 = temp_tcgv_vec(arg_temp(a2));
4180         expand_vec_rotv(type, vece, v0, v1, v2, true);
4181         break;
4182
4183     case INDEX_op_mul_vec:
4184         v2 = temp_tcgv_vec(arg_temp(a2));
4185         expand_vec_mul(type, vece, v0, v1, v2);
4186         break;
4187
4188     case INDEX_op_cmp_vec:
4189         v2 = temp_tcgv_vec(arg_temp(a2));
4190         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4191         break;
4192
4193     case INDEX_op_cmpsel_vec:
4194         v2 = temp_tcgv_vec(arg_temp(a2));
4195         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4196         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4197         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4198         break;
4199
4200     default:
4201         break;
4202     }
4203
4204     va_end(va);
4205 }
4206
4207 static const int tcg_target_callee_save_regs[] = {
4208 #if TCG_TARGET_REG_BITS == 64
4209     TCG_REG_RBP,
4210     TCG_REG_RBX,
4211 #if defined(_WIN64)
4212     TCG_REG_RDI,
4213     TCG_REG_RSI,
4214 #endif
4215     TCG_REG_R12,
4216     TCG_REG_R13,
4217     TCG_REG_R14, /* Currently used for the global env. */
4218     TCG_REG_R15,
4219 #if defined(_WIN64)
4220     TCG_REG_RDI,
4221     TCG_REG_RSI
4222 #endif
4223 #else
4224     TCG_REG_EBP, /* Currently used for the global env. */
4225     TCG_REG_EBX,
4226     TCG_REG_ESI,
4227     TCG_REG_EDI,
4228 #endif
4229 };
4230
4231 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4232    and tcg_register_jit.  */
4233
4234 #define PUSH_SIZE \
4235     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4236      * (TCG_TARGET_REG_BITS / 8))
4237
4238 #define FRAME_SIZE \
4239     ((PUSH_SIZE \
4240       + TCG_STATIC_CALL_ARGS_SIZE \
4241       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4242       + TCG_TARGET_STACK_ALIGN - 1) \
4243      & ~(TCG_TARGET_STACK_ALIGN - 1))
4244
4245 /* Generate global QEMU prologue and epilogue code */
4246 static void tcg_target_qemu_prologue(TCGContext *s)
4247 {
4248     int i, stack_addend;
4249
4250     /* TB prologue */
4251
4252     /* Reserve some stack space, also for TCG temps.  */
4253     stack_addend = FRAME_SIZE - PUSH_SIZE;
4254     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4255                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4256
4257     /* Save all callee saved registers.  */
4258     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4259         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4260     }
4261
4262     if (!tcg_use_softmmu && guest_base) {
4263         int seg = setup_guest_base_seg();
4264         if (seg != 0) {
4265             x86_guest_base.seg = seg;
4266         } else if (guest_base == (int32_t)guest_base) {
4267             x86_guest_base.ofs = guest_base;
4268         } else {
4269             assert(TCG_TARGET_REG_BITS == 64);
4270             /* Choose R12 because, as a base, it requires a SIB byte. */
4271             x86_guest_base.index = TCG_REG_R12;
4272             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4273             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4274         }
4275     }
4276
4277     if (TCG_TARGET_REG_BITS == 32) {
4278         tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4279                    (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4280         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4281         /* jmp *tb.  */
4282         tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4283                              (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4284                              + stack_addend);
4285     } else {
4286         tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4287         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4288         /* jmp *tb.  */
4289         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4290     }
4291
4292     /*
4293      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4294      * and fall through to the rest of the epilogue.
4295      */
4296     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4297     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4298
4299     /* TB epilogue */
4300     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4301
4302     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4303
4304     if (have_avx2) {
4305         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4306     }
4307     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4308         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4309     }
4310     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4311 }
4312
4313 static void tcg_out_tb_start(TCGContext *s)
4314 {
4315     /* nothing to do */
4316 }
4317
4318 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4319 {
4320     memset(p, 0x90, count);
4321 }
4322
4323 static void tcg_target_init(TCGContext *s)
4324 {
4325     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4326     if (TCG_TARGET_REG_BITS == 64) {
4327         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4328     }
4329     if (have_avx1) {
4330         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4331         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4332     }
4333     if (have_avx2) {
4334         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4335     }
4336
4337     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4338     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4339     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4340     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4341     if (TCG_TARGET_REG_BITS == 64) {
4342 #if !defined(_WIN64)
4343         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4344         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4345 #endif
4346         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4347         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4348         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4349         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4350     }
4351
4352     s->reserved_regs = 0;
4353     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4354     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4355 #ifdef _WIN64
4356     /* These are call saved, and we don't save them, so don't use them. */
4357     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4358     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4359     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4360     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4361     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4362     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4363     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4364     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4365     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4366     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4367 #endif
4368 }
4369
4370 typedef struct {
4371     DebugFrameHeader h;
4372     uint8_t fde_def_cfa[4];
4373     uint8_t fde_reg_ofs[14];
4374 } DebugFrame;
4375
4376 /* We're expecting a 2 byte uleb128 encoded value.  */
4377 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4378
4379 #if !defined(__ELF__)
4380     /* Host machine without ELF. */
4381 #elif TCG_TARGET_REG_BITS == 64
4382 #define ELF_HOST_MACHINE EM_X86_64
4383 static const DebugFrame debug_frame = {
4384     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4385     .h.cie.id = -1,
4386     .h.cie.version = 1,
4387     .h.cie.code_align = 1,
4388     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4389     .h.cie.return_column = 16,
4390
4391     /* Total FDE size does not include the "len" member.  */
4392     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4393
4394     .fde_def_cfa = {
4395         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4396         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4397         (FRAME_SIZE >> 7)
4398     },
4399     .fde_reg_ofs = {
4400         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4401         /* The following ordering must match tcg_target_callee_save_regs.  */
4402         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4403         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4404         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4405         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4406         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4407         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4408     }
4409 };
4410 #else
4411 #define ELF_HOST_MACHINE EM_386
4412 static const DebugFrame debug_frame = {
4413     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4414     .h.cie.id = -1,
4415     .h.cie.version = 1,
4416     .h.cie.code_align = 1,
4417     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4418     .h.cie.return_column = 8,
4419
4420     /* Total FDE size does not include the "len" member.  */
4421     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4422
4423     .fde_def_cfa = {
4424         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4425         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4426         (FRAME_SIZE >> 7)
4427     },
4428     .fde_reg_ofs = {
4429         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4430         /* The following ordering must match tcg_target_callee_save_regs.  */
4431         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4432         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4433         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4434         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4435     }
4436 };
4437 #endif
4438
4439 #if defined(ELF_HOST_MACHINE)
4440 void tcg_register_jit(const void *buf, size_t buf_size)
4441 {
4442     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4443 }
4444 #endif