sev/i386: qmp: add query-sev command
[qemu/ar7.git] / tcg / i386 / tcg-target.inc.c
blobfc05909d1da9d9b73bcc7598a35ce4149c301615
1 /*
2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 #include "tcg-pool.inc.c"
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 #else
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33 #endif
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39 #endif
41 #endif
43 static const int tcg_target_reg_alloc_order[] = {
44 #if TCG_TARGET_REG_BITS == 64
45 TCG_REG_RBP,
46 TCG_REG_RBX,
47 TCG_REG_R12,
48 TCG_REG_R13,
49 TCG_REG_R14,
50 TCG_REG_R15,
51 TCG_REG_R10,
52 TCG_REG_R11,
53 TCG_REG_R9,
54 TCG_REG_R8,
55 TCG_REG_RCX,
56 TCG_REG_RDX,
57 TCG_REG_RSI,
58 TCG_REG_RDI,
59 TCG_REG_RAX,
60 #else
61 TCG_REG_EBX,
62 TCG_REG_ESI,
63 TCG_REG_EDI,
64 TCG_REG_EBP,
65 TCG_REG_ECX,
66 TCG_REG_EDX,
67 TCG_REG_EAX,
68 #endif
69 TCG_REG_XMM0,
70 TCG_REG_XMM1,
71 TCG_REG_XMM2,
72 TCG_REG_XMM3,
73 TCG_REG_XMM4,
74 TCG_REG_XMM5,
75 #ifndef _WIN64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
78 TCG_REG_XMM6,
79 TCG_REG_XMM7,
80 #if TCG_TARGET_REG_BITS == 64
81 TCG_REG_XMM8,
82 TCG_REG_XMM9,
83 TCG_REG_XMM10,
84 TCG_REG_XMM11,
85 TCG_REG_XMM12,
86 TCG_REG_XMM13,
87 TCG_REG_XMM14,
88 TCG_REG_XMM15,
89 #endif
90 #endif
93 static const int tcg_target_call_iarg_regs[] = {
94 #if TCG_TARGET_REG_BITS == 64
95 #if defined(_WIN64)
96 TCG_REG_RCX,
97 TCG_REG_RDX,
98 #else
99 TCG_REG_RDI,
100 TCG_REG_RSI,
101 TCG_REG_RDX,
102 TCG_REG_RCX,
103 #endif
104 TCG_REG_R8,
105 TCG_REG_R9,
106 #else
107 /* 32 bit mode uses stack based calling convention (GCC default). */
108 #endif
111 static const int tcg_target_call_oarg_regs[] = {
112 TCG_REG_EAX,
113 #if TCG_TARGET_REG_BITS == 32
114 TCG_REG_EDX
115 #endif
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
126 i386. */
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130 #else
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
133 #endif
135 /* The host compiler should supply <cpuid.h> to enable runtime features
136 detection, as we're not going to go so far as our own inline assembly.
137 If not available, default values will be assumed. */
138 #if defined(CONFIG_CPUID_H)
139 #include "qemu/cpuid.h"
140 #endif
142 /* For 64-bit, we always know that CMOV is available. */
143 #if TCG_TARGET_REG_BITS == 64
144 # define have_cmov 1
145 #elif defined(CONFIG_CPUID_H)
146 static bool have_cmov;
147 #else
148 # define have_cmov 0
149 #endif
151 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
152 it there. Therefore we always define the variable. */
153 bool have_bmi1;
154 bool have_popcnt;
155 bool have_avx1;
156 bool have_avx2;
158 #ifdef CONFIG_CPUID_H
159 static bool have_movbe;
160 static bool have_bmi2;
161 static bool have_lzcnt;
162 #else
163 # define have_movbe 0
164 # define have_bmi2 0
165 # define have_lzcnt 0
166 #endif
168 static tcg_insn_unit *tb_ret_addr;
170 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
171 intptr_t value, intptr_t addend)
173 value += addend;
174 switch(type) {
175 case R_386_PC32:
176 value -= (uintptr_t)code_ptr;
177 if (value != (int32_t)value) {
178 tcg_abort();
180 /* FALLTHRU */
181 case R_386_32:
182 tcg_patch32(code_ptr, value);
183 break;
184 case R_386_PC8:
185 value -= (uintptr_t)code_ptr;
186 if (value != (int8_t)value) {
187 tcg_abort();
189 tcg_patch8(code_ptr, value);
190 break;
191 default:
192 tcg_abort();
196 #if TCG_TARGET_REG_BITS == 64
197 #define ALL_GENERAL_REGS 0x0000ffffu
198 #define ALL_VECTOR_REGS 0xffff0000u
199 #else
200 #define ALL_GENERAL_REGS 0x000000ffu
201 #define ALL_VECTOR_REGS 0x00ff0000u
202 #endif
204 /* parse target specific constraints */
205 static const char *target_parse_constraint(TCGArgConstraint *ct,
206 const char *ct_str, TCGType type)
208 switch(*ct_str++) {
209 case 'a':
210 ct->ct |= TCG_CT_REG;
211 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
212 break;
213 case 'b':
214 ct->ct |= TCG_CT_REG;
215 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
216 break;
217 case 'c':
218 ct->ct |= TCG_CT_REG;
219 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
220 break;
221 case 'd':
222 ct->ct |= TCG_CT_REG;
223 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
224 break;
225 case 'S':
226 ct->ct |= TCG_CT_REG;
227 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
228 break;
229 case 'D':
230 ct->ct |= TCG_CT_REG;
231 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
232 break;
233 case 'q':
234 /* A register that can be used as a byte operand. */
235 ct->ct |= TCG_CT_REG;
236 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
237 break;
238 case 'Q':
239 /* A register with an addressable second byte (e.g. %ah). */
240 ct->ct |= TCG_CT_REG;
241 ct->u.regs = 0xf;
242 break;
243 case 'r':
244 /* A general register. */
245 ct->ct |= TCG_CT_REG;
246 ct->u.regs |= ALL_GENERAL_REGS;
247 break;
248 case 'W':
249 /* With TZCNT/LZCNT, we can have operand-size as an input. */
250 ct->ct |= TCG_CT_CONST_WSZ;
251 break;
252 case 'x':
253 /* A vector register. */
254 ct->ct |= TCG_CT_REG;
255 ct->u.regs |= ALL_VECTOR_REGS;
256 break;
258 /* qemu_ld/st address constraint */
259 case 'L':
260 ct->ct |= TCG_CT_REG;
261 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
262 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
263 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
264 break;
266 case 'e':
267 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
268 break;
269 case 'Z':
270 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
271 break;
272 case 'I':
273 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
274 break;
276 default:
277 return NULL;
279 return ct_str;
282 /* test if a constant matches the constraint */
283 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
284 const TCGArgConstraint *arg_ct)
286 int ct = arg_ct->ct;
287 if (ct & TCG_CT_CONST) {
288 return 1;
290 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
291 return 1;
293 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
294 return 1;
296 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
297 return 1;
299 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
300 return 1;
302 return 0;
305 #if TCG_TARGET_REG_BITS == 64
306 # define LOWREGMASK(x) ((x) & 7)
307 #else
308 # define LOWREGMASK(x) (x)
309 #endif
311 #define P_EXT 0x100 /* 0x0f opcode prefix */
312 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
313 #define P_DATA16 0x400 /* 0x66 opcode prefix */
314 #if TCG_TARGET_REG_BITS == 64
315 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
316 # define P_REXW 0x1000 /* Set REX.W = 1 */
317 # define P_REXB_R 0x2000 /* REG field as byte register */
318 # define P_REXB_RM 0x4000 /* R/M field as byte register */
319 # define P_GS 0x8000 /* gs segment override */
320 #else
321 # define P_ADDR32 0
322 # define P_REXW 0
323 # define P_REXB_R 0
324 # define P_REXB_RM 0
325 # define P_GS 0
326 #endif
327 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
328 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
329 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
330 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
332 #define OPC_ARITH_EvIz (0x81)
333 #define OPC_ARITH_EvIb (0x83)
334 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
335 #define OPC_ANDN (0xf2 | P_EXT38)
336 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
337 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
338 #define OPC_BSF (0xbc | P_EXT)
339 #define OPC_BSR (0xbd | P_EXT)
340 #define OPC_BSWAP (0xc8 | P_EXT)
341 #define OPC_CALL_Jz (0xe8)
342 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
343 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
344 #define OPC_DEC_r32 (0x48)
345 #define OPC_IMUL_GvEv (0xaf | P_EXT)
346 #define OPC_IMUL_GvEvIb (0x6b)
347 #define OPC_IMUL_GvEvIz (0x69)
348 #define OPC_INC_r32 (0x40)
349 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
350 #define OPC_JCC_short (0x70) /* ... plus condition code */
351 #define OPC_JMP_long (0xe9)
352 #define OPC_JMP_short (0xeb)
353 #define OPC_LEA (0x8d)
354 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
355 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
356 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
357 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
358 #define OPC_MOVB_EvIz (0xc6)
359 #define OPC_MOVL_EvIz (0xc7)
360 #define OPC_MOVL_Iv (0xb8)
361 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
362 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
363 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
364 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
365 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
366 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
367 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
368 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
369 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
370 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
371 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
372 #define OPC_MOVSBL (0xbe | P_EXT)
373 #define OPC_MOVSWL (0xbf | P_EXT)
374 #define OPC_MOVSLQ (0x63 | P_REXW)
375 #define OPC_MOVZBL (0xb6 | P_EXT)
376 #define OPC_MOVZWL (0xb7 | P_EXT)
377 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
378 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
379 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
380 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
381 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
382 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
383 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
384 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
385 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
386 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
387 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
388 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
389 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
390 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
391 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
392 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
393 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
394 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
395 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
396 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
397 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
398 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
399 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
400 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
401 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
402 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
403 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
404 #define OPC_POR (0xeb | P_EXT | P_DATA16)
405 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
406 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
407 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
408 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
409 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
410 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
411 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
412 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
413 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
414 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
415 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
416 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
417 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
418 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
419 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
420 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
421 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
422 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
423 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
424 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
425 #define OPC_POP_r32 (0x58)
426 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
427 #define OPC_PUSH_r32 (0x50)
428 #define OPC_PUSH_Iv (0x68)
429 #define OPC_PUSH_Ib (0x6a)
430 #define OPC_RET (0xc3)
431 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
432 #define OPC_SHIFT_1 (0xd1)
433 #define OPC_SHIFT_Ib (0xc1)
434 #define OPC_SHIFT_cl (0xd3)
435 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
436 #define OPC_SHUFPS (0xc6 | P_EXT)
437 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
438 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
439 #define OPC_TESTL (0x85)
440 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
441 #define OPC_UD2 (0x0b | P_EXT)
442 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
443 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
444 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
445 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
446 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
447 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
448 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
449 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
450 #define OPC_VZEROUPPER (0x77 | P_EXT)
451 #define OPC_XCHG_ax_r32 (0x90)
453 #define OPC_GRP3_Ev (0xf7)
454 #define OPC_GRP5 (0xff)
455 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
457 /* Group 1 opcode extensions for 0x80-0x83.
458 These are also used as modifiers for OPC_ARITH. */
459 #define ARITH_ADD 0
460 #define ARITH_OR 1
461 #define ARITH_ADC 2
462 #define ARITH_SBB 3
463 #define ARITH_AND 4
464 #define ARITH_SUB 5
465 #define ARITH_XOR 6
466 #define ARITH_CMP 7
468 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
469 #define SHIFT_ROL 0
470 #define SHIFT_ROR 1
471 #define SHIFT_SHL 4
472 #define SHIFT_SHR 5
473 #define SHIFT_SAR 7
475 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
476 #define EXT3_NOT 2
477 #define EXT3_NEG 3
478 #define EXT3_MUL 4
479 #define EXT3_IMUL 5
480 #define EXT3_DIV 6
481 #define EXT3_IDIV 7
483 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
484 #define EXT5_INC_Ev 0
485 #define EXT5_DEC_Ev 1
486 #define EXT5_CALLN_Ev 2
487 #define EXT5_JMPN_Ev 4
489 /* Condition codes to be added to OPC_JCC_{long,short}. */
490 #define JCC_JMP (-1)
491 #define JCC_JO 0x0
492 #define JCC_JNO 0x1
493 #define JCC_JB 0x2
494 #define JCC_JAE 0x3
495 #define JCC_JE 0x4
496 #define JCC_JNE 0x5
497 #define JCC_JBE 0x6
498 #define JCC_JA 0x7
499 #define JCC_JS 0x8
500 #define JCC_JNS 0x9
501 #define JCC_JP 0xa
502 #define JCC_JNP 0xb
503 #define JCC_JL 0xc
504 #define JCC_JGE 0xd
505 #define JCC_JLE 0xe
506 #define JCC_JG 0xf
508 static const uint8_t tcg_cond_to_jcc[] = {
509 [TCG_COND_EQ] = JCC_JE,
510 [TCG_COND_NE] = JCC_JNE,
511 [TCG_COND_LT] = JCC_JL,
512 [TCG_COND_GE] = JCC_JGE,
513 [TCG_COND_LE] = JCC_JLE,
514 [TCG_COND_GT] = JCC_JG,
515 [TCG_COND_LTU] = JCC_JB,
516 [TCG_COND_GEU] = JCC_JAE,
517 [TCG_COND_LEU] = JCC_JBE,
518 [TCG_COND_GTU] = JCC_JA,
521 #if TCG_TARGET_REG_BITS == 64
522 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
524 int rex;
526 if (opc & P_GS) {
527 tcg_out8(s, 0x65);
529 if (opc & P_DATA16) {
530 /* We should never be asking for both 16 and 64-bit operation. */
531 tcg_debug_assert((opc & P_REXW) == 0);
532 tcg_out8(s, 0x66);
534 if (opc & P_ADDR32) {
535 tcg_out8(s, 0x67);
537 if (opc & P_SIMDF3) {
538 tcg_out8(s, 0xf3);
539 } else if (opc & P_SIMDF2) {
540 tcg_out8(s, 0xf2);
543 rex = 0;
544 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
545 rex |= (r & 8) >> 1; /* REX.R */
546 rex |= (x & 8) >> 2; /* REX.X */
547 rex |= (rm & 8) >> 3; /* REX.B */
549 /* P_REXB_{R,RM} indicates that the given register is the low byte.
550 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
551 as otherwise the encoding indicates %[abcd]h. Note that the values
552 that are ORed in merely indicate that the REX byte must be present;
553 those bits get discarded in output. */
554 rex |= opc & (r >= 4 ? P_REXB_R : 0);
555 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
557 if (rex) {
558 tcg_out8(s, (uint8_t)(rex | 0x40));
561 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
562 tcg_out8(s, 0x0f);
563 if (opc & P_EXT38) {
564 tcg_out8(s, 0x38);
565 } else if (opc & P_EXT3A) {
566 tcg_out8(s, 0x3a);
570 tcg_out8(s, opc);
572 #else
573 static void tcg_out_opc(TCGContext *s, int opc)
575 if (opc & P_DATA16) {
576 tcg_out8(s, 0x66);
578 if (opc & P_SIMDF3) {
579 tcg_out8(s, 0xf3);
580 } else if (opc & P_SIMDF2) {
581 tcg_out8(s, 0xf2);
583 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
584 tcg_out8(s, 0x0f);
585 if (opc & P_EXT38) {
586 tcg_out8(s, 0x38);
587 } else if (opc & P_EXT3A) {
588 tcg_out8(s, 0x3a);
591 tcg_out8(s, opc);
593 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
594 the 32-bit compilation paths. This method works with all versions of gcc,
595 whereas relying on optimization may not be able to exclude them. */
596 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
597 #endif
599 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
601 tcg_out_opc(s, opc, r, rm, 0);
602 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
605 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
606 int rm, int index)
608 int tmp;
610 /* Use the two byte form if possible, which cannot encode
611 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
612 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
613 && ((rm | index) & 8) == 0) {
614 /* Two byte VEX prefix. */
615 tcg_out8(s, 0xc5);
617 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
618 } else {
619 /* Three byte VEX prefix. */
620 tcg_out8(s, 0xc4);
622 /* VEX.m-mmmm */
623 if (opc & P_EXT3A) {
624 tmp = 3;
625 } else if (opc & P_EXT38) {
626 tmp = 2;
627 } else if (opc & P_EXT) {
628 tmp = 1;
629 } else {
630 g_assert_not_reached();
632 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
633 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
634 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
635 tcg_out8(s, tmp);
637 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
640 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
641 /* VEX.pp */
642 if (opc & P_DATA16) {
643 tmp |= 1; /* 0x66 */
644 } else if (opc & P_SIMDF3) {
645 tmp |= 2; /* 0xf3 */
646 } else if (opc & P_SIMDF2) {
647 tmp |= 3; /* 0xf2 */
649 tmp |= (~v & 15) << 3; /* VEX.vvvv */
650 tcg_out8(s, tmp);
651 tcg_out8(s, opc);
654 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
656 tcg_out_vex_opc(s, opc, r, v, rm, 0);
657 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
660 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
661 We handle either RM and INDEX missing with a negative value. In 64-bit
662 mode for absolute addresses, ~RM is the size of the immediate operand
663 that will follow the instruction. */
665 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
666 int shift, intptr_t offset)
668 int mod, len;
670 if (index < 0 && rm < 0) {
671 if (TCG_TARGET_REG_BITS == 64) {
672 /* Try for a rip-relative addressing mode. This has replaced
673 the 32-bit-mode absolute addressing encoding. */
674 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
675 intptr_t disp = offset - pc;
676 if (disp == (int32_t)disp) {
677 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
678 tcg_out32(s, disp);
679 return;
682 /* Try for an absolute address encoding. This requires the
683 use of the MODRM+SIB encoding and is therefore larger than
684 rip-relative addressing. */
685 if (offset == (int32_t)offset) {
686 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
687 tcg_out8(s, (4 << 3) | 5);
688 tcg_out32(s, offset);
689 return;
692 /* ??? The memory isn't directly addressable. */
693 g_assert_not_reached();
694 } else {
695 /* Absolute address. */
696 tcg_out8(s, (r << 3) | 5);
697 tcg_out32(s, offset);
698 return;
702 /* Find the length of the immediate addend. Note that the encoding
703 that would be used for (%ebp) indicates absolute addressing. */
704 if (rm < 0) {
705 mod = 0, len = 4, rm = 5;
706 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
707 mod = 0, len = 0;
708 } else if (offset == (int8_t)offset) {
709 mod = 0x40, len = 1;
710 } else {
711 mod = 0x80, len = 4;
714 /* Use a single byte MODRM format if possible. Note that the encoding
715 that would be used for %esp is the escape to the two byte form. */
716 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
717 /* Single byte MODRM format. */
718 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
719 } else {
720 /* Two byte MODRM+SIB format. */
722 /* Note that the encoding that would place %esp into the index
723 field indicates no index register. In 64-bit mode, the REX.X
724 bit counts, so %r12 can be used as the index. */
725 if (index < 0) {
726 index = 4;
727 } else {
728 tcg_debug_assert(index != TCG_REG_ESP);
731 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
732 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
735 if (len == 1) {
736 tcg_out8(s, offset);
737 } else if (len == 4) {
738 tcg_out32(s, offset);
742 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
743 int index, int shift, intptr_t offset)
745 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
746 tcg_out_sib_offset(s, r, rm, index, shift, offset);
749 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
750 int rm, int index, int shift,
751 intptr_t offset)
753 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
754 tcg_out_sib_offset(s, r, rm, index, shift, offset);
757 /* A simplification of the above with no index or shift. */
758 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
759 int rm, intptr_t offset)
761 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
764 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
765 int v, int rm, intptr_t offset)
767 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
770 /* Output an opcode with an expected reference to the constant pool. */
771 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
773 tcg_out_opc(s, opc, r, 0, 0);
774 /* Absolute for 32-bit, pc-relative for 64-bit. */
775 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
776 tcg_out32(s, 0);
779 /* Output an opcode with an expected reference to the constant pool. */
780 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
782 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
783 /* Absolute for 32-bit, pc-relative for 64-bit. */
784 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
785 tcg_out32(s, 0);
788 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
789 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
791 /* Propagate an opcode prefix, such as P_REXW. */
792 int ext = subop & ~0x7;
793 subop &= 0x7;
795 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
798 static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
800 int rexw = 0;
802 if (arg == ret) {
803 return;
805 switch (type) {
806 case TCG_TYPE_I64:
807 rexw = P_REXW;
808 /* fallthru */
809 case TCG_TYPE_I32:
810 if (ret < 16) {
811 if (arg < 16) {
812 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
813 } else {
814 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
816 } else {
817 if (arg < 16) {
818 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
819 } else {
820 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
823 break;
825 case TCG_TYPE_V64:
826 tcg_debug_assert(ret >= 16 && arg >= 16);
827 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
828 break;
829 case TCG_TYPE_V128:
830 tcg_debug_assert(ret >= 16 && arg >= 16);
831 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
832 break;
833 case TCG_TYPE_V256:
834 tcg_debug_assert(ret >= 16 && arg >= 16);
835 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
836 break;
838 default:
839 g_assert_not_reached();
843 static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
844 TCGReg r, TCGReg a)
846 if (have_avx2) {
847 static const int dup_insn[4] = {
848 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
849 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
851 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
852 tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
853 } else {
854 switch (vece) {
855 case MO_8:
856 /* ??? With zero in a register, use PSHUFB. */
857 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, 0, a);
858 a = r;
859 /* FALLTHRU */
860 case MO_16:
861 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, 0, a);
862 a = r;
863 /* FALLTHRU */
864 case MO_32:
865 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
866 /* imm8 operand: all output lanes selected from input lane 0. */
867 tcg_out8(s, 0);
868 break;
869 case MO_64:
870 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, 0, a);
871 break;
872 default:
873 g_assert_not_reached();
878 static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
879 TCGReg ret, tcg_target_long arg)
881 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
883 if (arg == 0) {
884 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
885 return;
887 if (arg == -1) {
888 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
889 return;
892 if (TCG_TARGET_REG_BITS == 64) {
893 if (type == TCG_TYPE_V64) {
894 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
895 } else if (have_avx2) {
896 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
897 } else {
898 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
900 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
901 } else if (have_avx2) {
902 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
903 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
904 } else {
905 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
906 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
907 tcg_out_dup_vec(s, type, MO_32, ret, ret);
911 static void tcg_out_movi(TCGContext *s, TCGType type,
912 TCGReg ret, tcg_target_long arg)
914 tcg_target_long diff;
916 switch (type) {
917 case TCG_TYPE_I32:
918 #if TCG_TARGET_REG_BITS == 64
919 case TCG_TYPE_I64:
920 #endif
921 if (ret < 16) {
922 break;
924 /* fallthru */
925 case TCG_TYPE_V64:
926 case TCG_TYPE_V128:
927 case TCG_TYPE_V256:
928 tcg_debug_assert(ret >= 16);
929 tcg_out_dupi_vec(s, type, ret, arg);
930 return;
931 default:
932 g_assert_not_reached();
935 if (arg == 0) {
936 tgen_arithr(s, ARITH_XOR, ret, ret);
937 return;
939 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
940 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
941 tcg_out32(s, arg);
942 return;
944 if (arg == (int32_t)arg) {
945 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
946 tcg_out32(s, arg);
947 return;
950 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
951 diff = arg - ((uintptr_t)s->code_ptr + 7);
952 if (diff == (int32_t)diff) {
953 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
954 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
955 tcg_out32(s, diff);
956 return;
959 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
960 tcg_out64(s, arg);
963 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
965 if (val == (int8_t)val) {
966 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
967 tcg_out8(s, val);
968 } else if (val == (int32_t)val) {
969 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
970 tcg_out32(s, val);
971 } else {
972 tcg_abort();
976 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
978 /* Given the strength of x86 memory ordering, we only need care for
979 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
980 faster than "mfence", so don't bother with the sse insn. */
981 if (a0 & TCG_MO_ST_LD) {
982 tcg_out8(s, 0xf0);
983 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
984 tcg_out8(s, 0);
988 static inline void tcg_out_push(TCGContext *s, int reg)
990 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
993 static inline void tcg_out_pop(TCGContext *s, int reg)
995 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
998 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
999 TCGReg arg1, intptr_t arg2)
1001 switch (type) {
1002 case TCG_TYPE_I32:
1003 if (ret < 16) {
1004 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1005 } else {
1006 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1008 break;
1009 case TCG_TYPE_I64:
1010 if (ret < 16) {
1011 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1012 break;
1014 /* FALLTHRU */
1015 case TCG_TYPE_V64:
1016 tcg_debug_assert(ret >= 16);
1017 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1018 break;
1019 case TCG_TYPE_V128:
1020 tcg_debug_assert(ret >= 16);
1021 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1022 break;
1023 case TCG_TYPE_V256:
1024 tcg_debug_assert(ret >= 16);
1025 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1026 ret, 0, arg1, arg2);
1027 break;
1028 default:
1029 g_assert_not_reached();
1033 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1034 TCGReg arg1, intptr_t arg2)
1036 switch (type) {
1037 case TCG_TYPE_I32:
1038 if (arg < 16) {
1039 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1040 } else {
1041 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1043 break;
1044 case TCG_TYPE_I64:
1045 if (arg < 16) {
1046 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1047 break;
1049 /* FALLTHRU */
1050 case TCG_TYPE_V64:
1051 tcg_debug_assert(arg >= 16);
1052 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1053 break;
1054 case TCG_TYPE_V128:
1055 tcg_debug_assert(arg >= 16);
1056 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1057 break;
1058 case TCG_TYPE_V256:
1059 tcg_debug_assert(arg >= 16);
1060 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1061 arg, 0, arg1, arg2);
1062 break;
1063 default:
1064 g_assert_not_reached();
1068 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1069 TCGReg base, intptr_t ofs)
1071 int rexw = 0;
1072 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1073 if (val != (int32_t)val) {
1074 return false;
1076 rexw = P_REXW;
1077 } else if (type != TCG_TYPE_I32) {
1078 return false;
1080 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1081 tcg_out32(s, val);
1082 return true;
1085 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1087 /* Propagate an opcode prefix, such as P_DATA16. */
1088 int ext = subopc & ~0x7;
1089 subopc &= 0x7;
1091 if (count == 1) {
1092 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1093 } else {
1094 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1095 tcg_out8(s, count);
1099 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1101 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1104 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1106 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1109 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1111 /* movzbl */
1112 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1113 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1116 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1118 /* movsbl */
1119 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1120 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1123 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1125 /* movzwl */
1126 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1129 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1131 /* movsw[lq] */
1132 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1135 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1137 /* 32-bit mov zero extends. */
1138 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1141 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1143 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1146 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1148 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1151 static void tgen_arithi(TCGContext *s, int c, int r0,
1152 tcg_target_long val, int cf)
1154 int rexw = 0;
1156 if (TCG_TARGET_REG_BITS == 64) {
1157 rexw = c & -8;
1158 c &= 7;
1161 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1162 partial flags update stalls on Pentium4 and are not recommended
1163 by current Intel optimization manuals. */
1164 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1165 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1166 if (TCG_TARGET_REG_BITS == 64) {
1167 /* The single-byte increment encodings are re-tasked as the
1168 REX prefixes. Use the MODRM encoding. */
1169 tcg_out_modrm(s, OPC_GRP5 + rexw,
1170 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1171 } else {
1172 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1174 return;
1177 if (c == ARITH_AND) {
1178 if (TCG_TARGET_REG_BITS == 64) {
1179 if (val == 0xffffffffu) {
1180 tcg_out_ext32u(s, r0, r0);
1181 return;
1183 if (val == (uint32_t)val) {
1184 /* AND with no high bits set can use a 32-bit operation. */
1185 rexw = 0;
1188 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1189 tcg_out_ext8u(s, r0, r0);
1190 return;
1192 if (val == 0xffffu) {
1193 tcg_out_ext16u(s, r0, r0);
1194 return;
1198 if (val == (int8_t)val) {
1199 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1200 tcg_out8(s, val);
1201 return;
1203 if (rexw == 0 || val == (int32_t)val) {
1204 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1205 tcg_out32(s, val);
1206 return;
1209 tcg_abort();
1212 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1214 if (val != 0) {
1215 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1219 /* Use SMALL != 0 to force a short forward branch. */
1220 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1222 int32_t val, val1;
1224 if (l->has_value) {
1225 val = tcg_pcrel_diff(s, l->u.value_ptr);
1226 val1 = val - 2;
1227 if ((int8_t)val1 == val1) {
1228 if (opc == -1) {
1229 tcg_out8(s, OPC_JMP_short);
1230 } else {
1231 tcg_out8(s, OPC_JCC_short + opc);
1233 tcg_out8(s, val1);
1234 } else {
1235 if (small) {
1236 tcg_abort();
1238 if (opc == -1) {
1239 tcg_out8(s, OPC_JMP_long);
1240 tcg_out32(s, val - 5);
1241 } else {
1242 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1243 tcg_out32(s, val - 6);
1246 } else if (small) {
1247 if (opc == -1) {
1248 tcg_out8(s, OPC_JMP_short);
1249 } else {
1250 tcg_out8(s, OPC_JCC_short + opc);
1252 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1253 s->code_ptr += 1;
1254 } else {
1255 if (opc == -1) {
1256 tcg_out8(s, OPC_JMP_long);
1257 } else {
1258 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1260 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1261 s->code_ptr += 4;
1265 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1266 int const_arg2, int rexw)
1268 if (const_arg2) {
1269 if (arg2 == 0) {
1270 /* test r, r */
1271 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1272 } else {
1273 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1275 } else {
1276 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1280 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1281 TCGArg arg1, TCGArg arg2, int const_arg2,
1282 TCGLabel *label, int small)
1284 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1285 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1288 #if TCG_TARGET_REG_BITS == 64
1289 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1290 TCGArg arg1, TCGArg arg2, int const_arg2,
1291 TCGLabel *label, int small)
1293 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1294 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1296 #else
1297 /* XXX: we implement it at the target level to avoid having to
1298 handle cross basic blocks temporaries */
1299 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1300 const int *const_args, int small)
1302 TCGLabel *label_next = gen_new_label();
1303 TCGLabel *label_this = arg_label(args[5]);
1305 switch(args[4]) {
1306 case TCG_COND_EQ:
1307 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1308 label_next, 1);
1309 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1310 label_this, small);
1311 break;
1312 case TCG_COND_NE:
1313 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1314 label_this, small);
1315 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1316 label_this, small);
1317 break;
1318 case TCG_COND_LT:
1319 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1320 label_this, small);
1321 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1322 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1323 label_this, small);
1324 break;
1325 case TCG_COND_LE:
1326 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1327 label_this, small);
1328 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1329 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1330 label_this, small);
1331 break;
1332 case TCG_COND_GT:
1333 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1334 label_this, small);
1335 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1336 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1337 label_this, small);
1338 break;
1339 case TCG_COND_GE:
1340 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1341 label_this, small);
1342 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1343 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1344 label_this, small);
1345 break;
1346 case TCG_COND_LTU:
1347 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1348 label_this, small);
1349 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1350 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1351 label_this, small);
1352 break;
1353 case TCG_COND_LEU:
1354 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1355 label_this, small);
1356 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1357 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1358 label_this, small);
1359 break;
1360 case TCG_COND_GTU:
1361 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1362 label_this, small);
1363 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1364 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1365 label_this, small);
1366 break;
1367 case TCG_COND_GEU:
1368 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1369 label_this, small);
1370 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1372 label_this, small);
1373 break;
1374 default:
1375 tcg_abort();
1377 tcg_out_label(s, label_next, s->code_ptr);
1379 #endif
1381 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1382 TCGArg arg1, TCGArg arg2, int const_arg2)
1384 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1385 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1386 tcg_out_ext8u(s, dest, dest);
1389 #if TCG_TARGET_REG_BITS == 64
1390 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1391 TCGArg arg1, TCGArg arg2, int const_arg2)
1393 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1394 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1395 tcg_out_ext8u(s, dest, dest);
1397 #else
1398 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1399 const int *const_args)
1401 TCGArg new_args[6];
1402 TCGLabel *label_true, *label_over;
1404 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1406 if (args[0] == args[1] || args[0] == args[2]
1407 || (!const_args[3] && args[0] == args[3])
1408 || (!const_args[4] && args[0] == args[4])) {
1409 /* When the destination overlaps with one of the argument
1410 registers, don't do anything tricky. */
1411 label_true = gen_new_label();
1412 label_over = gen_new_label();
1414 new_args[5] = label_arg(label_true);
1415 tcg_out_brcond2(s, new_args, const_args+1, 1);
1417 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1418 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1419 tcg_out_label(s, label_true, s->code_ptr);
1421 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1422 tcg_out_label(s, label_over, s->code_ptr);
1423 } else {
1424 /* When the destination does not overlap one of the arguments,
1425 clear the destination first, jump if cond false, and emit an
1426 increment in the true case. This results in smaller code. */
1428 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1430 label_over = gen_new_label();
1431 new_args[4] = tcg_invert_cond(new_args[4]);
1432 new_args[5] = label_arg(label_over);
1433 tcg_out_brcond2(s, new_args, const_args+1, 1);
1435 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1436 tcg_out_label(s, label_over, s->code_ptr);
1439 #endif
1441 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1442 TCGReg dest, TCGReg v1)
1444 if (have_cmov) {
1445 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1446 } else {
1447 TCGLabel *over = gen_new_label();
1448 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1449 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1450 tcg_out_label(s, over, s->code_ptr);
1454 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1455 TCGReg c1, TCGArg c2, int const_c2,
1456 TCGReg v1)
1458 tcg_out_cmp(s, c1, c2, const_c2, 0);
1459 tcg_out_cmov(s, cond, 0, dest, v1);
1462 #if TCG_TARGET_REG_BITS == 64
1463 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1464 TCGReg c1, TCGArg c2, int const_c2,
1465 TCGReg v1)
1467 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1468 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1470 #endif
1472 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1473 TCGArg arg2, bool const_a2)
1475 if (have_bmi1) {
1476 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1477 if (const_a2) {
1478 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1479 } else {
1480 tcg_debug_assert(dest != arg2);
1481 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1483 } else {
1484 tcg_debug_assert(dest != arg2);
1485 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1486 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1490 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1491 TCGArg arg2, bool const_a2)
1493 if (have_lzcnt) {
1494 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1495 if (const_a2) {
1496 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1497 } else {
1498 tcg_debug_assert(dest != arg2);
1499 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1501 } else {
1502 tcg_debug_assert(!const_a2);
1503 tcg_debug_assert(dest != arg1);
1504 tcg_debug_assert(dest != arg2);
1506 /* Recall that the output of BSR is the index not the count. */
1507 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1508 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1510 /* Since we have destroyed the flags from BSR, we have to re-test. */
1511 tcg_out_cmp(s, arg1, 0, 1, rexw);
1512 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1516 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1518 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1520 if (disp == (int32_t)disp) {
1521 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1522 tcg_out32(s, disp);
1523 } else {
1524 /* rip-relative addressing into the constant pool.
1525 This is 6 + 8 = 14 bytes, as compared to using an
1526 an immediate load 10 + 6 = 16 bytes, plus we may
1527 be able to re-use the pool constant for more calls. */
1528 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1529 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1530 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1531 tcg_out32(s, 0);
1535 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1537 tcg_out_branch(s, 1, dest);
1540 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1542 tcg_out_branch(s, 0, dest);
1545 static void tcg_out_nopn(TCGContext *s, int n)
1547 int i;
1548 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1549 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1550 * duplicate prefix, and all of the interesting recent cores can
1551 * decode and discard the duplicates in a single cycle.
1553 tcg_debug_assert(n >= 1);
1554 for (i = 1; i < n; ++i) {
1555 tcg_out8(s, 0x66);
1557 tcg_out8(s, 0x90);
1560 #if defined(CONFIG_SOFTMMU)
1561 #include "tcg-ldst.inc.c"
1563 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1564 * int mmu_idx, uintptr_t ra)
1566 static void * const qemu_ld_helpers[16] = {
1567 [MO_UB] = helper_ret_ldub_mmu,
1568 [MO_LEUW] = helper_le_lduw_mmu,
1569 [MO_LEUL] = helper_le_ldul_mmu,
1570 [MO_LEQ] = helper_le_ldq_mmu,
1571 [MO_BEUW] = helper_be_lduw_mmu,
1572 [MO_BEUL] = helper_be_ldul_mmu,
1573 [MO_BEQ] = helper_be_ldq_mmu,
1576 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1577 * uintxx_t val, int mmu_idx, uintptr_t ra)
1579 static void * const qemu_st_helpers[16] = {
1580 [MO_UB] = helper_ret_stb_mmu,
1581 [MO_LEUW] = helper_le_stw_mmu,
1582 [MO_LEUL] = helper_le_stl_mmu,
1583 [MO_LEQ] = helper_le_stq_mmu,
1584 [MO_BEUW] = helper_be_stw_mmu,
1585 [MO_BEUL] = helper_be_stl_mmu,
1586 [MO_BEQ] = helper_be_stq_mmu,
1589 /* Perform the TLB load and compare.
1591 Inputs:
1592 ADDRLO and ADDRHI contain the low and high part of the address.
1594 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1596 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1597 This should be offsetof addr_read or addr_write.
1599 Outputs:
1600 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1601 positions of the displacements of forward jumps to the TLB miss case.
1603 Second argument register is loaded with the low part of the address.
1604 In the TLB hit case, it has been adjusted as indicated by the TLB
1605 and so is a host address. In the TLB miss case, it continues to
1606 hold a guest address.
1608 First argument register is clobbered. */
1610 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1611 int mem_index, TCGMemOp opc,
1612 tcg_insn_unit **label_ptr, int which)
1614 const TCGReg r0 = TCG_REG_L0;
1615 const TCGReg r1 = TCG_REG_L1;
1616 TCGType ttype = TCG_TYPE_I32;
1617 TCGType tlbtype = TCG_TYPE_I32;
1618 int trexw = 0, hrexw = 0, tlbrexw = 0;
1619 unsigned a_bits = get_alignment_bits(opc);
1620 unsigned s_bits = opc & MO_SIZE;
1621 unsigned a_mask = (1 << a_bits) - 1;
1622 unsigned s_mask = (1 << s_bits) - 1;
1623 target_ulong tlb_mask;
1625 if (TCG_TARGET_REG_BITS == 64) {
1626 if (TARGET_LONG_BITS == 64) {
1627 ttype = TCG_TYPE_I64;
1628 trexw = P_REXW;
1630 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1631 hrexw = P_REXW;
1632 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1633 tlbtype = TCG_TYPE_I64;
1634 tlbrexw = P_REXW;
1639 tcg_out_mov(s, tlbtype, r0, addrlo);
1640 /* If the required alignment is at least as large as the access, simply
1641 copy the address and mask. For lesser alignments, check that we don't
1642 cross pages for the complete access. */
1643 if (a_bits >= s_bits) {
1644 tcg_out_mov(s, ttype, r1, addrlo);
1645 } else {
1646 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1648 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1650 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1651 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1653 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1654 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1655 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1657 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1658 offsetof(CPUArchState, tlb_table[mem_index][0])
1659 + which);
1661 /* cmp 0(r0), r1 */
1662 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1664 /* Prepare for both the fast path add of the tlb addend, and the slow
1665 path function argument setup. There are two cases worth note:
1666 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1667 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1668 copies the entire guest address for the slow path, while truncation
1669 for the 32-bit host happens with the fastpath ADDL below. */
1670 tcg_out_mov(s, ttype, r1, addrlo);
1672 /* jne slow_path */
1673 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1674 label_ptr[0] = s->code_ptr;
1675 s->code_ptr += 4;
1677 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1678 /* cmp 4(r0), addrhi */
1679 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1681 /* jne slow_path */
1682 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1683 label_ptr[1] = s->code_ptr;
1684 s->code_ptr += 4;
1687 /* TLB Hit. */
1689 /* add addend(r0), r1 */
1690 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1691 offsetof(CPUTLBEntry, addend) - which);
1695 * Record the context of a call to the out of line helper code for the slow path
1696 * for a load or store, so that we can later generate the correct helper code
1698 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1699 TCGReg datalo, TCGReg datahi,
1700 TCGReg addrlo, TCGReg addrhi,
1701 tcg_insn_unit *raddr,
1702 tcg_insn_unit **label_ptr)
1704 TCGLabelQemuLdst *label = new_ldst_label(s);
1706 label->is_ld = is_ld;
1707 label->oi = oi;
1708 label->datalo_reg = datalo;
1709 label->datahi_reg = datahi;
1710 label->addrlo_reg = addrlo;
1711 label->addrhi_reg = addrhi;
1712 label->raddr = raddr;
1713 label->label_ptr[0] = label_ptr[0];
1714 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1715 label->label_ptr[1] = label_ptr[1];
1720 * Generate code for the slow path for a load at the end of block
1722 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1724 TCGMemOpIdx oi = l->oi;
1725 TCGMemOp opc = get_memop(oi);
1726 TCGReg data_reg;
1727 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1729 /* resolve label address */
1730 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1731 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1732 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1735 if (TCG_TARGET_REG_BITS == 32) {
1736 int ofs = 0;
1738 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1739 ofs += 4;
1741 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1742 ofs += 4;
1744 if (TARGET_LONG_BITS == 64) {
1745 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1746 ofs += 4;
1749 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1750 ofs += 4;
1752 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1753 } else {
1754 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1755 /* The second argument is already loaded with addrlo. */
1756 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1757 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1758 (uintptr_t)l->raddr);
1761 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1763 data_reg = l->datalo_reg;
1764 switch (opc & MO_SSIZE) {
1765 case MO_SB:
1766 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1767 break;
1768 case MO_SW:
1769 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1770 break;
1771 #if TCG_TARGET_REG_BITS == 64
1772 case MO_SL:
1773 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1774 break;
1775 #endif
1776 case MO_UB:
1777 case MO_UW:
1778 /* Note that the helpers have zero-extended to tcg_target_long. */
1779 case MO_UL:
1780 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1781 break;
1782 case MO_Q:
1783 if (TCG_TARGET_REG_BITS == 64) {
1784 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1785 } else if (data_reg == TCG_REG_EDX) {
1786 /* xchg %edx, %eax */
1787 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1788 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1789 } else {
1790 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1791 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1793 break;
1794 default:
1795 tcg_abort();
1798 /* Jump to the code corresponding to next IR of qemu_st */
1799 tcg_out_jmp(s, l->raddr);
1803 * Generate code for the slow path for a store at the end of block
1805 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1807 TCGMemOpIdx oi = l->oi;
1808 TCGMemOp opc = get_memop(oi);
1809 TCGMemOp s_bits = opc & MO_SIZE;
1810 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1811 TCGReg retaddr;
1813 /* resolve label address */
1814 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1815 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1816 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1819 if (TCG_TARGET_REG_BITS == 32) {
1820 int ofs = 0;
1822 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1823 ofs += 4;
1825 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1826 ofs += 4;
1828 if (TARGET_LONG_BITS == 64) {
1829 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1830 ofs += 4;
1833 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1834 ofs += 4;
1836 if (s_bits == MO_64) {
1837 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1838 ofs += 4;
1841 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1842 ofs += 4;
1844 retaddr = TCG_REG_EAX;
1845 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1846 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1847 } else {
1848 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1849 /* The second argument is already loaded with addrlo. */
1850 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1851 tcg_target_call_iarg_regs[2], l->datalo_reg);
1852 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1854 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1855 retaddr = tcg_target_call_iarg_regs[4];
1856 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1857 } else {
1858 retaddr = TCG_REG_RAX;
1859 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1860 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1861 TCG_TARGET_CALL_STACK_OFFSET);
1865 /* "Tail call" to the helper, with the return address back inline. */
1866 tcg_out_push(s, retaddr);
1867 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1869 #elif defined(__x86_64__) && defined(__linux__)
1870 # include <asm/prctl.h>
1871 # include <sys/prctl.h>
1873 int arch_prctl(int code, unsigned long addr);
1875 static int guest_base_flags;
1876 static inline void setup_guest_base_seg(void)
1878 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1879 guest_base_flags = P_GS;
1882 #else
1883 # define guest_base_flags 0
1884 static inline void setup_guest_base_seg(void) { }
1885 #endif /* SOFTMMU */
1887 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1888 TCGReg base, int index, intptr_t ofs,
1889 int seg, TCGMemOp memop)
1891 const TCGMemOp real_bswap = memop & MO_BSWAP;
1892 TCGMemOp bswap = real_bswap;
1893 int movop = OPC_MOVL_GvEv;
1895 if (have_movbe && real_bswap) {
1896 bswap = 0;
1897 movop = OPC_MOVBE_GyMy;
1900 switch (memop & MO_SSIZE) {
1901 case MO_UB:
1902 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1903 base, index, 0, ofs);
1904 break;
1905 case MO_SB:
1906 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1907 base, index, 0, ofs);
1908 break;
1909 case MO_UW:
1910 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1911 base, index, 0, ofs);
1912 if (real_bswap) {
1913 tcg_out_rolw_8(s, datalo);
1915 break;
1916 case MO_SW:
1917 if (real_bswap) {
1918 if (have_movbe) {
1919 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1920 datalo, base, index, 0, ofs);
1921 } else {
1922 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1923 base, index, 0, ofs);
1924 tcg_out_rolw_8(s, datalo);
1926 tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1927 } else {
1928 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1929 datalo, base, index, 0, ofs);
1931 break;
1932 case MO_UL:
1933 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1934 if (bswap) {
1935 tcg_out_bswap32(s, datalo);
1937 break;
1938 #if TCG_TARGET_REG_BITS == 64
1939 case MO_SL:
1940 if (real_bswap) {
1941 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1942 base, index, 0, ofs);
1943 if (bswap) {
1944 tcg_out_bswap32(s, datalo);
1946 tcg_out_ext32s(s, datalo, datalo);
1947 } else {
1948 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1949 base, index, 0, ofs);
1951 break;
1952 #endif
1953 case MO_Q:
1954 if (TCG_TARGET_REG_BITS == 64) {
1955 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1956 base, index, 0, ofs);
1957 if (bswap) {
1958 tcg_out_bswap64(s, datalo);
1960 } else {
1961 if (real_bswap) {
1962 int t = datalo;
1963 datalo = datahi;
1964 datahi = t;
1966 if (base != datalo) {
1967 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1968 base, index, 0, ofs);
1969 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1970 base, index, 0, ofs + 4);
1971 } else {
1972 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1973 base, index, 0, ofs + 4);
1974 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1975 base, index, 0, ofs);
1977 if (bswap) {
1978 tcg_out_bswap32(s, datalo);
1979 tcg_out_bswap32(s, datahi);
1982 break;
1983 default:
1984 tcg_abort();
1988 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1989 EAX. It will be useful once fixed registers globals are less
1990 common. */
1991 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1993 TCGReg datalo, datahi, addrlo;
1994 TCGReg addrhi __attribute__((unused));
1995 TCGMemOpIdx oi;
1996 TCGMemOp opc;
1997 #if defined(CONFIG_SOFTMMU)
1998 int mem_index;
1999 tcg_insn_unit *label_ptr[2];
2000 #endif
2002 datalo = *args++;
2003 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2004 addrlo = *args++;
2005 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2006 oi = *args++;
2007 opc = get_memop(oi);
2009 #if defined(CONFIG_SOFTMMU)
2010 mem_index = get_mmuidx(oi);
2012 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2013 label_ptr, offsetof(CPUTLBEntry, addr_read));
2015 /* TLB Hit. */
2016 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2018 /* Record the current context of a load into ldst label */
2019 add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
2020 s->code_ptr, label_ptr);
2021 #else
2023 int32_t offset = guest_base;
2024 TCGReg base = addrlo;
2025 int index = -1;
2026 int seg = 0;
2028 /* For a 32-bit guest, the high 32 bits may contain garbage.
2029 We can do this with the ADDR32 prefix if we're not using
2030 a guest base, or when using segmentation. Otherwise we
2031 need to zero-extend manually. */
2032 if (guest_base == 0 || guest_base_flags) {
2033 seg = guest_base_flags;
2034 offset = 0;
2035 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2036 seg |= P_ADDR32;
2038 } else if (TCG_TARGET_REG_BITS == 64) {
2039 if (TARGET_LONG_BITS == 32) {
2040 tcg_out_ext32u(s, TCG_REG_L0, base);
2041 base = TCG_REG_L0;
2043 if (offset != guest_base) {
2044 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2045 index = TCG_REG_L1;
2046 offset = 0;
2050 tcg_out_qemu_ld_direct(s, datalo, datahi,
2051 base, index, offset, seg, opc);
2053 #endif
2056 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2057 TCGReg base, intptr_t ofs, int seg,
2058 TCGMemOp memop)
2060 /* ??? Ideally we wouldn't need a scratch register. For user-only,
2061 we could perform the bswap twice to restore the original value
2062 instead of moving to the scratch. But as it is, the L constraint
2063 means that TCG_REG_L0 is definitely free here. */
2064 const TCGReg scratch = TCG_REG_L0;
2065 const TCGMemOp real_bswap = memop & MO_BSWAP;
2066 TCGMemOp bswap = real_bswap;
2067 int movop = OPC_MOVL_EvGv;
2069 if (have_movbe && real_bswap) {
2070 bswap = 0;
2071 movop = OPC_MOVBE_MyGy;
2074 switch (memop & MO_SIZE) {
2075 case MO_8:
2076 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2077 Use the scratch register if necessary. */
2078 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2079 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2080 datalo = scratch;
2082 tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2083 datalo, base, ofs);
2084 break;
2085 case MO_16:
2086 if (bswap) {
2087 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2088 tcg_out_rolw_8(s, scratch);
2089 datalo = scratch;
2091 tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
2092 break;
2093 case MO_32:
2094 if (bswap) {
2095 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2096 tcg_out_bswap32(s, scratch);
2097 datalo = scratch;
2099 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2100 break;
2101 case MO_64:
2102 if (TCG_TARGET_REG_BITS == 64) {
2103 if (bswap) {
2104 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2105 tcg_out_bswap64(s, scratch);
2106 datalo = scratch;
2108 tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
2109 } else if (bswap) {
2110 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2111 tcg_out_bswap32(s, scratch);
2112 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
2113 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2114 tcg_out_bswap32(s, scratch);
2115 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
2116 } else {
2117 if (real_bswap) {
2118 int t = datalo;
2119 datalo = datahi;
2120 datahi = t;
2122 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2123 tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
2125 break;
2126 default:
2127 tcg_abort();
2131 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2133 TCGReg datalo, datahi, addrlo;
2134 TCGReg addrhi __attribute__((unused));
2135 TCGMemOpIdx oi;
2136 TCGMemOp opc;
2137 #if defined(CONFIG_SOFTMMU)
2138 int mem_index;
2139 tcg_insn_unit *label_ptr[2];
2140 #endif
2142 datalo = *args++;
2143 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2144 addrlo = *args++;
2145 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2146 oi = *args++;
2147 opc = get_memop(oi);
2149 #if defined(CONFIG_SOFTMMU)
2150 mem_index = get_mmuidx(oi);
2152 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2153 label_ptr, offsetof(CPUTLBEntry, addr_write));
2155 /* TLB Hit. */
2156 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
2158 /* Record the current context of a store into ldst label */
2159 add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
2160 s->code_ptr, label_ptr);
2161 #else
2163 int32_t offset = guest_base;
2164 TCGReg base = addrlo;
2165 int seg = 0;
2167 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
2168 if (guest_base == 0 || guest_base_flags) {
2169 seg = guest_base_flags;
2170 offset = 0;
2171 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2172 seg |= P_ADDR32;
2174 } else if (TCG_TARGET_REG_BITS == 64) {
2175 /* ??? Note that we can't use the same SIB addressing scheme
2176 as for loads, since we require L0 free for bswap. */
2177 if (offset != guest_base) {
2178 if (TARGET_LONG_BITS == 32) {
2179 tcg_out_ext32u(s, TCG_REG_L0, base);
2180 base = TCG_REG_L0;
2182 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2183 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
2184 base = TCG_REG_L1;
2185 offset = 0;
2186 } else if (TARGET_LONG_BITS == 32) {
2187 tcg_out_ext32u(s, TCG_REG_L1, base);
2188 base = TCG_REG_L1;
2192 tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
2194 #endif
2197 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2198 const TCGArg *args, const int *const_args)
2200 TCGArg a0, a1, a2;
2201 int c, const_a2, vexop, rexw = 0;
2203 #if TCG_TARGET_REG_BITS == 64
2204 # define OP_32_64(x) \
2205 case glue(glue(INDEX_op_, x), _i64): \
2206 rexw = P_REXW; /* FALLTHRU */ \
2207 case glue(glue(INDEX_op_, x), _i32)
2208 #else
2209 # define OP_32_64(x) \
2210 case glue(glue(INDEX_op_, x), _i32)
2211 #endif
2213 /* Hoist the loads of the most common arguments. */
2214 a0 = args[0];
2215 a1 = args[1];
2216 a2 = args[2];
2217 const_a2 = const_args[2];
2219 switch (opc) {
2220 case INDEX_op_exit_tb:
2221 /* Reuse the zeroing that exists for goto_ptr. */
2222 if (a0 == 0) {
2223 tcg_out_jmp(s, s->code_gen_epilogue);
2224 } else {
2225 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2226 tcg_out_jmp(s, tb_ret_addr);
2228 break;
2229 case INDEX_op_goto_tb:
2230 if (s->tb_jmp_insn_offset) {
2231 /* direct jump method */
2232 int gap;
2233 /* jump displacement must be aligned for atomic patching;
2234 * see if we need to add extra nops before jump
2236 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2237 if (gap != 1) {
2238 tcg_out_nopn(s, gap - 1);
2240 tcg_out8(s, OPC_JMP_long); /* jmp im */
2241 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2242 tcg_out32(s, 0);
2243 } else {
2244 /* indirect jump method */
2245 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2246 (intptr_t)(s->tb_jmp_target_addr + a0));
2248 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
2249 break;
2250 case INDEX_op_goto_ptr:
2251 /* jmp to the given host address (could be epilogue) */
2252 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2253 break;
2254 case INDEX_op_br:
2255 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2256 break;
2257 OP_32_64(ld8u):
2258 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2259 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2260 break;
2261 OP_32_64(ld8s):
2262 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2263 break;
2264 OP_32_64(ld16u):
2265 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2266 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2267 break;
2268 OP_32_64(ld16s):
2269 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2270 break;
2271 #if TCG_TARGET_REG_BITS == 64
2272 case INDEX_op_ld32u_i64:
2273 #endif
2274 case INDEX_op_ld_i32:
2275 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2276 break;
2278 OP_32_64(st8):
2279 if (const_args[0]) {
2280 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2281 tcg_out8(s, a0);
2282 } else {
2283 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2285 break;
2286 OP_32_64(st16):
2287 if (const_args[0]) {
2288 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2289 tcg_out16(s, a0);
2290 } else {
2291 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2293 break;
2294 #if TCG_TARGET_REG_BITS == 64
2295 case INDEX_op_st32_i64:
2296 #endif
2297 case INDEX_op_st_i32:
2298 if (const_args[0]) {
2299 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2300 tcg_out32(s, a0);
2301 } else {
2302 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2304 break;
2306 OP_32_64(add):
2307 /* For 3-operand addition, use LEA. */
2308 if (a0 != a1) {
2309 TCGArg c3 = 0;
2310 if (const_a2) {
2311 c3 = a2, a2 = -1;
2312 } else if (a0 == a2) {
2313 /* Watch out for dest = src + dest, since we've removed
2314 the matching constraint on the add. */
2315 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2316 break;
2319 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2320 break;
2322 c = ARITH_ADD;
2323 goto gen_arith;
2324 OP_32_64(sub):
2325 c = ARITH_SUB;
2326 goto gen_arith;
2327 OP_32_64(and):
2328 c = ARITH_AND;
2329 goto gen_arith;
2330 OP_32_64(or):
2331 c = ARITH_OR;
2332 goto gen_arith;
2333 OP_32_64(xor):
2334 c = ARITH_XOR;
2335 goto gen_arith;
2336 gen_arith:
2337 if (const_a2) {
2338 tgen_arithi(s, c + rexw, a0, a2, 0);
2339 } else {
2340 tgen_arithr(s, c + rexw, a0, a2);
2342 break;
2344 OP_32_64(andc):
2345 if (const_a2) {
2346 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2347 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2348 } else {
2349 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2351 break;
2353 OP_32_64(mul):
2354 if (const_a2) {
2355 int32_t val;
2356 val = a2;
2357 if (val == (int8_t)val) {
2358 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2359 tcg_out8(s, val);
2360 } else {
2361 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2362 tcg_out32(s, val);
2364 } else {
2365 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2367 break;
2369 OP_32_64(div2):
2370 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2371 break;
2372 OP_32_64(divu2):
2373 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2374 break;
2376 OP_32_64(shl):
2377 /* For small constant 3-operand shift, use LEA. */
2378 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2379 if (a2 - 1 == 0) {
2380 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2381 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2382 } else {
2383 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2384 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2386 break;
2388 c = SHIFT_SHL;
2389 vexop = OPC_SHLX;
2390 goto gen_shift_maybe_vex;
2391 OP_32_64(shr):
2392 c = SHIFT_SHR;
2393 vexop = OPC_SHRX;
2394 goto gen_shift_maybe_vex;
2395 OP_32_64(sar):
2396 c = SHIFT_SAR;
2397 vexop = OPC_SARX;
2398 goto gen_shift_maybe_vex;
2399 OP_32_64(rotl):
2400 c = SHIFT_ROL;
2401 goto gen_shift;
2402 OP_32_64(rotr):
2403 c = SHIFT_ROR;
2404 goto gen_shift;
2405 gen_shift_maybe_vex:
2406 if (have_bmi2) {
2407 if (!const_a2) {
2408 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2409 break;
2411 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2413 /* FALLTHRU */
2414 gen_shift:
2415 if (const_a2) {
2416 tcg_out_shifti(s, c + rexw, a0, a2);
2417 } else {
2418 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2420 break;
2422 OP_32_64(ctz):
2423 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2424 break;
2425 OP_32_64(clz):
2426 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2427 break;
2428 OP_32_64(ctpop):
2429 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2430 break;
2432 case INDEX_op_brcond_i32:
2433 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2434 break;
2435 case INDEX_op_setcond_i32:
2436 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2437 break;
2438 case INDEX_op_movcond_i32:
2439 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2440 break;
2442 OP_32_64(bswap16):
2443 tcg_out_rolw_8(s, a0);
2444 break;
2445 OP_32_64(bswap32):
2446 tcg_out_bswap32(s, a0);
2447 break;
2449 OP_32_64(neg):
2450 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2451 break;
2452 OP_32_64(not):
2453 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2454 break;
2456 OP_32_64(ext8s):
2457 tcg_out_ext8s(s, a0, a1, rexw);
2458 break;
2459 OP_32_64(ext16s):
2460 tcg_out_ext16s(s, a0, a1, rexw);
2461 break;
2462 OP_32_64(ext8u):
2463 tcg_out_ext8u(s, a0, a1);
2464 break;
2465 OP_32_64(ext16u):
2466 tcg_out_ext16u(s, a0, a1);
2467 break;
2469 case INDEX_op_qemu_ld_i32:
2470 tcg_out_qemu_ld(s, args, 0);
2471 break;
2472 case INDEX_op_qemu_ld_i64:
2473 tcg_out_qemu_ld(s, args, 1);
2474 break;
2475 case INDEX_op_qemu_st_i32:
2476 tcg_out_qemu_st(s, args, 0);
2477 break;
2478 case INDEX_op_qemu_st_i64:
2479 tcg_out_qemu_st(s, args, 1);
2480 break;
2482 OP_32_64(mulu2):
2483 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2484 break;
2485 OP_32_64(muls2):
2486 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2487 break;
2488 OP_32_64(add2):
2489 if (const_args[4]) {
2490 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2491 } else {
2492 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2494 if (const_args[5]) {
2495 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2496 } else {
2497 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2499 break;
2500 OP_32_64(sub2):
2501 if (const_args[4]) {
2502 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2503 } else {
2504 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2506 if (const_args[5]) {
2507 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2508 } else {
2509 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2511 break;
2513 #if TCG_TARGET_REG_BITS == 32
2514 case INDEX_op_brcond2_i32:
2515 tcg_out_brcond2(s, args, const_args, 0);
2516 break;
2517 case INDEX_op_setcond2_i32:
2518 tcg_out_setcond2(s, args, const_args);
2519 break;
2520 #else /* TCG_TARGET_REG_BITS == 64 */
2521 case INDEX_op_ld32s_i64:
2522 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2523 break;
2524 case INDEX_op_ld_i64:
2525 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2526 break;
2527 case INDEX_op_st_i64:
2528 if (const_args[0]) {
2529 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2530 tcg_out32(s, a0);
2531 } else {
2532 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2534 break;
2536 case INDEX_op_brcond_i64:
2537 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2538 break;
2539 case INDEX_op_setcond_i64:
2540 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2541 break;
2542 case INDEX_op_movcond_i64:
2543 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2544 break;
2546 case INDEX_op_bswap64_i64:
2547 tcg_out_bswap64(s, a0);
2548 break;
2549 case INDEX_op_extu_i32_i64:
2550 case INDEX_op_ext32u_i64:
2551 tcg_out_ext32u(s, a0, a1);
2552 break;
2553 case INDEX_op_ext_i32_i64:
2554 case INDEX_op_ext32s_i64:
2555 tcg_out_ext32s(s, a0, a1);
2556 break;
2557 #endif
2559 OP_32_64(deposit):
2560 if (args[3] == 0 && args[4] == 8) {
2561 /* load bits 0..7 */
2562 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2563 } else if (args[3] == 8 && args[4] == 8) {
2564 /* load bits 8..15 */
2565 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2566 } else if (args[3] == 0 && args[4] == 16) {
2567 /* load bits 0..15 */
2568 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2569 } else {
2570 tcg_abort();
2572 break;
2574 case INDEX_op_extract_i64:
2575 if (a2 + args[3] == 32) {
2576 /* This is a 32-bit zero-extending right shift. */
2577 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2578 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2579 break;
2581 /* FALLTHRU */
2582 case INDEX_op_extract_i32:
2583 /* On the off-chance that we can use the high-byte registers.
2584 Otherwise we emit the same ext16 + shift pattern that we
2585 would have gotten from the normal tcg-op.c expansion. */
2586 tcg_debug_assert(a2 == 8 && args[3] == 8);
2587 if (a1 < 4 && a0 < 8) {
2588 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2589 } else {
2590 tcg_out_ext16u(s, a0, a1);
2591 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2593 break;
2595 case INDEX_op_sextract_i32:
2596 /* We don't implement sextract_i64, as we cannot sign-extend to
2597 64-bits without using the REX prefix that explicitly excludes
2598 access to the high-byte registers. */
2599 tcg_debug_assert(a2 == 8 && args[3] == 8);
2600 if (a1 < 4 && a0 < 8) {
2601 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2602 } else {
2603 tcg_out_ext16s(s, a0, a1, 0);
2604 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2606 break;
2608 case INDEX_op_mb:
2609 tcg_out_mb(s, a0);
2610 break;
2611 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2612 case INDEX_op_mov_i64:
2613 case INDEX_op_mov_vec:
2614 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2615 case INDEX_op_movi_i64:
2616 case INDEX_op_dupi_vec:
2617 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2618 default:
2619 tcg_abort();
2622 #undef OP_32_64
2625 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2626 unsigned vecl, unsigned vece,
2627 const TCGArg *args, const int *const_args)
2629 static int const add_insn[4] = {
2630 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2632 static int const sub_insn[4] = {
2633 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2635 static int const mul_insn[4] = {
2636 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2638 static int const shift_imm_insn[4] = {
2639 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2641 static int const cmpeq_insn[4] = {
2642 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2644 static int const cmpgt_insn[4] = {
2645 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2647 static int const punpckl_insn[4] = {
2648 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2650 static int const punpckh_insn[4] = {
2651 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2653 static int const packss_insn[4] = {
2654 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2656 static int const packus_insn[4] = {
2657 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2660 TCGType type = vecl + TCG_TYPE_V64;
2661 int insn, sub;
2662 TCGArg a0, a1, a2;
2664 a0 = args[0];
2665 a1 = args[1];
2666 a2 = args[2];
2668 switch (opc) {
2669 case INDEX_op_add_vec:
2670 insn = add_insn[vece];
2671 goto gen_simd;
2672 case INDEX_op_sub_vec:
2673 insn = sub_insn[vece];
2674 goto gen_simd;
2675 case INDEX_op_mul_vec:
2676 insn = mul_insn[vece];
2677 goto gen_simd;
2678 case INDEX_op_and_vec:
2679 insn = OPC_PAND;
2680 goto gen_simd;
2681 case INDEX_op_or_vec:
2682 insn = OPC_POR;
2683 goto gen_simd;
2684 case INDEX_op_xor_vec:
2685 insn = OPC_PXOR;
2686 goto gen_simd;
2687 case INDEX_op_x86_punpckl_vec:
2688 insn = punpckl_insn[vece];
2689 goto gen_simd;
2690 case INDEX_op_x86_punpckh_vec:
2691 insn = punpckh_insn[vece];
2692 goto gen_simd;
2693 case INDEX_op_x86_packss_vec:
2694 insn = packss_insn[vece];
2695 goto gen_simd;
2696 case INDEX_op_x86_packus_vec:
2697 insn = packus_insn[vece];
2698 goto gen_simd;
2699 gen_simd:
2700 tcg_debug_assert(insn != OPC_UD2);
2701 if (type == TCG_TYPE_V256) {
2702 insn |= P_VEXL;
2704 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2705 break;
2707 case INDEX_op_cmp_vec:
2708 sub = args[3];
2709 if (sub == TCG_COND_EQ) {
2710 insn = cmpeq_insn[vece];
2711 } else if (sub == TCG_COND_GT) {
2712 insn = cmpgt_insn[vece];
2713 } else {
2714 g_assert_not_reached();
2716 goto gen_simd;
2718 case INDEX_op_andc_vec:
2719 insn = OPC_PANDN;
2720 if (type == TCG_TYPE_V256) {
2721 insn |= P_VEXL;
2723 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2724 break;
2726 case INDEX_op_shli_vec:
2727 sub = 6;
2728 goto gen_shift;
2729 case INDEX_op_shri_vec:
2730 sub = 2;
2731 goto gen_shift;
2732 case INDEX_op_sari_vec:
2733 tcg_debug_assert(vece != MO_64);
2734 sub = 4;
2735 gen_shift:
2736 tcg_debug_assert(vece != MO_8);
2737 insn = shift_imm_insn[vece];
2738 if (type == TCG_TYPE_V256) {
2739 insn |= P_VEXL;
2741 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2742 tcg_out8(s, a2);
2743 break;
2745 case INDEX_op_ld_vec:
2746 tcg_out_ld(s, type, a0, a1, a2);
2747 break;
2748 case INDEX_op_st_vec:
2749 tcg_out_st(s, type, a0, a1, a2);
2750 break;
2751 case INDEX_op_dup_vec:
2752 tcg_out_dup_vec(s, type, vece, a0, a1);
2753 break;
2755 case INDEX_op_x86_shufps_vec:
2756 insn = OPC_SHUFPS;
2757 sub = args[3];
2758 goto gen_simd_imm8;
2759 case INDEX_op_x86_blend_vec:
2760 if (vece == MO_16) {
2761 insn = OPC_PBLENDW;
2762 } else if (vece == MO_32) {
2763 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2764 } else {
2765 g_assert_not_reached();
2767 sub = args[3];
2768 goto gen_simd_imm8;
2769 case INDEX_op_x86_vperm2i128_vec:
2770 insn = OPC_VPERM2I128;
2771 sub = args[3];
2772 goto gen_simd_imm8;
2773 gen_simd_imm8:
2774 if (type == TCG_TYPE_V256) {
2775 insn |= P_VEXL;
2777 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2778 tcg_out8(s, sub);
2779 break;
2781 case INDEX_op_x86_vpblendvb_vec:
2782 insn = OPC_VPBLENDVB;
2783 if (type == TCG_TYPE_V256) {
2784 insn |= P_VEXL;
2786 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2787 tcg_out8(s, args[3] << 4);
2788 break;
2790 case INDEX_op_x86_psrldq_vec:
2791 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2792 tcg_out8(s, a2);
2793 break;
2795 default:
2796 g_assert_not_reached();
2800 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2802 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2803 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2804 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2805 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2806 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2807 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2808 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2809 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2810 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2811 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2812 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2813 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2814 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2815 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2816 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2817 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2818 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2819 static const TCGTargetOpDef r_r_L_L
2820 = { .args_ct_str = { "r", "r", "L", "L" } };
2821 static const TCGTargetOpDef L_L_L_L
2822 = { .args_ct_str = { "L", "L", "L", "L" } };
2823 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2824 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2825 static const TCGTargetOpDef x_x_x_x
2826 = { .args_ct_str = { "x", "x", "x", "x" } };
2827 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2829 switch (op) {
2830 case INDEX_op_goto_ptr:
2831 return &r;
2833 case INDEX_op_ld8u_i32:
2834 case INDEX_op_ld8u_i64:
2835 case INDEX_op_ld8s_i32:
2836 case INDEX_op_ld8s_i64:
2837 case INDEX_op_ld16u_i32:
2838 case INDEX_op_ld16u_i64:
2839 case INDEX_op_ld16s_i32:
2840 case INDEX_op_ld16s_i64:
2841 case INDEX_op_ld_i32:
2842 case INDEX_op_ld32u_i64:
2843 case INDEX_op_ld32s_i64:
2844 case INDEX_op_ld_i64:
2845 return &r_r;
2847 case INDEX_op_st8_i32:
2848 case INDEX_op_st8_i64:
2849 return &qi_r;
2850 case INDEX_op_st16_i32:
2851 case INDEX_op_st16_i64:
2852 case INDEX_op_st_i32:
2853 case INDEX_op_st32_i64:
2854 return &ri_r;
2855 case INDEX_op_st_i64:
2856 return &re_r;
2858 case INDEX_op_add_i32:
2859 case INDEX_op_add_i64:
2860 return &r_r_re;
2861 case INDEX_op_sub_i32:
2862 case INDEX_op_sub_i64:
2863 case INDEX_op_mul_i32:
2864 case INDEX_op_mul_i64:
2865 case INDEX_op_or_i32:
2866 case INDEX_op_or_i64:
2867 case INDEX_op_xor_i32:
2868 case INDEX_op_xor_i64:
2869 return &r_0_re;
2871 case INDEX_op_and_i32:
2872 case INDEX_op_and_i64:
2874 static const TCGTargetOpDef and
2875 = { .args_ct_str = { "r", "0", "reZ" } };
2876 return &and;
2878 break;
2879 case INDEX_op_andc_i32:
2880 case INDEX_op_andc_i64:
2882 static const TCGTargetOpDef andc
2883 = { .args_ct_str = { "r", "r", "rI" } };
2884 return &andc;
2886 break;
2888 case INDEX_op_shl_i32:
2889 case INDEX_op_shl_i64:
2890 case INDEX_op_shr_i32:
2891 case INDEX_op_shr_i64:
2892 case INDEX_op_sar_i32:
2893 case INDEX_op_sar_i64:
2894 return have_bmi2 ? &r_r_ri : &r_0_ci;
2895 case INDEX_op_rotl_i32:
2896 case INDEX_op_rotl_i64:
2897 case INDEX_op_rotr_i32:
2898 case INDEX_op_rotr_i64:
2899 return &r_0_ci;
2901 case INDEX_op_brcond_i32:
2902 case INDEX_op_brcond_i64:
2903 return &r_re;
2905 case INDEX_op_bswap16_i32:
2906 case INDEX_op_bswap16_i64:
2907 case INDEX_op_bswap32_i32:
2908 case INDEX_op_bswap32_i64:
2909 case INDEX_op_bswap64_i64:
2910 case INDEX_op_neg_i32:
2911 case INDEX_op_neg_i64:
2912 case INDEX_op_not_i32:
2913 case INDEX_op_not_i64:
2914 return &r_0;
2916 case INDEX_op_ext8s_i32:
2917 case INDEX_op_ext8s_i64:
2918 case INDEX_op_ext8u_i32:
2919 case INDEX_op_ext8u_i64:
2920 return &r_q;
2921 case INDEX_op_ext16s_i32:
2922 case INDEX_op_ext16s_i64:
2923 case INDEX_op_ext16u_i32:
2924 case INDEX_op_ext16u_i64:
2925 case INDEX_op_ext32s_i64:
2926 case INDEX_op_ext32u_i64:
2927 case INDEX_op_ext_i32_i64:
2928 case INDEX_op_extu_i32_i64:
2929 case INDEX_op_extract_i32:
2930 case INDEX_op_extract_i64:
2931 case INDEX_op_sextract_i32:
2932 case INDEX_op_ctpop_i32:
2933 case INDEX_op_ctpop_i64:
2934 return &r_r;
2936 case INDEX_op_deposit_i32:
2937 case INDEX_op_deposit_i64:
2939 static const TCGTargetOpDef dep
2940 = { .args_ct_str = { "Q", "0", "Q" } };
2941 return &dep;
2943 case INDEX_op_setcond_i32:
2944 case INDEX_op_setcond_i64:
2946 static const TCGTargetOpDef setc
2947 = { .args_ct_str = { "q", "r", "re" } };
2948 return &setc;
2950 case INDEX_op_movcond_i32:
2951 case INDEX_op_movcond_i64:
2953 static const TCGTargetOpDef movc
2954 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2955 return &movc;
2957 case INDEX_op_div2_i32:
2958 case INDEX_op_div2_i64:
2959 case INDEX_op_divu2_i32:
2960 case INDEX_op_divu2_i64:
2962 static const TCGTargetOpDef div2
2963 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2964 return &div2;
2966 case INDEX_op_mulu2_i32:
2967 case INDEX_op_mulu2_i64:
2968 case INDEX_op_muls2_i32:
2969 case INDEX_op_muls2_i64:
2971 static const TCGTargetOpDef mul2
2972 = { .args_ct_str = { "a", "d", "a", "r" } };
2973 return &mul2;
2975 case INDEX_op_add2_i32:
2976 case INDEX_op_add2_i64:
2977 case INDEX_op_sub2_i32:
2978 case INDEX_op_sub2_i64:
2980 static const TCGTargetOpDef arith2
2981 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2982 return &arith2;
2984 case INDEX_op_ctz_i32:
2985 case INDEX_op_ctz_i64:
2987 static const TCGTargetOpDef ctz[2] = {
2988 { .args_ct_str = { "&r", "r", "r" } },
2989 { .args_ct_str = { "&r", "r", "rW" } },
2991 return &ctz[have_bmi1];
2993 case INDEX_op_clz_i32:
2994 case INDEX_op_clz_i64:
2996 static const TCGTargetOpDef clz[2] = {
2997 { .args_ct_str = { "&r", "r", "r" } },
2998 { .args_ct_str = { "&r", "r", "rW" } },
3000 return &clz[have_lzcnt];
3003 case INDEX_op_qemu_ld_i32:
3004 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3005 case INDEX_op_qemu_st_i32:
3006 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3007 case INDEX_op_qemu_ld_i64:
3008 return (TCG_TARGET_REG_BITS == 64 ? &r_L
3009 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3010 : &r_r_L_L);
3011 case INDEX_op_qemu_st_i64:
3012 return (TCG_TARGET_REG_BITS == 64 ? &L_L
3013 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3014 : &L_L_L_L);
3016 case INDEX_op_brcond2_i32:
3018 static const TCGTargetOpDef b2
3019 = { .args_ct_str = { "r", "r", "ri", "ri" } };
3020 return &b2;
3022 case INDEX_op_setcond2_i32:
3024 static const TCGTargetOpDef s2
3025 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3026 return &s2;
3029 case INDEX_op_ld_vec:
3030 case INDEX_op_st_vec:
3031 return &x_r;
3033 case INDEX_op_add_vec:
3034 case INDEX_op_sub_vec:
3035 case INDEX_op_mul_vec:
3036 case INDEX_op_and_vec:
3037 case INDEX_op_or_vec:
3038 case INDEX_op_xor_vec:
3039 case INDEX_op_andc_vec:
3040 case INDEX_op_cmp_vec:
3041 case INDEX_op_x86_shufps_vec:
3042 case INDEX_op_x86_blend_vec:
3043 case INDEX_op_x86_packss_vec:
3044 case INDEX_op_x86_packus_vec:
3045 case INDEX_op_x86_vperm2i128_vec:
3046 case INDEX_op_x86_punpckl_vec:
3047 case INDEX_op_x86_punpckh_vec:
3048 return &x_x_x;
3049 case INDEX_op_dup_vec:
3050 case INDEX_op_shli_vec:
3051 case INDEX_op_shri_vec:
3052 case INDEX_op_sari_vec:
3053 case INDEX_op_x86_psrldq_vec:
3054 return &x_x;
3055 case INDEX_op_x86_vpblendvb_vec:
3056 return &x_x_x_x;
3058 default:
3059 break;
3061 return NULL;
3064 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3066 switch (opc) {
3067 case INDEX_op_add_vec:
3068 case INDEX_op_sub_vec:
3069 case INDEX_op_and_vec:
3070 case INDEX_op_or_vec:
3071 case INDEX_op_xor_vec:
3072 case INDEX_op_andc_vec:
3073 return 1;
3074 case INDEX_op_cmp_vec:
3075 return -1;
3077 case INDEX_op_shli_vec:
3078 case INDEX_op_shri_vec:
3079 /* We must expand the operation for MO_8. */
3080 return vece == MO_8 ? -1 : 1;
3082 case INDEX_op_sari_vec:
3083 /* We must expand the operation for MO_8. */
3084 if (vece == MO_8) {
3085 return -1;
3087 /* We can emulate this for MO_64, but it does not pay off
3088 unless we're producing at least 4 values. */
3089 if (vece == MO_64) {
3090 return type >= TCG_TYPE_V256 ? -1 : 0;
3092 return 1;
3094 case INDEX_op_mul_vec:
3095 if (vece == MO_8) {
3096 /* We can expand the operation for MO_8. */
3097 return -1;
3099 if (vece == MO_64) {
3100 return 0;
3102 return 1;
3104 default:
3105 return 0;
3109 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3110 TCGArg a0, ...)
3112 va_list va;
3113 TCGArg a1, a2;
3114 TCGv_vec v0, t1, t2, t3, t4;
3116 va_start(va, a0);
3117 v0 = temp_tcgv_vec(arg_temp(a0));
3119 switch (opc) {
3120 case INDEX_op_shli_vec:
3121 case INDEX_op_shri_vec:
3122 tcg_debug_assert(vece == MO_8);
3123 a1 = va_arg(va, TCGArg);
3124 a2 = va_arg(va, TCGArg);
3125 /* Unpack to W, shift, and repack. Tricky bits:
3126 (1) Use punpck*bw x,x to produce DDCCBBAA,
3127 i.e. duplicate in other half of the 16-bit lane.
3128 (2) For right-shift, add 8 so that the high half of
3129 the lane becomes zero. For left-shift, we must
3130 shift up and down again.
3131 (3) Step 2 leaves high half zero such that PACKUSWB
3132 (pack with unsigned saturation) does not modify
3133 the quantity. */
3134 t1 = tcg_temp_new_vec(type);
3135 t2 = tcg_temp_new_vec(type);
3136 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3137 tcgv_vec_arg(t1), a1, a1);
3138 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3139 tcgv_vec_arg(t2), a1, a1);
3140 if (opc == INDEX_op_shri_vec) {
3141 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3142 tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3143 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3144 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3145 } else {
3146 vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3147 tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3148 vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3149 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3150 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3151 tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
3152 vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3153 tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
3155 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3156 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3157 tcg_temp_free_vec(t1);
3158 tcg_temp_free_vec(t2);
3159 break;
3161 case INDEX_op_sari_vec:
3162 a1 = va_arg(va, TCGArg);
3163 a2 = va_arg(va, TCGArg);
3164 if (vece == MO_8) {
3165 /* Unpack to W, shift, and repack, as above. */
3166 t1 = tcg_temp_new_vec(type);
3167 t2 = tcg_temp_new_vec(type);
3168 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3169 tcgv_vec_arg(t1), a1, a1);
3170 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3171 tcgv_vec_arg(t2), a1, a1);
3172 vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3173 tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3174 vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3175 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3176 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3177 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3178 tcg_temp_free_vec(t1);
3179 tcg_temp_free_vec(t2);
3180 break;
3182 tcg_debug_assert(vece == MO_64);
3183 /* MO_64: If the shift is <= 32, we can emulate the sign extend by
3184 performing an arithmetic 32-bit shift and overwriting the high
3185 half of the result (note that the ISA says shift of 32 is valid). */
3186 if (a2 <= 32) {
3187 t1 = tcg_temp_new_vec(type);
3188 vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
3189 vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3190 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3191 a0, a0, tcgv_vec_arg(t1), 0xaa);
3192 tcg_temp_free_vec(t1);
3193 break;
3195 /* Otherwise we will need to use a compare vs 0 to produce the
3196 sign-extend, shift and merge. */
3197 t1 = tcg_temp_new_vec(type);
3198 t2 = tcg_const_zeros_vec(type);
3199 vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
3200 tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
3201 tcg_temp_free_vec(t2);
3202 vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3203 vec_gen_3(INDEX_op_shli_vec, type, MO_64,
3204 tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
3205 vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
3206 tcg_temp_free_vec(t1);
3207 break;
3209 case INDEX_op_mul_vec:
3210 tcg_debug_assert(vece == MO_8);
3211 a1 = va_arg(va, TCGArg);
3212 a2 = va_arg(va, TCGArg);
3213 switch (type) {
3214 case TCG_TYPE_V64:
3215 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3216 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3217 tcg_gen_dup16i_vec(t2, 0);
3218 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3219 tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
3220 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3221 tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
3222 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3223 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3224 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3225 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3226 tcg_temp_free_vec(t1);
3227 tcg_temp_free_vec(t2);
3228 break;
3230 case TCG_TYPE_V128:
3231 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3232 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3233 t3 = tcg_temp_new_vec(TCG_TYPE_V128);
3234 t4 = tcg_temp_new_vec(TCG_TYPE_V128);
3235 tcg_gen_dup16i_vec(t4, 0);
3236 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3237 tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3238 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3239 tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3240 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3241 tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3242 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3243 tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3244 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3245 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3246 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3247 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3248 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3249 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3250 tcg_temp_free_vec(t1);
3251 tcg_temp_free_vec(t2);
3252 tcg_temp_free_vec(t3);
3253 tcg_temp_free_vec(t4);
3254 break;
3256 case TCG_TYPE_V256:
3257 t1 = tcg_temp_new_vec(TCG_TYPE_V256);
3258 t2 = tcg_temp_new_vec(TCG_TYPE_V256);
3259 t3 = tcg_temp_new_vec(TCG_TYPE_V256);
3260 t4 = tcg_temp_new_vec(TCG_TYPE_V256);
3261 tcg_gen_dup16i_vec(t4, 0);
3262 /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
3263 t1: extends of B[0-7], D[0-7]
3264 t2: extends of X[0-7], Z[0-7]
3265 t3: extends of A[0-7], C[0-7]
3266 t4: extends of W[0-7], Y[0-7]. */
3267 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3268 tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3269 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3270 tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3271 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3272 tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3273 vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3274 tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3275 /* t1: BX DZ; t2: AW CY. */
3276 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3277 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3278 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3279 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3280 /* a0: AW BX CY DZ. */
3281 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
3282 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3283 tcg_temp_free_vec(t1);
3284 tcg_temp_free_vec(t2);
3285 tcg_temp_free_vec(t3);
3286 tcg_temp_free_vec(t4);
3287 break;
3289 default:
3290 g_assert_not_reached();
3292 break;
3294 case INDEX_op_cmp_vec:
3296 enum {
3297 NEED_SWAP = 1,
3298 NEED_INV = 2,
3299 NEED_BIAS = 4
3301 static const uint8_t fixups[16] = {
3302 [0 ... 15] = -1,
3303 [TCG_COND_EQ] = 0,
3304 [TCG_COND_NE] = NEED_INV,
3305 [TCG_COND_GT] = 0,
3306 [TCG_COND_LT] = NEED_SWAP,
3307 [TCG_COND_LE] = NEED_INV,
3308 [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3309 [TCG_COND_GTU] = NEED_BIAS,
3310 [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3311 [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3312 [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3315 TCGCond cond;
3316 uint8_t fixup;
3318 a1 = va_arg(va, TCGArg);
3319 a2 = va_arg(va, TCGArg);
3320 cond = va_arg(va, TCGArg);
3321 fixup = fixups[cond & 15];
3322 tcg_debug_assert(fixup != 0xff);
3324 if (fixup & NEED_INV) {
3325 cond = tcg_invert_cond(cond);
3327 if (fixup & NEED_SWAP) {
3328 TCGArg t;
3329 t = a1, a1 = a2, a2 = t;
3330 cond = tcg_swap_cond(cond);
3333 t1 = t2 = NULL;
3334 if (fixup & NEED_BIAS) {
3335 t1 = tcg_temp_new_vec(type);
3336 t2 = tcg_temp_new_vec(type);
3337 tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3338 tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
3339 tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
3340 a1 = tcgv_vec_arg(t1);
3341 a2 = tcgv_vec_arg(t2);
3342 cond = tcg_signed_cond(cond);
3345 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3346 vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
3348 if (fixup & NEED_BIAS) {
3349 tcg_temp_free_vec(t1);
3350 tcg_temp_free_vec(t2);
3352 if (fixup & NEED_INV) {
3353 tcg_gen_not_vec(vece, v0, v0);
3356 break;
3358 default:
3359 break;
3362 va_end(va);
3365 static const int tcg_target_callee_save_regs[] = {
3366 #if TCG_TARGET_REG_BITS == 64
3367 TCG_REG_RBP,
3368 TCG_REG_RBX,
3369 #if defined(_WIN64)
3370 TCG_REG_RDI,
3371 TCG_REG_RSI,
3372 #endif
3373 TCG_REG_R12,
3374 TCG_REG_R13,
3375 TCG_REG_R14, /* Currently used for the global env. */
3376 TCG_REG_R15,
3377 #else
3378 TCG_REG_EBP, /* Currently used for the global env. */
3379 TCG_REG_EBX,
3380 TCG_REG_ESI,
3381 TCG_REG_EDI,
3382 #endif
3385 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3386 and tcg_register_jit. */
3388 #define PUSH_SIZE \
3389 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3390 * (TCG_TARGET_REG_BITS / 8))
3392 #define FRAME_SIZE \
3393 ((PUSH_SIZE \
3394 + TCG_STATIC_CALL_ARGS_SIZE \
3395 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3396 + TCG_TARGET_STACK_ALIGN - 1) \
3397 & ~(TCG_TARGET_STACK_ALIGN - 1))
3399 /* Generate global QEMU prologue and epilogue code */
3400 static void tcg_target_qemu_prologue(TCGContext *s)
3402 int i, stack_addend;
3404 /* TB prologue */
3406 /* Reserve some stack space, also for TCG temps. */
3407 stack_addend = FRAME_SIZE - PUSH_SIZE;
3408 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3409 CPU_TEMP_BUF_NLONGS * sizeof(long));
3411 /* Save all callee saved registers. */
3412 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3413 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3416 #if TCG_TARGET_REG_BITS == 32
3417 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3418 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3419 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3420 /* jmp *tb. */
3421 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3422 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3423 + stack_addend);
3424 #else
3425 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3426 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3427 /* jmp *tb. */
3428 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3429 #endif
3432 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3433 * and fall through to the rest of the epilogue.
3435 s->code_gen_epilogue = s->code_ptr;
3436 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3438 /* TB epilogue */
3439 tb_ret_addr = s->code_ptr;
3441 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3443 if (have_avx2) {
3444 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3446 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3447 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3449 tcg_out_opc(s, OPC_RET, 0, 0, 0);
3451 #if !defined(CONFIG_SOFTMMU)
3452 /* Try to set up a segment register to point to guest_base. */
3453 if (guest_base) {
3454 setup_guest_base_seg();
3456 #endif
3459 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3461 memset(p, 0x90, count);
3464 static void tcg_target_init(TCGContext *s)
3466 #ifdef CONFIG_CPUID_H
3467 unsigned a, b, c, d, b7 = 0;
3468 int max = __get_cpuid_max(0, 0);
3470 if (max >= 7) {
3471 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3472 __cpuid_count(7, 0, a, b7, c, d);
3473 have_bmi1 = (b7 & bit_BMI) != 0;
3474 have_bmi2 = (b7 & bit_BMI2) != 0;
3477 if (max >= 1) {
3478 __cpuid(1, a, b, c, d);
3479 #ifndef have_cmov
3480 /* For 32-bit, 99% certainty that we're running on hardware that
3481 supports cmov, but we still need to check. In case cmov is not
3482 available, we'll use a small forward branch. */
3483 have_cmov = (d & bit_CMOV) != 0;
3484 #endif
3486 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3487 need to probe for it. */
3488 have_movbe = (c & bit_MOVBE) != 0;
3489 have_popcnt = (c & bit_POPCNT) != 0;
3491 /* There are a number of things we must check before we can be
3492 sure of not hitting invalid opcode. */
3493 if (c & bit_OSXSAVE) {
3494 unsigned xcrl, xcrh;
3495 asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3496 if ((xcrl & 6) == 6) {
3497 have_avx1 = (c & bit_AVX) != 0;
3498 have_avx2 = (b7 & bit_AVX2) != 0;
3503 max = __get_cpuid_max(0x8000000, 0);
3504 if (max >= 1) {
3505 __cpuid(0x80000001, a, b, c, d);
3506 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3507 have_lzcnt = (c & bit_LZCNT) != 0;
3509 #endif /* CONFIG_CPUID_H */
3511 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3512 if (TCG_TARGET_REG_BITS == 64) {
3513 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3515 if (have_avx1) {
3516 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3517 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3519 if (have_avx2) {
3520 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3523 tcg_target_call_clobber_regs = 0;
3524 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3525 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3526 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3527 if (TCG_TARGET_REG_BITS == 64) {
3528 #if !defined(_WIN64)
3529 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3530 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3531 #endif
3532 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3533 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3534 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3535 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3538 s->reserved_regs = 0;
3539 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3542 typedef struct {
3543 DebugFrameHeader h;
3544 uint8_t fde_def_cfa[4];
3545 uint8_t fde_reg_ofs[14];
3546 } DebugFrame;
3548 /* We're expecting a 2 byte uleb128 encoded value. */
3549 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3551 #if !defined(__ELF__)
3552 /* Host machine without ELF. */
3553 #elif TCG_TARGET_REG_BITS == 64
3554 #define ELF_HOST_MACHINE EM_X86_64
3555 static const DebugFrame debug_frame = {
3556 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3557 .h.cie.id = -1,
3558 .h.cie.version = 1,
3559 .h.cie.code_align = 1,
3560 .h.cie.data_align = 0x78, /* sleb128 -8 */
3561 .h.cie.return_column = 16,
3563 /* Total FDE size does not include the "len" member. */
3564 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3566 .fde_def_cfa = {
3567 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3568 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3569 (FRAME_SIZE >> 7)
3571 .fde_reg_ofs = {
3572 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3573 /* The following ordering must match tcg_target_callee_save_regs. */
3574 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3575 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3576 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3577 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3578 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3579 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3582 #else
3583 #define ELF_HOST_MACHINE EM_386
3584 static const DebugFrame debug_frame = {
3585 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3586 .h.cie.id = -1,
3587 .h.cie.version = 1,
3588 .h.cie.code_align = 1,
3589 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3590 .h.cie.return_column = 8,
3592 /* Total FDE size does not include the "len" member. */
3593 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3595 .fde_def_cfa = {
3596 12, 4, /* DW_CFA_def_cfa %esp, ... */
3597 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3598 (FRAME_SIZE >> 7)
3600 .fde_reg_ofs = {
3601 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3602 /* The following ordering must match tcg_target_callee_save_regs. */
3603 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3604 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3605 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3606 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3609 #endif
3611 #if defined(ELF_HOST_MACHINE)
3612 void tcg_register_jit(void *buf, size_t buf_size)
3614 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3616 #endif