Merge remote-tracking branch 'qemu/master'
[qemu/ar7.git] / tcg / i386 / tcg-target.inc.c
blob613a3eda92be4fb6fd12fcdbcd02fbc1557e7272
1 /*
2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 #include "tcg-be-ldst.h"
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
32 #else
33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 #endif
36 #endif
38 static const int tcg_target_reg_alloc_order[] = {
39 #if TCG_TARGET_REG_BITS == 64
40 TCG_REG_RBP,
41 TCG_REG_RBX,
42 TCG_REG_R12,
43 TCG_REG_R13,
44 TCG_REG_R14,
45 TCG_REG_R15,
46 TCG_REG_R10,
47 TCG_REG_R11,
48 TCG_REG_R9,
49 TCG_REG_R8,
50 TCG_REG_RCX,
51 TCG_REG_RDX,
52 TCG_REG_RSI,
53 TCG_REG_RDI,
54 TCG_REG_RAX,
55 #else
56 TCG_REG_EBX,
57 TCG_REG_ESI,
58 TCG_REG_EDI,
59 TCG_REG_EBP,
60 TCG_REG_ECX,
61 TCG_REG_EDX,
62 TCG_REG_EAX,
63 #endif
66 static const int tcg_target_call_iarg_regs[] = {
67 #if TCG_TARGET_REG_BITS == 64
68 #if defined(_WIN64)
69 TCG_REG_RCX,
70 TCG_REG_RDX,
71 #else
72 TCG_REG_RDI,
73 TCG_REG_RSI,
74 TCG_REG_RDX,
75 TCG_REG_RCX,
76 #endif
77 TCG_REG_R8,
78 TCG_REG_R9,
79 #else
80 /* 32 bit mode uses stack based calling convention (GCC default). */
81 #endif
84 static const int tcg_target_call_oarg_regs[] = {
85 TCG_REG_EAX,
86 #if TCG_TARGET_REG_BITS == 32
87 TCG_REG_EDX
88 #endif
91 /* Constants we accept. */
92 #define TCG_CT_CONST_S32 0x100
93 #define TCG_CT_CONST_U32 0x200
94 #define TCG_CT_CONST_I32 0x400
95 #define TCG_CT_CONST_WSZ 0x800
97 /* Registers used with L constraint, which are the first argument
98 registers on x86_64, and two random call clobbered registers on
99 i386. */
100 #if TCG_TARGET_REG_BITS == 64
101 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
102 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
103 #else
104 # define TCG_REG_L0 TCG_REG_EAX
105 # define TCG_REG_L1 TCG_REG_EDX
106 #endif
108 /* The host compiler should supply <cpuid.h> to enable runtime features
109 detection, as we're not going to go so far as our own inline assembly.
110 If not available, default values will be assumed. */
111 #if defined(CONFIG_CPUID_H)
112 #include <cpuid.h>
113 #endif
115 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
116 is available. */
117 #if TCG_TARGET_REG_BITS == 64
118 # define have_cmov 1
119 #elif defined(CONFIG_CPUID_H) && defined(bit_CMOV)
120 static bool have_cmov;
121 #else
122 # define have_cmov 0
123 #endif
125 /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
126 going to attempt to determine at runtime whether movbe is available. */
127 #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
128 static bool have_movbe;
129 #else
130 # define have_movbe 0
131 #endif
133 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
134 it there. Therefore we always define the variable. */
135 bool have_bmi1;
136 bool have_popcnt;
138 #if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
139 static bool have_bmi2;
140 #else
141 # define have_bmi2 0
142 #endif
143 #if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
144 static bool have_lzcnt;
145 #else
146 # define have_lzcnt 0
147 #endif
149 static tcg_insn_unit *tb_ret_addr;
151 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
152 intptr_t value, intptr_t addend)
154 value += addend;
155 switch(type) {
156 case R_386_PC32:
157 value -= (uintptr_t)code_ptr;
158 if (value != (int32_t)value) {
159 tcg_abort();
161 tcg_patch32(code_ptr, value);
162 break;
163 case R_386_PC8:
164 value -= (uintptr_t)code_ptr;
165 if (value != (int8_t)value) {
166 tcg_abort();
168 tcg_patch8(code_ptr, value);
169 break;
170 default:
171 tcg_abort();
175 /* parse target specific constraints */
176 static const char *target_parse_constraint(TCGArgConstraint *ct,
177 const char *ct_str, TCGType type)
179 switch(*ct_str++) {
180 case 'a':
181 ct->ct |= TCG_CT_REG;
182 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
183 break;
184 case 'b':
185 ct->ct |= TCG_CT_REG;
186 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
187 break;
188 case 'c':
189 ct->ct |= TCG_CT_REG;
190 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
191 break;
192 case 'd':
193 ct->ct |= TCG_CT_REG;
194 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
195 break;
196 case 'S':
197 ct->ct |= TCG_CT_REG;
198 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
199 break;
200 case 'D':
201 ct->ct |= TCG_CT_REG;
202 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
203 break;
204 case 'q':
205 ct->ct |= TCG_CT_REG;
206 if (TCG_TARGET_REG_BITS == 64) {
207 tcg_regset_set32(ct->u.regs, 0, 0xffff);
208 } else {
209 tcg_regset_set32(ct->u.regs, 0, 0xf);
211 break;
212 case 'Q':
213 ct->ct |= TCG_CT_REG;
214 tcg_regset_set32(ct->u.regs, 0, 0xf);
215 break;
216 case 'r':
217 ct->ct |= TCG_CT_REG;
218 if (TCG_TARGET_REG_BITS == 64) {
219 tcg_regset_set32(ct->u.regs, 0, 0xffff);
220 } else {
221 tcg_regset_set32(ct->u.regs, 0, 0xff);
223 break;
224 case 'W':
225 /* With TZCNT/LZCNT, we can have operand-size as an input. */
226 ct->ct |= TCG_CT_CONST_WSZ;
227 break;
229 /* qemu_ld/st address constraint */
230 case 'L':
231 ct->ct |= TCG_CT_REG;
232 if (TCG_TARGET_REG_BITS == 64) {
233 tcg_regset_set32(ct->u.regs, 0, 0xffff);
234 } else {
235 tcg_regset_set32(ct->u.regs, 0, 0xff);
237 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
238 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
239 break;
241 case 'e':
242 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
243 break;
244 case 'Z':
245 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
246 break;
247 case 'I':
248 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
249 break;
251 default:
252 return NULL;
254 return ct_str;
257 /* test if a constant matches the constraint */
258 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
259 const TCGArgConstraint *arg_ct)
261 int ct = arg_ct->ct;
262 if (ct & TCG_CT_CONST) {
263 return 1;
265 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
266 return 1;
268 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
269 return 1;
271 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
272 return 1;
274 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
275 return 1;
277 return 0;
280 #if TCG_TARGET_REG_BITS == 64
281 # define LOWREGMASK(x) ((x) & 7)
282 #else
283 # define LOWREGMASK(x) (x)
284 #endif
286 #define P_EXT 0x100 /* 0x0f opcode prefix */
287 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
288 #define P_DATA16 0x400 /* 0x66 opcode prefix */
289 #if TCG_TARGET_REG_BITS == 64
290 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
291 # define P_REXW 0x1000 /* Set REX.W = 1 */
292 # define P_REXB_R 0x2000 /* REG field as byte register */
293 # define P_REXB_RM 0x4000 /* R/M field as byte register */
294 # define P_GS 0x8000 /* gs segment override */
295 #else
296 # define P_ADDR32 0
297 # define P_REXW 0
298 # define P_REXB_R 0
299 # define P_REXB_RM 0
300 # define P_GS 0
301 #endif
302 #define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
303 #define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
305 #define OPC_ARITH_EvIz (0x81)
306 #define OPC_ARITH_EvIb (0x83)
307 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
308 #define OPC_ANDN (0xf2 | P_EXT38)
309 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
310 #define OPC_BSF (0xbc | P_EXT)
311 #define OPC_BSR (0xbd | P_EXT)
312 #define OPC_BSWAP (0xc8 | P_EXT)
313 #define OPC_CALL_Jz (0xe8)
314 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
315 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
316 #define OPC_DEC_r32 (0x48)
317 #define OPC_IMUL_GvEv (0xaf | P_EXT)
318 #define OPC_IMUL_GvEvIb (0x6b)
319 #define OPC_IMUL_GvEvIz (0x69)
320 #define OPC_INC_r32 (0x40)
321 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
322 #define OPC_JCC_short (0x70) /* ... plus condition code */
323 #define OPC_JMP_long (0xe9)
324 #define OPC_JMP_short (0xeb)
325 #define OPC_LEA (0x8d)
326 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
327 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
328 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
329 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
330 #define OPC_MOVB_EvIz (0xc6)
331 #define OPC_MOVL_EvIz (0xc7)
332 #define OPC_MOVL_Iv (0xb8)
333 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
334 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
335 #define OPC_MOVSBL (0xbe | P_EXT)
336 #define OPC_MOVSWL (0xbf | P_EXT)
337 #define OPC_MOVSLQ (0x63 | P_REXW)
338 #define OPC_MOVZBL (0xb6 | P_EXT)
339 #define OPC_MOVZWL (0xb7 | P_EXT)
340 #define OPC_POP_r32 (0x58)
341 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
342 #define OPC_PUSH_r32 (0x50)
343 #define OPC_PUSH_Iv (0x68)
344 #define OPC_PUSH_Ib (0x6a)
345 #define OPC_RET (0xc3)
346 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
347 #define OPC_SHIFT_1 (0xd1)
348 #define OPC_SHIFT_Ib (0xc1)
349 #define OPC_SHIFT_cl (0xd3)
350 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
351 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
352 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
353 #define OPC_TESTL (0x85)
354 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
355 #define OPC_XCHG_ax_r32 (0x90)
357 #define OPC_GRP3_Ev (0xf7)
358 #define OPC_GRP5 (0xff)
360 /* Group 1 opcode extensions for 0x80-0x83.
361 These are also used as modifiers for OPC_ARITH. */
362 #define ARITH_ADD 0
363 #define ARITH_OR 1
364 #define ARITH_ADC 2
365 #define ARITH_SBB 3
366 #define ARITH_AND 4
367 #define ARITH_SUB 5
368 #define ARITH_XOR 6
369 #define ARITH_CMP 7
371 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
372 #define SHIFT_ROL 0
373 #define SHIFT_ROR 1
374 #define SHIFT_SHL 4
375 #define SHIFT_SHR 5
376 #define SHIFT_SAR 7
378 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
379 #define EXT3_NOT 2
380 #define EXT3_NEG 3
381 #define EXT3_MUL 4
382 #define EXT3_IMUL 5
383 #define EXT3_DIV 6
384 #define EXT3_IDIV 7
386 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
387 #define EXT5_INC_Ev 0
388 #define EXT5_DEC_Ev 1
389 #define EXT5_CALLN_Ev 2
390 #define EXT5_JMPN_Ev 4
392 /* Condition codes to be added to OPC_JCC_{long,short}. */
393 #define JCC_JMP (-1)
394 #define JCC_JO 0x0
395 #define JCC_JNO 0x1
396 #define JCC_JB 0x2
397 #define JCC_JAE 0x3
398 #define JCC_JE 0x4
399 #define JCC_JNE 0x5
400 #define JCC_JBE 0x6
401 #define JCC_JA 0x7
402 #define JCC_JS 0x8
403 #define JCC_JNS 0x9
404 #define JCC_JP 0xa
405 #define JCC_JNP 0xb
406 #define JCC_JL 0xc
407 #define JCC_JGE 0xd
408 #define JCC_JLE 0xe
409 #define JCC_JG 0xf
411 static const uint8_t tcg_cond_to_jcc[] = {
412 [TCG_COND_EQ] = JCC_JE,
413 [TCG_COND_NE] = JCC_JNE,
414 [TCG_COND_LT] = JCC_JL,
415 [TCG_COND_GE] = JCC_JGE,
416 [TCG_COND_LE] = JCC_JLE,
417 [TCG_COND_GT] = JCC_JG,
418 [TCG_COND_LTU] = JCC_JB,
419 [TCG_COND_GEU] = JCC_JAE,
420 [TCG_COND_LEU] = JCC_JBE,
421 [TCG_COND_GTU] = JCC_JA,
424 #if TCG_TARGET_REG_BITS == 64
425 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
427 int rex;
429 if (opc & P_GS) {
430 tcg_out8(s, 0x65);
432 if (opc & P_DATA16) {
433 /* We should never be asking for both 16 and 64-bit operation. */
434 tcg_debug_assert((opc & P_REXW) == 0);
435 tcg_out8(s, 0x66);
437 if (opc & P_ADDR32) {
438 tcg_out8(s, 0x67);
440 if (opc & P_SIMDF3) {
441 tcg_out8(s, 0xf3);
442 } else if (opc & P_SIMDF2) {
443 tcg_out8(s, 0xf2);
446 rex = 0;
447 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
448 rex |= (r & 8) >> 1; /* REX.R */
449 rex |= (x & 8) >> 2; /* REX.X */
450 rex |= (rm & 8) >> 3; /* REX.B */
452 /* P_REXB_{R,RM} indicates that the given register is the low byte.
453 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
454 as otherwise the encoding indicates %[abcd]h. Note that the values
455 that are ORed in merely indicate that the REX byte must be present;
456 those bits get discarded in output. */
457 rex |= opc & (r >= 4 ? P_REXB_R : 0);
458 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
460 if (rex) {
461 tcg_out8(s, (uint8_t)(rex | 0x40));
464 if (opc & (P_EXT | P_EXT38)) {
465 tcg_out8(s, 0x0f);
466 if (opc & P_EXT38) {
467 tcg_out8(s, 0x38);
471 tcg_out8(s, opc);
473 #else
474 static void tcg_out_opc(TCGContext *s, int opc)
476 if (opc & P_DATA16) {
477 tcg_out8(s, 0x66);
479 if (opc & P_SIMDF3) {
480 tcg_out8(s, 0xf3);
481 } else if (opc & P_SIMDF2) {
482 tcg_out8(s, 0xf2);
484 if (opc & (P_EXT | P_EXT38)) {
485 tcg_out8(s, 0x0f);
486 if (opc & P_EXT38) {
487 tcg_out8(s, 0x38);
490 tcg_out8(s, opc);
492 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
493 the 32-bit compilation paths. This method works with all versions of gcc,
494 whereas relying on optimization may not be able to exclude them. */
495 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
496 #endif
498 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
500 tcg_out_opc(s, opc, r, rm, 0);
501 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
504 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
506 int tmp;
508 if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
509 /* Three byte VEX prefix. */
510 tcg_out8(s, 0xc4);
512 /* VEX.m-mmmm */
513 if (opc & P_EXT38) {
514 tmp = 2;
515 } else if (opc & P_EXT) {
516 tmp = 1;
517 } else {
518 tcg_abort();
520 tmp |= 0x40; /* VEX.X */
521 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
522 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
523 tcg_out8(s, tmp);
525 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
526 } else {
527 /* Two byte VEX prefix. */
528 tcg_out8(s, 0xc5);
530 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
532 /* VEX.pp */
533 if (opc & P_DATA16) {
534 tmp |= 1; /* 0x66 */
535 } else if (opc & P_SIMDF3) {
536 tmp |= 2; /* 0xf3 */
537 } else if (opc & P_SIMDF2) {
538 tmp |= 3; /* 0xf2 */
540 tmp |= (~v & 15) << 3; /* VEX.vvvv */
541 tcg_out8(s, tmp);
542 tcg_out8(s, opc);
543 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
546 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
547 We handle either RM and INDEX missing with a negative value. In 64-bit
548 mode for absolute addresses, ~RM is the size of the immediate operand
549 that will follow the instruction. */
551 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
552 int index, int shift, intptr_t offset)
554 int mod, len;
556 if (index < 0 && rm < 0) {
557 if (TCG_TARGET_REG_BITS == 64) {
558 /* Try for a rip-relative addressing mode. This has replaced
559 the 32-bit-mode absolute addressing encoding. */
560 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
561 intptr_t disp = offset - pc;
562 if (disp == (int32_t)disp) {
563 tcg_out_opc(s, opc, r, 0, 0);
564 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
565 tcg_out32(s, disp);
566 return;
569 /* Try for an absolute address encoding. This requires the
570 use of the MODRM+SIB encoding and is therefore larger than
571 rip-relative addressing. */
572 if (offset == (int32_t)offset) {
573 tcg_out_opc(s, opc, r, 0, 0);
574 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
575 tcg_out8(s, (4 << 3) | 5);
576 tcg_out32(s, offset);
577 return;
580 /* ??? The memory isn't directly addressable. */
581 tcg_abort();
582 } else {
583 /* Absolute address. */
584 tcg_out_opc(s, opc, r, 0, 0);
585 tcg_out8(s, (r << 3) | 5);
586 tcg_out32(s, offset);
587 return;
591 /* Find the length of the immediate addend. Note that the encoding
592 that would be used for (%ebp) indicates absolute addressing. */
593 if (rm < 0) {
594 mod = 0, len = 4, rm = 5;
595 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
596 mod = 0, len = 0;
597 } else if (offset == (int8_t)offset) {
598 mod = 0x40, len = 1;
599 } else {
600 mod = 0x80, len = 4;
603 /* Use a single byte MODRM format if possible. Note that the encoding
604 that would be used for %esp is the escape to the two byte form. */
605 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
606 /* Single byte MODRM format. */
607 tcg_out_opc(s, opc, r, rm, 0);
608 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
609 } else {
610 /* Two byte MODRM+SIB format. */
612 /* Note that the encoding that would place %esp into the index
613 field indicates no index register. In 64-bit mode, the REX.X
614 bit counts, so %r12 can be used as the index. */
615 if (index < 0) {
616 index = 4;
617 } else {
618 tcg_debug_assert(index != TCG_REG_ESP);
621 tcg_out_opc(s, opc, r, rm, index);
622 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
623 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
626 if (len == 1) {
627 tcg_out8(s, offset);
628 } else if (len == 4) {
629 tcg_out32(s, offset);
633 /* A simplification of the above with no index or shift. */
634 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
635 int rm, intptr_t offset)
637 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
640 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
641 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
643 /* Propagate an opcode prefix, such as P_REXW. */
644 int ext = subop & ~0x7;
645 subop &= 0x7;
647 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
650 static inline void tcg_out_mov(TCGContext *s, TCGType type,
651 TCGReg ret, TCGReg arg)
653 if (arg != ret) {
654 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
655 tcg_out_modrm(s, opc, ret, arg);
659 static void tcg_out_movi(TCGContext *s, TCGType type,
660 TCGReg ret, tcg_target_long arg)
662 tcg_target_long diff;
664 if (arg == 0) {
665 tgen_arithr(s, ARITH_XOR, ret, ret);
666 return;
668 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
669 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
670 tcg_out32(s, arg);
671 return;
673 if (arg == (int32_t)arg) {
674 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
675 tcg_out32(s, arg);
676 return;
679 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
680 diff = arg - ((uintptr_t)s->code_ptr + 7);
681 if (diff == (int32_t)diff) {
682 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
683 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
684 tcg_out32(s, diff);
685 return;
688 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
689 tcg_out64(s, arg);
692 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
694 if (val == (int8_t)val) {
695 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
696 tcg_out8(s, val);
697 } else if (val == (int32_t)val) {
698 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
699 tcg_out32(s, val);
700 } else {
701 tcg_abort();
705 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
707 /* Given the strength of x86 memory ordering, we only need care for
708 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
709 faster than "mfence", so don't bother with the sse insn. */
710 if (a0 & TCG_MO_ST_LD) {
711 tcg_out8(s, 0xf0);
712 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
713 tcg_out8(s, 0);
717 static inline void tcg_out_push(TCGContext *s, int reg)
719 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
722 static inline void tcg_out_pop(TCGContext *s, int reg)
724 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
727 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
728 TCGReg arg1, intptr_t arg2)
730 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
731 tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
734 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
735 TCGReg arg1, intptr_t arg2)
737 int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
738 tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
741 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
742 TCGReg base, intptr_t ofs)
744 int rexw = 0;
745 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
746 if (val != (int32_t)val) {
747 return false;
749 rexw = P_REXW;
751 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
752 tcg_out32(s, val);
753 return true;
756 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
758 /* Propagate an opcode prefix, such as P_DATA16. */
759 int ext = subopc & ~0x7;
760 subopc &= 0x7;
762 if (count == 1) {
763 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
764 } else {
765 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
766 tcg_out8(s, count);
770 static inline void tcg_out_bswap32(TCGContext *s, int reg)
772 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
775 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
777 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
780 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
782 /* movzbl */
783 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
784 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
787 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
789 /* movsbl */
790 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
791 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
794 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
796 /* movzwl */
797 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
800 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
802 /* movsw[lq] */
803 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
806 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
808 /* 32-bit mov zero extends. */
809 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
812 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
814 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
817 static inline void tcg_out_bswap64(TCGContext *s, int reg)
819 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
822 static void tgen_arithi(TCGContext *s, int c, int r0,
823 tcg_target_long val, int cf)
825 int rexw = 0;
827 if (TCG_TARGET_REG_BITS == 64) {
828 rexw = c & -8;
829 c &= 7;
832 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
833 partial flags update stalls on Pentium4 and are not recommended
834 by current Intel optimization manuals. */
835 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
836 int is_inc = (c == ARITH_ADD) ^ (val < 0);
837 if (TCG_TARGET_REG_BITS == 64) {
838 /* The single-byte increment encodings are re-tasked as the
839 REX prefixes. Use the MODRM encoding. */
840 tcg_out_modrm(s, OPC_GRP5 + rexw,
841 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
842 } else {
843 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
845 return;
848 if (c == ARITH_AND) {
849 if (TCG_TARGET_REG_BITS == 64) {
850 if (val == 0xffffffffu) {
851 tcg_out_ext32u(s, r0, r0);
852 return;
854 if (val == (uint32_t)val) {
855 /* AND with no high bits set can use a 32-bit operation. */
856 rexw = 0;
859 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
860 tcg_out_ext8u(s, r0, r0);
861 return;
863 if (val == 0xffffu) {
864 tcg_out_ext16u(s, r0, r0);
865 return;
869 if (val == (int8_t)val) {
870 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
871 tcg_out8(s, val);
872 return;
874 if (rexw == 0 || val == (int32_t)val) {
875 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
876 tcg_out32(s, val);
877 return;
880 tcg_abort();
883 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
885 if (val != 0) {
886 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
890 /* Use SMALL != 0 to force a short forward branch. */
891 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
893 int32_t val, val1;
895 if (l->has_value) {
896 val = tcg_pcrel_diff(s, l->u.value_ptr);
897 val1 = val - 2;
898 if ((int8_t)val1 == val1) {
899 if (opc == -1) {
900 tcg_out8(s, OPC_JMP_short);
901 } else {
902 tcg_out8(s, OPC_JCC_short + opc);
904 tcg_out8(s, val1);
905 } else {
906 if (small) {
907 tcg_abort();
909 if (opc == -1) {
910 tcg_out8(s, OPC_JMP_long);
911 tcg_out32(s, val - 5);
912 } else {
913 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
914 tcg_out32(s, val - 6);
917 } else if (small) {
918 if (opc == -1) {
919 tcg_out8(s, OPC_JMP_short);
920 } else {
921 tcg_out8(s, OPC_JCC_short + opc);
923 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
924 s->code_ptr += 1;
925 } else {
926 if (opc == -1) {
927 tcg_out8(s, OPC_JMP_long);
928 } else {
929 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
931 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
932 s->code_ptr += 4;
936 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
937 int const_arg2, int rexw)
939 if (const_arg2) {
940 if (arg2 == 0) {
941 /* test r, r */
942 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
943 } else {
944 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
946 } else {
947 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
951 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
952 TCGArg arg1, TCGArg arg2, int const_arg2,
953 TCGLabel *label, int small)
955 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
956 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
959 #if TCG_TARGET_REG_BITS == 64
960 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
961 TCGArg arg1, TCGArg arg2, int const_arg2,
962 TCGLabel *label, int small)
964 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
965 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
967 #else
968 /* XXX: we implement it at the target level to avoid having to
969 handle cross basic blocks temporaries */
970 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
971 const int *const_args, int small)
973 TCGLabel *label_next = gen_new_label();
974 TCGLabel *label_this = arg_label(args[5]);
976 switch(args[4]) {
977 case TCG_COND_EQ:
978 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
979 label_next, 1);
980 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
981 label_this, small);
982 break;
983 case TCG_COND_NE:
984 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
985 label_this, small);
986 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
987 label_this, small);
988 break;
989 case TCG_COND_LT:
990 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
991 label_this, small);
992 tcg_out_jxx(s, JCC_JNE, label_next, 1);
993 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
994 label_this, small);
995 break;
996 case TCG_COND_LE:
997 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
998 label_this, small);
999 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1000 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1001 label_this, small);
1002 break;
1003 case TCG_COND_GT:
1004 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1005 label_this, small);
1006 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1007 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1008 label_this, small);
1009 break;
1010 case TCG_COND_GE:
1011 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1012 label_this, small);
1013 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1014 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1015 label_this, small);
1016 break;
1017 case TCG_COND_LTU:
1018 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1019 label_this, small);
1020 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1021 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1022 label_this, small);
1023 break;
1024 case TCG_COND_LEU:
1025 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1026 label_this, small);
1027 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1028 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1029 label_this, small);
1030 break;
1031 case TCG_COND_GTU:
1032 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1033 label_this, small);
1034 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1035 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1036 label_this, small);
1037 break;
1038 case TCG_COND_GEU:
1039 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1040 label_this, small);
1041 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1042 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1043 label_this, small);
1044 break;
1045 default:
1046 tcg_abort();
1048 tcg_out_label(s, label_next, s->code_ptr);
1050 #endif
1052 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1053 TCGArg arg1, TCGArg arg2, int const_arg2)
1055 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1056 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1057 tcg_out_ext8u(s, dest, dest);
1060 #if TCG_TARGET_REG_BITS == 64
1061 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1062 TCGArg arg1, TCGArg arg2, int const_arg2)
1064 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1065 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1066 tcg_out_ext8u(s, dest, dest);
1068 #else
1069 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1070 const int *const_args)
1072 TCGArg new_args[6];
1073 TCGLabel *label_true, *label_over;
1075 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1077 if (args[0] == args[1] || args[0] == args[2]
1078 || (!const_args[3] && args[0] == args[3])
1079 || (!const_args[4] && args[0] == args[4])) {
1080 /* When the destination overlaps with one of the argument
1081 registers, don't do anything tricky. */
1082 label_true = gen_new_label();
1083 label_over = gen_new_label();
1085 new_args[5] = label_arg(label_true);
1086 tcg_out_brcond2(s, new_args, const_args+1, 1);
1088 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1089 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1090 tcg_out_label(s, label_true, s->code_ptr);
1092 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1093 tcg_out_label(s, label_over, s->code_ptr);
1094 } else {
1095 /* When the destination does not overlap one of the arguments,
1096 clear the destination first, jump if cond false, and emit an
1097 increment in the true case. This results in smaller code. */
1099 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1101 label_over = gen_new_label();
1102 new_args[4] = tcg_invert_cond(new_args[4]);
1103 new_args[5] = label_arg(label_over);
1104 tcg_out_brcond2(s, new_args, const_args+1, 1);
1106 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1107 tcg_out_label(s, label_over, s->code_ptr);
1110 #endif
1112 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1113 TCGReg dest, TCGReg v1)
1115 if (have_cmov) {
1116 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1117 } else {
1118 TCGLabel *over = gen_new_label();
1119 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1120 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1121 tcg_out_label(s, over, s->code_ptr);
1125 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1126 TCGReg c1, TCGArg c2, int const_c2,
1127 TCGReg v1)
1129 tcg_out_cmp(s, c1, c2, const_c2, 0);
1130 tcg_out_cmov(s, cond, 0, dest, v1);
1133 #if TCG_TARGET_REG_BITS == 64
1134 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1135 TCGReg c1, TCGArg c2, int const_c2,
1136 TCGReg v1)
1138 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1139 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1141 #endif
1143 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1144 TCGArg arg2, bool const_a2)
1146 if (have_bmi1) {
1147 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1148 if (const_a2) {
1149 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1150 } else {
1151 tcg_debug_assert(dest != arg2);
1152 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1154 } else {
1155 tcg_debug_assert(dest != arg2);
1156 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1157 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1161 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1162 TCGArg arg2, bool const_a2)
1164 if (have_lzcnt) {
1165 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1166 if (const_a2) {
1167 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1168 } else {
1169 tcg_debug_assert(dest != arg2);
1170 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1172 } else {
1173 tcg_debug_assert(!const_a2);
1174 tcg_debug_assert(dest != arg1);
1175 tcg_debug_assert(dest != arg2);
1177 /* Recall that the output of BSR is the index not the count. */
1178 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1179 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1181 /* Since we have destroyed the flags from BSR, we have to re-test. */
1182 tcg_out_cmp(s, arg1, 0, 1, rexw);
1183 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1187 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1189 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1191 if (disp == (int32_t)disp) {
1192 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1193 tcg_out32(s, disp);
1194 } else {
1195 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (uintptr_t)dest);
1196 tcg_out_modrm(s, OPC_GRP5,
1197 call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
1201 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1203 tcg_out_branch(s, 1, dest);
1206 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1208 tcg_out_branch(s, 0, dest);
1211 static void tcg_out_nopn(TCGContext *s, int n)
1213 int i;
1214 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1215 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1216 * duplicate prefix, and all of the interesting recent cores can
1217 * decode and discard the duplicates in a single cycle.
1219 tcg_debug_assert(n >= 1);
1220 for (i = 1; i < n; ++i) {
1221 tcg_out8(s, 0x66);
1223 tcg_out8(s, 0x90);
1226 #if defined(CONFIG_SOFTMMU)
1227 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1228 * int mmu_idx, uintptr_t ra)
1230 static void * const qemu_ld_helpers[16] = {
1231 [MO_UB] = helper_ret_ldub_mmu,
1232 [MO_LEUW] = helper_le_lduw_mmu,
1233 [MO_LEUL] = helper_le_ldul_mmu,
1234 [MO_LEQ] = helper_le_ldq_mmu,
1235 [MO_BEUW] = helper_be_lduw_mmu,
1236 [MO_BEUL] = helper_be_ldul_mmu,
1237 [MO_BEQ] = helper_be_ldq_mmu,
1240 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1241 * uintxx_t val, int mmu_idx, uintptr_t ra)
1243 static void * const qemu_st_helpers[16] = {
1244 [MO_UB] = helper_ret_stb_mmu,
1245 [MO_LEUW] = helper_le_stw_mmu,
1246 [MO_LEUL] = helper_le_stl_mmu,
1247 [MO_LEQ] = helper_le_stq_mmu,
1248 [MO_BEUW] = helper_be_stw_mmu,
1249 [MO_BEUL] = helper_be_stl_mmu,
1250 [MO_BEQ] = helper_be_stq_mmu,
1253 /* Perform the TLB load and compare.
1255 Inputs:
1256 ADDRLO and ADDRHI contain the low and high part of the address.
1258 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1260 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1261 This should be offsetof addr_read or addr_write.
1263 Outputs:
1264 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1265 positions of the displacements of forward jumps to the TLB miss case.
1267 Second argument register is loaded with the low part of the address.
1268 In the TLB hit case, it has been adjusted as indicated by the TLB
1269 and so is a host address. In the TLB miss case, it continues to
1270 hold a guest address.
1272 First argument register is clobbered. */
1274 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1275 int mem_index, TCGMemOp opc,
1276 tcg_insn_unit **label_ptr, int which)
1278 const TCGReg r0 = TCG_REG_L0;
1279 const TCGReg r1 = TCG_REG_L1;
1280 TCGType ttype = TCG_TYPE_I32;
1281 TCGType tlbtype = TCG_TYPE_I32;
1282 int trexw = 0, hrexw = 0, tlbrexw = 0;
1283 unsigned a_bits = get_alignment_bits(opc);
1284 unsigned s_bits = opc & MO_SIZE;
1285 unsigned a_mask = (1 << a_bits) - 1;
1286 unsigned s_mask = (1 << s_bits) - 1;
1287 target_ulong tlb_mask;
1289 if (TCG_TARGET_REG_BITS == 64) {
1290 if (TARGET_LONG_BITS == 64) {
1291 ttype = TCG_TYPE_I64;
1292 trexw = P_REXW;
1294 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1295 hrexw = P_REXW;
1296 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1297 tlbtype = TCG_TYPE_I64;
1298 tlbrexw = P_REXW;
1303 tcg_out_mov(s, tlbtype, r0, addrlo);
1304 /* If the required alignment is at least as large as the access, simply
1305 copy the address and mask. For lesser alignments, check that we don't
1306 cross pages for the complete access. */
1307 if (a_bits >= s_bits) {
1308 tcg_out_mov(s, ttype, r1, addrlo);
1309 } else {
1310 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1312 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1314 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1315 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1317 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1318 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1319 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1321 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1322 offsetof(CPUArchState, tlb_table[mem_index][0])
1323 + which);
1325 /* cmp 0(r0), r1 */
1326 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1328 /* Prepare for both the fast path add of the tlb addend, and the slow
1329 path function argument setup. There are two cases worth note:
1330 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1331 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1332 copies the entire guest address for the slow path, while truncation
1333 for the 32-bit host happens with the fastpath ADDL below. */
1334 tcg_out_mov(s, ttype, r1, addrlo);
1336 /* jne slow_path */
1337 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1338 label_ptr[0] = s->code_ptr;
1339 s->code_ptr += 4;
1341 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1342 /* cmp 4(r0), addrhi */
1343 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1345 /* jne slow_path */
1346 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1347 label_ptr[1] = s->code_ptr;
1348 s->code_ptr += 4;
1351 /* TLB Hit. */
1353 /* add addend(r0), r1 */
1354 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1355 offsetof(CPUTLBEntry, addend) - which);
1359 * Record the context of a call to the out of line helper code for the slow path
1360 * for a load or store, so that we can later generate the correct helper code
1362 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1363 TCGReg datalo, TCGReg datahi,
1364 TCGReg addrlo, TCGReg addrhi,
1365 tcg_insn_unit *raddr,
1366 tcg_insn_unit **label_ptr)
1368 TCGLabelQemuLdst *label = new_ldst_label(s);
1370 label->is_ld = is_ld;
1371 label->oi = oi;
1372 label->datalo_reg = datalo;
1373 label->datahi_reg = datahi;
1374 label->addrlo_reg = addrlo;
1375 label->addrhi_reg = addrhi;
1376 label->raddr = raddr;
1377 label->label_ptr[0] = label_ptr[0];
1378 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1379 label->label_ptr[1] = label_ptr[1];
1384 * Generate code for the slow path for a load at the end of block
1386 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1388 TCGMemOpIdx oi = l->oi;
1389 TCGMemOp opc = get_memop(oi);
1390 TCGReg data_reg;
1391 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1393 /* resolve label address */
1394 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1395 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1396 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1399 if (TCG_TARGET_REG_BITS == 32) {
1400 int ofs = 0;
1402 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1403 ofs += 4;
1405 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1406 ofs += 4;
1408 if (TARGET_LONG_BITS == 64) {
1409 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1410 ofs += 4;
1413 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1414 ofs += 4;
1416 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1417 } else {
1418 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1419 /* The second argument is already loaded with addrlo. */
1420 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1421 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1422 (uintptr_t)l->raddr);
1425 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1427 data_reg = l->datalo_reg;
1428 switch (opc & MO_SSIZE) {
1429 case MO_SB:
1430 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1431 break;
1432 case MO_SW:
1433 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1434 break;
1435 #if TCG_TARGET_REG_BITS == 64
1436 case MO_SL:
1437 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1438 break;
1439 #endif
1440 case MO_UB:
1441 case MO_UW:
1442 /* Note that the helpers have zero-extended to tcg_target_long. */
1443 case MO_UL:
1444 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1445 break;
1446 case MO_Q:
1447 if (TCG_TARGET_REG_BITS == 64) {
1448 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1449 } else if (data_reg == TCG_REG_EDX) {
1450 /* xchg %edx, %eax */
1451 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1452 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1453 } else {
1454 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1455 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1457 break;
1458 default:
1459 tcg_abort();
1462 /* Jump to the code corresponding to next IR of qemu_st */
1463 tcg_out_jmp(s, l->raddr);
1467 * Generate code for the slow path for a store at the end of block
1469 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1471 TCGMemOpIdx oi = l->oi;
1472 TCGMemOp opc = get_memop(oi);
1473 TCGMemOp s_bits = opc & MO_SIZE;
1474 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1475 TCGReg retaddr;
1477 /* resolve label address */
1478 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1479 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1480 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1483 if (TCG_TARGET_REG_BITS == 32) {
1484 int ofs = 0;
1486 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1487 ofs += 4;
1489 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1490 ofs += 4;
1492 if (TARGET_LONG_BITS == 64) {
1493 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1494 ofs += 4;
1497 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1498 ofs += 4;
1500 if (s_bits == MO_64) {
1501 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1502 ofs += 4;
1505 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1506 ofs += 4;
1508 retaddr = TCG_REG_EAX;
1509 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1510 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1511 } else {
1512 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1513 /* The second argument is already loaded with addrlo. */
1514 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1515 tcg_target_call_iarg_regs[2], l->datalo_reg);
1516 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1518 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1519 retaddr = tcg_target_call_iarg_regs[4];
1520 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1521 } else {
1522 retaddr = TCG_REG_RAX;
1523 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1524 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1525 TCG_TARGET_CALL_STACK_OFFSET);
1529 /* "Tail call" to the helper, with the return address back inline. */
1530 tcg_out_push(s, retaddr);
1531 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1533 #elif defined(__x86_64__) && defined(__linux__)
1534 # include <asm/prctl.h>
1535 # include <sys/prctl.h>
1537 int arch_prctl(int code, unsigned long addr);
1539 static int guest_base_flags;
1540 static inline void setup_guest_base_seg(void)
1542 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1543 guest_base_flags = P_GS;
1546 #else
1547 # define guest_base_flags 0
1548 static inline void setup_guest_base_seg(void) { }
1549 #endif /* SOFTMMU */
1551 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1552 TCGReg base, int index, intptr_t ofs,
1553 int seg, TCGMemOp memop)
1555 const TCGMemOp real_bswap = memop & MO_BSWAP;
1556 TCGMemOp bswap = real_bswap;
1557 int movop = OPC_MOVL_GvEv;
1559 if (have_movbe && real_bswap) {
1560 bswap = 0;
1561 movop = OPC_MOVBE_GyMy;
1564 switch (memop & MO_SSIZE) {
1565 case MO_UB:
1566 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1567 base, index, 0, ofs);
1568 break;
1569 case MO_SB:
1570 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1571 base, index, 0, ofs);
1572 break;
1573 case MO_UW:
1574 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1575 base, index, 0, ofs);
1576 if (real_bswap) {
1577 tcg_out_rolw_8(s, datalo);
1579 break;
1580 case MO_SW:
1581 if (real_bswap) {
1582 if (have_movbe) {
1583 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1584 datalo, base, index, 0, ofs);
1585 } else {
1586 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1587 base, index, 0, ofs);
1588 tcg_out_rolw_8(s, datalo);
1590 tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1591 } else {
1592 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1593 datalo, base, index, 0, ofs);
1595 break;
1596 case MO_UL:
1597 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1598 if (bswap) {
1599 tcg_out_bswap32(s, datalo);
1601 break;
1602 #if TCG_TARGET_REG_BITS == 64
1603 case MO_SL:
1604 if (real_bswap) {
1605 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1606 base, index, 0, ofs);
1607 if (bswap) {
1608 tcg_out_bswap32(s, datalo);
1610 tcg_out_ext32s(s, datalo, datalo);
1611 } else {
1612 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1613 base, index, 0, ofs);
1615 break;
1616 #endif
1617 case MO_Q:
1618 if (TCG_TARGET_REG_BITS == 64) {
1619 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1620 base, index, 0, ofs);
1621 if (bswap) {
1622 tcg_out_bswap64(s, datalo);
1624 } else {
1625 if (real_bswap) {
1626 int t = datalo;
1627 datalo = datahi;
1628 datahi = t;
1630 if (base != datalo) {
1631 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1632 base, index, 0, ofs);
1633 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1634 base, index, 0, ofs + 4);
1635 } else {
1636 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1637 base, index, 0, ofs + 4);
1638 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1639 base, index, 0, ofs);
1641 if (bswap) {
1642 tcg_out_bswap32(s, datalo);
1643 tcg_out_bswap32(s, datahi);
1646 break;
1647 default:
1648 tcg_abort();
1652 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1653 EAX. It will be useful once fixed registers globals are less
1654 common. */
1655 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1657 TCGReg datalo, datahi, addrlo;
1658 TCGReg addrhi __attribute__((unused));
1659 TCGMemOpIdx oi;
1660 TCGMemOp opc;
1661 #if defined(CONFIG_SOFTMMU)
1662 int mem_index;
1663 tcg_insn_unit *label_ptr[2];
1664 #endif
1666 datalo = *args++;
1667 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1668 addrlo = *args++;
1669 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1670 oi = *args++;
1671 opc = get_memop(oi);
1673 #if defined(CONFIG_SOFTMMU)
1674 mem_index = get_mmuidx(oi);
1676 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1677 label_ptr, offsetof(CPUTLBEntry, addr_read));
1679 /* TLB Hit. */
1680 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
1682 /* Record the current context of a load into ldst label */
1683 add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
1684 s->code_ptr, label_ptr);
1685 #else
1687 int32_t offset = guest_base;
1688 TCGReg base = addrlo;
1689 int index = -1;
1690 int seg = 0;
1692 /* For a 32-bit guest, the high 32 bits may contain garbage.
1693 We can do this with the ADDR32 prefix if we're not using
1694 a guest base, or when using segmentation. Otherwise we
1695 need to zero-extend manually. */
1696 if (guest_base == 0 || guest_base_flags) {
1697 seg = guest_base_flags;
1698 offset = 0;
1699 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1700 seg |= P_ADDR32;
1702 } else if (TCG_TARGET_REG_BITS == 64) {
1703 if (TARGET_LONG_BITS == 32) {
1704 tcg_out_ext32u(s, TCG_REG_L0, base);
1705 base = TCG_REG_L0;
1707 if (offset != guest_base) {
1708 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1709 index = TCG_REG_L1;
1710 offset = 0;
1714 tcg_out_qemu_ld_direct(s, datalo, datahi,
1715 base, index, offset, seg, opc);
1717 #endif
1720 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1721 TCGReg base, intptr_t ofs, int seg,
1722 TCGMemOp memop)
1724 /* ??? Ideally we wouldn't need a scratch register. For user-only,
1725 we could perform the bswap twice to restore the original value
1726 instead of moving to the scratch. But as it is, the L constraint
1727 means that TCG_REG_L0 is definitely free here. */
1728 const TCGReg scratch = TCG_REG_L0;
1729 const TCGMemOp real_bswap = memop & MO_BSWAP;
1730 TCGMemOp bswap = real_bswap;
1731 int movop = OPC_MOVL_EvGv;
1733 if (have_movbe && real_bswap) {
1734 bswap = 0;
1735 movop = OPC_MOVBE_MyGy;
1738 switch (memop & MO_SIZE) {
1739 case MO_8:
1740 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
1741 Use the scratch register if necessary. */
1742 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
1743 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1744 datalo = scratch;
1746 tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
1747 datalo, base, ofs);
1748 break;
1749 case MO_16:
1750 if (bswap) {
1751 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1752 tcg_out_rolw_8(s, scratch);
1753 datalo = scratch;
1755 tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
1756 break;
1757 case MO_32:
1758 if (bswap) {
1759 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1760 tcg_out_bswap32(s, scratch);
1761 datalo = scratch;
1763 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1764 break;
1765 case MO_64:
1766 if (TCG_TARGET_REG_BITS == 64) {
1767 if (bswap) {
1768 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
1769 tcg_out_bswap64(s, scratch);
1770 datalo = scratch;
1772 tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
1773 } else if (bswap) {
1774 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
1775 tcg_out_bswap32(s, scratch);
1776 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
1777 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1778 tcg_out_bswap32(s, scratch);
1779 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
1780 } else {
1781 if (real_bswap) {
1782 int t = datalo;
1783 datalo = datahi;
1784 datahi = t;
1786 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1787 tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
1789 break;
1790 default:
1791 tcg_abort();
1795 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
1797 TCGReg datalo, datahi, addrlo;
1798 TCGReg addrhi __attribute__((unused));
1799 TCGMemOpIdx oi;
1800 TCGMemOp opc;
1801 #if defined(CONFIG_SOFTMMU)
1802 int mem_index;
1803 tcg_insn_unit *label_ptr[2];
1804 #endif
1806 datalo = *args++;
1807 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1808 addrlo = *args++;
1809 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1810 oi = *args++;
1811 opc = get_memop(oi);
1813 #if defined(CONFIG_SOFTMMU)
1814 mem_index = get_mmuidx(oi);
1816 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1817 label_ptr, offsetof(CPUTLBEntry, addr_write));
1819 /* TLB Hit. */
1820 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
1822 /* Record the current context of a store into ldst label */
1823 add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
1824 s->code_ptr, label_ptr);
1825 #else
1827 int32_t offset = guest_base;
1828 TCGReg base = addrlo;
1829 int seg = 0;
1831 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
1832 if (guest_base == 0 || guest_base_flags) {
1833 seg = guest_base_flags;
1834 offset = 0;
1835 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1836 seg |= P_ADDR32;
1838 } else if (TCG_TARGET_REG_BITS == 64) {
1839 /* ??? Note that we can't use the same SIB addressing scheme
1840 as for loads, since we require L0 free for bswap. */
1841 if (offset != guest_base) {
1842 if (TARGET_LONG_BITS == 32) {
1843 tcg_out_ext32u(s, TCG_REG_L0, base);
1844 base = TCG_REG_L0;
1846 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1847 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
1848 base = TCG_REG_L1;
1849 offset = 0;
1850 } else if (TARGET_LONG_BITS == 32) {
1851 tcg_out_ext32u(s, TCG_REG_L1, base);
1852 base = TCG_REG_L1;
1856 tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
1858 #endif
1861 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
1862 const TCGArg *args, const int *const_args)
1864 TCGArg a0, a1, a2;
1865 int c, const_a2, vexop, rexw = 0;
1867 #if TCG_TARGET_REG_BITS == 64
1868 # define OP_32_64(x) \
1869 case glue(glue(INDEX_op_, x), _i64): \
1870 rexw = P_REXW; /* FALLTHRU */ \
1871 case glue(glue(INDEX_op_, x), _i32)
1872 #else
1873 # define OP_32_64(x) \
1874 case glue(glue(INDEX_op_, x), _i32)
1875 #endif
1877 /* Hoist the loads of the most common arguments. */
1878 a0 = args[0];
1879 a1 = args[1];
1880 a2 = args[2];
1881 const_a2 = const_args[2];
1883 switch (opc) {
1884 case INDEX_op_exit_tb:
1885 /* Reuse the zeroing that exists for goto_ptr. */
1886 if (a0 == 0) {
1887 tcg_out_jmp(s, s->code_gen_epilogue);
1888 } else {
1889 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
1890 tcg_out_jmp(s, tb_ret_addr);
1892 break;
1893 case INDEX_op_goto_tb:
1894 if (s->tb_jmp_insn_offset) {
1895 /* direct jump method */
1896 int gap;
1897 /* jump displacement must be aligned for atomic patching;
1898 * see if we need to add extra nops before jump
1900 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
1901 if (gap != 1) {
1902 tcg_out_nopn(s, gap - 1);
1904 tcg_out8(s, OPC_JMP_long); /* jmp im */
1905 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
1906 tcg_out32(s, 0);
1907 } else {
1908 /* indirect jump method */
1909 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
1910 (intptr_t)(s->tb_jmp_target_addr + a0));
1912 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
1913 break;
1914 case INDEX_op_goto_ptr:
1915 /* jmp to the given host address (could be epilogue) */
1916 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
1917 break;
1918 case INDEX_op_br:
1919 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
1920 break;
1921 OP_32_64(ld8u):
1922 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1923 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
1924 break;
1925 OP_32_64(ld8s):
1926 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
1927 break;
1928 OP_32_64(ld16u):
1929 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1930 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
1931 break;
1932 OP_32_64(ld16s):
1933 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
1934 break;
1935 #if TCG_TARGET_REG_BITS == 64
1936 case INDEX_op_ld32u_i64:
1937 #endif
1938 case INDEX_op_ld_i32:
1939 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
1940 break;
1942 OP_32_64(st8):
1943 if (const_args[0]) {
1944 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
1945 tcg_out8(s, a0);
1946 } else {
1947 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
1949 break;
1950 OP_32_64(st16):
1951 if (const_args[0]) {
1952 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
1953 tcg_out16(s, a0);
1954 } else {
1955 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
1957 break;
1958 #if TCG_TARGET_REG_BITS == 64
1959 case INDEX_op_st32_i64:
1960 #endif
1961 case INDEX_op_st_i32:
1962 if (const_args[0]) {
1963 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
1964 tcg_out32(s, a0);
1965 } else {
1966 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
1968 break;
1970 OP_32_64(add):
1971 /* For 3-operand addition, use LEA. */
1972 if (a0 != a1) {
1973 TCGArg c3 = 0;
1974 if (const_a2) {
1975 c3 = a2, a2 = -1;
1976 } else if (a0 == a2) {
1977 /* Watch out for dest = src + dest, since we've removed
1978 the matching constraint on the add. */
1979 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
1980 break;
1983 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
1984 break;
1986 c = ARITH_ADD;
1987 goto gen_arith;
1988 OP_32_64(sub):
1989 c = ARITH_SUB;
1990 goto gen_arith;
1991 OP_32_64(and):
1992 c = ARITH_AND;
1993 goto gen_arith;
1994 OP_32_64(or):
1995 c = ARITH_OR;
1996 goto gen_arith;
1997 OP_32_64(xor):
1998 c = ARITH_XOR;
1999 goto gen_arith;
2000 gen_arith:
2001 if (const_a2) {
2002 tgen_arithi(s, c + rexw, a0, a2, 0);
2003 } else {
2004 tgen_arithr(s, c + rexw, a0, a2);
2006 break;
2008 OP_32_64(andc):
2009 if (const_a2) {
2010 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2011 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2012 } else {
2013 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2015 break;
2017 OP_32_64(mul):
2018 if (const_a2) {
2019 int32_t val;
2020 val = a2;
2021 if (val == (int8_t)val) {
2022 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2023 tcg_out8(s, val);
2024 } else {
2025 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2026 tcg_out32(s, val);
2028 } else {
2029 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2031 break;
2033 OP_32_64(div2):
2034 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2035 break;
2036 OP_32_64(divu2):
2037 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2038 break;
2040 OP_32_64(shl):
2041 /* For small constant 3-operand shift, use LEA. */
2042 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2043 if (a2 - 1 == 0) {
2044 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2045 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2046 } else {
2047 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2048 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2050 break;
2052 c = SHIFT_SHL;
2053 vexop = OPC_SHLX;
2054 goto gen_shift_maybe_vex;
2055 OP_32_64(shr):
2056 c = SHIFT_SHR;
2057 vexop = OPC_SHRX;
2058 goto gen_shift_maybe_vex;
2059 OP_32_64(sar):
2060 c = SHIFT_SAR;
2061 vexop = OPC_SARX;
2062 goto gen_shift_maybe_vex;
2063 OP_32_64(rotl):
2064 c = SHIFT_ROL;
2065 goto gen_shift;
2066 OP_32_64(rotr):
2067 c = SHIFT_ROR;
2068 goto gen_shift;
2069 gen_shift_maybe_vex:
2070 if (have_bmi2) {
2071 if (!const_a2) {
2072 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2073 break;
2075 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2077 /* FALLTHRU */
2078 gen_shift:
2079 if (const_a2) {
2080 tcg_out_shifti(s, c + rexw, a0, a2);
2081 } else {
2082 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2084 break;
2086 OP_32_64(ctz):
2087 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2088 break;
2089 OP_32_64(clz):
2090 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2091 break;
2092 OP_32_64(ctpop):
2093 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2094 break;
2096 case INDEX_op_brcond_i32:
2097 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2098 break;
2099 case INDEX_op_setcond_i32:
2100 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2101 break;
2102 case INDEX_op_movcond_i32:
2103 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2104 break;
2106 OP_32_64(bswap16):
2107 tcg_out_rolw_8(s, a0);
2108 break;
2109 OP_32_64(bswap32):
2110 tcg_out_bswap32(s, a0);
2111 break;
2113 OP_32_64(neg):
2114 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2115 break;
2116 OP_32_64(not):
2117 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2118 break;
2120 OP_32_64(ext8s):
2121 tcg_out_ext8s(s, a0, a1, rexw);
2122 break;
2123 OP_32_64(ext16s):
2124 tcg_out_ext16s(s, a0, a1, rexw);
2125 break;
2126 OP_32_64(ext8u):
2127 tcg_out_ext8u(s, a0, a1);
2128 break;
2129 OP_32_64(ext16u):
2130 tcg_out_ext16u(s, a0, a1);
2131 break;
2133 case INDEX_op_qemu_ld_i32:
2134 tcg_out_qemu_ld(s, args, 0);
2135 break;
2136 case INDEX_op_qemu_ld_i64:
2137 tcg_out_qemu_ld(s, args, 1);
2138 break;
2139 case INDEX_op_qemu_st_i32:
2140 tcg_out_qemu_st(s, args, 0);
2141 break;
2142 case INDEX_op_qemu_st_i64:
2143 tcg_out_qemu_st(s, args, 1);
2144 break;
2146 OP_32_64(mulu2):
2147 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2148 break;
2149 OP_32_64(muls2):
2150 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2151 break;
2152 OP_32_64(add2):
2153 if (const_args[4]) {
2154 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2155 } else {
2156 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2158 if (const_args[5]) {
2159 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2160 } else {
2161 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2163 break;
2164 OP_32_64(sub2):
2165 if (const_args[4]) {
2166 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2167 } else {
2168 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2170 if (const_args[5]) {
2171 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2172 } else {
2173 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2175 break;
2177 #if TCG_TARGET_REG_BITS == 32
2178 case INDEX_op_brcond2_i32:
2179 tcg_out_brcond2(s, args, const_args, 0);
2180 break;
2181 case INDEX_op_setcond2_i32:
2182 tcg_out_setcond2(s, args, const_args);
2183 break;
2184 #else /* TCG_TARGET_REG_BITS == 64 */
2185 case INDEX_op_ld32s_i64:
2186 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2187 break;
2188 case INDEX_op_ld_i64:
2189 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2190 break;
2191 case INDEX_op_st_i64:
2192 if (const_args[0]) {
2193 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2194 tcg_out32(s, a0);
2195 } else {
2196 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2198 break;
2200 case INDEX_op_brcond_i64:
2201 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2202 break;
2203 case INDEX_op_setcond_i64:
2204 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2205 break;
2206 case INDEX_op_movcond_i64:
2207 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2208 break;
2210 case INDEX_op_bswap64_i64:
2211 tcg_out_bswap64(s, a0);
2212 break;
2213 case INDEX_op_extu_i32_i64:
2214 case INDEX_op_ext32u_i64:
2215 tcg_out_ext32u(s, a0, a1);
2216 break;
2217 case INDEX_op_ext_i32_i64:
2218 case INDEX_op_ext32s_i64:
2219 tcg_out_ext32s(s, a0, a1);
2220 break;
2221 #endif
2223 OP_32_64(deposit):
2224 if (args[3] == 0 && args[4] == 8) {
2225 /* load bits 0..7 */
2226 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2227 } else if (args[3] == 8 && args[4] == 8) {
2228 /* load bits 8..15 */
2229 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2230 } else if (args[3] == 0 && args[4] == 16) {
2231 /* load bits 0..15 */
2232 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2233 } else {
2234 tcg_abort();
2236 break;
2238 case INDEX_op_extract_i64:
2239 if (a2 + args[3] == 32) {
2240 /* This is a 32-bit zero-extending right shift. */
2241 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2242 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2243 break;
2245 /* FALLTHRU */
2246 case INDEX_op_extract_i32:
2247 /* On the off-chance that we can use the high-byte registers.
2248 Otherwise we emit the same ext16 + shift pattern that we
2249 would have gotten from the normal tcg-op.c expansion. */
2250 tcg_debug_assert(a2 == 8 && args[3] == 8);
2251 if (a1 < 4 && a0 < 8) {
2252 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2253 } else {
2254 tcg_out_ext16u(s, a0, a1);
2255 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2257 break;
2259 case INDEX_op_sextract_i32:
2260 /* We don't implement sextract_i64, as we cannot sign-extend to
2261 64-bits without using the REX prefix that explicitly excludes
2262 access to the high-byte registers. */
2263 tcg_debug_assert(a2 == 8 && args[3] == 8);
2264 if (a1 < 4 && a0 < 8) {
2265 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2266 } else {
2267 tcg_out_ext16s(s, a0, a1, 0);
2268 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2270 break;
2272 case INDEX_op_mb:
2273 tcg_out_mb(s, a0);
2274 break;
2275 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2276 case INDEX_op_mov_i64:
2277 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2278 case INDEX_op_movi_i64:
2279 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2280 default:
2281 tcg_abort();
2284 #undef OP_32_64
2287 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2289 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2290 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2291 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2292 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2293 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2294 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2295 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2296 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2297 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2298 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2299 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2300 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2301 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2302 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2303 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2304 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2305 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2306 static const TCGTargetOpDef r_r_L_L
2307 = { .args_ct_str = { "r", "r", "L", "L" } };
2308 static const TCGTargetOpDef L_L_L_L
2309 = { .args_ct_str = { "L", "L", "L", "L" } };
2311 switch (op) {
2312 case INDEX_op_goto_ptr:
2313 return &r;
2315 case INDEX_op_ld8u_i32:
2316 case INDEX_op_ld8u_i64:
2317 case INDEX_op_ld8s_i32:
2318 case INDEX_op_ld8s_i64:
2319 case INDEX_op_ld16u_i32:
2320 case INDEX_op_ld16u_i64:
2321 case INDEX_op_ld16s_i32:
2322 case INDEX_op_ld16s_i64:
2323 case INDEX_op_ld_i32:
2324 case INDEX_op_ld32u_i64:
2325 case INDEX_op_ld32s_i64:
2326 case INDEX_op_ld_i64:
2327 return &r_r;
2329 case INDEX_op_st8_i32:
2330 case INDEX_op_st8_i64:
2331 return &qi_r;
2332 case INDEX_op_st16_i32:
2333 case INDEX_op_st16_i64:
2334 case INDEX_op_st_i32:
2335 case INDEX_op_st32_i64:
2336 return &ri_r;
2337 case INDEX_op_st_i64:
2338 return &re_r;
2340 case INDEX_op_add_i32:
2341 case INDEX_op_add_i64:
2342 return &r_r_re;
2343 case INDEX_op_sub_i32:
2344 case INDEX_op_sub_i64:
2345 case INDEX_op_mul_i32:
2346 case INDEX_op_mul_i64:
2347 case INDEX_op_or_i32:
2348 case INDEX_op_or_i64:
2349 case INDEX_op_xor_i32:
2350 case INDEX_op_xor_i64:
2351 return &r_0_re;
2353 case INDEX_op_and_i32:
2354 case INDEX_op_and_i64:
2356 static const TCGTargetOpDef and
2357 = { .args_ct_str = { "r", "0", "reZ" } };
2358 return &and;
2360 break;
2361 case INDEX_op_andc_i32:
2362 case INDEX_op_andc_i64:
2364 static const TCGTargetOpDef andc
2365 = { .args_ct_str = { "r", "r", "rI" } };
2366 return &andc;
2368 break;
2370 case INDEX_op_shl_i32:
2371 case INDEX_op_shl_i64:
2372 case INDEX_op_shr_i32:
2373 case INDEX_op_shr_i64:
2374 case INDEX_op_sar_i32:
2375 case INDEX_op_sar_i64:
2376 return have_bmi2 ? &r_r_ri : &r_0_ci;
2377 case INDEX_op_rotl_i32:
2378 case INDEX_op_rotl_i64:
2379 case INDEX_op_rotr_i32:
2380 case INDEX_op_rotr_i64:
2381 return &r_0_ci;
2383 case INDEX_op_brcond_i32:
2384 case INDEX_op_brcond_i64:
2385 return &r_re;
2387 case INDEX_op_bswap16_i32:
2388 case INDEX_op_bswap16_i64:
2389 case INDEX_op_bswap32_i32:
2390 case INDEX_op_bswap32_i64:
2391 case INDEX_op_bswap64_i64:
2392 case INDEX_op_neg_i32:
2393 case INDEX_op_neg_i64:
2394 case INDEX_op_not_i32:
2395 case INDEX_op_not_i64:
2396 return &r_0;
2398 case INDEX_op_ext8s_i32:
2399 case INDEX_op_ext8s_i64:
2400 case INDEX_op_ext8u_i32:
2401 case INDEX_op_ext8u_i64:
2402 return &r_q;
2403 case INDEX_op_ext16s_i32:
2404 case INDEX_op_ext16s_i64:
2405 case INDEX_op_ext16u_i32:
2406 case INDEX_op_ext16u_i64:
2407 case INDEX_op_ext32s_i64:
2408 case INDEX_op_ext32u_i64:
2409 case INDEX_op_ext_i32_i64:
2410 case INDEX_op_extu_i32_i64:
2411 case INDEX_op_extract_i32:
2412 case INDEX_op_extract_i64:
2413 case INDEX_op_sextract_i32:
2414 case INDEX_op_ctpop_i32:
2415 case INDEX_op_ctpop_i64:
2416 return &r_r;
2418 case INDEX_op_deposit_i32:
2419 case INDEX_op_deposit_i64:
2421 static const TCGTargetOpDef dep
2422 = { .args_ct_str = { "Q", "0", "Q" } };
2423 return &dep;
2425 case INDEX_op_setcond_i32:
2426 case INDEX_op_setcond_i64:
2428 static const TCGTargetOpDef setc
2429 = { .args_ct_str = { "q", "r", "re" } };
2430 return &setc;
2432 case INDEX_op_movcond_i32:
2433 case INDEX_op_movcond_i64:
2435 static const TCGTargetOpDef movc
2436 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2437 return &movc;
2439 case INDEX_op_div2_i32:
2440 case INDEX_op_div2_i64:
2441 case INDEX_op_divu2_i32:
2442 case INDEX_op_divu2_i64:
2444 static const TCGTargetOpDef div2
2445 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2446 return &div2;
2448 case INDEX_op_mulu2_i32:
2449 case INDEX_op_mulu2_i64:
2450 case INDEX_op_muls2_i32:
2451 case INDEX_op_muls2_i64:
2453 static const TCGTargetOpDef mul2
2454 = { .args_ct_str = { "a", "d", "a", "r" } };
2455 return &mul2;
2457 case INDEX_op_add2_i32:
2458 case INDEX_op_add2_i64:
2459 case INDEX_op_sub2_i32:
2460 case INDEX_op_sub2_i64:
2462 static const TCGTargetOpDef arith2
2463 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2464 return &arith2;
2466 case INDEX_op_ctz_i32:
2467 case INDEX_op_ctz_i64:
2469 static const TCGTargetOpDef ctz[2] = {
2470 { .args_ct_str = { "&r", "r", "r" } },
2471 { .args_ct_str = { "&r", "r", "rW" } },
2473 return &ctz[have_bmi1];
2475 case INDEX_op_clz_i32:
2476 case INDEX_op_clz_i64:
2478 static const TCGTargetOpDef clz[2] = {
2479 { .args_ct_str = { "&r", "r", "r" } },
2480 { .args_ct_str = { "&r", "r", "rW" } },
2482 return &clz[have_lzcnt];
2485 case INDEX_op_qemu_ld_i32:
2486 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
2487 case INDEX_op_qemu_st_i32:
2488 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
2489 case INDEX_op_qemu_ld_i64:
2490 return (TCG_TARGET_REG_BITS == 64 ? &r_L
2491 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
2492 : &r_r_L_L);
2493 case INDEX_op_qemu_st_i64:
2494 return (TCG_TARGET_REG_BITS == 64 ? &L_L
2495 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
2496 : &L_L_L_L);
2498 case INDEX_op_brcond2_i32:
2500 static const TCGTargetOpDef b2
2501 = { .args_ct_str = { "r", "r", "ri", "ri" } };
2502 return &b2;
2504 case INDEX_op_setcond2_i32:
2506 static const TCGTargetOpDef s2
2507 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
2508 return &s2;
2511 default:
2512 break;
2514 return NULL;
2517 static int tcg_target_callee_save_regs[] = {
2518 #if TCG_TARGET_REG_BITS == 64
2519 TCG_REG_RBP,
2520 TCG_REG_RBX,
2521 #if defined(_WIN64)
2522 TCG_REG_RDI,
2523 TCG_REG_RSI,
2524 #endif
2525 TCG_REG_R12,
2526 TCG_REG_R13,
2527 TCG_REG_R14, /* Currently used for the global env. */
2528 TCG_REG_R15,
2529 #if defined(_WIN64)
2530 TCG_REG_RDI,
2531 TCG_REG_RSI
2532 #endif
2533 #else
2534 TCG_REG_EBP, /* Currently used for the global env. */
2535 TCG_REG_EBX,
2536 TCG_REG_ESI,
2537 TCG_REG_EDI,
2538 #endif
2541 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
2542 and tcg_register_jit. */
2544 #define PUSH_SIZE \
2545 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
2546 * (TCG_TARGET_REG_BITS / 8))
2548 #define FRAME_SIZE \
2549 ((PUSH_SIZE \
2550 + TCG_STATIC_CALL_ARGS_SIZE \
2551 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
2552 + TCG_TARGET_STACK_ALIGN - 1) \
2553 & ~(TCG_TARGET_STACK_ALIGN - 1))
2555 /* Generate global QEMU prologue and epilogue code */
2556 static void tcg_target_qemu_prologue(TCGContext *s)
2558 int i, stack_addend;
2560 /* TB prologue */
2562 /* Reserve some stack space, also for TCG temps. */
2563 stack_addend = FRAME_SIZE - PUSH_SIZE;
2564 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
2565 CPU_TEMP_BUF_NLONGS * sizeof(long));
2567 /* Save all callee saved registers. */
2568 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
2569 tcg_out_push(s, tcg_target_callee_save_regs[i]);
2572 #if TCG_TARGET_REG_BITS == 32
2573 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
2574 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
2575 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2576 /* jmp *tb. */
2577 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
2578 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
2579 + stack_addend);
2580 #else
2581 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
2582 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2583 /* jmp *tb. */
2584 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
2585 #endif
2588 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
2589 * and fall through to the rest of the epilogue.
2591 s->code_gen_epilogue = s->code_ptr;
2592 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
2594 /* TB epilogue */
2595 tb_ret_addr = s->code_ptr;
2597 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
2599 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
2600 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
2602 tcg_out_opc(s, OPC_RET, 0, 0, 0);
2604 #if !defined(CONFIG_SOFTMMU)
2605 /* Try to set up a segment register to point to guest_base. */
2606 if (guest_base) {
2607 setup_guest_base_seg();
2609 #endif
2612 static void tcg_target_init(TCGContext *s)
2614 #ifdef CONFIG_CPUID_H
2615 unsigned a, b, c, d;
2616 int max = __get_cpuid_max(0, 0);
2618 if (max >= 1) {
2619 __cpuid(1, a, b, c, d);
2620 #ifndef have_cmov
2621 /* For 32-bit, 99% certainty that we're running on hardware that
2622 supports cmov, but we still need to check. In case cmov is not
2623 available, we'll use a small forward branch. */
2624 have_cmov = (d & bit_CMOV) != 0;
2625 #endif
2626 #ifndef have_movbe
2627 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
2628 need to probe for it. */
2629 have_movbe = (c & bit_MOVBE) != 0;
2630 #endif
2631 #ifdef bit_POPCNT
2632 have_popcnt = (c & bit_POPCNT) != 0;
2633 #endif
2636 if (max >= 7) {
2637 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
2638 __cpuid_count(7, 0, a, b, c, d);
2639 #ifdef bit_BMI
2640 have_bmi1 = (b & bit_BMI) != 0;
2641 #endif
2642 #ifndef have_bmi2
2643 have_bmi2 = (b & bit_BMI2) != 0;
2644 #endif
2646 #endif
2648 #ifndef have_lzcnt
2649 max = __get_cpuid_max(0x8000000, 0);
2650 if (max >= 1) {
2651 __cpuid(0x80000001, a, b, c, d);
2652 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
2653 have_lzcnt = (c & bit_LZCNT) != 0;
2655 #endif
2657 if (TCG_TARGET_REG_BITS == 64) {
2658 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
2659 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
2660 } else {
2661 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
2664 tcg_regset_clear(tcg_target_call_clobber_regs);
2665 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
2666 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
2667 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
2668 if (TCG_TARGET_REG_BITS == 64) {
2669 #if !defined(_WIN64)
2670 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
2671 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
2672 #endif
2673 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
2674 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
2675 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
2676 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
2679 tcg_regset_clear(s->reserved_regs);
2680 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
2683 typedef struct {
2684 DebugFrameHeader h;
2685 uint8_t fde_def_cfa[4];
2686 uint8_t fde_reg_ofs[14];
2687 } DebugFrame;
2689 /* We're expecting a 2 byte uleb128 encoded value. */
2690 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
2692 #if !defined(__ELF__)
2693 /* Host machine without ELF. */
2694 #elif TCG_TARGET_REG_BITS == 64
2695 #define ELF_HOST_MACHINE EM_X86_64
2696 static const DebugFrame debug_frame = {
2697 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2698 .h.cie.id = -1,
2699 .h.cie.version = 1,
2700 .h.cie.code_align = 1,
2701 .h.cie.data_align = 0x78, /* sleb128 -8 */
2702 .h.cie.return_column = 16,
2704 /* Total FDE size does not include the "len" member. */
2705 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2707 .fde_def_cfa = {
2708 12, 7, /* DW_CFA_def_cfa %rsp, ... */
2709 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2710 (FRAME_SIZE >> 7)
2712 .fde_reg_ofs = {
2713 0x90, 1, /* DW_CFA_offset, %rip, -8 */
2714 /* The following ordering must match tcg_target_callee_save_regs. */
2715 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
2716 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
2717 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
2718 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
2719 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
2720 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
2723 #else
2724 #define ELF_HOST_MACHINE EM_386
2725 static const DebugFrame debug_frame = {
2726 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2727 .h.cie.id = -1,
2728 .h.cie.version = 1,
2729 .h.cie.code_align = 1,
2730 .h.cie.data_align = 0x7c, /* sleb128 -4 */
2731 .h.cie.return_column = 8,
2733 /* Total FDE size does not include the "len" member. */
2734 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2736 .fde_def_cfa = {
2737 12, 4, /* DW_CFA_def_cfa %esp, ... */
2738 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2739 (FRAME_SIZE >> 7)
2741 .fde_reg_ofs = {
2742 0x88, 1, /* DW_CFA_offset, %eip, -4 */
2743 /* The following ordering must match tcg_target_callee_save_regs. */
2744 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
2745 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
2746 0x86, 4, /* DW_CFA_offset, %esi, -16 */
2747 0x87, 5, /* DW_CFA_offset, %edi, -20 */
2750 #endif
2752 #if defined(ELF_HOST_MACHINE)
2753 void tcg_register_jit(void *buf, size_t buf_size)
2755 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
2757 #endif