2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "tcg-pool.inc.c"
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names
[TCG_TARGET_NB_REGS
] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
43 static const int tcg_target_reg_alloc_order
[] = {
44 #if TCG_TARGET_REG_BITS == 64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
80 #if TCG_TARGET_REG_BITS == 64
93 static const int tcg_target_call_iarg_regs
[] = {
94 #if TCG_TARGET_REG_BITS == 64
107 /* 32 bit mode uses stack based calling convention (GCC default). */
111 static const int tcg_target_call_oarg_regs
[] = {
113 #if TCG_TARGET_REG_BITS == 32
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
135 /* The host compiler should supply <cpuid.h> to enable runtime features
136 detection, as we're not going to go so far as our own inline assembly.
137 If not available, default values will be assumed. */
138 #if defined(CONFIG_CPUID_H)
139 #include "qemu/cpuid.h"
142 /* For 64-bit, we always know that CMOV is available. */
143 #if TCG_TARGET_REG_BITS == 64
145 #elif defined(CONFIG_CPUID_H)
146 static bool have_cmov
;
151 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
152 it there. Therefore we always define the variable. */
158 #ifdef CONFIG_CPUID_H
159 static bool have_movbe
;
160 static bool have_bmi2
;
161 static bool have_lzcnt
;
163 # define have_movbe 0
165 # define have_lzcnt 0
168 static tcg_insn_unit
*tb_ret_addr
;
170 static void patch_reloc(tcg_insn_unit
*code_ptr
, int type
,
171 intptr_t value
, intptr_t addend
)
176 value
-= (uintptr_t)code_ptr
;
177 if (value
!= (int32_t)value
) {
182 tcg_patch32(code_ptr
, value
);
185 value
-= (uintptr_t)code_ptr
;
186 if (value
!= (int8_t)value
) {
189 tcg_patch8(code_ptr
, value
);
196 #if TCG_TARGET_REG_BITS == 64
197 #define ALL_GENERAL_REGS 0x0000ffffu
198 #define ALL_VECTOR_REGS 0xffff0000u
200 #define ALL_GENERAL_REGS 0x000000ffu
201 #define ALL_VECTOR_REGS 0x00ff0000u
204 /* parse target specific constraints */
205 static const char *target_parse_constraint(TCGArgConstraint
*ct
,
206 const char *ct_str
, TCGType type
)
210 ct
->ct
|= TCG_CT_REG
;
211 tcg_regset_set_reg(ct
->u
.regs
, TCG_REG_EAX
);
214 ct
->ct
|= TCG_CT_REG
;
215 tcg_regset_set_reg(ct
->u
.regs
, TCG_REG_EBX
);
218 ct
->ct
|= TCG_CT_REG
;
219 tcg_regset_set_reg(ct
->u
.regs
, TCG_REG_ECX
);
222 ct
->ct
|= TCG_CT_REG
;
223 tcg_regset_set_reg(ct
->u
.regs
, TCG_REG_EDX
);
226 ct
->ct
|= TCG_CT_REG
;
227 tcg_regset_set_reg(ct
->u
.regs
, TCG_REG_ESI
);
230 ct
->ct
|= TCG_CT_REG
;
231 tcg_regset_set_reg(ct
->u
.regs
, TCG_REG_EDI
);
234 /* A register that can be used as a byte operand. */
235 ct
->ct
|= TCG_CT_REG
;
236 ct
->u
.regs
= TCG_TARGET_REG_BITS
== 64 ? 0xffff : 0xf;
239 /* A register with an addressable second byte (e.g. %ah). */
240 ct
->ct
|= TCG_CT_REG
;
244 /* A general register. */
245 ct
->ct
|= TCG_CT_REG
;
246 ct
->u
.regs
|= ALL_GENERAL_REGS
;
249 /* With TZCNT/LZCNT, we can have operand-size as an input. */
250 ct
->ct
|= TCG_CT_CONST_WSZ
;
253 /* A vector register. */
254 ct
->ct
|= TCG_CT_REG
;
255 ct
->u
.regs
|= ALL_VECTOR_REGS
;
258 /* qemu_ld/st address constraint */
260 ct
->ct
|= TCG_CT_REG
;
261 ct
->u
.regs
= TCG_TARGET_REG_BITS
== 64 ? 0xffff : 0xff;
262 tcg_regset_reset_reg(ct
->u
.regs
, TCG_REG_L0
);
263 tcg_regset_reset_reg(ct
->u
.regs
, TCG_REG_L1
);
267 ct
->ct
|= (type
== TCG_TYPE_I32
? TCG_CT_CONST
: TCG_CT_CONST_S32
);
270 ct
->ct
|= (type
== TCG_TYPE_I32
? TCG_CT_CONST
: TCG_CT_CONST_U32
);
273 ct
->ct
|= (type
== TCG_TYPE_I32
? TCG_CT_CONST
: TCG_CT_CONST_I32
);
282 /* test if a constant matches the constraint */
283 static inline int tcg_target_const_match(tcg_target_long val
, TCGType type
,
284 const TCGArgConstraint
*arg_ct
)
287 if (ct
& TCG_CT_CONST
) {
290 if ((ct
& TCG_CT_CONST_S32
) && val
== (int32_t)val
) {
293 if ((ct
& TCG_CT_CONST_U32
) && val
== (uint32_t)val
) {
296 if ((ct
& TCG_CT_CONST_I32
) && ~val
== (int32_t)~val
) {
299 if ((ct
& TCG_CT_CONST_WSZ
) && val
== (type
== TCG_TYPE_I32
? 32 : 64)) {
305 #if TCG_TARGET_REG_BITS == 64
306 # define LOWREGMASK(x) ((x) & 7)
308 # define LOWREGMASK(x) (x)
311 #define P_EXT 0x100 /* 0x0f opcode prefix */
312 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
313 #define P_DATA16 0x400 /* 0x66 opcode prefix */
314 #if TCG_TARGET_REG_BITS == 64
315 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
316 # define P_REXW 0x1000 /* Set REX.W = 1 */
317 # define P_REXB_R 0x2000 /* REG field as byte register */
318 # define P_REXB_RM 0x4000 /* R/M field as byte register */
319 # define P_GS 0x8000 /* gs segment override */
327 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
328 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
329 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
330 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
332 #define OPC_ARITH_EvIz (0x81)
333 #define OPC_ARITH_EvIb (0x83)
334 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
335 #define OPC_ANDN (0xf2 | P_EXT38)
336 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
337 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
338 #define OPC_BSF (0xbc | P_EXT)
339 #define OPC_BSR (0xbd | P_EXT)
340 #define OPC_BSWAP (0xc8 | P_EXT)
341 #define OPC_CALL_Jz (0xe8)
342 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
343 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
344 #define OPC_DEC_r32 (0x48)
345 #define OPC_IMUL_GvEv (0xaf | P_EXT)
346 #define OPC_IMUL_GvEvIb (0x6b)
347 #define OPC_IMUL_GvEvIz (0x69)
348 #define OPC_INC_r32 (0x40)
349 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
350 #define OPC_JCC_short (0x70) /* ... plus condition code */
351 #define OPC_JMP_long (0xe9)
352 #define OPC_JMP_short (0xeb)
353 #define OPC_LEA (0x8d)
354 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
355 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
356 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
357 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
358 #define OPC_MOVB_EvIz (0xc6)
359 #define OPC_MOVL_EvIz (0xc7)
360 #define OPC_MOVL_Iv (0xb8)
361 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
362 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
363 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
364 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
365 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
366 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
367 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
368 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
369 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
370 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
371 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
372 #define OPC_MOVSBL (0xbe | P_EXT)
373 #define OPC_MOVSWL (0xbf | P_EXT)
374 #define OPC_MOVSLQ (0x63 | P_REXW)
375 #define OPC_MOVZBL (0xb6 | P_EXT)
376 #define OPC_MOVZWL (0xb7 | P_EXT)
377 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
378 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
379 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
380 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
381 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
382 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
383 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
384 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
385 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
386 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
387 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
388 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
389 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
390 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
391 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
392 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
393 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
394 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
395 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
396 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
397 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
398 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
399 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
400 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
401 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
402 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
403 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
404 #define OPC_POR (0xeb | P_EXT | P_DATA16)
405 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
406 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
407 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
408 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
409 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
410 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
411 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
412 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
413 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
414 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
415 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
416 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
417 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
418 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
419 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
420 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
421 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
422 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
423 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
424 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
425 #define OPC_POP_r32 (0x58)
426 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
427 #define OPC_PUSH_r32 (0x50)
428 #define OPC_PUSH_Iv (0x68)
429 #define OPC_PUSH_Ib (0x6a)
430 #define OPC_RET (0xc3)
431 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
432 #define OPC_SHIFT_1 (0xd1)
433 #define OPC_SHIFT_Ib (0xc1)
434 #define OPC_SHIFT_cl (0xd3)
435 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
436 #define OPC_SHUFPS (0xc6 | P_EXT)
437 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
438 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
439 #define OPC_TESTL (0x85)
440 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
441 #define OPC_UD2 (0x0b | P_EXT)
442 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
443 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
444 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
445 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
446 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
447 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
448 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
449 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
450 #define OPC_VZEROUPPER (0x77 | P_EXT)
451 #define OPC_XCHG_ax_r32 (0x90)
453 #define OPC_GRP3_Ev (0xf7)
454 #define OPC_GRP5 (0xff)
455 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
457 /* Group 1 opcode extensions for 0x80-0x83.
458 These are also used as modifiers for OPC_ARITH. */
468 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
475 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
483 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
484 #define EXT5_INC_Ev 0
485 #define EXT5_DEC_Ev 1
486 #define EXT5_CALLN_Ev 2
487 #define EXT5_JMPN_Ev 4
489 /* Condition codes to be added to OPC_JCC_{long,short}. */
508 static const uint8_t tcg_cond_to_jcc
[] = {
509 [TCG_COND_EQ
] = JCC_JE
,
510 [TCG_COND_NE
] = JCC_JNE
,
511 [TCG_COND_LT
] = JCC_JL
,
512 [TCG_COND_GE
] = JCC_JGE
,
513 [TCG_COND_LE
] = JCC_JLE
,
514 [TCG_COND_GT
] = JCC_JG
,
515 [TCG_COND_LTU
] = JCC_JB
,
516 [TCG_COND_GEU
] = JCC_JAE
,
517 [TCG_COND_LEU
] = JCC_JBE
,
518 [TCG_COND_GTU
] = JCC_JA
,
521 #if TCG_TARGET_REG_BITS == 64
522 static void tcg_out_opc(TCGContext
*s
, int opc
, int r
, int rm
, int x
)
529 if (opc
& P_DATA16
) {
530 /* We should never be asking for both 16 and 64-bit operation. */
531 tcg_debug_assert((opc
& P_REXW
) == 0);
534 if (opc
& P_ADDR32
) {
537 if (opc
& P_SIMDF3
) {
539 } else if (opc
& P_SIMDF2
) {
544 rex
|= (opc
& P_REXW
) ? 0x8 : 0x0; /* REX.W */
545 rex
|= (r
& 8) >> 1; /* REX.R */
546 rex
|= (x
& 8) >> 2; /* REX.X */
547 rex
|= (rm
& 8) >> 3; /* REX.B */
549 /* P_REXB_{R,RM} indicates that the given register is the low byte.
550 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
551 as otherwise the encoding indicates %[abcd]h. Note that the values
552 that are ORed in merely indicate that the REX byte must be present;
553 those bits get discarded in output. */
554 rex
|= opc
& (r
>= 4 ? P_REXB_R
: 0);
555 rex
|= opc
& (rm
>= 4 ? P_REXB_RM
: 0);
558 tcg_out8(s
, (uint8_t)(rex
| 0x40));
561 if (opc
& (P_EXT
| P_EXT38
| P_EXT3A
)) {
565 } else if (opc
& P_EXT3A
) {
573 static void tcg_out_opc(TCGContext
*s
, int opc
)
575 if (opc
& P_DATA16
) {
578 if (opc
& P_SIMDF3
) {
580 } else if (opc
& P_SIMDF2
) {
583 if (opc
& (P_EXT
| P_EXT38
| P_EXT3A
)) {
587 } else if (opc
& P_EXT3A
) {
593 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
594 the 32-bit compilation paths. This method works with all versions of gcc,
595 whereas relying on optimization may not be able to exclude them. */
596 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
599 static void tcg_out_modrm(TCGContext
*s
, int opc
, int r
, int rm
)
601 tcg_out_opc(s
, opc
, r
, rm
, 0);
602 tcg_out8(s
, 0xc0 | (LOWREGMASK(r
) << 3) | LOWREGMASK(rm
));
605 static void tcg_out_vex_opc(TCGContext
*s
, int opc
, int r
, int v
,
610 /* Use the two byte form if possible, which cannot encode
611 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
612 if ((opc
& (P_EXT
| P_EXT38
| P_EXT3A
| P_REXW
)) == P_EXT
613 && ((rm
| index
) & 8) == 0) {
614 /* Two byte VEX prefix. */
617 tmp
= (r
& 8 ? 0 : 0x80); /* VEX.R */
619 /* Three byte VEX prefix. */
625 } else if (opc
& P_EXT38
) {
627 } else if (opc
& P_EXT
) {
630 g_assert_not_reached();
632 tmp
|= (r
& 8 ? 0 : 0x80); /* VEX.R */
633 tmp
|= (index
& 8 ? 0 : 0x40); /* VEX.X */
634 tmp
|= (rm
& 8 ? 0 : 0x20); /* VEX.B */
637 tmp
= (opc
& P_REXW
? 0x80 : 0); /* VEX.W */
640 tmp
|= (opc
& P_VEXL
? 0x04 : 0); /* VEX.L */
642 if (opc
& P_DATA16
) {
644 } else if (opc
& P_SIMDF3
) {
646 } else if (opc
& P_SIMDF2
) {
649 tmp
|= (~v
& 15) << 3; /* VEX.vvvv */
654 static void tcg_out_vex_modrm(TCGContext
*s
, int opc
, int r
, int v
, int rm
)
656 tcg_out_vex_opc(s
, opc
, r
, v
, rm
, 0);
657 tcg_out8(s
, 0xc0 | (LOWREGMASK(r
) << 3) | LOWREGMASK(rm
));
660 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
661 We handle either RM and INDEX missing with a negative value. In 64-bit
662 mode for absolute addresses, ~RM is the size of the immediate operand
663 that will follow the instruction. */
665 static void tcg_out_sib_offset(TCGContext
*s
, int r
, int rm
, int index
,
666 int shift
, intptr_t offset
)
670 if (index
< 0 && rm
< 0) {
671 if (TCG_TARGET_REG_BITS
== 64) {
672 /* Try for a rip-relative addressing mode. This has replaced
673 the 32-bit-mode absolute addressing encoding. */
674 intptr_t pc
= (intptr_t)s
->code_ptr
+ 5 + ~rm
;
675 intptr_t disp
= offset
- pc
;
676 if (disp
== (int32_t)disp
) {
677 tcg_out8(s
, (LOWREGMASK(r
) << 3) | 5);
682 /* Try for an absolute address encoding. This requires the
683 use of the MODRM+SIB encoding and is therefore larger than
684 rip-relative addressing. */
685 if (offset
== (int32_t)offset
) {
686 tcg_out8(s
, (LOWREGMASK(r
) << 3) | 4);
687 tcg_out8(s
, (4 << 3) | 5);
688 tcg_out32(s
, offset
);
692 /* ??? The memory isn't directly addressable. */
693 g_assert_not_reached();
695 /* Absolute address. */
696 tcg_out8(s
, (r
<< 3) | 5);
697 tcg_out32(s
, offset
);
702 /* Find the length of the immediate addend. Note that the encoding
703 that would be used for (%ebp) indicates absolute addressing. */
705 mod
= 0, len
= 4, rm
= 5;
706 } else if (offset
== 0 && LOWREGMASK(rm
) != TCG_REG_EBP
) {
708 } else if (offset
== (int8_t)offset
) {
714 /* Use a single byte MODRM format if possible. Note that the encoding
715 that would be used for %esp is the escape to the two byte form. */
716 if (index
< 0 && LOWREGMASK(rm
) != TCG_REG_ESP
) {
717 /* Single byte MODRM format. */
718 tcg_out8(s
, mod
| (LOWREGMASK(r
) << 3) | LOWREGMASK(rm
));
720 /* Two byte MODRM+SIB format. */
722 /* Note that the encoding that would place %esp into the index
723 field indicates no index register. In 64-bit mode, the REX.X
724 bit counts, so %r12 can be used as the index. */
728 tcg_debug_assert(index
!= TCG_REG_ESP
);
731 tcg_out8(s
, mod
| (LOWREGMASK(r
) << 3) | 4);
732 tcg_out8(s
, (shift
<< 6) | (LOWREGMASK(index
) << 3) | LOWREGMASK(rm
));
737 } else if (len
== 4) {
738 tcg_out32(s
, offset
);
742 static void tcg_out_modrm_sib_offset(TCGContext
*s
, int opc
, int r
, int rm
,
743 int index
, int shift
, intptr_t offset
)
745 tcg_out_opc(s
, opc
, r
, rm
< 0 ? 0 : rm
, index
< 0 ? 0 : index
);
746 tcg_out_sib_offset(s
, r
, rm
, index
, shift
, offset
);
749 static void tcg_out_vex_modrm_sib_offset(TCGContext
*s
, int opc
, int r
, int v
,
750 int rm
, int index
, int shift
,
753 tcg_out_vex_opc(s
, opc
, r
, v
, rm
< 0 ? 0 : rm
, index
< 0 ? 0 : index
);
754 tcg_out_sib_offset(s
, r
, rm
, index
, shift
, offset
);
757 /* A simplification of the above with no index or shift. */
758 static inline void tcg_out_modrm_offset(TCGContext
*s
, int opc
, int r
,
759 int rm
, intptr_t offset
)
761 tcg_out_modrm_sib_offset(s
, opc
, r
, rm
, -1, 0, offset
);
764 static inline void tcg_out_vex_modrm_offset(TCGContext
*s
, int opc
, int r
,
765 int v
, int rm
, intptr_t offset
)
767 tcg_out_vex_modrm_sib_offset(s
, opc
, r
, v
, rm
, -1, 0, offset
);
770 /* Output an opcode with an expected reference to the constant pool. */
771 static inline void tcg_out_modrm_pool(TCGContext
*s
, int opc
, int r
)
773 tcg_out_opc(s
, opc
, r
, 0, 0);
774 /* Absolute for 32-bit, pc-relative for 64-bit. */
775 tcg_out8(s
, LOWREGMASK(r
) << 3 | 5);
779 /* Output an opcode with an expected reference to the constant pool. */
780 static inline void tcg_out_vex_modrm_pool(TCGContext
*s
, int opc
, int r
)
782 tcg_out_vex_opc(s
, opc
, r
, 0, 0, 0);
783 /* Absolute for 32-bit, pc-relative for 64-bit. */
784 tcg_out8(s
, LOWREGMASK(r
) << 3 | 5);
788 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
789 static inline void tgen_arithr(TCGContext
*s
, int subop
, int dest
, int src
)
791 /* Propagate an opcode prefix, such as P_REXW. */
792 int ext
= subop
& ~0x7;
795 tcg_out_modrm(s
, OPC_ARITH_GvEv
+ (subop
<< 3) + ext
, dest
, src
);
798 static void tcg_out_mov(TCGContext
*s
, TCGType type
, TCGReg ret
, TCGReg arg
)
812 tcg_out_modrm(s
, OPC_MOVL_GvEv
+ rexw
, ret
, arg
);
814 tcg_out_vex_modrm(s
, OPC_MOVD_EyVy
+ rexw
, arg
, 0, ret
);
818 tcg_out_vex_modrm(s
, OPC_MOVD_VyEy
+ rexw
, ret
, 0, arg
);
820 tcg_out_vex_modrm(s
, OPC_MOVQ_VqWq
, ret
, 0, arg
);
826 tcg_debug_assert(ret
>= 16 && arg
>= 16);
827 tcg_out_vex_modrm(s
, OPC_MOVQ_VqWq
, ret
, 0, arg
);
830 tcg_debug_assert(ret
>= 16 && arg
>= 16);
831 tcg_out_vex_modrm(s
, OPC_MOVDQA_VxWx
, ret
, 0, arg
);
834 tcg_debug_assert(ret
>= 16 && arg
>= 16);
835 tcg_out_vex_modrm(s
, OPC_MOVDQA_VxWx
| P_VEXL
, ret
, 0, arg
);
839 g_assert_not_reached();
843 static void tcg_out_dup_vec(TCGContext
*s
, TCGType type
, unsigned vece
,
847 static const int dup_insn
[4] = {
848 OPC_VPBROADCASTB
, OPC_VPBROADCASTW
,
849 OPC_VPBROADCASTD
, OPC_VPBROADCASTQ
,
851 int vex_l
= (type
== TCG_TYPE_V256
? P_VEXL
: 0);
852 tcg_out_vex_modrm(s
, dup_insn
[vece
] + vex_l
, r
, 0, a
);
856 /* ??? With zero in a register, use PSHUFB. */
857 tcg_out_vex_modrm(s
, OPC_PUNPCKLBW
, r
, a
, a
);
861 tcg_out_vex_modrm(s
, OPC_PUNPCKLWD
, r
, a
, a
);
865 tcg_out_vex_modrm(s
, OPC_PSHUFD
, r
, 0, a
);
866 /* imm8 operand: all output lanes selected from input lane 0. */
870 tcg_out_vex_modrm(s
, OPC_PUNPCKLQDQ
, r
, a
, a
);
873 g_assert_not_reached();
878 static void tcg_out_dupi_vec(TCGContext
*s
, TCGType type
,
879 TCGReg ret
, tcg_target_long arg
)
881 int vex_l
= (type
== TCG_TYPE_V256
? P_VEXL
: 0);
884 tcg_out_vex_modrm(s
, OPC_PXOR
, ret
, ret
, ret
);
888 tcg_out_vex_modrm(s
, OPC_PCMPEQB
+ vex_l
, ret
, ret
, ret
);
892 if (TCG_TARGET_REG_BITS
== 64) {
893 if (type
== TCG_TYPE_V64
) {
894 tcg_out_vex_modrm_pool(s
, OPC_MOVQ_VqWq
, ret
);
895 } else if (have_avx2
) {
896 tcg_out_vex_modrm_pool(s
, OPC_VPBROADCASTQ
+ vex_l
, ret
);
898 tcg_out_vex_modrm_pool(s
, OPC_MOVDDUP
, ret
);
900 new_pool_label(s
, arg
, R_386_PC32
, s
->code_ptr
- 4, -4);
901 } else if (have_avx2
) {
902 tcg_out_vex_modrm_pool(s
, OPC_VPBROADCASTD
+ vex_l
, ret
);
903 new_pool_label(s
, arg
, R_386_32
, s
->code_ptr
- 4, 0);
905 tcg_out_vex_modrm_pool(s
, OPC_MOVD_VyEy
, ret
);
906 new_pool_label(s
, arg
, R_386_32
, s
->code_ptr
- 4, 0);
907 tcg_out_dup_vec(s
, type
, MO_32
, ret
, ret
);
911 static void tcg_out_movi(TCGContext
*s
, TCGType type
,
912 TCGReg ret
, tcg_target_long arg
)
914 tcg_target_long diff
;
918 #if TCG_TARGET_REG_BITS == 64
928 tcg_debug_assert(ret
>= 16);
929 tcg_out_dupi_vec(s
, type
, ret
, arg
);
932 g_assert_not_reached();
936 tgen_arithr(s
, ARITH_XOR
, ret
, ret
);
939 if (arg
== (uint32_t)arg
|| type
== TCG_TYPE_I32
) {
940 tcg_out_opc(s
, OPC_MOVL_Iv
+ LOWREGMASK(ret
), 0, ret
, 0);
944 if (arg
== (int32_t)arg
) {
945 tcg_out_modrm(s
, OPC_MOVL_EvIz
+ P_REXW
, 0, ret
);
950 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
951 diff
= arg
- ((uintptr_t)s
->code_ptr
+ 7);
952 if (diff
== (int32_t)diff
) {
953 tcg_out_opc(s
, OPC_LEA
| P_REXW
, ret
, 0, 0);
954 tcg_out8(s
, (LOWREGMASK(ret
) << 3) | 5);
959 tcg_out_opc(s
, OPC_MOVL_Iv
+ P_REXW
+ LOWREGMASK(ret
), 0, ret
, 0);
963 static inline void tcg_out_pushi(TCGContext
*s
, tcg_target_long val
)
965 if (val
== (int8_t)val
) {
966 tcg_out_opc(s
, OPC_PUSH_Ib
, 0, 0, 0);
968 } else if (val
== (int32_t)val
) {
969 tcg_out_opc(s
, OPC_PUSH_Iv
, 0, 0, 0);
976 static inline void tcg_out_mb(TCGContext
*s
, TCGArg a0
)
978 /* Given the strength of x86 memory ordering, we only need care for
979 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
980 faster than "mfence", so don't bother with the sse insn. */
981 if (a0
& TCG_MO_ST_LD
) {
983 tcg_out_modrm_offset(s
, OPC_ARITH_EvIb
, ARITH_OR
, TCG_REG_ESP
, 0);
988 static inline void tcg_out_push(TCGContext
*s
, int reg
)
990 tcg_out_opc(s
, OPC_PUSH_r32
+ LOWREGMASK(reg
), 0, reg
, 0);
993 static inline void tcg_out_pop(TCGContext
*s
, int reg
)
995 tcg_out_opc(s
, OPC_POP_r32
+ LOWREGMASK(reg
), 0, reg
, 0);
998 static void tcg_out_ld(TCGContext
*s
, TCGType type
, TCGReg ret
,
999 TCGReg arg1
, intptr_t arg2
)
1004 tcg_out_modrm_offset(s
, OPC_MOVL_GvEv
, ret
, arg1
, arg2
);
1006 tcg_out_vex_modrm_offset(s
, OPC_MOVD_VyEy
, ret
, 0, arg1
, arg2
);
1011 tcg_out_modrm_offset(s
, OPC_MOVL_GvEv
| P_REXW
, ret
, arg1
, arg2
);
1016 tcg_debug_assert(ret
>= 16);
1017 tcg_out_vex_modrm_offset(s
, OPC_MOVQ_VqWq
, ret
, 0, arg1
, arg2
);
1020 tcg_debug_assert(ret
>= 16);
1021 tcg_out_vex_modrm_offset(s
, OPC_MOVDQU_VxWx
, ret
, 0, arg1
, arg2
);
1024 tcg_debug_assert(ret
>= 16);
1025 tcg_out_vex_modrm_offset(s
, OPC_MOVDQU_VxWx
| P_VEXL
,
1026 ret
, 0, arg1
, arg2
);
1029 g_assert_not_reached();
1033 static void tcg_out_st(TCGContext
*s
, TCGType type
, TCGReg arg
,
1034 TCGReg arg1
, intptr_t arg2
)
1039 tcg_out_modrm_offset(s
, OPC_MOVL_EvGv
, arg
, arg1
, arg2
);
1041 tcg_out_vex_modrm_offset(s
, OPC_MOVD_EyVy
, arg
, 0, arg1
, arg2
);
1046 tcg_out_modrm_offset(s
, OPC_MOVL_EvGv
| P_REXW
, arg
, arg1
, arg2
);
1051 tcg_debug_assert(arg
>= 16);
1052 tcg_out_vex_modrm_offset(s
, OPC_MOVQ_WqVq
, arg
, 0, arg1
, arg2
);
1055 tcg_debug_assert(arg
>= 16);
1056 tcg_out_vex_modrm_offset(s
, OPC_MOVDQU_WxVx
, arg
, 0, arg1
, arg2
);
1059 tcg_debug_assert(arg
>= 16);
1060 tcg_out_vex_modrm_offset(s
, OPC_MOVDQU_WxVx
| P_VEXL
,
1061 arg
, 0, arg1
, arg2
);
1064 g_assert_not_reached();
1068 static bool tcg_out_sti(TCGContext
*s
, TCGType type
, TCGArg val
,
1069 TCGReg base
, intptr_t ofs
)
1072 if (TCG_TARGET_REG_BITS
== 64 && type
== TCG_TYPE_I64
) {
1073 if (val
!= (int32_t)val
) {
1077 } else if (type
!= TCG_TYPE_I32
) {
1080 tcg_out_modrm_offset(s
, OPC_MOVL_EvIz
| rexw
, 0, base
, ofs
);
1085 static void tcg_out_shifti(TCGContext
*s
, int subopc
, int reg
, int count
)
1087 /* Propagate an opcode prefix, such as P_DATA16. */
1088 int ext
= subopc
& ~0x7;
1092 tcg_out_modrm(s
, OPC_SHIFT_1
+ ext
, subopc
, reg
);
1094 tcg_out_modrm(s
, OPC_SHIFT_Ib
+ ext
, subopc
, reg
);
1099 static inline void tcg_out_bswap32(TCGContext
*s
, int reg
)
1101 tcg_out_opc(s
, OPC_BSWAP
+ LOWREGMASK(reg
), 0, reg
, 0);
1104 static inline void tcg_out_rolw_8(TCGContext
*s
, int reg
)
1106 tcg_out_shifti(s
, SHIFT_ROL
+ P_DATA16
, reg
, 8);
1109 static inline void tcg_out_ext8u(TCGContext
*s
, int dest
, int src
)
1112 tcg_debug_assert(src
< 4 || TCG_TARGET_REG_BITS
== 64);
1113 tcg_out_modrm(s
, OPC_MOVZBL
+ P_REXB_RM
, dest
, src
);
1116 static void tcg_out_ext8s(TCGContext
*s
, int dest
, int src
, int rexw
)
1119 tcg_debug_assert(src
< 4 || TCG_TARGET_REG_BITS
== 64);
1120 tcg_out_modrm(s
, OPC_MOVSBL
+ P_REXB_RM
+ rexw
, dest
, src
);
1123 static inline void tcg_out_ext16u(TCGContext
*s
, int dest
, int src
)
1126 tcg_out_modrm(s
, OPC_MOVZWL
, dest
, src
);
1129 static inline void tcg_out_ext16s(TCGContext
*s
, int dest
, int src
, int rexw
)
1132 tcg_out_modrm(s
, OPC_MOVSWL
+ rexw
, dest
, src
);
1135 static inline void tcg_out_ext32u(TCGContext
*s
, int dest
, int src
)
1137 /* 32-bit mov zero extends. */
1138 tcg_out_modrm(s
, OPC_MOVL_GvEv
, dest
, src
);
1141 static inline void tcg_out_ext32s(TCGContext
*s
, int dest
, int src
)
1143 tcg_out_modrm(s
, OPC_MOVSLQ
, dest
, src
);
1146 static inline void tcg_out_bswap64(TCGContext
*s
, int reg
)
1148 tcg_out_opc(s
, OPC_BSWAP
+ P_REXW
+ LOWREGMASK(reg
), 0, reg
, 0);
1151 static void tgen_arithi(TCGContext
*s
, int c
, int r0
,
1152 tcg_target_long val
, int cf
)
1156 if (TCG_TARGET_REG_BITS
== 64) {
1161 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1162 partial flags update stalls on Pentium4 and are not recommended
1163 by current Intel optimization manuals. */
1164 if (!cf
&& (c
== ARITH_ADD
|| c
== ARITH_SUB
) && (val
== 1 || val
== -1)) {
1165 int is_inc
= (c
== ARITH_ADD
) ^ (val
< 0);
1166 if (TCG_TARGET_REG_BITS
== 64) {
1167 /* The single-byte increment encodings are re-tasked as the
1168 REX prefixes. Use the MODRM encoding. */
1169 tcg_out_modrm(s
, OPC_GRP5
+ rexw
,
1170 (is_inc
? EXT5_INC_Ev
: EXT5_DEC_Ev
), r0
);
1172 tcg_out8(s
, (is_inc
? OPC_INC_r32
: OPC_DEC_r32
) + r0
);
1177 if (c
== ARITH_AND
) {
1178 if (TCG_TARGET_REG_BITS
== 64) {
1179 if (val
== 0xffffffffu
) {
1180 tcg_out_ext32u(s
, r0
, r0
);
1183 if (val
== (uint32_t)val
) {
1184 /* AND with no high bits set can use a 32-bit operation. */
1188 if (val
== 0xffu
&& (r0
< 4 || TCG_TARGET_REG_BITS
== 64)) {
1189 tcg_out_ext8u(s
, r0
, r0
);
1192 if (val
== 0xffffu
) {
1193 tcg_out_ext16u(s
, r0
, r0
);
1198 if (val
== (int8_t)val
) {
1199 tcg_out_modrm(s
, OPC_ARITH_EvIb
+ rexw
, c
, r0
);
1203 if (rexw
== 0 || val
== (int32_t)val
) {
1204 tcg_out_modrm(s
, OPC_ARITH_EvIz
+ rexw
, c
, r0
);
1212 static void tcg_out_addi(TCGContext
*s
, int reg
, tcg_target_long val
)
1215 tgen_arithi(s
, ARITH_ADD
+ P_REXW
, reg
, val
, 0);
1219 /* Use SMALL != 0 to force a short forward branch. */
1220 static void tcg_out_jxx(TCGContext
*s
, int opc
, TCGLabel
*l
, int small
)
1225 val
= tcg_pcrel_diff(s
, l
->u
.value_ptr
);
1227 if ((int8_t)val1
== val1
) {
1229 tcg_out8(s
, OPC_JMP_short
);
1231 tcg_out8(s
, OPC_JCC_short
+ opc
);
1239 tcg_out8(s
, OPC_JMP_long
);
1240 tcg_out32(s
, val
- 5);
1242 tcg_out_opc(s
, OPC_JCC_long
+ opc
, 0, 0, 0);
1243 tcg_out32(s
, val
- 6);
1248 tcg_out8(s
, OPC_JMP_short
);
1250 tcg_out8(s
, OPC_JCC_short
+ opc
);
1252 tcg_out_reloc(s
, s
->code_ptr
, R_386_PC8
, l
, -1);
1256 tcg_out8(s
, OPC_JMP_long
);
1258 tcg_out_opc(s
, OPC_JCC_long
+ opc
, 0, 0, 0);
1260 tcg_out_reloc(s
, s
->code_ptr
, R_386_PC32
, l
, -4);
1265 static void tcg_out_cmp(TCGContext
*s
, TCGArg arg1
, TCGArg arg2
,
1266 int const_arg2
, int rexw
)
1271 tcg_out_modrm(s
, OPC_TESTL
+ rexw
, arg1
, arg1
);
1273 tgen_arithi(s
, ARITH_CMP
+ rexw
, arg1
, arg2
, 0);
1276 tgen_arithr(s
, ARITH_CMP
+ rexw
, arg1
, arg2
);
1280 static void tcg_out_brcond32(TCGContext
*s
, TCGCond cond
,
1281 TCGArg arg1
, TCGArg arg2
, int const_arg2
,
1282 TCGLabel
*label
, int small
)
1284 tcg_out_cmp(s
, arg1
, arg2
, const_arg2
, 0);
1285 tcg_out_jxx(s
, tcg_cond_to_jcc
[cond
], label
, small
);
1288 #if TCG_TARGET_REG_BITS == 64
1289 static void tcg_out_brcond64(TCGContext
*s
, TCGCond cond
,
1290 TCGArg arg1
, TCGArg arg2
, int const_arg2
,
1291 TCGLabel
*label
, int small
)
1293 tcg_out_cmp(s
, arg1
, arg2
, const_arg2
, P_REXW
);
1294 tcg_out_jxx(s
, tcg_cond_to_jcc
[cond
], label
, small
);
1297 /* XXX: we implement it at the target level to avoid having to
1298 handle cross basic blocks temporaries */
1299 static void tcg_out_brcond2(TCGContext
*s
, const TCGArg
*args
,
1300 const int *const_args
, int small
)
1302 TCGLabel
*label_next
= gen_new_label();
1303 TCGLabel
*label_this
= arg_label(args
[5]);
1307 tcg_out_brcond32(s
, TCG_COND_NE
, args
[0], args
[2], const_args
[2],
1309 tcg_out_brcond32(s
, TCG_COND_EQ
, args
[1], args
[3], const_args
[3],
1313 tcg_out_brcond32(s
, TCG_COND_NE
, args
[0], args
[2], const_args
[2],
1315 tcg_out_brcond32(s
, TCG_COND_NE
, args
[1], args
[3], const_args
[3],
1319 tcg_out_brcond32(s
, TCG_COND_LT
, args
[1], args
[3], const_args
[3],
1321 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1322 tcg_out_brcond32(s
, TCG_COND_LTU
, args
[0], args
[2], const_args
[2],
1326 tcg_out_brcond32(s
, TCG_COND_LT
, args
[1], args
[3], const_args
[3],
1328 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1329 tcg_out_brcond32(s
, TCG_COND_LEU
, args
[0], args
[2], const_args
[2],
1333 tcg_out_brcond32(s
, TCG_COND_GT
, args
[1], args
[3], const_args
[3],
1335 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1336 tcg_out_brcond32(s
, TCG_COND_GTU
, args
[0], args
[2], const_args
[2],
1340 tcg_out_brcond32(s
, TCG_COND_GT
, args
[1], args
[3], const_args
[3],
1342 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1343 tcg_out_brcond32(s
, TCG_COND_GEU
, args
[0], args
[2], const_args
[2],
1347 tcg_out_brcond32(s
, TCG_COND_LTU
, args
[1], args
[3], const_args
[3],
1349 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1350 tcg_out_brcond32(s
, TCG_COND_LTU
, args
[0], args
[2], const_args
[2],
1354 tcg_out_brcond32(s
, TCG_COND_LTU
, args
[1], args
[3], const_args
[3],
1356 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1357 tcg_out_brcond32(s
, TCG_COND_LEU
, args
[0], args
[2], const_args
[2],
1361 tcg_out_brcond32(s
, TCG_COND_GTU
, args
[1], args
[3], const_args
[3],
1363 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1364 tcg_out_brcond32(s
, TCG_COND_GTU
, args
[0], args
[2], const_args
[2],
1368 tcg_out_brcond32(s
, TCG_COND_GTU
, args
[1], args
[3], const_args
[3],
1370 tcg_out_jxx(s
, JCC_JNE
, label_next
, 1);
1371 tcg_out_brcond32(s
, TCG_COND_GEU
, args
[0], args
[2], const_args
[2],
1377 tcg_out_label(s
, label_next
, s
->code_ptr
);
1381 static void tcg_out_setcond32(TCGContext
*s
, TCGCond cond
, TCGArg dest
,
1382 TCGArg arg1
, TCGArg arg2
, int const_arg2
)
1384 tcg_out_cmp(s
, arg1
, arg2
, const_arg2
, 0);
1385 tcg_out_modrm(s
, OPC_SETCC
| tcg_cond_to_jcc
[cond
], 0, dest
);
1386 tcg_out_ext8u(s
, dest
, dest
);
1389 #if TCG_TARGET_REG_BITS == 64
1390 static void tcg_out_setcond64(TCGContext
*s
, TCGCond cond
, TCGArg dest
,
1391 TCGArg arg1
, TCGArg arg2
, int const_arg2
)
1393 tcg_out_cmp(s
, arg1
, arg2
, const_arg2
, P_REXW
);
1394 tcg_out_modrm(s
, OPC_SETCC
| tcg_cond_to_jcc
[cond
], 0, dest
);
1395 tcg_out_ext8u(s
, dest
, dest
);
1398 static void tcg_out_setcond2(TCGContext
*s
, const TCGArg
*args
,
1399 const int *const_args
)
1402 TCGLabel
*label_true
, *label_over
;
1404 memcpy(new_args
, args
+1, 5*sizeof(TCGArg
));
1406 if (args
[0] == args
[1] || args
[0] == args
[2]
1407 || (!const_args
[3] && args
[0] == args
[3])
1408 || (!const_args
[4] && args
[0] == args
[4])) {
1409 /* When the destination overlaps with one of the argument
1410 registers, don't do anything tricky. */
1411 label_true
= gen_new_label();
1412 label_over
= gen_new_label();
1414 new_args
[5] = label_arg(label_true
);
1415 tcg_out_brcond2(s
, new_args
, const_args
+1, 1);
1417 tcg_out_movi(s
, TCG_TYPE_I32
, args
[0], 0);
1418 tcg_out_jxx(s
, JCC_JMP
, label_over
, 1);
1419 tcg_out_label(s
, label_true
, s
->code_ptr
);
1421 tcg_out_movi(s
, TCG_TYPE_I32
, args
[0], 1);
1422 tcg_out_label(s
, label_over
, s
->code_ptr
);
1424 /* When the destination does not overlap one of the arguments,
1425 clear the destination first, jump if cond false, and emit an
1426 increment in the true case. This results in smaller code. */
1428 tcg_out_movi(s
, TCG_TYPE_I32
, args
[0], 0);
1430 label_over
= gen_new_label();
1431 new_args
[4] = tcg_invert_cond(new_args
[4]);
1432 new_args
[5] = label_arg(label_over
);
1433 tcg_out_brcond2(s
, new_args
, const_args
+1, 1);
1435 tgen_arithi(s
, ARITH_ADD
, args
[0], 1, 0);
1436 tcg_out_label(s
, label_over
, s
->code_ptr
);
1441 static void tcg_out_cmov(TCGContext
*s
, TCGCond cond
, int rexw
,
1442 TCGReg dest
, TCGReg v1
)
1445 tcg_out_modrm(s
, OPC_CMOVCC
| tcg_cond_to_jcc
[cond
] | rexw
, dest
, v1
);
1447 TCGLabel
*over
= gen_new_label();
1448 tcg_out_jxx(s
, tcg_cond_to_jcc
[tcg_invert_cond(cond
)], over
, 1);
1449 tcg_out_mov(s
, TCG_TYPE_I32
, dest
, v1
);
1450 tcg_out_label(s
, over
, s
->code_ptr
);
1454 static void tcg_out_movcond32(TCGContext
*s
, TCGCond cond
, TCGReg dest
,
1455 TCGReg c1
, TCGArg c2
, int const_c2
,
1458 tcg_out_cmp(s
, c1
, c2
, const_c2
, 0);
1459 tcg_out_cmov(s
, cond
, 0, dest
, v1
);
1462 #if TCG_TARGET_REG_BITS == 64
1463 static void tcg_out_movcond64(TCGContext
*s
, TCGCond cond
, TCGReg dest
,
1464 TCGReg c1
, TCGArg c2
, int const_c2
,
1467 tcg_out_cmp(s
, c1
, c2
, const_c2
, P_REXW
);
1468 tcg_out_cmov(s
, cond
, P_REXW
, dest
, v1
);
1472 static void tcg_out_ctz(TCGContext
*s
, int rexw
, TCGReg dest
, TCGReg arg1
,
1473 TCGArg arg2
, bool const_a2
)
1476 tcg_out_modrm(s
, OPC_TZCNT
+ rexw
, dest
, arg1
);
1478 tcg_debug_assert(arg2
== (rexw
? 64 : 32));
1480 tcg_debug_assert(dest
!= arg2
);
1481 tcg_out_cmov(s
, TCG_COND_LTU
, rexw
, dest
, arg2
);
1484 tcg_debug_assert(dest
!= arg2
);
1485 tcg_out_modrm(s
, OPC_BSF
+ rexw
, dest
, arg1
);
1486 tcg_out_cmov(s
, TCG_COND_EQ
, rexw
, dest
, arg2
);
1490 static void tcg_out_clz(TCGContext
*s
, int rexw
, TCGReg dest
, TCGReg arg1
,
1491 TCGArg arg2
, bool const_a2
)
1494 tcg_out_modrm(s
, OPC_LZCNT
+ rexw
, dest
, arg1
);
1496 tcg_debug_assert(arg2
== (rexw
? 64 : 32));
1498 tcg_debug_assert(dest
!= arg2
);
1499 tcg_out_cmov(s
, TCG_COND_LTU
, rexw
, dest
, arg2
);
1502 tcg_debug_assert(!const_a2
);
1503 tcg_debug_assert(dest
!= arg1
);
1504 tcg_debug_assert(dest
!= arg2
);
1506 /* Recall that the output of BSR is the index not the count. */
1507 tcg_out_modrm(s
, OPC_BSR
+ rexw
, dest
, arg1
);
1508 tgen_arithi(s
, ARITH_XOR
+ rexw
, dest
, rexw
? 63 : 31, 0);
1510 /* Since we have destroyed the flags from BSR, we have to re-test. */
1511 tcg_out_cmp(s
, arg1
, 0, 1, rexw
);
1512 tcg_out_cmov(s
, TCG_COND_EQ
, rexw
, dest
, arg2
);
1516 static void tcg_out_branch(TCGContext
*s
, int call
, tcg_insn_unit
*dest
)
1518 intptr_t disp
= tcg_pcrel_diff(s
, dest
) - 5;
1520 if (disp
== (int32_t)disp
) {
1521 tcg_out_opc(s
, call
? OPC_CALL_Jz
: OPC_JMP_long
, 0, 0, 0);
1524 /* rip-relative addressing into the constant pool.
1525 This is 6 + 8 = 14 bytes, as compared to using an
1526 an immediate load 10 + 6 = 16 bytes, plus we may
1527 be able to re-use the pool constant for more calls. */
1528 tcg_out_opc(s
, OPC_GRP5
, 0, 0, 0);
1529 tcg_out8(s
, (call
? EXT5_CALLN_Ev
: EXT5_JMPN_Ev
) << 3 | 5);
1530 new_pool_label(s
, (uintptr_t)dest
, R_386_PC32
, s
->code_ptr
, -4);
1535 static inline void tcg_out_call(TCGContext
*s
, tcg_insn_unit
*dest
)
1537 tcg_out_branch(s
, 1, dest
);
1540 static void tcg_out_jmp(TCGContext
*s
, tcg_insn_unit
*dest
)
1542 tcg_out_branch(s
, 0, dest
);
1545 static void tcg_out_nopn(TCGContext
*s
, int n
)
1548 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1549 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1550 * duplicate prefix, and all of the interesting recent cores can
1551 * decode and discard the duplicates in a single cycle.
1553 tcg_debug_assert(n
>= 1);
1554 for (i
= 1; i
< n
; ++i
) {
1560 #if defined(CONFIG_SOFTMMU)
1561 #include "tcg-ldst.inc.c"
1563 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1564 * int mmu_idx, uintptr_t ra)
1566 static void * const qemu_ld_helpers
[16] = {
1567 [MO_UB
] = helper_ret_ldub_mmu
,
1568 [MO_LEUW
] = helper_le_lduw_mmu
,
1569 [MO_LEUL
] = helper_le_ldul_mmu
,
1570 [MO_LEQ
] = helper_le_ldq_mmu
,
1571 [MO_BEUW
] = helper_be_lduw_mmu
,
1572 [MO_BEUL
] = helper_be_ldul_mmu
,
1573 [MO_BEQ
] = helper_be_ldq_mmu
,
1576 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1577 * uintxx_t val, int mmu_idx, uintptr_t ra)
1579 static void * const qemu_st_helpers
[16] = {
1580 [MO_UB
] = helper_ret_stb_mmu
,
1581 [MO_LEUW
] = helper_le_stw_mmu
,
1582 [MO_LEUL
] = helper_le_stl_mmu
,
1583 [MO_LEQ
] = helper_le_stq_mmu
,
1584 [MO_BEUW
] = helper_be_stw_mmu
,
1585 [MO_BEUL
] = helper_be_stl_mmu
,
1586 [MO_BEQ
] = helper_be_stq_mmu
,
1589 /* Perform the TLB load and compare.
1592 ADDRLO and ADDRHI contain the low and high part of the address.
1594 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1596 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1597 This should be offsetof addr_read or addr_write.
1600 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1601 positions of the displacements of forward jumps to the TLB miss case.
1603 Second argument register is loaded with the low part of the address.
1604 In the TLB hit case, it has been adjusted as indicated by the TLB
1605 and so is a host address. In the TLB miss case, it continues to
1606 hold a guest address.
1608 First argument register is clobbered. */
1610 static inline void tcg_out_tlb_load(TCGContext
*s
, TCGReg addrlo
, TCGReg addrhi
,
1611 int mem_index
, TCGMemOp opc
,
1612 tcg_insn_unit
**label_ptr
, int which
)
1614 const TCGReg r0
= TCG_REG_L0
;
1615 const TCGReg r1
= TCG_REG_L1
;
1616 TCGType ttype
= TCG_TYPE_I32
;
1617 TCGType tlbtype
= TCG_TYPE_I32
;
1618 int trexw
= 0, hrexw
= 0, tlbrexw
= 0;
1619 unsigned a_bits
= get_alignment_bits(opc
);
1620 unsigned s_bits
= opc
& MO_SIZE
;
1621 unsigned a_mask
= (1 << a_bits
) - 1;
1622 unsigned s_mask
= (1 << s_bits
) - 1;
1623 target_ulong tlb_mask
;
1625 if (TCG_TARGET_REG_BITS
== 64) {
1626 if (TARGET_LONG_BITS
== 64) {
1627 ttype
= TCG_TYPE_I64
;
1630 if (TCG_TYPE_PTR
== TCG_TYPE_I64
) {
1632 if (TARGET_PAGE_BITS
+ CPU_TLB_BITS
> 32) {
1633 tlbtype
= TCG_TYPE_I64
;
1639 tcg_out_mov(s
, tlbtype
, r0
, addrlo
);
1640 /* If the required alignment is at least as large as the access, simply
1641 copy the address and mask. For lesser alignments, check that we don't
1642 cross pages for the complete access. */
1643 if (a_bits
>= s_bits
) {
1644 tcg_out_mov(s
, ttype
, r1
, addrlo
);
1646 tcg_out_modrm_offset(s
, OPC_LEA
+ trexw
, r1
, addrlo
, s_mask
- a_mask
);
1648 tlb_mask
= (target_ulong
)TARGET_PAGE_MASK
| a_mask
;
1650 tcg_out_shifti(s
, SHIFT_SHR
+ tlbrexw
, r0
,
1651 TARGET_PAGE_BITS
- CPU_TLB_ENTRY_BITS
);
1653 tgen_arithi(s
, ARITH_AND
+ trexw
, r1
, tlb_mask
, 0);
1654 tgen_arithi(s
, ARITH_AND
+ tlbrexw
, r0
,
1655 (CPU_TLB_SIZE
- 1) << CPU_TLB_ENTRY_BITS
, 0);
1657 tcg_out_modrm_sib_offset(s
, OPC_LEA
+ hrexw
, r0
, TCG_AREG0
, r0
, 0,
1658 offsetof(CPUArchState
, tlb_table
[mem_index
][0])
1662 tcg_out_modrm_offset(s
, OPC_CMP_GvEv
+ trexw
, r1
, r0
, 0);
1664 /* Prepare for both the fast path add of the tlb addend, and the slow
1665 path function argument setup. There are two cases worth note:
1666 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1667 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1668 copies the entire guest address for the slow path, while truncation
1669 for the 32-bit host happens with the fastpath ADDL below. */
1670 tcg_out_mov(s
, ttype
, r1
, addrlo
);
1673 tcg_out_opc(s
, OPC_JCC_long
+ JCC_JNE
, 0, 0, 0);
1674 label_ptr
[0] = s
->code_ptr
;
1677 if (TARGET_LONG_BITS
> TCG_TARGET_REG_BITS
) {
1678 /* cmp 4(r0), addrhi */
1679 tcg_out_modrm_offset(s
, OPC_CMP_GvEv
, addrhi
, r0
, 4);
1682 tcg_out_opc(s
, OPC_JCC_long
+ JCC_JNE
, 0, 0, 0);
1683 label_ptr
[1] = s
->code_ptr
;
1689 /* add addend(r0), r1 */
1690 tcg_out_modrm_offset(s
, OPC_ADD_GvEv
+ hrexw
, r1
, r0
,
1691 offsetof(CPUTLBEntry
, addend
) - which
);
1695 * Record the context of a call to the out of line helper code for the slow path
1696 * for a load or store, so that we can later generate the correct helper code
1698 static void add_qemu_ldst_label(TCGContext
*s
, bool is_ld
, TCGMemOpIdx oi
,
1699 TCGReg datalo
, TCGReg datahi
,
1700 TCGReg addrlo
, TCGReg addrhi
,
1701 tcg_insn_unit
*raddr
,
1702 tcg_insn_unit
**label_ptr
)
1704 TCGLabelQemuLdst
*label
= new_ldst_label(s
);
1706 label
->is_ld
= is_ld
;
1708 label
->datalo_reg
= datalo
;
1709 label
->datahi_reg
= datahi
;
1710 label
->addrlo_reg
= addrlo
;
1711 label
->addrhi_reg
= addrhi
;
1712 label
->raddr
= raddr
;
1713 label
->label_ptr
[0] = label_ptr
[0];
1714 if (TARGET_LONG_BITS
> TCG_TARGET_REG_BITS
) {
1715 label
->label_ptr
[1] = label_ptr
[1];
1720 * Generate code for the slow path for a load at the end of block
1722 static void tcg_out_qemu_ld_slow_path(TCGContext
*s
, TCGLabelQemuLdst
*l
)
1724 TCGMemOpIdx oi
= l
->oi
;
1725 TCGMemOp opc
= get_memop(oi
);
1727 tcg_insn_unit
**label_ptr
= &l
->label_ptr
[0];
1729 /* resolve label address */
1730 tcg_patch32(label_ptr
[0], s
->code_ptr
- label_ptr
[0] - 4);
1731 if (TARGET_LONG_BITS
> TCG_TARGET_REG_BITS
) {
1732 tcg_patch32(label_ptr
[1], s
->code_ptr
- label_ptr
[1] - 4);
1735 if (TCG_TARGET_REG_BITS
== 32) {
1738 tcg_out_st(s
, TCG_TYPE_PTR
, TCG_AREG0
, TCG_REG_ESP
, ofs
);
1741 tcg_out_st(s
, TCG_TYPE_I32
, l
->addrlo_reg
, TCG_REG_ESP
, ofs
);
1744 if (TARGET_LONG_BITS
== 64) {
1745 tcg_out_st(s
, TCG_TYPE_I32
, l
->addrhi_reg
, TCG_REG_ESP
, ofs
);
1749 tcg_out_sti(s
, TCG_TYPE_I32
, oi
, TCG_REG_ESP
, ofs
);
1752 tcg_out_sti(s
, TCG_TYPE_PTR
, (uintptr_t)l
->raddr
, TCG_REG_ESP
, ofs
);
1754 tcg_out_mov(s
, TCG_TYPE_PTR
, tcg_target_call_iarg_regs
[0], TCG_AREG0
);
1755 /* The second argument is already loaded with addrlo. */
1756 tcg_out_movi(s
, TCG_TYPE_I32
, tcg_target_call_iarg_regs
[2], oi
);
1757 tcg_out_movi(s
, TCG_TYPE_PTR
, tcg_target_call_iarg_regs
[3],
1758 (uintptr_t)l
->raddr
);
1761 tcg_out_call(s
, qemu_ld_helpers
[opc
& (MO_BSWAP
| MO_SIZE
)]);
1763 data_reg
= l
->datalo_reg
;
1764 switch (opc
& MO_SSIZE
) {
1766 tcg_out_ext8s(s
, data_reg
, TCG_REG_EAX
, P_REXW
);
1769 tcg_out_ext16s(s
, data_reg
, TCG_REG_EAX
, P_REXW
);
1771 #if TCG_TARGET_REG_BITS == 64
1773 tcg_out_ext32s(s
, data_reg
, TCG_REG_EAX
);
1778 /* Note that the helpers have zero-extended to tcg_target_long. */
1780 tcg_out_mov(s
, TCG_TYPE_I32
, data_reg
, TCG_REG_EAX
);
1783 if (TCG_TARGET_REG_BITS
== 64) {
1784 tcg_out_mov(s
, TCG_TYPE_I64
, data_reg
, TCG_REG_RAX
);
1785 } else if (data_reg
== TCG_REG_EDX
) {
1786 /* xchg %edx, %eax */
1787 tcg_out_opc(s
, OPC_XCHG_ax_r32
+ TCG_REG_EDX
, 0, 0, 0);
1788 tcg_out_mov(s
, TCG_TYPE_I32
, l
->datahi_reg
, TCG_REG_EAX
);
1790 tcg_out_mov(s
, TCG_TYPE_I32
, data_reg
, TCG_REG_EAX
);
1791 tcg_out_mov(s
, TCG_TYPE_I32
, l
->datahi_reg
, TCG_REG_EDX
);
1798 /* Jump to the code corresponding to next IR of qemu_st */
1799 tcg_out_jmp(s
, l
->raddr
);
1803 * Generate code for the slow path for a store at the end of block
1805 static void tcg_out_qemu_st_slow_path(TCGContext
*s
, TCGLabelQemuLdst
*l
)
1807 TCGMemOpIdx oi
= l
->oi
;
1808 TCGMemOp opc
= get_memop(oi
);
1809 TCGMemOp s_bits
= opc
& MO_SIZE
;
1810 tcg_insn_unit
**label_ptr
= &l
->label_ptr
[0];
1813 /* resolve label address */
1814 tcg_patch32(label_ptr
[0], s
->code_ptr
- label_ptr
[0] - 4);
1815 if (TARGET_LONG_BITS
> TCG_TARGET_REG_BITS
) {
1816 tcg_patch32(label_ptr
[1], s
->code_ptr
- label_ptr
[1] - 4);
1819 if (TCG_TARGET_REG_BITS
== 32) {
1822 tcg_out_st(s
, TCG_TYPE_PTR
, TCG_AREG0
, TCG_REG_ESP
, ofs
);
1825 tcg_out_st(s
, TCG_TYPE_I32
, l
->addrlo_reg
, TCG_REG_ESP
, ofs
);
1828 if (TARGET_LONG_BITS
== 64) {
1829 tcg_out_st(s
, TCG_TYPE_I32
, l
->addrhi_reg
, TCG_REG_ESP
, ofs
);
1833 tcg_out_st(s
, TCG_TYPE_I32
, l
->datalo_reg
, TCG_REG_ESP
, ofs
);
1836 if (s_bits
== MO_64
) {
1837 tcg_out_st(s
, TCG_TYPE_I32
, l
->datahi_reg
, TCG_REG_ESP
, ofs
);
1841 tcg_out_sti(s
, TCG_TYPE_I32
, oi
, TCG_REG_ESP
, ofs
);
1844 retaddr
= TCG_REG_EAX
;
1845 tcg_out_movi(s
, TCG_TYPE_PTR
, retaddr
, (uintptr_t)l
->raddr
);
1846 tcg_out_st(s
, TCG_TYPE_PTR
, retaddr
, TCG_REG_ESP
, ofs
);
1848 tcg_out_mov(s
, TCG_TYPE_PTR
, tcg_target_call_iarg_regs
[0], TCG_AREG0
);
1849 /* The second argument is already loaded with addrlo. */
1850 tcg_out_mov(s
, (s_bits
== MO_64
? TCG_TYPE_I64
: TCG_TYPE_I32
),
1851 tcg_target_call_iarg_regs
[2], l
->datalo_reg
);
1852 tcg_out_movi(s
, TCG_TYPE_I32
, tcg_target_call_iarg_regs
[3], oi
);
1854 if (ARRAY_SIZE(tcg_target_call_iarg_regs
) > 4) {
1855 retaddr
= tcg_target_call_iarg_regs
[4];
1856 tcg_out_movi(s
, TCG_TYPE_PTR
, retaddr
, (uintptr_t)l
->raddr
);
1858 retaddr
= TCG_REG_RAX
;
1859 tcg_out_movi(s
, TCG_TYPE_PTR
, retaddr
, (uintptr_t)l
->raddr
);
1860 tcg_out_st(s
, TCG_TYPE_PTR
, retaddr
, TCG_REG_ESP
,
1861 TCG_TARGET_CALL_STACK_OFFSET
);
1865 /* "Tail call" to the helper, with the return address back inline. */
1866 tcg_out_push(s
, retaddr
);
1867 tcg_out_jmp(s
, qemu_st_helpers
[opc
& (MO_BSWAP
| MO_SIZE
)]);
1869 #elif defined(__x86_64__) && defined(__linux__)
1870 # include <asm/prctl.h>
1871 # include <sys/prctl.h>
1873 int arch_prctl(int code
, unsigned long addr
);
1875 static int guest_base_flags
;
1876 static inline void setup_guest_base_seg(void)
1878 if (arch_prctl(ARCH_SET_GS
, guest_base
) == 0) {
1879 guest_base_flags
= P_GS
;
1883 # define guest_base_flags 0
1884 static inline void setup_guest_base_seg(void) { }
1885 #endif /* SOFTMMU */
1887 static void tcg_out_qemu_ld_direct(TCGContext
*s
, TCGReg datalo
, TCGReg datahi
,
1888 TCGReg base
, int index
, intptr_t ofs
,
1889 int seg
, TCGMemOp memop
)
1891 const TCGMemOp real_bswap
= memop
& MO_BSWAP
;
1892 TCGMemOp bswap
= real_bswap
;
1893 int movop
= OPC_MOVL_GvEv
;
1895 if (have_movbe
&& real_bswap
) {
1897 movop
= OPC_MOVBE_GyMy
;
1900 switch (memop
& MO_SSIZE
) {
1902 tcg_out_modrm_sib_offset(s
, OPC_MOVZBL
+ seg
, datalo
,
1903 base
, index
, 0, ofs
);
1906 tcg_out_modrm_sib_offset(s
, OPC_MOVSBL
+ P_REXW
+ seg
, datalo
,
1907 base
, index
, 0, ofs
);
1910 tcg_out_modrm_sib_offset(s
, OPC_MOVZWL
+ seg
, datalo
,
1911 base
, index
, 0, ofs
);
1913 tcg_out_rolw_8(s
, datalo
);
1919 tcg_out_modrm_sib_offset(s
, OPC_MOVBE_GyMy
+ P_DATA16
+ seg
,
1920 datalo
, base
, index
, 0, ofs
);
1922 tcg_out_modrm_sib_offset(s
, OPC_MOVZWL
+ seg
, datalo
,
1923 base
, index
, 0, ofs
);
1924 tcg_out_rolw_8(s
, datalo
);
1926 tcg_out_modrm(s
, OPC_MOVSWL
+ P_REXW
, datalo
, datalo
);
1928 tcg_out_modrm_sib_offset(s
, OPC_MOVSWL
+ P_REXW
+ seg
,
1929 datalo
, base
, index
, 0, ofs
);
1933 tcg_out_modrm_sib_offset(s
, movop
+ seg
, datalo
, base
, index
, 0, ofs
);
1935 tcg_out_bswap32(s
, datalo
);
1938 #if TCG_TARGET_REG_BITS == 64
1941 tcg_out_modrm_sib_offset(s
, movop
+ seg
, datalo
,
1942 base
, index
, 0, ofs
);
1944 tcg_out_bswap32(s
, datalo
);
1946 tcg_out_ext32s(s
, datalo
, datalo
);
1948 tcg_out_modrm_sib_offset(s
, OPC_MOVSLQ
+ seg
, datalo
,
1949 base
, index
, 0, ofs
);
1954 if (TCG_TARGET_REG_BITS
== 64) {
1955 tcg_out_modrm_sib_offset(s
, movop
+ P_REXW
+ seg
, datalo
,
1956 base
, index
, 0, ofs
);
1958 tcg_out_bswap64(s
, datalo
);
1966 if (base
!= datalo
) {
1967 tcg_out_modrm_sib_offset(s
, movop
+ seg
, datalo
,
1968 base
, index
, 0, ofs
);
1969 tcg_out_modrm_sib_offset(s
, movop
+ seg
, datahi
,
1970 base
, index
, 0, ofs
+ 4);
1972 tcg_out_modrm_sib_offset(s
, movop
+ seg
, datahi
,
1973 base
, index
, 0, ofs
+ 4);
1974 tcg_out_modrm_sib_offset(s
, movop
+ seg
, datalo
,
1975 base
, index
, 0, ofs
);
1978 tcg_out_bswap32(s
, datalo
);
1979 tcg_out_bswap32(s
, datahi
);
1988 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1989 EAX. It will be useful once fixed registers globals are less
1991 static void tcg_out_qemu_ld(TCGContext
*s
, const TCGArg
*args
, bool is64
)
1993 TCGReg datalo
, datahi
, addrlo
;
1994 TCGReg addrhi
__attribute__((unused
));
1997 #if defined(CONFIG_SOFTMMU)
1999 tcg_insn_unit
*label_ptr
[2];
2003 datahi
= (TCG_TARGET_REG_BITS
== 32 && is64
? *args
++ : 0);
2005 addrhi
= (TARGET_LONG_BITS
> TCG_TARGET_REG_BITS
? *args
++ : 0);
2007 opc
= get_memop(oi
);
2009 #if defined(CONFIG_SOFTMMU)
2010 mem_index
= get_mmuidx(oi
);
2012 tcg_out_tlb_load(s
, addrlo
, addrhi
, mem_index
, opc
,
2013 label_ptr
, offsetof(CPUTLBEntry
, addr_read
));
2016 tcg_out_qemu_ld_direct(s
, datalo
, datahi
, TCG_REG_L1
, -1, 0, 0, opc
);
2018 /* Record the current context of a load into ldst label */
2019 add_qemu_ldst_label(s
, true, oi
, datalo
, datahi
, addrlo
, addrhi
,
2020 s
->code_ptr
, label_ptr
);
2023 int32_t offset
= guest_base
;
2024 TCGReg base
= addrlo
;
2028 /* For a 32-bit guest, the high 32 bits may contain garbage.
2029 We can do this with the ADDR32 prefix if we're not using
2030 a guest base, or when using segmentation. Otherwise we
2031 need to zero-extend manually. */
2032 if (guest_base
== 0 || guest_base_flags
) {
2033 seg
= guest_base_flags
;
2035 if (TCG_TARGET_REG_BITS
> TARGET_LONG_BITS
) {
2038 } else if (TCG_TARGET_REG_BITS
== 64) {
2039 if (TARGET_LONG_BITS
== 32) {
2040 tcg_out_ext32u(s
, TCG_REG_L0
, base
);
2043 if (offset
!= guest_base
) {
2044 tcg_out_movi(s
, TCG_TYPE_I64
, TCG_REG_L1
, guest_base
);
2050 tcg_out_qemu_ld_direct(s
, datalo
, datahi
,
2051 base
, index
, offset
, seg
, opc
);
2056 static void tcg_out_qemu_st_direct(TCGContext
*s
, TCGReg datalo
, TCGReg datahi
,
2057 TCGReg base
, intptr_t ofs
, int seg
,
2060 /* ??? Ideally we wouldn't need a scratch register. For user-only,
2061 we could perform the bswap twice to restore the original value
2062 instead of moving to the scratch. But as it is, the L constraint
2063 means that TCG_REG_L0 is definitely free here. */
2064 const TCGReg scratch
= TCG_REG_L0
;
2065 const TCGMemOp real_bswap
= memop
& MO_BSWAP
;
2066 TCGMemOp bswap
= real_bswap
;
2067 int movop
= OPC_MOVL_EvGv
;
2069 if (have_movbe
&& real_bswap
) {
2071 movop
= OPC_MOVBE_MyGy
;
2074 switch (memop
& MO_SIZE
) {
2076 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2077 Use the scratch register if necessary. */
2078 if (TCG_TARGET_REG_BITS
== 32 && datalo
>= 4) {
2079 tcg_out_mov(s
, TCG_TYPE_I32
, scratch
, datalo
);
2082 tcg_out_modrm_offset(s
, OPC_MOVB_EvGv
+ P_REXB_R
+ seg
,
2087 tcg_out_mov(s
, TCG_TYPE_I32
, scratch
, datalo
);
2088 tcg_out_rolw_8(s
, scratch
);
2091 tcg_out_modrm_offset(s
, movop
+ P_DATA16
+ seg
, datalo
, base
, ofs
);
2095 tcg_out_mov(s
, TCG_TYPE_I32
, scratch
, datalo
);
2096 tcg_out_bswap32(s
, scratch
);
2099 tcg_out_modrm_offset(s
, movop
+ seg
, datalo
, base
, ofs
);
2102 if (TCG_TARGET_REG_BITS
== 64) {
2104 tcg_out_mov(s
, TCG_TYPE_I64
, scratch
, datalo
);
2105 tcg_out_bswap64(s
, scratch
);
2108 tcg_out_modrm_offset(s
, movop
+ P_REXW
+ seg
, datalo
, base
, ofs
);
2110 tcg_out_mov(s
, TCG_TYPE_I32
, scratch
, datahi
);
2111 tcg_out_bswap32(s
, scratch
);
2112 tcg_out_modrm_offset(s
, OPC_MOVL_EvGv
+ seg
, scratch
, base
, ofs
);
2113 tcg_out_mov(s
, TCG_TYPE_I32
, scratch
, datalo
);
2114 tcg_out_bswap32(s
, scratch
);
2115 tcg_out_modrm_offset(s
, OPC_MOVL_EvGv
+ seg
, scratch
, base
, ofs
+4);
2122 tcg_out_modrm_offset(s
, movop
+ seg
, datalo
, base
, ofs
);
2123 tcg_out_modrm_offset(s
, movop
+ seg
, datahi
, base
, ofs
+4);
2131 static void tcg_out_qemu_st(TCGContext
*s
, const TCGArg
*args
, bool is64
)
2133 TCGReg datalo
, datahi
, addrlo
;
2134 TCGReg addrhi
__attribute__((unused
));
2137 #if defined(CONFIG_SOFTMMU)
2139 tcg_insn_unit
*label_ptr
[2];
2143 datahi
= (TCG_TARGET_REG_BITS
== 32 && is64
? *args
++ : 0);
2145 addrhi
= (TARGET_LONG_BITS
> TCG_TARGET_REG_BITS
? *args
++ : 0);
2147 opc
= get_memop(oi
);
2149 #if defined(CONFIG_SOFTMMU)
2150 mem_index
= get_mmuidx(oi
);
2152 tcg_out_tlb_load(s
, addrlo
, addrhi
, mem_index
, opc
,
2153 label_ptr
, offsetof(CPUTLBEntry
, addr_write
));
2156 tcg_out_qemu_st_direct(s
, datalo
, datahi
, TCG_REG_L1
, 0, 0, opc
);
2158 /* Record the current context of a store into ldst label */
2159 add_qemu_ldst_label(s
, false, oi
, datalo
, datahi
, addrlo
, addrhi
,
2160 s
->code_ptr
, label_ptr
);
2163 int32_t offset
= guest_base
;
2164 TCGReg base
= addrlo
;
2167 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
2168 if (guest_base
== 0 || guest_base_flags
) {
2169 seg
= guest_base_flags
;
2171 if (TCG_TARGET_REG_BITS
> TARGET_LONG_BITS
) {
2174 } else if (TCG_TARGET_REG_BITS
== 64) {
2175 /* ??? Note that we can't use the same SIB addressing scheme
2176 as for loads, since we require L0 free for bswap. */
2177 if (offset
!= guest_base
) {
2178 if (TARGET_LONG_BITS
== 32) {
2179 tcg_out_ext32u(s
, TCG_REG_L0
, base
);
2182 tcg_out_movi(s
, TCG_TYPE_I64
, TCG_REG_L1
, guest_base
);
2183 tgen_arithr(s
, ARITH_ADD
+ P_REXW
, TCG_REG_L1
, base
);
2186 } else if (TARGET_LONG_BITS
== 32) {
2187 tcg_out_ext32u(s
, TCG_REG_L1
, base
);
2192 tcg_out_qemu_st_direct(s
, datalo
, datahi
, base
, offset
, seg
, opc
);
2197 static inline void tcg_out_op(TCGContext
*s
, TCGOpcode opc
,
2198 const TCGArg
*args
, const int *const_args
)
2201 int c
, const_a2
, vexop
, rexw
= 0;
2203 #if TCG_TARGET_REG_BITS == 64
2204 # define OP_32_64(x) \
2205 case glue(glue(INDEX_op_, x), _i64): \
2206 rexw = P_REXW; /* FALLTHRU */ \
2207 case glue(glue(INDEX_op_, x), _i32)
2209 # define OP_32_64(x) \
2210 case glue(glue(INDEX_op_, x), _i32)
2213 /* Hoist the loads of the most common arguments. */
2217 const_a2
= const_args
[2];
2220 case INDEX_op_exit_tb
:
2221 /* Reuse the zeroing that exists for goto_ptr. */
2223 tcg_out_jmp(s
, s
->code_gen_epilogue
);
2225 tcg_out_movi(s
, TCG_TYPE_PTR
, TCG_REG_EAX
, a0
);
2226 tcg_out_jmp(s
, tb_ret_addr
);
2229 case INDEX_op_goto_tb
:
2230 if (s
->tb_jmp_insn_offset
) {
2231 /* direct jump method */
2233 /* jump displacement must be aligned for atomic patching;
2234 * see if we need to add extra nops before jump
2236 gap
= tcg_pcrel_diff(s
, QEMU_ALIGN_PTR_UP(s
->code_ptr
+ 1, 4));
2238 tcg_out_nopn(s
, gap
- 1);
2240 tcg_out8(s
, OPC_JMP_long
); /* jmp im */
2241 s
->tb_jmp_insn_offset
[a0
] = tcg_current_code_size(s
);
2244 /* indirect jump method */
2245 tcg_out_modrm_offset(s
, OPC_GRP5
, EXT5_JMPN_Ev
, -1,
2246 (intptr_t)(s
->tb_jmp_target_addr
+ a0
));
2248 set_jmp_reset_offset(s
, a0
);
2250 case INDEX_op_goto_ptr
:
2251 /* jmp to the given host address (could be epilogue) */
2252 tcg_out_modrm(s
, OPC_GRP5
, EXT5_JMPN_Ev
, a0
);
2255 tcg_out_jxx(s
, JCC_JMP
, arg_label(a0
), 0);
2258 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2259 tcg_out_modrm_offset(s
, OPC_MOVZBL
, a0
, a1
, a2
);
2262 tcg_out_modrm_offset(s
, OPC_MOVSBL
+ rexw
, a0
, a1
, a2
);
2265 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2266 tcg_out_modrm_offset(s
, OPC_MOVZWL
, a0
, a1
, a2
);
2269 tcg_out_modrm_offset(s
, OPC_MOVSWL
+ rexw
, a0
, a1
, a2
);
2271 #if TCG_TARGET_REG_BITS == 64
2272 case INDEX_op_ld32u_i64
:
2274 case INDEX_op_ld_i32
:
2275 tcg_out_ld(s
, TCG_TYPE_I32
, a0
, a1
, a2
);
2279 if (const_args
[0]) {
2280 tcg_out_modrm_offset(s
, OPC_MOVB_EvIz
, 0, a1
, a2
);
2283 tcg_out_modrm_offset(s
, OPC_MOVB_EvGv
| P_REXB_R
, a0
, a1
, a2
);
2287 if (const_args
[0]) {
2288 tcg_out_modrm_offset(s
, OPC_MOVL_EvIz
| P_DATA16
, 0, a1
, a2
);
2291 tcg_out_modrm_offset(s
, OPC_MOVL_EvGv
| P_DATA16
, a0
, a1
, a2
);
2294 #if TCG_TARGET_REG_BITS == 64
2295 case INDEX_op_st32_i64
:
2297 case INDEX_op_st_i32
:
2298 if (const_args
[0]) {
2299 tcg_out_modrm_offset(s
, OPC_MOVL_EvIz
, 0, a1
, a2
);
2302 tcg_out_st(s
, TCG_TYPE_I32
, a0
, a1
, a2
);
2307 /* For 3-operand addition, use LEA. */
2312 } else if (a0
== a2
) {
2313 /* Watch out for dest = src + dest, since we've removed
2314 the matching constraint on the add. */
2315 tgen_arithr(s
, ARITH_ADD
+ rexw
, a0
, a1
);
2319 tcg_out_modrm_sib_offset(s
, OPC_LEA
+ rexw
, a0
, a1
, a2
, 0, c3
);
2338 tgen_arithi(s
, c
+ rexw
, a0
, a2
, 0);
2340 tgen_arithr(s
, c
+ rexw
, a0
, a2
);
2346 tcg_out_mov(s
, rexw
? TCG_TYPE_I64
: TCG_TYPE_I32
, a0
, a1
);
2347 tgen_arithi(s
, ARITH_AND
+ rexw
, a0
, ~a2
, 0);
2349 tcg_out_vex_modrm(s
, OPC_ANDN
+ rexw
, a0
, a2
, a1
);
2357 if (val
== (int8_t)val
) {
2358 tcg_out_modrm(s
, OPC_IMUL_GvEvIb
+ rexw
, a0
, a0
);
2361 tcg_out_modrm(s
, OPC_IMUL_GvEvIz
+ rexw
, a0
, a0
);
2365 tcg_out_modrm(s
, OPC_IMUL_GvEv
+ rexw
, a0
, a2
);
2370 tcg_out_modrm(s
, OPC_GRP3_Ev
+ rexw
, EXT3_IDIV
, args
[4]);
2373 tcg_out_modrm(s
, OPC_GRP3_Ev
+ rexw
, EXT3_DIV
, args
[4]);
2377 /* For small constant 3-operand shift, use LEA. */
2378 if (const_a2
&& a0
!= a1
&& (a2
- 1) < 3) {
2380 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2381 tcg_out_modrm_sib_offset(s
, OPC_LEA
+ rexw
, a0
, a1
, a1
, 0, 0);
2383 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2384 tcg_out_modrm_sib_offset(s
, OPC_LEA
+ rexw
, a0
, -1, a1
, a2
, 0);
2390 goto gen_shift_maybe_vex
;
2394 goto gen_shift_maybe_vex
;
2398 goto gen_shift_maybe_vex
;
2405 gen_shift_maybe_vex
:
2408 tcg_out_vex_modrm(s
, vexop
+ rexw
, a0
, a2
, a1
);
2411 tcg_out_mov(s
, rexw
? TCG_TYPE_I64
: TCG_TYPE_I32
, a0
, a1
);
2416 tcg_out_shifti(s
, c
+ rexw
, a0
, a2
);
2418 tcg_out_modrm(s
, OPC_SHIFT_cl
+ rexw
, c
, a0
);
2423 tcg_out_ctz(s
, rexw
, args
[0], args
[1], args
[2], const_args
[2]);
2426 tcg_out_clz(s
, rexw
, args
[0], args
[1], args
[2], const_args
[2]);
2429 tcg_out_modrm(s
, OPC_POPCNT
+ rexw
, a0
, a1
);
2432 case INDEX_op_brcond_i32
:
2433 tcg_out_brcond32(s
, a2
, a0
, a1
, const_args
[1], arg_label(args
[3]), 0);
2435 case INDEX_op_setcond_i32
:
2436 tcg_out_setcond32(s
, args
[3], a0
, a1
, a2
, const_a2
);
2438 case INDEX_op_movcond_i32
:
2439 tcg_out_movcond32(s
, args
[5], a0
, a1
, a2
, const_a2
, args
[3]);
2443 tcg_out_rolw_8(s
, a0
);
2446 tcg_out_bswap32(s
, a0
);
2450 tcg_out_modrm(s
, OPC_GRP3_Ev
+ rexw
, EXT3_NEG
, a0
);
2453 tcg_out_modrm(s
, OPC_GRP3_Ev
+ rexw
, EXT3_NOT
, a0
);
2457 tcg_out_ext8s(s
, a0
, a1
, rexw
);
2460 tcg_out_ext16s(s
, a0
, a1
, rexw
);
2463 tcg_out_ext8u(s
, a0
, a1
);
2466 tcg_out_ext16u(s
, a0
, a1
);
2469 case INDEX_op_qemu_ld_i32
:
2470 tcg_out_qemu_ld(s
, args
, 0);
2472 case INDEX_op_qemu_ld_i64
:
2473 tcg_out_qemu_ld(s
, args
, 1);
2475 case INDEX_op_qemu_st_i32
:
2476 tcg_out_qemu_st(s
, args
, 0);
2478 case INDEX_op_qemu_st_i64
:
2479 tcg_out_qemu_st(s
, args
, 1);
2483 tcg_out_modrm(s
, OPC_GRP3_Ev
+ rexw
, EXT3_MUL
, args
[3]);
2486 tcg_out_modrm(s
, OPC_GRP3_Ev
+ rexw
, EXT3_IMUL
, args
[3]);
2489 if (const_args
[4]) {
2490 tgen_arithi(s
, ARITH_ADD
+ rexw
, a0
, args
[4], 1);
2492 tgen_arithr(s
, ARITH_ADD
+ rexw
, a0
, args
[4]);
2494 if (const_args
[5]) {
2495 tgen_arithi(s
, ARITH_ADC
+ rexw
, a1
, args
[5], 1);
2497 tgen_arithr(s
, ARITH_ADC
+ rexw
, a1
, args
[5]);
2501 if (const_args
[4]) {
2502 tgen_arithi(s
, ARITH_SUB
+ rexw
, a0
, args
[4], 1);
2504 tgen_arithr(s
, ARITH_SUB
+ rexw
, a0
, args
[4]);
2506 if (const_args
[5]) {
2507 tgen_arithi(s
, ARITH_SBB
+ rexw
, a1
, args
[5], 1);
2509 tgen_arithr(s
, ARITH_SBB
+ rexw
, a1
, args
[5]);
2513 #if TCG_TARGET_REG_BITS == 32
2514 case INDEX_op_brcond2_i32
:
2515 tcg_out_brcond2(s
, args
, const_args
, 0);
2517 case INDEX_op_setcond2_i32
:
2518 tcg_out_setcond2(s
, args
, const_args
);
2520 #else /* TCG_TARGET_REG_BITS == 64 */
2521 case INDEX_op_ld32s_i64
:
2522 tcg_out_modrm_offset(s
, OPC_MOVSLQ
, a0
, a1
, a2
);
2524 case INDEX_op_ld_i64
:
2525 tcg_out_ld(s
, TCG_TYPE_I64
, a0
, a1
, a2
);
2527 case INDEX_op_st_i64
:
2528 if (const_args
[0]) {
2529 tcg_out_modrm_offset(s
, OPC_MOVL_EvIz
| P_REXW
, 0, a1
, a2
);
2532 tcg_out_st(s
, TCG_TYPE_I64
, a0
, a1
, a2
);
2536 case INDEX_op_brcond_i64
:
2537 tcg_out_brcond64(s
, a2
, a0
, a1
, const_args
[1], arg_label(args
[3]), 0);
2539 case INDEX_op_setcond_i64
:
2540 tcg_out_setcond64(s
, args
[3], a0
, a1
, a2
, const_a2
);
2542 case INDEX_op_movcond_i64
:
2543 tcg_out_movcond64(s
, args
[5], a0
, a1
, a2
, const_a2
, args
[3]);
2546 case INDEX_op_bswap64_i64
:
2547 tcg_out_bswap64(s
, a0
);
2549 case INDEX_op_extu_i32_i64
:
2550 case INDEX_op_ext32u_i64
:
2551 tcg_out_ext32u(s
, a0
, a1
);
2553 case INDEX_op_ext_i32_i64
:
2554 case INDEX_op_ext32s_i64
:
2555 tcg_out_ext32s(s
, a0
, a1
);
2560 if (args
[3] == 0 && args
[4] == 8) {
2561 /* load bits 0..7 */
2562 tcg_out_modrm(s
, OPC_MOVB_EvGv
| P_REXB_R
| P_REXB_RM
, a2
, a0
);
2563 } else if (args
[3] == 8 && args
[4] == 8) {
2564 /* load bits 8..15 */
2565 tcg_out_modrm(s
, OPC_MOVB_EvGv
, a2
, a0
+ 4);
2566 } else if (args
[3] == 0 && args
[4] == 16) {
2567 /* load bits 0..15 */
2568 tcg_out_modrm(s
, OPC_MOVL_EvGv
| P_DATA16
, a2
, a0
);
2574 case INDEX_op_extract_i64
:
2575 if (a2
+ args
[3] == 32) {
2576 /* This is a 32-bit zero-extending right shift. */
2577 tcg_out_mov(s
, TCG_TYPE_I32
, a0
, a1
);
2578 tcg_out_shifti(s
, SHIFT_SHR
, a0
, a2
);
2582 case INDEX_op_extract_i32
:
2583 /* On the off-chance that we can use the high-byte registers.
2584 Otherwise we emit the same ext16 + shift pattern that we
2585 would have gotten from the normal tcg-op.c expansion. */
2586 tcg_debug_assert(a2
== 8 && args
[3] == 8);
2587 if (a1
< 4 && a0
< 8) {
2588 tcg_out_modrm(s
, OPC_MOVZBL
, a0
, a1
+ 4);
2590 tcg_out_ext16u(s
, a0
, a1
);
2591 tcg_out_shifti(s
, SHIFT_SHR
, a0
, 8);
2595 case INDEX_op_sextract_i32
:
2596 /* We don't implement sextract_i64, as we cannot sign-extend to
2597 64-bits without using the REX prefix that explicitly excludes
2598 access to the high-byte registers. */
2599 tcg_debug_assert(a2
== 8 && args
[3] == 8);
2600 if (a1
< 4 && a0
< 8) {
2601 tcg_out_modrm(s
, OPC_MOVSBL
, a0
, a1
+ 4);
2603 tcg_out_ext16s(s
, a0
, a1
, 0);
2604 tcg_out_shifti(s
, SHIFT_SAR
, a0
, 8);
2611 case INDEX_op_mov_i32
: /* Always emitted via tcg_out_mov. */
2612 case INDEX_op_mov_i64
:
2613 case INDEX_op_mov_vec
:
2614 case INDEX_op_movi_i32
: /* Always emitted via tcg_out_movi. */
2615 case INDEX_op_movi_i64
:
2616 case INDEX_op_dupi_vec
:
2617 case INDEX_op_call
: /* Always emitted via tcg_out_call. */
2625 static void tcg_out_vec_op(TCGContext
*s
, TCGOpcode opc
,
2626 unsigned vecl
, unsigned vece
,
2627 const TCGArg
*args
, const int *const_args
)
2629 static int const add_insn
[4] = {
2630 OPC_PADDB
, OPC_PADDW
, OPC_PADDD
, OPC_PADDQ
2632 static int const sub_insn
[4] = {
2633 OPC_PSUBB
, OPC_PSUBW
, OPC_PSUBD
, OPC_PSUBQ
2635 static int const mul_insn
[4] = {
2636 OPC_UD2
, OPC_PMULLW
, OPC_PMULLD
, OPC_UD2
2638 static int const shift_imm_insn
[4] = {
2639 OPC_UD2
, OPC_PSHIFTW_Ib
, OPC_PSHIFTD_Ib
, OPC_PSHIFTQ_Ib
2641 static int const cmpeq_insn
[4] = {
2642 OPC_PCMPEQB
, OPC_PCMPEQW
, OPC_PCMPEQD
, OPC_PCMPEQQ
2644 static int const cmpgt_insn
[4] = {
2645 OPC_PCMPGTB
, OPC_PCMPGTW
, OPC_PCMPGTD
, OPC_PCMPGTQ
2647 static int const punpckl_insn
[4] = {
2648 OPC_PUNPCKLBW
, OPC_PUNPCKLWD
, OPC_PUNPCKLDQ
, OPC_PUNPCKLQDQ
2650 static int const punpckh_insn
[4] = {
2651 OPC_PUNPCKHBW
, OPC_PUNPCKHWD
, OPC_PUNPCKHDQ
, OPC_PUNPCKHQDQ
2653 static int const packss_insn
[4] = {
2654 OPC_PACKSSWB
, OPC_PACKSSDW
, OPC_UD2
, OPC_UD2
2656 static int const packus_insn
[4] = {
2657 OPC_PACKUSWB
, OPC_PACKUSDW
, OPC_UD2
, OPC_UD2
2660 TCGType type
= vecl
+ TCG_TYPE_V64
;
2669 case INDEX_op_add_vec
:
2670 insn
= add_insn
[vece
];
2672 case INDEX_op_sub_vec
:
2673 insn
= sub_insn
[vece
];
2675 case INDEX_op_mul_vec
:
2676 insn
= mul_insn
[vece
];
2678 case INDEX_op_and_vec
:
2681 case INDEX_op_or_vec
:
2684 case INDEX_op_xor_vec
:
2687 case INDEX_op_x86_punpckl_vec
:
2688 insn
= punpckl_insn
[vece
];
2690 case INDEX_op_x86_punpckh_vec
:
2691 insn
= punpckh_insn
[vece
];
2693 case INDEX_op_x86_packss_vec
:
2694 insn
= packss_insn
[vece
];
2696 case INDEX_op_x86_packus_vec
:
2697 insn
= packus_insn
[vece
];
2699 #if TCG_TARGET_REG_BITS == 32
2700 case INDEX_op_dup2_vec
:
2701 /* Constraints have already placed both 32-bit inputs in xmm regs. */
2702 insn
= OPC_PUNPCKLDQ
;
2706 tcg_debug_assert(insn
!= OPC_UD2
);
2707 if (type
== TCG_TYPE_V256
) {
2710 tcg_out_vex_modrm(s
, insn
, a0
, a1
, a2
);
2713 case INDEX_op_cmp_vec
:
2715 if (sub
== TCG_COND_EQ
) {
2716 insn
= cmpeq_insn
[vece
];
2717 } else if (sub
== TCG_COND_GT
) {
2718 insn
= cmpgt_insn
[vece
];
2720 g_assert_not_reached();
2724 case INDEX_op_andc_vec
:
2726 if (type
== TCG_TYPE_V256
) {
2729 tcg_out_vex_modrm(s
, insn
, a0
, a2
, a1
);
2732 case INDEX_op_shli_vec
:
2735 case INDEX_op_shri_vec
:
2738 case INDEX_op_sari_vec
:
2739 tcg_debug_assert(vece
!= MO_64
);
2742 tcg_debug_assert(vece
!= MO_8
);
2743 insn
= shift_imm_insn
[vece
];
2744 if (type
== TCG_TYPE_V256
) {
2747 tcg_out_vex_modrm(s
, insn
, sub
, a0
, a1
);
2751 case INDEX_op_ld_vec
:
2752 tcg_out_ld(s
, type
, a0
, a1
, a2
);
2754 case INDEX_op_st_vec
:
2755 tcg_out_st(s
, type
, a0
, a1
, a2
);
2757 case INDEX_op_dup_vec
:
2758 tcg_out_dup_vec(s
, type
, vece
, a0
, a1
);
2761 case INDEX_op_x86_shufps_vec
:
2765 case INDEX_op_x86_blend_vec
:
2766 if (vece
== MO_16
) {
2768 } else if (vece
== MO_32
) {
2769 insn
= (have_avx2
? OPC_VPBLENDD
: OPC_BLENDPS
);
2771 g_assert_not_reached();
2775 case INDEX_op_x86_vperm2i128_vec
:
2776 insn
= OPC_VPERM2I128
;
2780 if (type
== TCG_TYPE_V256
) {
2783 tcg_out_vex_modrm(s
, insn
, a0
, a1
, a2
);
2787 case INDEX_op_x86_vpblendvb_vec
:
2788 insn
= OPC_VPBLENDVB
;
2789 if (type
== TCG_TYPE_V256
) {
2792 tcg_out_vex_modrm(s
, insn
, a0
, a1
, a2
);
2793 tcg_out8(s
, args
[3] << 4);
2796 case INDEX_op_x86_psrldq_vec
:
2797 tcg_out_vex_modrm(s
, OPC_GRP14
, 3, a0
, a1
);
2802 g_assert_not_reached();
2806 static const TCGTargetOpDef
*tcg_target_op_def(TCGOpcode op
)
2808 static const TCGTargetOpDef r
= { .args_ct_str
= { "r" } };
2809 static const TCGTargetOpDef ri_r
= { .args_ct_str
= { "ri", "r" } };
2810 static const TCGTargetOpDef re_r
= { .args_ct_str
= { "re", "r" } };
2811 static const TCGTargetOpDef qi_r
= { .args_ct_str
= { "qi", "r" } };
2812 static const TCGTargetOpDef r_r
= { .args_ct_str
= { "r", "r" } };
2813 static const TCGTargetOpDef r_q
= { .args_ct_str
= { "r", "q" } };
2814 static const TCGTargetOpDef r_re
= { .args_ct_str
= { "r", "re" } };
2815 static const TCGTargetOpDef r_0
= { .args_ct_str
= { "r", "0" } };
2816 static const TCGTargetOpDef r_r_ri
= { .args_ct_str
= { "r", "r", "ri" } };
2817 static const TCGTargetOpDef r_r_re
= { .args_ct_str
= { "r", "r", "re" } };
2818 static const TCGTargetOpDef r_0_re
= { .args_ct_str
= { "r", "0", "re" } };
2819 static const TCGTargetOpDef r_0_ci
= { .args_ct_str
= { "r", "0", "ci" } };
2820 static const TCGTargetOpDef r_L
= { .args_ct_str
= { "r", "L" } };
2821 static const TCGTargetOpDef L_L
= { .args_ct_str
= { "L", "L" } };
2822 static const TCGTargetOpDef r_L_L
= { .args_ct_str
= { "r", "L", "L" } };
2823 static const TCGTargetOpDef r_r_L
= { .args_ct_str
= { "r", "r", "L" } };
2824 static const TCGTargetOpDef L_L_L
= { .args_ct_str
= { "L", "L", "L" } };
2825 static const TCGTargetOpDef r_r_L_L
2826 = { .args_ct_str
= { "r", "r", "L", "L" } };
2827 static const TCGTargetOpDef L_L_L_L
2828 = { .args_ct_str
= { "L", "L", "L", "L" } };
2829 static const TCGTargetOpDef x_x
= { .args_ct_str
= { "x", "x" } };
2830 static const TCGTargetOpDef x_x_x
= { .args_ct_str
= { "x", "x", "x" } };
2831 static const TCGTargetOpDef x_x_x_x
2832 = { .args_ct_str
= { "x", "x", "x", "x" } };
2833 static const TCGTargetOpDef x_r
= { .args_ct_str
= { "x", "r" } };
2836 case INDEX_op_goto_ptr
:
2839 case INDEX_op_ld8u_i32
:
2840 case INDEX_op_ld8u_i64
:
2841 case INDEX_op_ld8s_i32
:
2842 case INDEX_op_ld8s_i64
:
2843 case INDEX_op_ld16u_i32
:
2844 case INDEX_op_ld16u_i64
:
2845 case INDEX_op_ld16s_i32
:
2846 case INDEX_op_ld16s_i64
:
2847 case INDEX_op_ld_i32
:
2848 case INDEX_op_ld32u_i64
:
2849 case INDEX_op_ld32s_i64
:
2850 case INDEX_op_ld_i64
:
2853 case INDEX_op_st8_i32
:
2854 case INDEX_op_st8_i64
:
2856 case INDEX_op_st16_i32
:
2857 case INDEX_op_st16_i64
:
2858 case INDEX_op_st_i32
:
2859 case INDEX_op_st32_i64
:
2861 case INDEX_op_st_i64
:
2864 case INDEX_op_add_i32
:
2865 case INDEX_op_add_i64
:
2867 case INDEX_op_sub_i32
:
2868 case INDEX_op_sub_i64
:
2869 case INDEX_op_mul_i32
:
2870 case INDEX_op_mul_i64
:
2871 case INDEX_op_or_i32
:
2872 case INDEX_op_or_i64
:
2873 case INDEX_op_xor_i32
:
2874 case INDEX_op_xor_i64
:
2877 case INDEX_op_and_i32
:
2878 case INDEX_op_and_i64
:
2880 static const TCGTargetOpDef
and
2881 = { .args_ct_str
= { "r", "0", "reZ" } };
2885 case INDEX_op_andc_i32
:
2886 case INDEX_op_andc_i64
:
2888 static const TCGTargetOpDef andc
2889 = { .args_ct_str
= { "r", "r", "rI" } };
2894 case INDEX_op_shl_i32
:
2895 case INDEX_op_shl_i64
:
2896 case INDEX_op_shr_i32
:
2897 case INDEX_op_shr_i64
:
2898 case INDEX_op_sar_i32
:
2899 case INDEX_op_sar_i64
:
2900 return have_bmi2
? &r_r_ri
: &r_0_ci
;
2901 case INDEX_op_rotl_i32
:
2902 case INDEX_op_rotl_i64
:
2903 case INDEX_op_rotr_i32
:
2904 case INDEX_op_rotr_i64
:
2907 case INDEX_op_brcond_i32
:
2908 case INDEX_op_brcond_i64
:
2911 case INDEX_op_bswap16_i32
:
2912 case INDEX_op_bswap16_i64
:
2913 case INDEX_op_bswap32_i32
:
2914 case INDEX_op_bswap32_i64
:
2915 case INDEX_op_bswap64_i64
:
2916 case INDEX_op_neg_i32
:
2917 case INDEX_op_neg_i64
:
2918 case INDEX_op_not_i32
:
2919 case INDEX_op_not_i64
:
2922 case INDEX_op_ext8s_i32
:
2923 case INDEX_op_ext8s_i64
:
2924 case INDEX_op_ext8u_i32
:
2925 case INDEX_op_ext8u_i64
:
2927 case INDEX_op_ext16s_i32
:
2928 case INDEX_op_ext16s_i64
:
2929 case INDEX_op_ext16u_i32
:
2930 case INDEX_op_ext16u_i64
:
2931 case INDEX_op_ext32s_i64
:
2932 case INDEX_op_ext32u_i64
:
2933 case INDEX_op_ext_i32_i64
:
2934 case INDEX_op_extu_i32_i64
:
2935 case INDEX_op_extract_i32
:
2936 case INDEX_op_extract_i64
:
2937 case INDEX_op_sextract_i32
:
2938 case INDEX_op_ctpop_i32
:
2939 case INDEX_op_ctpop_i64
:
2942 case INDEX_op_deposit_i32
:
2943 case INDEX_op_deposit_i64
:
2945 static const TCGTargetOpDef dep
2946 = { .args_ct_str
= { "Q", "0", "Q" } };
2949 case INDEX_op_setcond_i32
:
2950 case INDEX_op_setcond_i64
:
2952 static const TCGTargetOpDef setc
2953 = { .args_ct_str
= { "q", "r", "re" } };
2956 case INDEX_op_movcond_i32
:
2957 case INDEX_op_movcond_i64
:
2959 static const TCGTargetOpDef movc
2960 = { .args_ct_str
= { "r", "r", "re", "r", "0" } };
2963 case INDEX_op_div2_i32
:
2964 case INDEX_op_div2_i64
:
2965 case INDEX_op_divu2_i32
:
2966 case INDEX_op_divu2_i64
:
2968 static const TCGTargetOpDef div2
2969 = { .args_ct_str
= { "a", "d", "0", "1", "r" } };
2972 case INDEX_op_mulu2_i32
:
2973 case INDEX_op_mulu2_i64
:
2974 case INDEX_op_muls2_i32
:
2975 case INDEX_op_muls2_i64
:
2977 static const TCGTargetOpDef mul2
2978 = { .args_ct_str
= { "a", "d", "a", "r" } };
2981 case INDEX_op_add2_i32
:
2982 case INDEX_op_add2_i64
:
2983 case INDEX_op_sub2_i32
:
2984 case INDEX_op_sub2_i64
:
2986 static const TCGTargetOpDef arith2
2987 = { .args_ct_str
= { "r", "r", "0", "1", "re", "re" } };
2990 case INDEX_op_ctz_i32
:
2991 case INDEX_op_ctz_i64
:
2993 static const TCGTargetOpDef ctz
[2] = {
2994 { .args_ct_str
= { "&r", "r", "r" } },
2995 { .args_ct_str
= { "&r", "r", "rW" } },
2997 return &ctz
[have_bmi1
];
2999 case INDEX_op_clz_i32
:
3000 case INDEX_op_clz_i64
:
3002 static const TCGTargetOpDef clz
[2] = {
3003 { .args_ct_str
= { "&r", "r", "r" } },
3004 { .args_ct_str
= { "&r", "r", "rW" } },
3006 return &clz
[have_lzcnt
];
3009 case INDEX_op_qemu_ld_i32
:
3010 return TARGET_LONG_BITS
<= TCG_TARGET_REG_BITS
? &r_L
: &r_L_L
;
3011 case INDEX_op_qemu_st_i32
:
3012 return TARGET_LONG_BITS
<= TCG_TARGET_REG_BITS
? &L_L
: &L_L_L
;
3013 case INDEX_op_qemu_ld_i64
:
3014 return (TCG_TARGET_REG_BITS
== 64 ? &r_L
3015 : TARGET_LONG_BITS
<= TCG_TARGET_REG_BITS
? &r_r_L
3017 case INDEX_op_qemu_st_i64
:
3018 return (TCG_TARGET_REG_BITS
== 64 ? &L_L
3019 : TARGET_LONG_BITS
<= TCG_TARGET_REG_BITS
? &L_L_L
3022 case INDEX_op_brcond2_i32
:
3024 static const TCGTargetOpDef b2
3025 = { .args_ct_str
= { "r", "r", "ri", "ri" } };
3028 case INDEX_op_setcond2_i32
:
3030 static const TCGTargetOpDef s2
3031 = { .args_ct_str
= { "r", "r", "r", "ri", "ri" } };
3035 case INDEX_op_ld_vec
:
3036 case INDEX_op_st_vec
:
3039 case INDEX_op_add_vec
:
3040 case INDEX_op_sub_vec
:
3041 case INDEX_op_mul_vec
:
3042 case INDEX_op_and_vec
:
3043 case INDEX_op_or_vec
:
3044 case INDEX_op_xor_vec
:
3045 case INDEX_op_andc_vec
:
3046 case INDEX_op_cmp_vec
:
3047 case INDEX_op_x86_shufps_vec
:
3048 case INDEX_op_x86_blend_vec
:
3049 case INDEX_op_x86_packss_vec
:
3050 case INDEX_op_x86_packus_vec
:
3051 case INDEX_op_x86_vperm2i128_vec
:
3052 case INDEX_op_x86_punpckl_vec
:
3053 case INDEX_op_x86_punpckh_vec
:
3054 #if TCG_TARGET_REG_BITS == 32
3055 case INDEX_op_dup2_vec
:
3058 case INDEX_op_dup_vec
:
3059 case INDEX_op_shli_vec
:
3060 case INDEX_op_shri_vec
:
3061 case INDEX_op_sari_vec
:
3062 case INDEX_op_x86_psrldq_vec
:
3064 case INDEX_op_x86_vpblendvb_vec
:
3073 int tcg_can_emit_vec_op(TCGOpcode opc
, TCGType type
, unsigned vece
)
3076 case INDEX_op_add_vec
:
3077 case INDEX_op_sub_vec
:
3078 case INDEX_op_and_vec
:
3079 case INDEX_op_or_vec
:
3080 case INDEX_op_xor_vec
:
3081 case INDEX_op_andc_vec
:
3083 case INDEX_op_cmp_vec
:
3086 case INDEX_op_shli_vec
:
3087 case INDEX_op_shri_vec
:
3088 /* We must expand the operation for MO_8. */
3089 return vece
== MO_8
? -1 : 1;
3091 case INDEX_op_sari_vec
:
3092 /* We must expand the operation for MO_8. */
3096 /* We can emulate this for MO_64, but it does not pay off
3097 unless we're producing at least 4 values. */
3098 if (vece
== MO_64
) {
3099 return type
>= TCG_TYPE_V256
? -1 : 0;
3103 case INDEX_op_mul_vec
:
3105 /* We can expand the operation for MO_8. */
3108 if (vece
== MO_64
) {
3118 void tcg_expand_vec_op(TCGOpcode opc
, TCGType type
, unsigned vece
,
3123 TCGv_vec v0
, t1
, t2
, t3
, t4
;
3126 v0
= temp_tcgv_vec(arg_temp(a0
));
3129 case INDEX_op_shli_vec
:
3130 case INDEX_op_shri_vec
:
3131 tcg_debug_assert(vece
== MO_8
);
3132 a1
= va_arg(va
, TCGArg
);
3133 a2
= va_arg(va
, TCGArg
);
3134 /* Unpack to W, shift, and repack. Tricky bits:
3135 (1) Use punpck*bw x,x to produce DDCCBBAA,
3136 i.e. duplicate in other half of the 16-bit lane.
3137 (2) For right-shift, add 8 so that the high half of
3138 the lane becomes zero. For left-shift, we must
3139 shift up and down again.
3140 (3) Step 2 leaves high half zero such that PACKUSWB
3141 (pack with unsigned saturation) does not modify
3143 t1
= tcg_temp_new_vec(type
);
3144 t2
= tcg_temp_new_vec(type
);
3145 vec_gen_3(INDEX_op_x86_punpckl_vec
, type
, MO_8
,
3146 tcgv_vec_arg(t1
), a1
, a1
);
3147 vec_gen_3(INDEX_op_x86_punpckh_vec
, type
, MO_8
,
3148 tcgv_vec_arg(t2
), a1
, a1
);
3149 if (opc
== INDEX_op_shri_vec
) {
3150 vec_gen_3(INDEX_op_shri_vec
, type
, MO_16
,
3151 tcgv_vec_arg(t1
), tcgv_vec_arg(t1
), a2
+ 8);
3152 vec_gen_3(INDEX_op_shri_vec
, type
, MO_16
,
3153 tcgv_vec_arg(t2
), tcgv_vec_arg(t2
), a2
+ 8);
3155 vec_gen_3(INDEX_op_shli_vec
, type
, MO_16
,
3156 tcgv_vec_arg(t1
), tcgv_vec_arg(t1
), a2
+ 8);
3157 vec_gen_3(INDEX_op_shli_vec
, type
, MO_16
,
3158 tcgv_vec_arg(t2
), tcgv_vec_arg(t2
), a2
+ 8);
3159 vec_gen_3(INDEX_op_shri_vec
, type
, MO_16
,
3160 tcgv_vec_arg(t1
), tcgv_vec_arg(t1
), 8);
3161 vec_gen_3(INDEX_op_shri_vec
, type
, MO_16
,
3162 tcgv_vec_arg(t2
), tcgv_vec_arg(t2
), 8);
3164 vec_gen_3(INDEX_op_x86_packus_vec
, type
, MO_8
,
3165 a0
, tcgv_vec_arg(t1
), tcgv_vec_arg(t2
));
3166 tcg_temp_free_vec(t1
);
3167 tcg_temp_free_vec(t2
);
3170 case INDEX_op_sari_vec
:
3171 a1
= va_arg(va
, TCGArg
);
3172 a2
= va_arg(va
, TCGArg
);
3174 /* Unpack to W, shift, and repack, as above. */
3175 t1
= tcg_temp_new_vec(type
);
3176 t2
= tcg_temp_new_vec(type
);
3177 vec_gen_3(INDEX_op_x86_punpckl_vec
, type
, MO_8
,
3178 tcgv_vec_arg(t1
), a1
, a1
);
3179 vec_gen_3(INDEX_op_x86_punpckh_vec
, type
, MO_8
,
3180 tcgv_vec_arg(t2
), a1
, a1
);
3181 vec_gen_3(INDEX_op_sari_vec
, type
, MO_16
,
3182 tcgv_vec_arg(t1
), tcgv_vec_arg(t1
), a2
+ 8);
3183 vec_gen_3(INDEX_op_sari_vec
, type
, MO_16
,
3184 tcgv_vec_arg(t2
), tcgv_vec_arg(t2
), a2
+ 8);
3185 vec_gen_3(INDEX_op_x86_packss_vec
, type
, MO_8
,
3186 a0
, tcgv_vec_arg(t1
), tcgv_vec_arg(t2
));
3187 tcg_temp_free_vec(t1
);
3188 tcg_temp_free_vec(t2
);
3191 tcg_debug_assert(vece
== MO_64
);
3192 /* MO_64: If the shift is <= 32, we can emulate the sign extend by
3193 performing an arithmetic 32-bit shift and overwriting the high
3194 half of the result (note that the ISA says shift of 32 is valid). */
3196 t1
= tcg_temp_new_vec(type
);
3197 vec_gen_3(INDEX_op_sari_vec
, type
, MO_32
, tcgv_vec_arg(t1
), a1
, a2
);
3198 vec_gen_3(INDEX_op_shri_vec
, type
, MO_64
, a0
, a1
, a2
);
3199 vec_gen_4(INDEX_op_x86_blend_vec
, type
, MO_32
,
3200 a0
, a0
, tcgv_vec_arg(t1
), 0xaa);
3201 tcg_temp_free_vec(t1
);
3204 /* Otherwise we will need to use a compare vs 0 to produce the
3205 sign-extend, shift and merge. */
3206 t1
= tcg_temp_new_vec(type
);
3207 t2
= tcg_const_zeros_vec(type
);
3208 vec_gen_4(INDEX_op_cmp_vec
, type
, MO_64
,
3209 tcgv_vec_arg(t1
), tcgv_vec_arg(t2
), a1
, TCG_COND_GT
);
3210 tcg_temp_free_vec(t2
);
3211 vec_gen_3(INDEX_op_shri_vec
, type
, MO_64
, a0
, a1
, a2
);
3212 vec_gen_3(INDEX_op_shli_vec
, type
, MO_64
,
3213 tcgv_vec_arg(t1
), tcgv_vec_arg(t1
), 64 - a2
);
3214 vec_gen_3(INDEX_op_or_vec
, type
, MO_64
, a0
, a0
, tcgv_vec_arg(t1
));
3215 tcg_temp_free_vec(t1
);
3218 case INDEX_op_mul_vec
:
3219 tcg_debug_assert(vece
== MO_8
);
3220 a1
= va_arg(va
, TCGArg
);
3221 a2
= va_arg(va
, TCGArg
);
3224 t1
= tcg_temp_new_vec(TCG_TYPE_V128
);
3225 t2
= tcg_temp_new_vec(TCG_TYPE_V128
);
3226 tcg_gen_dup16i_vec(t2
, 0);
3227 vec_gen_3(INDEX_op_x86_punpckl_vec
, TCG_TYPE_V128
, MO_8
,
3228 tcgv_vec_arg(t1
), a1
, tcgv_vec_arg(t2
));
3229 vec_gen_3(INDEX_op_x86_punpckl_vec
, TCG_TYPE_V128
, MO_8
,
3230 tcgv_vec_arg(t2
), tcgv_vec_arg(t2
), a2
);
3231 tcg_gen_mul_vec(MO_16
, t1
, t1
, t2
);
3232 tcg_gen_shri_vec(MO_16
, t1
, t1
, 8);
3233 vec_gen_3(INDEX_op_x86_packus_vec
, TCG_TYPE_V128
, MO_8
,
3234 a0
, tcgv_vec_arg(t1
), tcgv_vec_arg(t1
));
3235 tcg_temp_free_vec(t1
);
3236 tcg_temp_free_vec(t2
);
3240 t1
= tcg_temp_new_vec(TCG_TYPE_V128
);
3241 t2
= tcg_temp_new_vec(TCG_TYPE_V128
);
3242 t3
= tcg_temp_new_vec(TCG_TYPE_V128
);
3243 t4
= tcg_temp_new_vec(TCG_TYPE_V128
);
3244 tcg_gen_dup16i_vec(t4
, 0);
3245 vec_gen_3(INDEX_op_x86_punpckl_vec
, TCG_TYPE_V128
, MO_8
,
3246 tcgv_vec_arg(t1
), a1
, tcgv_vec_arg(t4
));
3247 vec_gen_3(INDEX_op_x86_punpckl_vec
, TCG_TYPE_V128
, MO_8
,
3248 tcgv_vec_arg(t2
), tcgv_vec_arg(t4
), a2
);
3249 vec_gen_3(INDEX_op_x86_punpckh_vec
, TCG_TYPE_V128
, MO_8
,
3250 tcgv_vec_arg(t3
), a1
, tcgv_vec_arg(t4
));
3251 vec_gen_3(INDEX_op_x86_punpckh_vec
, TCG_TYPE_V128
, MO_8
,
3252 tcgv_vec_arg(t4
), tcgv_vec_arg(t4
), a2
);
3253 tcg_gen_mul_vec(MO_16
, t1
, t1
, t2
);
3254 tcg_gen_mul_vec(MO_16
, t3
, t3
, t4
);
3255 tcg_gen_shri_vec(MO_16
, t1
, t1
, 8);
3256 tcg_gen_shri_vec(MO_16
, t3
, t3
, 8);
3257 vec_gen_3(INDEX_op_x86_packus_vec
, TCG_TYPE_V128
, MO_8
,
3258 a0
, tcgv_vec_arg(t1
), tcgv_vec_arg(t3
));
3259 tcg_temp_free_vec(t1
);
3260 tcg_temp_free_vec(t2
);
3261 tcg_temp_free_vec(t3
);
3262 tcg_temp_free_vec(t4
);
3266 t1
= tcg_temp_new_vec(TCG_TYPE_V256
);
3267 t2
= tcg_temp_new_vec(TCG_TYPE_V256
);
3268 t3
= tcg_temp_new_vec(TCG_TYPE_V256
);
3269 t4
= tcg_temp_new_vec(TCG_TYPE_V256
);
3270 tcg_gen_dup16i_vec(t4
, 0);
3271 /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
3272 t1: extends of B[0-7], D[0-7]
3273 t2: extends of X[0-7], Z[0-7]
3274 t3: extends of A[0-7], C[0-7]
3275 t4: extends of W[0-7], Y[0-7]. */
3276 vec_gen_3(INDEX_op_x86_punpckl_vec
, TCG_TYPE_V256
, MO_8
,
3277 tcgv_vec_arg(t1
), a1
, tcgv_vec_arg(t4
));
3278 vec_gen_3(INDEX_op_x86_punpckl_vec
, TCG_TYPE_V256
, MO_8
,
3279 tcgv_vec_arg(t2
), tcgv_vec_arg(t4
), a2
);
3280 vec_gen_3(INDEX_op_x86_punpckh_vec
, TCG_TYPE_V256
, MO_8
,
3281 tcgv_vec_arg(t3
), a1
, tcgv_vec_arg(t4
));
3282 vec_gen_3(INDEX_op_x86_punpckh_vec
, TCG_TYPE_V256
, MO_8
,
3283 tcgv_vec_arg(t4
), tcgv_vec_arg(t4
), a2
);
3284 /* t1: BX DZ; t2: AW CY. */
3285 tcg_gen_mul_vec(MO_16
, t1
, t1
, t2
);
3286 tcg_gen_mul_vec(MO_16
, t3
, t3
, t4
);
3287 tcg_gen_shri_vec(MO_16
, t1
, t1
, 8);
3288 tcg_gen_shri_vec(MO_16
, t3
, t3
, 8);
3289 /* a0: AW BX CY DZ. */
3290 vec_gen_3(INDEX_op_x86_packus_vec
, TCG_TYPE_V256
, MO_8
,
3291 a0
, tcgv_vec_arg(t1
), tcgv_vec_arg(t3
));
3292 tcg_temp_free_vec(t1
);
3293 tcg_temp_free_vec(t2
);
3294 tcg_temp_free_vec(t3
);
3295 tcg_temp_free_vec(t4
);
3299 g_assert_not_reached();
3303 case INDEX_op_cmp_vec
:
3310 static const uint8_t fixups
[16] = {
3313 [TCG_COND_NE
] = NEED_INV
,
3315 [TCG_COND_LT
] = NEED_SWAP
,
3316 [TCG_COND_LE
] = NEED_INV
,
3317 [TCG_COND_GE
] = NEED_SWAP
| NEED_INV
,
3318 [TCG_COND_GTU
] = NEED_BIAS
,
3319 [TCG_COND_LTU
] = NEED_BIAS
| NEED_SWAP
,
3320 [TCG_COND_LEU
] = NEED_BIAS
| NEED_INV
,
3321 [TCG_COND_GEU
] = NEED_BIAS
| NEED_SWAP
| NEED_INV
,
3327 a1
= va_arg(va
, TCGArg
);
3328 a2
= va_arg(va
, TCGArg
);
3329 cond
= va_arg(va
, TCGArg
);
3330 fixup
= fixups
[cond
& 15];
3331 tcg_debug_assert(fixup
!= 0xff);
3333 if (fixup
& NEED_INV
) {
3334 cond
= tcg_invert_cond(cond
);
3336 if (fixup
& NEED_SWAP
) {
3338 t
= a1
, a1
= a2
, a2
= t
;
3339 cond
= tcg_swap_cond(cond
);
3343 if (fixup
& NEED_BIAS
) {
3344 t1
= tcg_temp_new_vec(type
);
3345 t2
= tcg_temp_new_vec(type
);
3346 tcg_gen_dupi_vec(vece
, t2
, 1ull << ((8 << vece
) - 1));
3347 tcg_gen_sub_vec(vece
, t1
, temp_tcgv_vec(arg_temp(a1
)), t2
);
3348 tcg_gen_sub_vec(vece
, t2
, temp_tcgv_vec(arg_temp(a2
)), t2
);
3349 a1
= tcgv_vec_arg(t1
);
3350 a2
= tcgv_vec_arg(t2
);
3351 cond
= tcg_signed_cond(cond
);
3354 tcg_debug_assert(cond
== TCG_COND_EQ
|| cond
== TCG_COND_GT
);
3355 vec_gen_4(INDEX_op_cmp_vec
, type
, vece
, a0
, a1
, a2
, cond
);
3357 if (fixup
& NEED_BIAS
) {
3358 tcg_temp_free_vec(t1
);
3359 tcg_temp_free_vec(t2
);
3361 if (fixup
& NEED_INV
) {
3362 tcg_gen_not_vec(vece
, v0
, v0
);
3374 static const int tcg_target_callee_save_regs
[] = {
3375 #if TCG_TARGET_REG_BITS == 64
3384 TCG_REG_R14
, /* Currently used for the global env. */
3387 TCG_REG_EBP
, /* Currently used for the global env. */
3394 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3395 and tcg_register_jit. */
3398 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3399 * (TCG_TARGET_REG_BITS / 8))
3401 #define FRAME_SIZE \
3403 + TCG_STATIC_CALL_ARGS_SIZE \
3404 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3405 + TCG_TARGET_STACK_ALIGN - 1) \
3406 & ~(TCG_TARGET_STACK_ALIGN - 1))
3408 /* Generate global QEMU prologue and epilogue code */
3409 static void tcg_target_qemu_prologue(TCGContext
*s
)
3411 int i
, stack_addend
;
3415 /* Reserve some stack space, also for TCG temps. */
3416 stack_addend
= FRAME_SIZE
- PUSH_SIZE
;
3417 tcg_set_frame(s
, TCG_REG_CALL_STACK
, TCG_STATIC_CALL_ARGS_SIZE
,
3418 CPU_TEMP_BUF_NLONGS
* sizeof(long));
3420 /* Save all callee saved registers. */
3421 for (i
= 0; i
< ARRAY_SIZE(tcg_target_callee_save_regs
); i
++) {
3422 tcg_out_push(s
, tcg_target_callee_save_regs
[i
]);
3425 #if TCG_TARGET_REG_BITS == 32
3426 tcg_out_ld(s
, TCG_TYPE_PTR
, TCG_AREG0
, TCG_REG_ESP
,
3427 (ARRAY_SIZE(tcg_target_callee_save_regs
) + 1) * 4);
3428 tcg_out_addi(s
, TCG_REG_ESP
, -stack_addend
);
3430 tcg_out_modrm_offset(s
, OPC_GRP5
, EXT5_JMPN_Ev
, TCG_REG_ESP
,
3431 (ARRAY_SIZE(tcg_target_callee_save_regs
) + 2) * 4
3434 tcg_out_mov(s
, TCG_TYPE_PTR
, TCG_AREG0
, tcg_target_call_iarg_regs
[0]);
3435 tcg_out_addi(s
, TCG_REG_ESP
, -stack_addend
);
3437 tcg_out_modrm(s
, OPC_GRP5
, EXT5_JMPN_Ev
, tcg_target_call_iarg_regs
[1]);
3441 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3442 * and fall through to the rest of the epilogue.
3444 s
->code_gen_epilogue
= s
->code_ptr
;
3445 tcg_out_movi(s
, TCG_TYPE_REG
, TCG_REG_EAX
, 0);
3448 tb_ret_addr
= s
->code_ptr
;
3450 tcg_out_addi(s
, TCG_REG_CALL_STACK
, stack_addend
);
3453 tcg_out_vex_opc(s
, OPC_VZEROUPPER
, 0, 0, 0, 0);
3455 for (i
= ARRAY_SIZE(tcg_target_callee_save_regs
) - 1; i
>= 0; i
--) {
3456 tcg_out_pop(s
, tcg_target_callee_save_regs
[i
]);
3458 tcg_out_opc(s
, OPC_RET
, 0, 0, 0);
3460 #if !defined(CONFIG_SOFTMMU)
3461 /* Try to set up a segment register to point to guest_base. */
3463 setup_guest_base_seg();
3468 static void tcg_out_nop_fill(tcg_insn_unit
*p
, int count
)
3470 memset(p
, 0x90, count
);
3473 static void tcg_target_init(TCGContext
*s
)
3475 #ifdef CONFIG_CPUID_H
3476 unsigned a
, b
, c
, d
, b7
= 0;
3477 int max
= __get_cpuid_max(0, 0);
3480 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3481 __cpuid_count(7, 0, a
, b7
, c
, d
);
3482 have_bmi1
= (b7
& bit_BMI
) != 0;
3483 have_bmi2
= (b7
& bit_BMI2
) != 0;
3487 __cpuid(1, a
, b
, c
, d
);
3489 /* For 32-bit, 99% certainty that we're running on hardware that
3490 supports cmov, but we still need to check. In case cmov is not
3491 available, we'll use a small forward branch. */
3492 have_cmov
= (d
& bit_CMOV
) != 0;
3495 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3496 need to probe for it. */
3497 have_movbe
= (c
& bit_MOVBE
) != 0;
3498 have_popcnt
= (c
& bit_POPCNT
) != 0;
3500 /* There are a number of things we must check before we can be
3501 sure of not hitting invalid opcode. */
3502 if (c
& bit_OSXSAVE
) {
3503 unsigned xcrl
, xcrh
;
3504 /* The xgetbv instruction is not available to older versions of
3505 * the assembler, so we encode the instruction manually.
3507 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl
), "=d" (xcrh
) : "c" (0));
3508 if ((xcrl
& 6) == 6) {
3509 have_avx1
= (c
& bit_AVX
) != 0;
3510 have_avx2
= (b7
& bit_AVX2
) != 0;
3515 max
= __get_cpuid_max(0x8000000, 0);
3517 __cpuid(0x80000001, a
, b
, c
, d
);
3518 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3519 have_lzcnt
= (c
& bit_LZCNT
) != 0;
3521 #endif /* CONFIG_CPUID_H */
3523 tcg_target_available_regs
[TCG_TYPE_I32
] = ALL_GENERAL_REGS
;
3524 if (TCG_TARGET_REG_BITS
== 64) {
3525 tcg_target_available_regs
[TCG_TYPE_I64
] = ALL_GENERAL_REGS
;
3528 tcg_target_available_regs
[TCG_TYPE_V64
] = ALL_VECTOR_REGS
;
3529 tcg_target_available_regs
[TCG_TYPE_V128
] = ALL_VECTOR_REGS
;
3532 tcg_target_available_regs
[TCG_TYPE_V256
] = ALL_VECTOR_REGS
;
3535 tcg_target_call_clobber_regs
= ALL_VECTOR_REGS
;
3536 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_EAX
);
3537 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_EDX
);
3538 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_ECX
);
3539 if (TCG_TARGET_REG_BITS
== 64) {
3540 #if !defined(_WIN64)
3541 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_RDI
);
3542 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_RSI
);
3544 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_R8
);
3545 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_R9
);
3546 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_R10
);
3547 tcg_regset_set_reg(tcg_target_call_clobber_regs
, TCG_REG_R11
);
3550 s
->reserved_regs
= 0;
3551 tcg_regset_set_reg(s
->reserved_regs
, TCG_REG_CALL_STACK
);
3556 uint8_t fde_def_cfa
[4];
3557 uint8_t fde_reg_ofs
[14];
3560 /* We're expecting a 2 byte uleb128 encoded value. */
3561 QEMU_BUILD_BUG_ON(FRAME_SIZE
>= (1 << 14));
3563 #if !defined(__ELF__)
3564 /* Host machine without ELF. */
3565 #elif TCG_TARGET_REG_BITS == 64
3566 #define ELF_HOST_MACHINE EM_X86_64
3567 static const DebugFrame debug_frame
= {
3568 .h
.cie
.len
= sizeof(DebugFrameCIE
)-4, /* length after .len member */
3571 .h
.cie
.code_align
= 1,
3572 .h
.cie
.data_align
= 0x78, /* sleb128 -8 */
3573 .h
.cie
.return_column
= 16,
3575 /* Total FDE size does not include the "len" member. */
3576 .h
.fde
.len
= sizeof(DebugFrame
) - offsetof(DebugFrame
, h
.fde
.cie_offset
),
3579 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3580 (FRAME_SIZE
& 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3584 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3585 /* The following ordering must match tcg_target_callee_save_regs. */
3586 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3587 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3588 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3589 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3590 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3591 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3595 #define ELF_HOST_MACHINE EM_386
3596 static const DebugFrame debug_frame
= {
3597 .h
.cie
.len
= sizeof(DebugFrameCIE
)-4, /* length after .len member */
3600 .h
.cie
.code_align
= 1,
3601 .h
.cie
.data_align
= 0x7c, /* sleb128 -4 */
3602 .h
.cie
.return_column
= 8,
3604 /* Total FDE size does not include the "len" member. */
3605 .h
.fde
.len
= sizeof(DebugFrame
) - offsetof(DebugFrame
, h
.fde
.cie_offset
),
3608 12, 4, /* DW_CFA_def_cfa %esp, ... */
3609 (FRAME_SIZE
& 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3613 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3614 /* The following ordering must match tcg_target_callee_save_regs. */
3615 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3616 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3617 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3618 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3623 #if defined(ELF_HOST_MACHINE)
3624 void tcg_register_jit(void *buf
, size_t buf_size
)
3626 tcg_register_jit_int(buf
, buf_size
, &debug_frame
, sizeof(debug_frame
));