tcg/i386: Implement vector saturating arithmetic
[qemu/ar7.git] / tcg / i386 / tcg-target.inc.c
bloba5791dfaa57285fe9efb4d862e947c0943c7b3dc
1 /*
2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 #include "tcg-pool.inc.c"
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 #else
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33 #endif
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39 #endif
41 #endif
43 static const int tcg_target_reg_alloc_order[] = {
44 #if TCG_TARGET_REG_BITS == 64
45 TCG_REG_RBP,
46 TCG_REG_RBX,
47 TCG_REG_R12,
48 TCG_REG_R13,
49 TCG_REG_R14,
50 TCG_REG_R15,
51 TCG_REG_R10,
52 TCG_REG_R11,
53 TCG_REG_R9,
54 TCG_REG_R8,
55 TCG_REG_RCX,
56 TCG_REG_RDX,
57 TCG_REG_RSI,
58 TCG_REG_RDI,
59 TCG_REG_RAX,
60 #else
61 TCG_REG_EBX,
62 TCG_REG_ESI,
63 TCG_REG_EDI,
64 TCG_REG_EBP,
65 TCG_REG_ECX,
66 TCG_REG_EDX,
67 TCG_REG_EAX,
68 #endif
69 TCG_REG_XMM0,
70 TCG_REG_XMM1,
71 TCG_REG_XMM2,
72 TCG_REG_XMM3,
73 TCG_REG_XMM4,
74 TCG_REG_XMM5,
75 #ifndef _WIN64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
78 TCG_REG_XMM6,
79 TCG_REG_XMM7,
80 #if TCG_TARGET_REG_BITS == 64
81 TCG_REG_XMM8,
82 TCG_REG_XMM9,
83 TCG_REG_XMM10,
84 TCG_REG_XMM11,
85 TCG_REG_XMM12,
86 TCG_REG_XMM13,
87 TCG_REG_XMM14,
88 TCG_REG_XMM15,
89 #endif
90 #endif
93 static const int tcg_target_call_iarg_regs[] = {
94 #if TCG_TARGET_REG_BITS == 64
95 #if defined(_WIN64)
96 TCG_REG_RCX,
97 TCG_REG_RDX,
98 #else
99 TCG_REG_RDI,
100 TCG_REG_RSI,
101 TCG_REG_RDX,
102 TCG_REG_RCX,
103 #endif
104 TCG_REG_R8,
105 TCG_REG_R9,
106 #else
107 /* 32 bit mode uses stack based calling convention (GCC default). */
108 #endif
111 static const int tcg_target_call_oarg_regs[] = {
112 TCG_REG_EAX,
113 #if TCG_TARGET_REG_BITS == 32
114 TCG_REG_EDX
115 #endif
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
126 i386. */
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130 #else
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
133 #endif
135 /* The host compiler should supply <cpuid.h> to enable runtime features
136 detection, as we're not going to go so far as our own inline assembly.
137 If not available, default values will be assumed. */
138 #if defined(CONFIG_CPUID_H)
139 #include "qemu/cpuid.h"
140 #endif
142 /* For 64-bit, we always know that CMOV is available. */
143 #if TCG_TARGET_REG_BITS == 64
144 # define have_cmov 1
145 #elif defined(CONFIG_CPUID_H)
146 static bool have_cmov;
147 #else
148 # define have_cmov 0
149 #endif
151 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
152 it there. Therefore we always define the variable. */
153 bool have_bmi1;
154 bool have_popcnt;
155 bool have_avx1;
156 bool have_avx2;
158 #ifdef CONFIG_CPUID_H
159 static bool have_movbe;
160 static bool have_bmi2;
161 static bool have_lzcnt;
162 #else
163 # define have_movbe 0
164 # define have_bmi2 0
165 # define have_lzcnt 0
166 #endif
168 static tcg_insn_unit *tb_ret_addr;
170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171 intptr_t value, intptr_t addend)
173 value += addend;
174 switch(type) {
175 case R_386_PC32:
176 value -= (uintptr_t)code_ptr;
177 if (value != (int32_t)value) {
178 return false;
180 /* FALLTHRU */
181 case R_386_32:
182 tcg_patch32(code_ptr, value);
183 break;
184 case R_386_PC8:
185 value -= (uintptr_t)code_ptr;
186 if (value != (int8_t)value) {
187 return false;
189 tcg_patch8(code_ptr, value);
190 break;
191 default:
192 tcg_abort();
194 return true;
197 #if TCG_TARGET_REG_BITS == 64
198 #define ALL_GENERAL_REGS 0x0000ffffu
199 #define ALL_VECTOR_REGS 0xffff0000u
200 #else
201 #define ALL_GENERAL_REGS 0x000000ffu
202 #define ALL_VECTOR_REGS 0x00ff0000u
203 #endif
205 /* parse target specific constraints */
206 static const char *target_parse_constraint(TCGArgConstraint *ct,
207 const char *ct_str, TCGType type)
209 switch(*ct_str++) {
210 case 'a':
211 ct->ct |= TCG_CT_REG;
212 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
213 break;
214 case 'b':
215 ct->ct |= TCG_CT_REG;
216 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
217 break;
218 case 'c':
219 ct->ct |= TCG_CT_REG;
220 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
221 break;
222 case 'd':
223 ct->ct |= TCG_CT_REG;
224 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
225 break;
226 case 'S':
227 ct->ct |= TCG_CT_REG;
228 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
229 break;
230 case 'D':
231 ct->ct |= TCG_CT_REG;
232 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
233 break;
234 case 'q':
235 /* A register that can be used as a byte operand. */
236 ct->ct |= TCG_CT_REG;
237 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
238 break;
239 case 'Q':
240 /* A register with an addressable second byte (e.g. %ah). */
241 ct->ct |= TCG_CT_REG;
242 ct->u.regs = 0xf;
243 break;
244 case 'r':
245 /* A general register. */
246 ct->ct |= TCG_CT_REG;
247 ct->u.regs |= ALL_GENERAL_REGS;
248 break;
249 case 'W':
250 /* With TZCNT/LZCNT, we can have operand-size as an input. */
251 ct->ct |= TCG_CT_CONST_WSZ;
252 break;
253 case 'x':
254 /* A vector register. */
255 ct->ct |= TCG_CT_REG;
256 ct->u.regs |= ALL_VECTOR_REGS;
257 break;
259 /* qemu_ld/st address constraint */
260 case 'L':
261 ct->ct |= TCG_CT_REG;
262 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
263 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
264 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
265 break;
267 case 'e':
268 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
269 break;
270 case 'Z':
271 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
272 break;
273 case 'I':
274 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
275 break;
277 default:
278 return NULL;
280 return ct_str;
283 /* test if a constant matches the constraint */
284 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
285 const TCGArgConstraint *arg_ct)
287 int ct = arg_ct->ct;
288 if (ct & TCG_CT_CONST) {
289 return 1;
291 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
292 return 1;
294 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
295 return 1;
297 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
298 return 1;
300 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
301 return 1;
303 return 0;
306 # define LOWREGMASK(x) ((x) & 7)
308 #define P_EXT 0x100 /* 0x0f opcode prefix */
309 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
310 #define P_DATA16 0x400 /* 0x66 opcode prefix */
311 #if TCG_TARGET_REG_BITS == 64
312 # define P_REXW 0x1000 /* Set REX.W = 1 */
313 # define P_REXB_R 0x2000 /* REG field as byte register */
314 # define P_REXB_RM 0x4000 /* R/M field as byte register */
315 # define P_GS 0x8000 /* gs segment override */
316 #else
317 # define P_REXW 0
318 # define P_REXB_R 0
319 # define P_REXB_RM 0
320 # define P_GS 0
321 #endif
322 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
323 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
324 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
325 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
327 #define OPC_ARITH_EvIz (0x81)
328 #define OPC_ARITH_EvIb (0x83)
329 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
330 #define OPC_ANDN (0xf2 | P_EXT38)
331 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
332 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
333 #define OPC_BSF (0xbc | P_EXT)
334 #define OPC_BSR (0xbd | P_EXT)
335 #define OPC_BSWAP (0xc8 | P_EXT)
336 #define OPC_CALL_Jz (0xe8)
337 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
338 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
339 #define OPC_DEC_r32 (0x48)
340 #define OPC_IMUL_GvEv (0xaf | P_EXT)
341 #define OPC_IMUL_GvEvIb (0x6b)
342 #define OPC_IMUL_GvEvIz (0x69)
343 #define OPC_INC_r32 (0x40)
344 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
345 #define OPC_JCC_short (0x70) /* ... plus condition code */
346 #define OPC_JMP_long (0xe9)
347 #define OPC_JMP_short (0xeb)
348 #define OPC_LEA (0x8d)
349 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
350 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
351 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
352 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
353 #define OPC_MOVB_EvIz (0xc6)
354 #define OPC_MOVL_EvIz (0xc7)
355 #define OPC_MOVL_Iv (0xb8)
356 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
357 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
358 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
359 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
360 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
361 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
362 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
363 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
364 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
365 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
366 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
367 #define OPC_MOVSBL (0xbe | P_EXT)
368 #define OPC_MOVSWL (0xbf | P_EXT)
369 #define OPC_MOVSLQ (0x63 | P_REXW)
370 #define OPC_MOVZBL (0xb6 | P_EXT)
371 #define OPC_MOVZWL (0xb7 | P_EXT)
372 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
373 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
374 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
375 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
376 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
377 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
378 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
379 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
380 #define OPC_PADDSB (0xec | P_EXT | P_DATA16)
381 #define OPC_PADDSW (0xed | P_EXT | P_DATA16)
382 #define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
383 #define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
384 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
385 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
386 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
387 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
388 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
389 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
390 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
391 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
392 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
393 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
394 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
395 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
396 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
397 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
398 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
399 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
400 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
401 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
402 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
403 #define OPC_POR (0xeb | P_EXT | P_DATA16)
404 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
405 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
406 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
407 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
408 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
409 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
410 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
411 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
412 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
413 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
414 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
415 #define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
416 #define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
417 #define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
418 #define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
419 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
420 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
421 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
422 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
423 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
424 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
425 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
426 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
427 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
428 #define OPC_POP_r32 (0x58)
429 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
430 #define OPC_PUSH_r32 (0x50)
431 #define OPC_PUSH_Iv (0x68)
432 #define OPC_PUSH_Ib (0x6a)
433 #define OPC_RET (0xc3)
434 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
435 #define OPC_SHIFT_1 (0xd1)
436 #define OPC_SHIFT_Ib (0xc1)
437 #define OPC_SHIFT_cl (0xd3)
438 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
439 #define OPC_SHUFPS (0xc6 | P_EXT)
440 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
441 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
442 #define OPC_TESTL (0x85)
443 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
444 #define OPC_UD2 (0x0b | P_EXT)
445 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
446 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
447 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
448 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
449 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
450 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
451 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
452 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
453 #define OPC_VZEROUPPER (0x77 | P_EXT)
454 #define OPC_XCHG_ax_r32 (0x90)
456 #define OPC_GRP3_Ev (0xf7)
457 #define OPC_GRP5 (0xff)
458 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
460 /* Group 1 opcode extensions for 0x80-0x83.
461 These are also used as modifiers for OPC_ARITH. */
462 #define ARITH_ADD 0
463 #define ARITH_OR 1
464 #define ARITH_ADC 2
465 #define ARITH_SBB 3
466 #define ARITH_AND 4
467 #define ARITH_SUB 5
468 #define ARITH_XOR 6
469 #define ARITH_CMP 7
471 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
472 #define SHIFT_ROL 0
473 #define SHIFT_ROR 1
474 #define SHIFT_SHL 4
475 #define SHIFT_SHR 5
476 #define SHIFT_SAR 7
478 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
479 #define EXT3_NOT 2
480 #define EXT3_NEG 3
481 #define EXT3_MUL 4
482 #define EXT3_IMUL 5
483 #define EXT3_DIV 6
484 #define EXT3_IDIV 7
486 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
487 #define EXT5_INC_Ev 0
488 #define EXT5_DEC_Ev 1
489 #define EXT5_CALLN_Ev 2
490 #define EXT5_JMPN_Ev 4
492 /* Condition codes to be added to OPC_JCC_{long,short}. */
493 #define JCC_JMP (-1)
494 #define JCC_JO 0x0
495 #define JCC_JNO 0x1
496 #define JCC_JB 0x2
497 #define JCC_JAE 0x3
498 #define JCC_JE 0x4
499 #define JCC_JNE 0x5
500 #define JCC_JBE 0x6
501 #define JCC_JA 0x7
502 #define JCC_JS 0x8
503 #define JCC_JNS 0x9
504 #define JCC_JP 0xa
505 #define JCC_JNP 0xb
506 #define JCC_JL 0xc
507 #define JCC_JGE 0xd
508 #define JCC_JLE 0xe
509 #define JCC_JG 0xf
511 static const uint8_t tcg_cond_to_jcc[] = {
512 [TCG_COND_EQ] = JCC_JE,
513 [TCG_COND_NE] = JCC_JNE,
514 [TCG_COND_LT] = JCC_JL,
515 [TCG_COND_GE] = JCC_JGE,
516 [TCG_COND_LE] = JCC_JLE,
517 [TCG_COND_GT] = JCC_JG,
518 [TCG_COND_LTU] = JCC_JB,
519 [TCG_COND_GEU] = JCC_JAE,
520 [TCG_COND_LEU] = JCC_JBE,
521 [TCG_COND_GTU] = JCC_JA,
524 #if TCG_TARGET_REG_BITS == 64
525 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
527 int rex;
529 if (opc & P_GS) {
530 tcg_out8(s, 0x65);
532 if (opc & P_DATA16) {
533 /* We should never be asking for both 16 and 64-bit operation. */
534 tcg_debug_assert((opc & P_REXW) == 0);
535 tcg_out8(s, 0x66);
537 if (opc & P_SIMDF3) {
538 tcg_out8(s, 0xf3);
539 } else if (opc & P_SIMDF2) {
540 tcg_out8(s, 0xf2);
543 rex = 0;
544 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
545 rex |= (r & 8) >> 1; /* REX.R */
546 rex |= (x & 8) >> 2; /* REX.X */
547 rex |= (rm & 8) >> 3; /* REX.B */
549 /* P_REXB_{R,RM} indicates that the given register is the low byte.
550 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
551 as otherwise the encoding indicates %[abcd]h. Note that the values
552 that are ORed in merely indicate that the REX byte must be present;
553 those bits get discarded in output. */
554 rex |= opc & (r >= 4 ? P_REXB_R : 0);
555 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
557 if (rex) {
558 tcg_out8(s, (uint8_t)(rex | 0x40));
561 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
562 tcg_out8(s, 0x0f);
563 if (opc & P_EXT38) {
564 tcg_out8(s, 0x38);
565 } else if (opc & P_EXT3A) {
566 tcg_out8(s, 0x3a);
570 tcg_out8(s, opc);
572 #else
573 static void tcg_out_opc(TCGContext *s, int opc)
575 if (opc & P_DATA16) {
576 tcg_out8(s, 0x66);
578 if (opc & P_SIMDF3) {
579 tcg_out8(s, 0xf3);
580 } else if (opc & P_SIMDF2) {
581 tcg_out8(s, 0xf2);
583 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
584 tcg_out8(s, 0x0f);
585 if (opc & P_EXT38) {
586 tcg_out8(s, 0x38);
587 } else if (opc & P_EXT3A) {
588 tcg_out8(s, 0x3a);
591 tcg_out8(s, opc);
593 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
594 the 32-bit compilation paths. This method works with all versions of gcc,
595 whereas relying on optimization may not be able to exclude them. */
596 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
597 #endif
599 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
601 tcg_out_opc(s, opc, r, rm, 0);
602 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
605 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
606 int rm, int index)
608 int tmp;
610 /* Use the two byte form if possible, which cannot encode
611 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
612 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
613 && ((rm | index) & 8) == 0) {
614 /* Two byte VEX prefix. */
615 tcg_out8(s, 0xc5);
617 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
618 } else {
619 /* Three byte VEX prefix. */
620 tcg_out8(s, 0xc4);
622 /* VEX.m-mmmm */
623 if (opc & P_EXT3A) {
624 tmp = 3;
625 } else if (opc & P_EXT38) {
626 tmp = 2;
627 } else if (opc & P_EXT) {
628 tmp = 1;
629 } else {
630 g_assert_not_reached();
632 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
633 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
634 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
635 tcg_out8(s, tmp);
637 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
640 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
641 /* VEX.pp */
642 if (opc & P_DATA16) {
643 tmp |= 1; /* 0x66 */
644 } else if (opc & P_SIMDF3) {
645 tmp |= 2; /* 0xf3 */
646 } else if (opc & P_SIMDF2) {
647 tmp |= 3; /* 0xf2 */
649 tmp |= (~v & 15) << 3; /* VEX.vvvv */
650 tcg_out8(s, tmp);
651 tcg_out8(s, opc);
654 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
656 tcg_out_vex_opc(s, opc, r, v, rm, 0);
657 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
660 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
661 We handle either RM and INDEX missing with a negative value. In 64-bit
662 mode for absolute addresses, ~RM is the size of the immediate operand
663 that will follow the instruction. */
665 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
666 int shift, intptr_t offset)
668 int mod, len;
670 if (index < 0 && rm < 0) {
671 if (TCG_TARGET_REG_BITS == 64) {
672 /* Try for a rip-relative addressing mode. This has replaced
673 the 32-bit-mode absolute addressing encoding. */
674 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
675 intptr_t disp = offset - pc;
676 if (disp == (int32_t)disp) {
677 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
678 tcg_out32(s, disp);
679 return;
682 /* Try for an absolute address encoding. This requires the
683 use of the MODRM+SIB encoding and is therefore larger than
684 rip-relative addressing. */
685 if (offset == (int32_t)offset) {
686 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
687 tcg_out8(s, (4 << 3) | 5);
688 tcg_out32(s, offset);
689 return;
692 /* ??? The memory isn't directly addressable. */
693 g_assert_not_reached();
694 } else {
695 /* Absolute address. */
696 tcg_out8(s, (r << 3) | 5);
697 tcg_out32(s, offset);
698 return;
702 /* Find the length of the immediate addend. Note that the encoding
703 that would be used for (%ebp) indicates absolute addressing. */
704 if (rm < 0) {
705 mod = 0, len = 4, rm = 5;
706 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
707 mod = 0, len = 0;
708 } else if (offset == (int8_t)offset) {
709 mod = 0x40, len = 1;
710 } else {
711 mod = 0x80, len = 4;
714 /* Use a single byte MODRM format if possible. Note that the encoding
715 that would be used for %esp is the escape to the two byte form. */
716 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
717 /* Single byte MODRM format. */
718 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
719 } else {
720 /* Two byte MODRM+SIB format. */
722 /* Note that the encoding that would place %esp into the index
723 field indicates no index register. In 64-bit mode, the REX.X
724 bit counts, so %r12 can be used as the index. */
725 if (index < 0) {
726 index = 4;
727 } else {
728 tcg_debug_assert(index != TCG_REG_ESP);
731 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
732 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
735 if (len == 1) {
736 tcg_out8(s, offset);
737 } else if (len == 4) {
738 tcg_out32(s, offset);
742 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
743 int index, int shift, intptr_t offset)
745 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
746 tcg_out_sib_offset(s, r, rm, index, shift, offset);
749 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
750 int rm, int index, int shift,
751 intptr_t offset)
753 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
754 tcg_out_sib_offset(s, r, rm, index, shift, offset);
757 /* A simplification of the above with no index or shift. */
758 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
759 int rm, intptr_t offset)
761 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
764 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
765 int v, int rm, intptr_t offset)
767 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
770 /* Output an opcode with an expected reference to the constant pool. */
771 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
773 tcg_out_opc(s, opc, r, 0, 0);
774 /* Absolute for 32-bit, pc-relative for 64-bit. */
775 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
776 tcg_out32(s, 0);
779 /* Output an opcode with an expected reference to the constant pool. */
780 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
782 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
783 /* Absolute for 32-bit, pc-relative for 64-bit. */
784 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
785 tcg_out32(s, 0);
788 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
789 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
791 /* Propagate an opcode prefix, such as P_REXW. */
792 int ext = subop & ~0x7;
793 subop &= 0x7;
795 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
798 static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
800 int rexw = 0;
802 if (arg == ret) {
803 return;
805 switch (type) {
806 case TCG_TYPE_I64:
807 rexw = P_REXW;
808 /* fallthru */
809 case TCG_TYPE_I32:
810 if (ret < 16) {
811 if (arg < 16) {
812 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
813 } else {
814 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
816 } else {
817 if (arg < 16) {
818 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
819 } else {
820 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
823 break;
825 case TCG_TYPE_V64:
826 tcg_debug_assert(ret >= 16 && arg >= 16);
827 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
828 break;
829 case TCG_TYPE_V128:
830 tcg_debug_assert(ret >= 16 && arg >= 16);
831 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
832 break;
833 case TCG_TYPE_V256:
834 tcg_debug_assert(ret >= 16 && arg >= 16);
835 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
836 break;
838 default:
839 g_assert_not_reached();
843 static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
844 TCGReg r, TCGReg a)
846 if (have_avx2) {
847 static const int dup_insn[4] = {
848 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
849 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
851 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
852 tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
853 } else {
854 switch (vece) {
855 case MO_8:
856 /* ??? With zero in a register, use PSHUFB. */
857 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
858 a = r;
859 /* FALLTHRU */
860 case MO_16:
861 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
862 a = r;
863 /* FALLTHRU */
864 case MO_32:
865 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
866 /* imm8 operand: all output lanes selected from input lane 0. */
867 tcg_out8(s, 0);
868 break;
869 case MO_64:
870 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
871 break;
872 default:
873 g_assert_not_reached();
878 static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
879 TCGReg ret, tcg_target_long arg)
881 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
883 if (arg == 0) {
884 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
885 return;
887 if (arg == -1) {
888 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
889 return;
892 if (TCG_TARGET_REG_BITS == 64) {
893 if (type == TCG_TYPE_V64) {
894 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
895 } else if (have_avx2) {
896 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
897 } else {
898 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
900 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
901 } else if (have_avx2) {
902 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
903 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
904 } else {
905 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
906 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
907 tcg_out_dup_vec(s, type, MO_32, ret, ret);
911 static void tcg_out_movi(TCGContext *s, TCGType type,
912 TCGReg ret, tcg_target_long arg)
914 tcg_target_long diff;
916 switch (type) {
917 case TCG_TYPE_I32:
918 #if TCG_TARGET_REG_BITS == 64
919 case TCG_TYPE_I64:
920 #endif
921 if (ret < 16) {
922 break;
924 /* fallthru */
925 case TCG_TYPE_V64:
926 case TCG_TYPE_V128:
927 case TCG_TYPE_V256:
928 tcg_debug_assert(ret >= 16);
929 tcg_out_dupi_vec(s, type, ret, arg);
930 return;
931 default:
932 g_assert_not_reached();
935 if (arg == 0) {
936 tgen_arithr(s, ARITH_XOR, ret, ret);
937 return;
939 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
940 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
941 tcg_out32(s, arg);
942 return;
944 if (arg == (int32_t)arg) {
945 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
946 tcg_out32(s, arg);
947 return;
950 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
951 diff = arg - ((uintptr_t)s->code_ptr + 7);
952 if (diff == (int32_t)diff) {
953 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
954 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
955 tcg_out32(s, diff);
956 return;
959 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
960 tcg_out64(s, arg);
963 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
965 if (val == (int8_t)val) {
966 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
967 tcg_out8(s, val);
968 } else if (val == (int32_t)val) {
969 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
970 tcg_out32(s, val);
971 } else {
972 tcg_abort();
976 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
978 /* Given the strength of x86 memory ordering, we only need care for
979 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
980 faster than "mfence", so don't bother with the sse insn. */
981 if (a0 & TCG_MO_ST_LD) {
982 tcg_out8(s, 0xf0);
983 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
984 tcg_out8(s, 0);
988 static inline void tcg_out_push(TCGContext *s, int reg)
990 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
993 static inline void tcg_out_pop(TCGContext *s, int reg)
995 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
998 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
999 TCGReg arg1, intptr_t arg2)
1001 switch (type) {
1002 case TCG_TYPE_I32:
1003 if (ret < 16) {
1004 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1005 } else {
1006 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1008 break;
1009 case TCG_TYPE_I64:
1010 if (ret < 16) {
1011 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1012 break;
1014 /* FALLTHRU */
1015 case TCG_TYPE_V64:
1016 tcg_debug_assert(ret >= 16);
1017 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1018 break;
1019 case TCG_TYPE_V128:
1020 tcg_debug_assert(ret >= 16);
1021 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1022 break;
1023 case TCG_TYPE_V256:
1024 tcg_debug_assert(ret >= 16);
1025 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1026 ret, 0, arg1, arg2);
1027 break;
1028 default:
1029 g_assert_not_reached();
1033 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1034 TCGReg arg1, intptr_t arg2)
1036 switch (type) {
1037 case TCG_TYPE_I32:
1038 if (arg < 16) {
1039 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1040 } else {
1041 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1043 break;
1044 case TCG_TYPE_I64:
1045 if (arg < 16) {
1046 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1047 break;
1049 /* FALLTHRU */
1050 case TCG_TYPE_V64:
1051 tcg_debug_assert(arg >= 16);
1052 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1053 break;
1054 case TCG_TYPE_V128:
1055 tcg_debug_assert(arg >= 16);
1056 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1057 break;
1058 case TCG_TYPE_V256:
1059 tcg_debug_assert(arg >= 16);
1060 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1061 arg, 0, arg1, arg2);
1062 break;
1063 default:
1064 g_assert_not_reached();
1068 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1069 TCGReg base, intptr_t ofs)
1071 int rexw = 0;
1072 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1073 if (val != (int32_t)val) {
1074 return false;
1076 rexw = P_REXW;
1077 } else if (type != TCG_TYPE_I32) {
1078 return false;
1080 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1081 tcg_out32(s, val);
1082 return true;
1085 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1087 /* Propagate an opcode prefix, such as P_DATA16. */
1088 int ext = subopc & ~0x7;
1089 subopc &= 0x7;
1091 if (count == 1) {
1092 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1093 } else {
1094 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1095 tcg_out8(s, count);
1099 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1101 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1104 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1106 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1109 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1111 /* movzbl */
1112 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1113 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1116 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1118 /* movsbl */
1119 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1120 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1123 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1125 /* movzwl */
1126 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1129 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1131 /* movsw[lq] */
1132 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1135 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1137 /* 32-bit mov zero extends. */
1138 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1141 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1143 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1146 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1148 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1151 static void tgen_arithi(TCGContext *s, int c, int r0,
1152 tcg_target_long val, int cf)
1154 int rexw = 0;
1156 if (TCG_TARGET_REG_BITS == 64) {
1157 rexw = c & -8;
1158 c &= 7;
1161 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1162 partial flags update stalls on Pentium4 and are not recommended
1163 by current Intel optimization manuals. */
1164 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1165 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1166 if (TCG_TARGET_REG_BITS == 64) {
1167 /* The single-byte increment encodings are re-tasked as the
1168 REX prefixes. Use the MODRM encoding. */
1169 tcg_out_modrm(s, OPC_GRP5 + rexw,
1170 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1171 } else {
1172 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1174 return;
1177 if (c == ARITH_AND) {
1178 if (TCG_TARGET_REG_BITS == 64) {
1179 if (val == 0xffffffffu) {
1180 tcg_out_ext32u(s, r0, r0);
1181 return;
1183 if (val == (uint32_t)val) {
1184 /* AND with no high bits set can use a 32-bit operation. */
1185 rexw = 0;
1188 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1189 tcg_out_ext8u(s, r0, r0);
1190 return;
1192 if (val == 0xffffu) {
1193 tcg_out_ext16u(s, r0, r0);
1194 return;
1198 if (val == (int8_t)val) {
1199 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1200 tcg_out8(s, val);
1201 return;
1203 if (rexw == 0 || val == (int32_t)val) {
1204 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1205 tcg_out32(s, val);
1206 return;
1209 tcg_abort();
1212 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1214 if (val != 0) {
1215 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1219 /* Use SMALL != 0 to force a short forward branch. */
1220 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1222 int32_t val, val1;
1224 if (l->has_value) {
1225 val = tcg_pcrel_diff(s, l->u.value_ptr);
1226 val1 = val - 2;
1227 if ((int8_t)val1 == val1) {
1228 if (opc == -1) {
1229 tcg_out8(s, OPC_JMP_short);
1230 } else {
1231 tcg_out8(s, OPC_JCC_short + opc);
1233 tcg_out8(s, val1);
1234 } else {
1235 if (small) {
1236 tcg_abort();
1238 if (opc == -1) {
1239 tcg_out8(s, OPC_JMP_long);
1240 tcg_out32(s, val - 5);
1241 } else {
1242 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1243 tcg_out32(s, val - 6);
1246 } else if (small) {
1247 if (opc == -1) {
1248 tcg_out8(s, OPC_JMP_short);
1249 } else {
1250 tcg_out8(s, OPC_JCC_short + opc);
1252 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1253 s->code_ptr += 1;
1254 } else {
1255 if (opc == -1) {
1256 tcg_out8(s, OPC_JMP_long);
1257 } else {
1258 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1260 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1261 s->code_ptr += 4;
1265 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1266 int const_arg2, int rexw)
1268 if (const_arg2) {
1269 if (arg2 == 0) {
1270 /* test r, r */
1271 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1272 } else {
1273 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1275 } else {
1276 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1280 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1281 TCGArg arg1, TCGArg arg2, int const_arg2,
1282 TCGLabel *label, int small)
1284 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1285 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1288 #if TCG_TARGET_REG_BITS == 64
1289 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1290 TCGArg arg1, TCGArg arg2, int const_arg2,
1291 TCGLabel *label, int small)
1293 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1294 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1296 #else
1297 /* XXX: we implement it at the target level to avoid having to
1298 handle cross basic blocks temporaries */
1299 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1300 const int *const_args, int small)
1302 TCGLabel *label_next = gen_new_label();
1303 TCGLabel *label_this = arg_label(args[5]);
1305 switch(args[4]) {
1306 case TCG_COND_EQ:
1307 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1308 label_next, 1);
1309 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1310 label_this, small);
1311 break;
1312 case TCG_COND_NE:
1313 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1314 label_this, small);
1315 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1316 label_this, small);
1317 break;
1318 case TCG_COND_LT:
1319 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1320 label_this, small);
1321 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1322 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1323 label_this, small);
1324 break;
1325 case TCG_COND_LE:
1326 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1327 label_this, small);
1328 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1329 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1330 label_this, small);
1331 break;
1332 case TCG_COND_GT:
1333 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1334 label_this, small);
1335 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1336 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1337 label_this, small);
1338 break;
1339 case TCG_COND_GE:
1340 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1341 label_this, small);
1342 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1343 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1344 label_this, small);
1345 break;
1346 case TCG_COND_LTU:
1347 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1348 label_this, small);
1349 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1350 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1351 label_this, small);
1352 break;
1353 case TCG_COND_LEU:
1354 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1355 label_this, small);
1356 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1357 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1358 label_this, small);
1359 break;
1360 case TCG_COND_GTU:
1361 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1362 label_this, small);
1363 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1364 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1365 label_this, small);
1366 break;
1367 case TCG_COND_GEU:
1368 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1369 label_this, small);
1370 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1372 label_this, small);
1373 break;
1374 default:
1375 tcg_abort();
1377 tcg_out_label(s, label_next, s->code_ptr);
1379 #endif
1381 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1382 TCGArg arg1, TCGArg arg2, int const_arg2)
1384 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1385 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1386 tcg_out_ext8u(s, dest, dest);
1389 #if TCG_TARGET_REG_BITS == 64
1390 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1391 TCGArg arg1, TCGArg arg2, int const_arg2)
1393 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1394 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1395 tcg_out_ext8u(s, dest, dest);
1397 #else
1398 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1399 const int *const_args)
1401 TCGArg new_args[6];
1402 TCGLabel *label_true, *label_over;
1404 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1406 if (args[0] == args[1] || args[0] == args[2]
1407 || (!const_args[3] && args[0] == args[3])
1408 || (!const_args[4] && args[0] == args[4])) {
1409 /* When the destination overlaps with one of the argument
1410 registers, don't do anything tricky. */
1411 label_true = gen_new_label();
1412 label_over = gen_new_label();
1414 new_args[5] = label_arg(label_true);
1415 tcg_out_brcond2(s, new_args, const_args+1, 1);
1417 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1418 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1419 tcg_out_label(s, label_true, s->code_ptr);
1421 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1422 tcg_out_label(s, label_over, s->code_ptr);
1423 } else {
1424 /* When the destination does not overlap one of the arguments,
1425 clear the destination first, jump if cond false, and emit an
1426 increment in the true case. This results in smaller code. */
1428 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1430 label_over = gen_new_label();
1431 new_args[4] = tcg_invert_cond(new_args[4]);
1432 new_args[5] = label_arg(label_over);
1433 tcg_out_brcond2(s, new_args, const_args+1, 1);
1435 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1436 tcg_out_label(s, label_over, s->code_ptr);
1439 #endif
1441 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1442 TCGReg dest, TCGReg v1)
1444 if (have_cmov) {
1445 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1446 } else {
1447 TCGLabel *over = gen_new_label();
1448 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1449 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1450 tcg_out_label(s, over, s->code_ptr);
1454 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1455 TCGReg c1, TCGArg c2, int const_c2,
1456 TCGReg v1)
1458 tcg_out_cmp(s, c1, c2, const_c2, 0);
1459 tcg_out_cmov(s, cond, 0, dest, v1);
1462 #if TCG_TARGET_REG_BITS == 64
1463 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1464 TCGReg c1, TCGArg c2, int const_c2,
1465 TCGReg v1)
1467 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1468 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1470 #endif
1472 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1473 TCGArg arg2, bool const_a2)
1475 if (have_bmi1) {
1476 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1477 if (const_a2) {
1478 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1479 } else {
1480 tcg_debug_assert(dest != arg2);
1481 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1483 } else {
1484 tcg_debug_assert(dest != arg2);
1485 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1486 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1490 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1491 TCGArg arg2, bool const_a2)
1493 if (have_lzcnt) {
1494 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1495 if (const_a2) {
1496 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1497 } else {
1498 tcg_debug_assert(dest != arg2);
1499 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1501 } else {
1502 tcg_debug_assert(!const_a2);
1503 tcg_debug_assert(dest != arg1);
1504 tcg_debug_assert(dest != arg2);
1506 /* Recall that the output of BSR is the index not the count. */
1507 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1508 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1510 /* Since we have destroyed the flags from BSR, we have to re-test. */
1511 tcg_out_cmp(s, arg1, 0, 1, rexw);
1512 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1516 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1518 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1520 if (disp == (int32_t)disp) {
1521 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1522 tcg_out32(s, disp);
1523 } else {
1524 /* rip-relative addressing into the constant pool.
1525 This is 6 + 8 = 14 bytes, as compared to using an
1526 an immediate load 10 + 6 = 16 bytes, plus we may
1527 be able to re-use the pool constant for more calls. */
1528 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1529 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1530 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1531 tcg_out32(s, 0);
1535 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1537 tcg_out_branch(s, 1, dest);
1540 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1542 tcg_out_branch(s, 0, dest);
1545 static void tcg_out_nopn(TCGContext *s, int n)
1547 int i;
1548 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1549 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1550 * duplicate prefix, and all of the interesting recent cores can
1551 * decode and discard the duplicates in a single cycle.
1553 tcg_debug_assert(n >= 1);
1554 for (i = 1; i < n; ++i) {
1555 tcg_out8(s, 0x66);
1557 tcg_out8(s, 0x90);
1560 #if defined(CONFIG_SOFTMMU)
1561 #include "tcg-ldst.inc.c"
1563 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1564 * int mmu_idx, uintptr_t ra)
1566 static void * const qemu_ld_helpers[16] = {
1567 [MO_UB] = helper_ret_ldub_mmu,
1568 [MO_LEUW] = helper_le_lduw_mmu,
1569 [MO_LEUL] = helper_le_ldul_mmu,
1570 [MO_LEQ] = helper_le_ldq_mmu,
1571 [MO_BEUW] = helper_be_lduw_mmu,
1572 [MO_BEUL] = helper_be_ldul_mmu,
1573 [MO_BEQ] = helper_be_ldq_mmu,
1576 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1577 * uintxx_t val, int mmu_idx, uintptr_t ra)
1579 static void * const qemu_st_helpers[16] = {
1580 [MO_UB] = helper_ret_stb_mmu,
1581 [MO_LEUW] = helper_le_stw_mmu,
1582 [MO_LEUL] = helper_le_stl_mmu,
1583 [MO_LEQ] = helper_le_stq_mmu,
1584 [MO_BEUW] = helper_be_stw_mmu,
1585 [MO_BEUL] = helper_be_stl_mmu,
1586 [MO_BEQ] = helper_be_stq_mmu,
1589 /* Perform the TLB load and compare.
1591 Inputs:
1592 ADDRLO and ADDRHI contain the low and high part of the address.
1594 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1596 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1597 This should be offsetof addr_read or addr_write.
1599 Outputs:
1600 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1601 positions of the displacements of forward jumps to the TLB miss case.
1603 Second argument register is loaded with the low part of the address.
1604 In the TLB hit case, it has been adjusted as indicated by the TLB
1605 and so is a host address. In the TLB miss case, it continues to
1606 hold a guest address.
1608 First argument register is clobbered. */
1610 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1611 int mem_index, TCGMemOp opc,
1612 tcg_insn_unit **label_ptr, int which)
1614 const TCGReg r0 = TCG_REG_L0;
1615 const TCGReg r1 = TCG_REG_L1;
1616 TCGType ttype = TCG_TYPE_I32;
1617 TCGType tlbtype = TCG_TYPE_I32;
1618 int trexw = 0, hrexw = 0, tlbrexw = 0;
1619 unsigned a_bits = get_alignment_bits(opc);
1620 unsigned s_bits = opc & MO_SIZE;
1621 unsigned a_mask = (1 << a_bits) - 1;
1622 unsigned s_mask = (1 << s_bits) - 1;
1623 target_ulong tlb_mask;
1625 if (TCG_TARGET_REG_BITS == 64) {
1626 if (TARGET_LONG_BITS == 64) {
1627 ttype = TCG_TYPE_I64;
1628 trexw = P_REXW;
1630 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1631 hrexw = P_REXW;
1632 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1633 tlbtype = TCG_TYPE_I64;
1634 tlbrexw = P_REXW;
1639 tcg_out_mov(s, tlbtype, r0, addrlo);
1640 /* If the required alignment is at least as large as the access, simply
1641 copy the address and mask. For lesser alignments, check that we don't
1642 cross pages for the complete access. */
1643 if (a_bits >= s_bits) {
1644 tcg_out_mov(s, ttype, r1, addrlo);
1645 } else {
1646 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1648 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1650 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1651 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1653 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1654 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1655 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1657 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1658 offsetof(CPUArchState, tlb_table[mem_index][0])
1659 + which);
1661 /* cmp 0(r0), r1 */
1662 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1664 /* Prepare for both the fast path add of the tlb addend, and the slow
1665 path function argument setup. */
1666 tcg_out_mov(s, ttype, r1, addrlo);
1668 /* jne slow_path */
1669 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1670 label_ptr[0] = s->code_ptr;
1671 s->code_ptr += 4;
1673 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1674 /* cmp 4(r0), addrhi */
1675 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1677 /* jne slow_path */
1678 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1679 label_ptr[1] = s->code_ptr;
1680 s->code_ptr += 4;
1683 /* TLB Hit. */
1685 /* add addend(r0), r1 */
1686 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1687 offsetof(CPUTLBEntry, addend) - which);
1691 * Record the context of a call to the out of line helper code for the slow path
1692 * for a load or store, so that we can later generate the correct helper code
1694 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1695 TCGMemOpIdx oi,
1696 TCGReg datalo, TCGReg datahi,
1697 TCGReg addrlo, TCGReg addrhi,
1698 tcg_insn_unit *raddr,
1699 tcg_insn_unit **label_ptr)
1701 TCGLabelQemuLdst *label = new_ldst_label(s);
1703 label->is_ld = is_ld;
1704 label->oi = oi;
1705 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1706 label->datalo_reg = datalo;
1707 label->datahi_reg = datahi;
1708 label->addrlo_reg = addrlo;
1709 label->addrhi_reg = addrhi;
1710 label->raddr = raddr;
1711 label->label_ptr[0] = label_ptr[0];
1712 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1713 label->label_ptr[1] = label_ptr[1];
1718 * Generate code for the slow path for a load at the end of block
1720 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1722 TCGMemOpIdx oi = l->oi;
1723 TCGMemOp opc = get_memop(oi);
1724 TCGReg data_reg;
1725 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1726 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1728 /* resolve label address */
1729 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1730 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1731 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1734 if (TCG_TARGET_REG_BITS == 32) {
1735 int ofs = 0;
1737 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1738 ofs += 4;
1740 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1741 ofs += 4;
1743 if (TARGET_LONG_BITS == 64) {
1744 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1745 ofs += 4;
1748 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1749 ofs += 4;
1751 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1752 } else {
1753 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1754 /* The second argument is already loaded with addrlo. */
1755 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1756 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1757 (uintptr_t)l->raddr);
1760 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1762 data_reg = l->datalo_reg;
1763 switch (opc & MO_SSIZE) {
1764 case MO_SB:
1765 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1766 break;
1767 case MO_SW:
1768 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1769 break;
1770 #if TCG_TARGET_REG_BITS == 64
1771 case MO_SL:
1772 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1773 break;
1774 #endif
1775 case MO_UB:
1776 case MO_UW:
1777 /* Note that the helpers have zero-extended to tcg_target_long. */
1778 case MO_UL:
1779 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1780 break;
1781 case MO_Q:
1782 if (TCG_TARGET_REG_BITS == 64) {
1783 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1784 } else if (data_reg == TCG_REG_EDX) {
1785 /* xchg %edx, %eax */
1786 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1787 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1788 } else {
1789 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1790 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1792 break;
1793 default:
1794 tcg_abort();
1797 /* Jump to the code corresponding to next IR of qemu_st */
1798 tcg_out_jmp(s, l->raddr);
1802 * Generate code for the slow path for a store at the end of block
1804 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1806 TCGMemOpIdx oi = l->oi;
1807 TCGMemOp opc = get_memop(oi);
1808 TCGMemOp s_bits = opc & MO_SIZE;
1809 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1810 TCGReg retaddr;
1812 /* resolve label address */
1813 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1814 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1815 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1818 if (TCG_TARGET_REG_BITS == 32) {
1819 int ofs = 0;
1821 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1822 ofs += 4;
1824 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1825 ofs += 4;
1827 if (TARGET_LONG_BITS == 64) {
1828 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1829 ofs += 4;
1832 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1833 ofs += 4;
1835 if (s_bits == MO_64) {
1836 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1837 ofs += 4;
1840 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1841 ofs += 4;
1843 retaddr = TCG_REG_EAX;
1844 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1845 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1846 } else {
1847 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1848 /* The second argument is already loaded with addrlo. */
1849 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1850 tcg_target_call_iarg_regs[2], l->datalo_reg);
1851 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1853 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1854 retaddr = tcg_target_call_iarg_regs[4];
1855 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1856 } else {
1857 retaddr = TCG_REG_RAX;
1858 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1859 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1860 TCG_TARGET_CALL_STACK_OFFSET);
1864 /* "Tail call" to the helper, with the return address back inline. */
1865 tcg_out_push(s, retaddr);
1866 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1868 #elif TCG_TARGET_REG_BITS == 32
1869 # define x86_guest_base_seg 0
1870 # define x86_guest_base_index -1
1871 # define x86_guest_base_offset guest_base
1872 #else
1873 static int x86_guest_base_seg;
1874 static int x86_guest_base_index = -1;
1875 static int32_t x86_guest_base_offset;
1876 # if defined(__x86_64__) && defined(__linux__)
1877 # include <asm/prctl.h>
1878 # include <sys/prctl.h>
1879 int arch_prctl(int code, unsigned long addr);
1880 static inline int setup_guest_base_seg(void)
1882 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1883 return P_GS;
1885 return 0;
1887 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1888 # include <machine/sysarch.h>
1889 static inline int setup_guest_base_seg(void)
1891 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1892 return P_GS;
1894 return 0;
1896 # else
1897 static inline int setup_guest_base_seg(void)
1899 return 0;
1901 # endif
1902 #endif /* SOFTMMU */
1904 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1905 TCGReg base, int index, intptr_t ofs,
1906 int seg, bool is64, TCGMemOp memop)
1908 const TCGMemOp real_bswap = memop & MO_BSWAP;
1909 TCGMemOp bswap = real_bswap;
1910 int rexw = is64 * P_REXW;
1911 int movop = OPC_MOVL_GvEv;
1913 if (have_movbe && real_bswap) {
1914 bswap = 0;
1915 movop = OPC_MOVBE_GyMy;
1918 switch (memop & MO_SSIZE) {
1919 case MO_UB:
1920 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1921 base, index, 0, ofs);
1922 break;
1923 case MO_SB:
1924 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1925 base, index, 0, ofs);
1926 break;
1927 case MO_UW:
1928 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1929 base, index, 0, ofs);
1930 if (real_bswap) {
1931 tcg_out_rolw_8(s, datalo);
1933 break;
1934 case MO_SW:
1935 if (real_bswap) {
1936 if (have_movbe) {
1937 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1938 datalo, base, index, 0, ofs);
1939 } else {
1940 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1941 base, index, 0, ofs);
1942 tcg_out_rolw_8(s, datalo);
1944 tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
1945 } else {
1946 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
1947 datalo, base, index, 0, ofs);
1949 break;
1950 case MO_UL:
1951 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1952 if (bswap) {
1953 tcg_out_bswap32(s, datalo);
1955 break;
1956 #if TCG_TARGET_REG_BITS == 64
1957 case MO_SL:
1958 if (real_bswap) {
1959 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1960 base, index, 0, ofs);
1961 if (bswap) {
1962 tcg_out_bswap32(s, datalo);
1964 tcg_out_ext32s(s, datalo, datalo);
1965 } else {
1966 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1967 base, index, 0, ofs);
1969 break;
1970 #endif
1971 case MO_Q:
1972 if (TCG_TARGET_REG_BITS == 64) {
1973 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1974 base, index, 0, ofs);
1975 if (bswap) {
1976 tcg_out_bswap64(s, datalo);
1978 } else {
1979 if (real_bswap) {
1980 int t = datalo;
1981 datalo = datahi;
1982 datahi = t;
1984 if (base != datalo) {
1985 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1986 base, index, 0, ofs);
1987 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1988 base, index, 0, ofs + 4);
1989 } else {
1990 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1991 base, index, 0, ofs + 4);
1992 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1993 base, index, 0, ofs);
1995 if (bswap) {
1996 tcg_out_bswap32(s, datalo);
1997 tcg_out_bswap32(s, datahi);
2000 break;
2001 default:
2002 tcg_abort();
2006 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2007 EAX. It will be useful once fixed registers globals are less
2008 common. */
2009 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2011 TCGReg datalo, datahi, addrlo;
2012 TCGReg addrhi __attribute__((unused));
2013 TCGMemOpIdx oi;
2014 TCGMemOp opc;
2015 #if defined(CONFIG_SOFTMMU)
2016 int mem_index;
2017 tcg_insn_unit *label_ptr[2];
2018 #endif
2020 datalo = *args++;
2021 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2022 addrlo = *args++;
2023 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2024 oi = *args++;
2025 opc = get_memop(oi);
2027 #if defined(CONFIG_SOFTMMU)
2028 mem_index = get_mmuidx(oi);
2030 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2031 label_ptr, offsetof(CPUTLBEntry, addr_read));
2033 /* TLB Hit. */
2034 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2036 /* Record the current context of a load into ldst label */
2037 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2038 s->code_ptr, label_ptr);
2039 #else
2040 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2041 x86_guest_base_offset, x86_guest_base_seg,
2042 is64, opc);
2043 #endif
2046 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2047 TCGReg base, int index, intptr_t ofs,
2048 int seg, TCGMemOp memop)
2050 /* ??? Ideally we wouldn't need a scratch register. For user-only,
2051 we could perform the bswap twice to restore the original value
2052 instead of moving to the scratch. But as it is, the L constraint
2053 means that TCG_REG_L0 is definitely free here. */
2054 const TCGReg scratch = TCG_REG_L0;
2055 const TCGMemOp real_bswap = memop & MO_BSWAP;
2056 TCGMemOp bswap = real_bswap;
2057 int movop = OPC_MOVL_EvGv;
2059 if (have_movbe && real_bswap) {
2060 bswap = 0;
2061 movop = OPC_MOVBE_MyGy;
2064 switch (memop & MO_SIZE) {
2065 case MO_8:
2066 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2067 Use the scratch register if necessary. */
2068 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2069 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2070 datalo = scratch;
2072 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2073 datalo, base, index, 0, ofs);
2074 break;
2075 case MO_16:
2076 if (bswap) {
2077 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2078 tcg_out_rolw_8(s, scratch);
2079 datalo = scratch;
2081 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2082 base, index, 0, ofs);
2083 break;
2084 case MO_32:
2085 if (bswap) {
2086 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2087 tcg_out_bswap32(s, scratch);
2088 datalo = scratch;
2090 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2091 break;
2092 case MO_64:
2093 if (TCG_TARGET_REG_BITS == 64) {
2094 if (bswap) {
2095 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2096 tcg_out_bswap64(s, scratch);
2097 datalo = scratch;
2099 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2100 base, index, 0, ofs);
2101 } else if (bswap) {
2102 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2103 tcg_out_bswap32(s, scratch);
2104 tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2105 base, index, 0, ofs);
2106 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2107 tcg_out_bswap32(s, scratch);
2108 tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2109 base, index, 0, ofs + 4);
2110 } else {
2111 if (real_bswap) {
2112 int t = datalo;
2113 datalo = datahi;
2114 datahi = t;
2116 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2117 base, index, 0, ofs);
2118 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2119 base, index, 0, ofs + 4);
2121 break;
2122 default:
2123 tcg_abort();
2127 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2129 TCGReg datalo, datahi, addrlo;
2130 TCGReg addrhi __attribute__((unused));
2131 TCGMemOpIdx oi;
2132 TCGMemOp opc;
2133 #if defined(CONFIG_SOFTMMU)
2134 int mem_index;
2135 tcg_insn_unit *label_ptr[2];
2136 #endif
2138 datalo = *args++;
2139 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2140 addrlo = *args++;
2141 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2142 oi = *args++;
2143 opc = get_memop(oi);
2145 #if defined(CONFIG_SOFTMMU)
2146 mem_index = get_mmuidx(oi);
2148 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2149 label_ptr, offsetof(CPUTLBEntry, addr_write));
2151 /* TLB Hit. */
2152 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2154 /* Record the current context of a store into ldst label */
2155 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2156 s->code_ptr, label_ptr);
2157 #else
2158 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2159 x86_guest_base_offset, x86_guest_base_seg, opc);
2160 #endif
2163 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2164 const TCGArg *args, const int *const_args)
2166 TCGArg a0, a1, a2;
2167 int c, const_a2, vexop, rexw = 0;
2169 #if TCG_TARGET_REG_BITS == 64
2170 # define OP_32_64(x) \
2171 case glue(glue(INDEX_op_, x), _i64): \
2172 rexw = P_REXW; /* FALLTHRU */ \
2173 case glue(glue(INDEX_op_, x), _i32)
2174 #else
2175 # define OP_32_64(x) \
2176 case glue(glue(INDEX_op_, x), _i32)
2177 #endif
2179 /* Hoist the loads of the most common arguments. */
2180 a0 = args[0];
2181 a1 = args[1];
2182 a2 = args[2];
2183 const_a2 = const_args[2];
2185 switch (opc) {
2186 case INDEX_op_exit_tb:
2187 /* Reuse the zeroing that exists for goto_ptr. */
2188 if (a0 == 0) {
2189 tcg_out_jmp(s, s->code_gen_epilogue);
2190 } else {
2191 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2192 tcg_out_jmp(s, tb_ret_addr);
2194 break;
2195 case INDEX_op_goto_tb:
2196 if (s->tb_jmp_insn_offset) {
2197 /* direct jump method */
2198 int gap;
2199 /* jump displacement must be aligned for atomic patching;
2200 * see if we need to add extra nops before jump
2202 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2203 if (gap != 1) {
2204 tcg_out_nopn(s, gap - 1);
2206 tcg_out8(s, OPC_JMP_long); /* jmp im */
2207 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2208 tcg_out32(s, 0);
2209 } else {
2210 /* indirect jump method */
2211 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2212 (intptr_t)(s->tb_jmp_target_addr + a0));
2214 set_jmp_reset_offset(s, a0);
2215 break;
2216 case INDEX_op_goto_ptr:
2217 /* jmp to the given host address (could be epilogue) */
2218 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2219 break;
2220 case INDEX_op_br:
2221 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2222 break;
2223 OP_32_64(ld8u):
2224 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2225 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2226 break;
2227 OP_32_64(ld8s):
2228 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2229 break;
2230 OP_32_64(ld16u):
2231 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2232 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2233 break;
2234 OP_32_64(ld16s):
2235 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2236 break;
2237 #if TCG_TARGET_REG_BITS == 64
2238 case INDEX_op_ld32u_i64:
2239 #endif
2240 case INDEX_op_ld_i32:
2241 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2242 break;
2244 OP_32_64(st8):
2245 if (const_args[0]) {
2246 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2247 tcg_out8(s, a0);
2248 } else {
2249 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2251 break;
2252 OP_32_64(st16):
2253 if (const_args[0]) {
2254 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2255 tcg_out16(s, a0);
2256 } else {
2257 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2259 break;
2260 #if TCG_TARGET_REG_BITS == 64
2261 case INDEX_op_st32_i64:
2262 #endif
2263 case INDEX_op_st_i32:
2264 if (const_args[0]) {
2265 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2266 tcg_out32(s, a0);
2267 } else {
2268 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2270 break;
2272 OP_32_64(add):
2273 /* For 3-operand addition, use LEA. */
2274 if (a0 != a1) {
2275 TCGArg c3 = 0;
2276 if (const_a2) {
2277 c3 = a2, a2 = -1;
2278 } else if (a0 == a2) {
2279 /* Watch out for dest = src + dest, since we've removed
2280 the matching constraint on the add. */
2281 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2282 break;
2285 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2286 break;
2288 c = ARITH_ADD;
2289 goto gen_arith;
2290 OP_32_64(sub):
2291 c = ARITH_SUB;
2292 goto gen_arith;
2293 OP_32_64(and):
2294 c = ARITH_AND;
2295 goto gen_arith;
2296 OP_32_64(or):
2297 c = ARITH_OR;
2298 goto gen_arith;
2299 OP_32_64(xor):
2300 c = ARITH_XOR;
2301 goto gen_arith;
2302 gen_arith:
2303 if (const_a2) {
2304 tgen_arithi(s, c + rexw, a0, a2, 0);
2305 } else {
2306 tgen_arithr(s, c + rexw, a0, a2);
2308 break;
2310 OP_32_64(andc):
2311 if (const_a2) {
2312 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2313 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2314 } else {
2315 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2317 break;
2319 OP_32_64(mul):
2320 if (const_a2) {
2321 int32_t val;
2322 val = a2;
2323 if (val == (int8_t)val) {
2324 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2325 tcg_out8(s, val);
2326 } else {
2327 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2328 tcg_out32(s, val);
2330 } else {
2331 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2333 break;
2335 OP_32_64(div2):
2336 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2337 break;
2338 OP_32_64(divu2):
2339 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2340 break;
2342 OP_32_64(shl):
2343 /* For small constant 3-operand shift, use LEA. */
2344 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2345 if (a2 - 1 == 0) {
2346 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2347 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2348 } else {
2349 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2350 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2352 break;
2354 c = SHIFT_SHL;
2355 vexop = OPC_SHLX;
2356 goto gen_shift_maybe_vex;
2357 OP_32_64(shr):
2358 c = SHIFT_SHR;
2359 vexop = OPC_SHRX;
2360 goto gen_shift_maybe_vex;
2361 OP_32_64(sar):
2362 c = SHIFT_SAR;
2363 vexop = OPC_SARX;
2364 goto gen_shift_maybe_vex;
2365 OP_32_64(rotl):
2366 c = SHIFT_ROL;
2367 goto gen_shift;
2368 OP_32_64(rotr):
2369 c = SHIFT_ROR;
2370 goto gen_shift;
2371 gen_shift_maybe_vex:
2372 if (have_bmi2) {
2373 if (!const_a2) {
2374 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2375 break;
2377 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2379 /* FALLTHRU */
2380 gen_shift:
2381 if (const_a2) {
2382 tcg_out_shifti(s, c + rexw, a0, a2);
2383 } else {
2384 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2386 break;
2388 OP_32_64(ctz):
2389 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2390 break;
2391 OP_32_64(clz):
2392 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2393 break;
2394 OP_32_64(ctpop):
2395 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2396 break;
2398 case INDEX_op_brcond_i32:
2399 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2400 break;
2401 case INDEX_op_setcond_i32:
2402 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2403 break;
2404 case INDEX_op_movcond_i32:
2405 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2406 break;
2408 OP_32_64(bswap16):
2409 tcg_out_rolw_8(s, a0);
2410 break;
2411 OP_32_64(bswap32):
2412 tcg_out_bswap32(s, a0);
2413 break;
2415 OP_32_64(neg):
2416 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2417 break;
2418 OP_32_64(not):
2419 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2420 break;
2422 OP_32_64(ext8s):
2423 tcg_out_ext8s(s, a0, a1, rexw);
2424 break;
2425 OP_32_64(ext16s):
2426 tcg_out_ext16s(s, a0, a1, rexw);
2427 break;
2428 OP_32_64(ext8u):
2429 tcg_out_ext8u(s, a0, a1);
2430 break;
2431 OP_32_64(ext16u):
2432 tcg_out_ext16u(s, a0, a1);
2433 break;
2435 case INDEX_op_qemu_ld_i32:
2436 tcg_out_qemu_ld(s, args, 0);
2437 break;
2438 case INDEX_op_qemu_ld_i64:
2439 tcg_out_qemu_ld(s, args, 1);
2440 break;
2441 case INDEX_op_qemu_st_i32:
2442 tcg_out_qemu_st(s, args, 0);
2443 break;
2444 case INDEX_op_qemu_st_i64:
2445 tcg_out_qemu_st(s, args, 1);
2446 break;
2448 OP_32_64(mulu2):
2449 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2450 break;
2451 OP_32_64(muls2):
2452 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2453 break;
2454 OP_32_64(add2):
2455 if (const_args[4]) {
2456 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2457 } else {
2458 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2460 if (const_args[5]) {
2461 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2462 } else {
2463 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2465 break;
2466 OP_32_64(sub2):
2467 if (const_args[4]) {
2468 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2469 } else {
2470 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2472 if (const_args[5]) {
2473 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2474 } else {
2475 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2477 break;
2479 #if TCG_TARGET_REG_BITS == 32
2480 case INDEX_op_brcond2_i32:
2481 tcg_out_brcond2(s, args, const_args, 0);
2482 break;
2483 case INDEX_op_setcond2_i32:
2484 tcg_out_setcond2(s, args, const_args);
2485 break;
2486 #else /* TCG_TARGET_REG_BITS == 64 */
2487 case INDEX_op_ld32s_i64:
2488 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2489 break;
2490 case INDEX_op_ld_i64:
2491 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2492 break;
2493 case INDEX_op_st_i64:
2494 if (const_args[0]) {
2495 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2496 tcg_out32(s, a0);
2497 } else {
2498 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2500 break;
2502 case INDEX_op_brcond_i64:
2503 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2504 break;
2505 case INDEX_op_setcond_i64:
2506 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2507 break;
2508 case INDEX_op_movcond_i64:
2509 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2510 break;
2512 case INDEX_op_bswap64_i64:
2513 tcg_out_bswap64(s, a0);
2514 break;
2515 case INDEX_op_extu_i32_i64:
2516 case INDEX_op_ext32u_i64:
2517 case INDEX_op_extrl_i64_i32:
2518 tcg_out_ext32u(s, a0, a1);
2519 break;
2520 case INDEX_op_ext_i32_i64:
2521 case INDEX_op_ext32s_i64:
2522 tcg_out_ext32s(s, a0, a1);
2523 break;
2524 case INDEX_op_extrh_i64_i32:
2525 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2526 break;
2527 #endif
2529 OP_32_64(deposit):
2530 if (args[3] == 0 && args[4] == 8) {
2531 /* load bits 0..7 */
2532 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2533 } else if (args[3] == 8 && args[4] == 8) {
2534 /* load bits 8..15 */
2535 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2536 } else if (args[3] == 0 && args[4] == 16) {
2537 /* load bits 0..15 */
2538 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2539 } else {
2540 tcg_abort();
2542 break;
2544 case INDEX_op_extract_i64:
2545 if (a2 + args[3] == 32) {
2546 /* This is a 32-bit zero-extending right shift. */
2547 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2548 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2549 break;
2551 /* FALLTHRU */
2552 case INDEX_op_extract_i32:
2553 /* On the off-chance that we can use the high-byte registers.
2554 Otherwise we emit the same ext16 + shift pattern that we
2555 would have gotten from the normal tcg-op.c expansion. */
2556 tcg_debug_assert(a2 == 8 && args[3] == 8);
2557 if (a1 < 4 && a0 < 8) {
2558 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2559 } else {
2560 tcg_out_ext16u(s, a0, a1);
2561 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2563 break;
2565 case INDEX_op_sextract_i32:
2566 /* We don't implement sextract_i64, as we cannot sign-extend to
2567 64-bits without using the REX prefix that explicitly excludes
2568 access to the high-byte registers. */
2569 tcg_debug_assert(a2 == 8 && args[3] == 8);
2570 if (a1 < 4 && a0 < 8) {
2571 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2572 } else {
2573 tcg_out_ext16s(s, a0, a1, 0);
2574 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2576 break;
2578 case INDEX_op_mb:
2579 tcg_out_mb(s, a0);
2580 break;
2581 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2582 case INDEX_op_mov_i64:
2583 case INDEX_op_mov_vec:
2584 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2585 case INDEX_op_movi_i64:
2586 case INDEX_op_dupi_vec:
2587 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2588 default:
2589 tcg_abort();
2592 #undef OP_32_64
2595 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2596 unsigned vecl, unsigned vece,
2597 const TCGArg *args, const int *const_args)
2599 static int const add_insn[4] = {
2600 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2602 static int const ssadd_insn[4] = {
2603 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2605 static int const usadd_insn[4] = {
2606 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2608 static int const sub_insn[4] = {
2609 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2611 static int const sssub_insn[4] = {
2612 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2614 static int const ussub_insn[4] = {
2615 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2617 static int const mul_insn[4] = {
2618 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2620 static int const shift_imm_insn[4] = {
2621 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2623 static int const cmpeq_insn[4] = {
2624 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2626 static int const cmpgt_insn[4] = {
2627 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2629 static int const punpckl_insn[4] = {
2630 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2632 static int const punpckh_insn[4] = {
2633 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2635 static int const packss_insn[4] = {
2636 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2638 static int const packus_insn[4] = {
2639 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2642 TCGType type = vecl + TCG_TYPE_V64;
2643 int insn, sub;
2644 TCGArg a0, a1, a2;
2646 a0 = args[0];
2647 a1 = args[1];
2648 a2 = args[2];
2650 switch (opc) {
2651 case INDEX_op_add_vec:
2652 insn = add_insn[vece];
2653 goto gen_simd;
2654 case INDEX_op_ssadd_vec:
2655 insn = ssadd_insn[vece];
2656 goto gen_simd;
2657 case INDEX_op_usadd_vec:
2658 insn = usadd_insn[vece];
2659 goto gen_simd;
2660 case INDEX_op_sub_vec:
2661 insn = sub_insn[vece];
2662 goto gen_simd;
2663 case INDEX_op_sssub_vec:
2664 insn = sssub_insn[vece];
2665 goto gen_simd;
2666 case INDEX_op_ussub_vec:
2667 insn = ussub_insn[vece];
2668 goto gen_simd;
2669 case INDEX_op_mul_vec:
2670 insn = mul_insn[vece];
2671 goto gen_simd;
2672 case INDEX_op_and_vec:
2673 insn = OPC_PAND;
2674 goto gen_simd;
2675 case INDEX_op_or_vec:
2676 insn = OPC_POR;
2677 goto gen_simd;
2678 case INDEX_op_xor_vec:
2679 insn = OPC_PXOR;
2680 goto gen_simd;
2681 case INDEX_op_x86_punpckl_vec:
2682 insn = punpckl_insn[vece];
2683 goto gen_simd;
2684 case INDEX_op_x86_punpckh_vec:
2685 insn = punpckh_insn[vece];
2686 goto gen_simd;
2687 case INDEX_op_x86_packss_vec:
2688 insn = packss_insn[vece];
2689 goto gen_simd;
2690 case INDEX_op_x86_packus_vec:
2691 insn = packus_insn[vece];
2692 goto gen_simd;
2693 #if TCG_TARGET_REG_BITS == 32
2694 case INDEX_op_dup2_vec:
2695 /* Constraints have already placed both 32-bit inputs in xmm regs. */
2696 insn = OPC_PUNPCKLDQ;
2697 goto gen_simd;
2698 #endif
2699 gen_simd:
2700 tcg_debug_assert(insn != OPC_UD2);
2701 if (type == TCG_TYPE_V256) {
2702 insn |= P_VEXL;
2704 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2705 break;
2707 case INDEX_op_cmp_vec:
2708 sub = args[3];
2709 if (sub == TCG_COND_EQ) {
2710 insn = cmpeq_insn[vece];
2711 } else if (sub == TCG_COND_GT) {
2712 insn = cmpgt_insn[vece];
2713 } else {
2714 g_assert_not_reached();
2716 goto gen_simd;
2718 case INDEX_op_andc_vec:
2719 insn = OPC_PANDN;
2720 if (type == TCG_TYPE_V256) {
2721 insn |= P_VEXL;
2723 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2724 break;
2726 case INDEX_op_shli_vec:
2727 sub = 6;
2728 goto gen_shift;
2729 case INDEX_op_shri_vec:
2730 sub = 2;
2731 goto gen_shift;
2732 case INDEX_op_sari_vec:
2733 tcg_debug_assert(vece != MO_64);
2734 sub = 4;
2735 gen_shift:
2736 tcg_debug_assert(vece != MO_8);
2737 insn = shift_imm_insn[vece];
2738 if (type == TCG_TYPE_V256) {
2739 insn |= P_VEXL;
2741 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2742 tcg_out8(s, a2);
2743 break;
2745 case INDEX_op_ld_vec:
2746 tcg_out_ld(s, type, a0, a1, a2);
2747 break;
2748 case INDEX_op_st_vec:
2749 tcg_out_st(s, type, a0, a1, a2);
2750 break;
2751 case INDEX_op_dup_vec:
2752 tcg_out_dup_vec(s, type, vece, a0, a1);
2753 break;
2755 case INDEX_op_x86_shufps_vec:
2756 insn = OPC_SHUFPS;
2757 sub = args[3];
2758 goto gen_simd_imm8;
2759 case INDEX_op_x86_blend_vec:
2760 if (vece == MO_16) {
2761 insn = OPC_PBLENDW;
2762 } else if (vece == MO_32) {
2763 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2764 } else {
2765 g_assert_not_reached();
2767 sub = args[3];
2768 goto gen_simd_imm8;
2769 case INDEX_op_x86_vperm2i128_vec:
2770 insn = OPC_VPERM2I128;
2771 sub = args[3];
2772 goto gen_simd_imm8;
2773 gen_simd_imm8:
2774 if (type == TCG_TYPE_V256) {
2775 insn |= P_VEXL;
2777 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2778 tcg_out8(s, sub);
2779 break;
2781 case INDEX_op_x86_vpblendvb_vec:
2782 insn = OPC_VPBLENDVB;
2783 if (type == TCG_TYPE_V256) {
2784 insn |= P_VEXL;
2786 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2787 tcg_out8(s, args[3] << 4);
2788 break;
2790 case INDEX_op_x86_psrldq_vec:
2791 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2792 tcg_out8(s, a2);
2793 break;
2795 default:
2796 g_assert_not_reached();
2800 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2802 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2803 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2804 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2805 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2806 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2807 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2808 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2809 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2810 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2811 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2812 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2813 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2814 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2815 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2816 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2817 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2818 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2819 static const TCGTargetOpDef r_r_L_L
2820 = { .args_ct_str = { "r", "r", "L", "L" } };
2821 static const TCGTargetOpDef L_L_L_L
2822 = { .args_ct_str = { "L", "L", "L", "L" } };
2823 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2824 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2825 static const TCGTargetOpDef x_x_x_x
2826 = { .args_ct_str = { "x", "x", "x", "x" } };
2827 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2829 switch (op) {
2830 case INDEX_op_goto_ptr:
2831 return &r;
2833 case INDEX_op_ld8u_i32:
2834 case INDEX_op_ld8u_i64:
2835 case INDEX_op_ld8s_i32:
2836 case INDEX_op_ld8s_i64:
2837 case INDEX_op_ld16u_i32:
2838 case INDEX_op_ld16u_i64:
2839 case INDEX_op_ld16s_i32:
2840 case INDEX_op_ld16s_i64:
2841 case INDEX_op_ld_i32:
2842 case INDEX_op_ld32u_i64:
2843 case INDEX_op_ld32s_i64:
2844 case INDEX_op_ld_i64:
2845 return &r_r;
2847 case INDEX_op_st8_i32:
2848 case INDEX_op_st8_i64:
2849 return &qi_r;
2850 case INDEX_op_st16_i32:
2851 case INDEX_op_st16_i64:
2852 case INDEX_op_st_i32:
2853 case INDEX_op_st32_i64:
2854 return &ri_r;
2855 case INDEX_op_st_i64:
2856 return &re_r;
2858 case INDEX_op_add_i32:
2859 case INDEX_op_add_i64:
2860 return &r_r_re;
2861 case INDEX_op_sub_i32:
2862 case INDEX_op_sub_i64:
2863 case INDEX_op_mul_i32:
2864 case INDEX_op_mul_i64:
2865 case INDEX_op_or_i32:
2866 case INDEX_op_or_i64:
2867 case INDEX_op_xor_i32:
2868 case INDEX_op_xor_i64:
2869 return &r_0_re;
2871 case INDEX_op_and_i32:
2872 case INDEX_op_and_i64:
2874 static const TCGTargetOpDef and
2875 = { .args_ct_str = { "r", "0", "reZ" } };
2876 return &and;
2878 break;
2879 case INDEX_op_andc_i32:
2880 case INDEX_op_andc_i64:
2882 static const TCGTargetOpDef andc
2883 = { .args_ct_str = { "r", "r", "rI" } };
2884 return &andc;
2886 break;
2888 case INDEX_op_shl_i32:
2889 case INDEX_op_shl_i64:
2890 case INDEX_op_shr_i32:
2891 case INDEX_op_shr_i64:
2892 case INDEX_op_sar_i32:
2893 case INDEX_op_sar_i64:
2894 return have_bmi2 ? &r_r_ri : &r_0_ci;
2895 case INDEX_op_rotl_i32:
2896 case INDEX_op_rotl_i64:
2897 case INDEX_op_rotr_i32:
2898 case INDEX_op_rotr_i64:
2899 return &r_0_ci;
2901 case INDEX_op_brcond_i32:
2902 case INDEX_op_brcond_i64:
2903 return &r_re;
2905 case INDEX_op_bswap16_i32:
2906 case INDEX_op_bswap16_i64:
2907 case INDEX_op_bswap32_i32:
2908 case INDEX_op_bswap32_i64:
2909 case INDEX_op_bswap64_i64:
2910 case INDEX_op_neg_i32:
2911 case INDEX_op_neg_i64:
2912 case INDEX_op_not_i32:
2913 case INDEX_op_not_i64:
2914 case INDEX_op_extrh_i64_i32:
2915 return &r_0;
2917 case INDEX_op_ext8s_i32:
2918 case INDEX_op_ext8s_i64:
2919 case INDEX_op_ext8u_i32:
2920 case INDEX_op_ext8u_i64:
2921 return &r_q;
2922 case INDEX_op_ext16s_i32:
2923 case INDEX_op_ext16s_i64:
2924 case INDEX_op_ext16u_i32:
2925 case INDEX_op_ext16u_i64:
2926 case INDEX_op_ext32s_i64:
2927 case INDEX_op_ext32u_i64:
2928 case INDEX_op_ext_i32_i64:
2929 case INDEX_op_extu_i32_i64:
2930 case INDEX_op_extrl_i64_i32:
2931 case INDEX_op_extract_i32:
2932 case INDEX_op_extract_i64:
2933 case INDEX_op_sextract_i32:
2934 case INDEX_op_ctpop_i32:
2935 case INDEX_op_ctpop_i64:
2936 return &r_r;
2938 case INDEX_op_deposit_i32:
2939 case INDEX_op_deposit_i64:
2941 static const TCGTargetOpDef dep
2942 = { .args_ct_str = { "Q", "0", "Q" } };
2943 return &dep;
2945 case INDEX_op_setcond_i32:
2946 case INDEX_op_setcond_i64:
2948 static const TCGTargetOpDef setc
2949 = { .args_ct_str = { "q", "r", "re" } };
2950 return &setc;
2952 case INDEX_op_movcond_i32:
2953 case INDEX_op_movcond_i64:
2955 static const TCGTargetOpDef movc
2956 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2957 return &movc;
2959 case INDEX_op_div2_i32:
2960 case INDEX_op_div2_i64:
2961 case INDEX_op_divu2_i32:
2962 case INDEX_op_divu2_i64:
2964 static const TCGTargetOpDef div2
2965 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2966 return &div2;
2968 case INDEX_op_mulu2_i32:
2969 case INDEX_op_mulu2_i64:
2970 case INDEX_op_muls2_i32:
2971 case INDEX_op_muls2_i64:
2973 static const TCGTargetOpDef mul2
2974 = { .args_ct_str = { "a", "d", "a", "r" } };
2975 return &mul2;
2977 case INDEX_op_add2_i32:
2978 case INDEX_op_add2_i64:
2979 case INDEX_op_sub2_i32:
2980 case INDEX_op_sub2_i64:
2982 static const TCGTargetOpDef arith2
2983 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2984 return &arith2;
2986 case INDEX_op_ctz_i32:
2987 case INDEX_op_ctz_i64:
2989 static const TCGTargetOpDef ctz[2] = {
2990 { .args_ct_str = { "&r", "r", "r" } },
2991 { .args_ct_str = { "&r", "r", "rW" } },
2993 return &ctz[have_bmi1];
2995 case INDEX_op_clz_i32:
2996 case INDEX_op_clz_i64:
2998 static const TCGTargetOpDef clz[2] = {
2999 { .args_ct_str = { "&r", "r", "r" } },
3000 { .args_ct_str = { "&r", "r", "rW" } },
3002 return &clz[have_lzcnt];
3005 case INDEX_op_qemu_ld_i32:
3006 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3007 case INDEX_op_qemu_st_i32:
3008 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3009 case INDEX_op_qemu_ld_i64:
3010 return (TCG_TARGET_REG_BITS == 64 ? &r_L
3011 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3012 : &r_r_L_L);
3013 case INDEX_op_qemu_st_i64:
3014 return (TCG_TARGET_REG_BITS == 64 ? &L_L
3015 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3016 : &L_L_L_L);
3018 case INDEX_op_brcond2_i32:
3020 static const TCGTargetOpDef b2
3021 = { .args_ct_str = { "r", "r", "ri", "ri" } };
3022 return &b2;
3024 case INDEX_op_setcond2_i32:
3026 static const TCGTargetOpDef s2
3027 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3028 return &s2;
3031 case INDEX_op_ld_vec:
3032 case INDEX_op_st_vec:
3033 return &x_r;
3035 case INDEX_op_add_vec:
3036 case INDEX_op_sub_vec:
3037 case INDEX_op_mul_vec:
3038 case INDEX_op_and_vec:
3039 case INDEX_op_or_vec:
3040 case INDEX_op_xor_vec:
3041 case INDEX_op_andc_vec:
3042 case INDEX_op_ssadd_vec:
3043 case INDEX_op_usadd_vec:
3044 case INDEX_op_sssub_vec:
3045 case INDEX_op_ussub_vec:
3046 case INDEX_op_cmp_vec:
3047 case INDEX_op_x86_shufps_vec:
3048 case INDEX_op_x86_blend_vec:
3049 case INDEX_op_x86_packss_vec:
3050 case INDEX_op_x86_packus_vec:
3051 case INDEX_op_x86_vperm2i128_vec:
3052 case INDEX_op_x86_punpckl_vec:
3053 case INDEX_op_x86_punpckh_vec:
3054 #if TCG_TARGET_REG_BITS == 32
3055 case INDEX_op_dup2_vec:
3056 #endif
3057 return &x_x_x;
3058 case INDEX_op_dup_vec:
3059 case INDEX_op_shli_vec:
3060 case INDEX_op_shri_vec:
3061 case INDEX_op_sari_vec:
3062 case INDEX_op_x86_psrldq_vec:
3063 return &x_x;
3064 case INDEX_op_x86_vpblendvb_vec:
3065 return &x_x_x_x;
3067 default:
3068 break;
3070 return NULL;
3073 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3075 switch (opc) {
3076 case INDEX_op_add_vec:
3077 case INDEX_op_sub_vec:
3078 case INDEX_op_and_vec:
3079 case INDEX_op_or_vec:
3080 case INDEX_op_xor_vec:
3081 case INDEX_op_andc_vec:
3082 return 1;
3083 case INDEX_op_cmp_vec:
3084 return -1;
3086 case INDEX_op_shli_vec:
3087 case INDEX_op_shri_vec:
3088 /* We must expand the operation for MO_8. */
3089 return vece == MO_8 ? -1 : 1;
3091 case INDEX_op_sari_vec:
3092 /* We must expand the operation for MO_8. */
3093 if (vece == MO_8) {
3094 return -1;
3096 /* We can emulate this for MO_64, but it does not pay off
3097 unless we're producing at least 4 values. */
3098 if (vece == MO_64) {
3099 return type >= TCG_TYPE_V256 ? -1 : 0;
3101 return 1;
3103 case INDEX_op_mul_vec:
3104 if (vece == MO_8) {
3105 /* We can expand the operation for MO_8. */
3106 return -1;
3108 if (vece == MO_64) {
3109 return 0;
3111 return 1;
3113 case INDEX_op_ssadd_vec:
3114 case INDEX_op_usadd_vec:
3115 case INDEX_op_sssub_vec:
3116 case INDEX_op_ussub_vec:
3117 return vece <= MO_16;
3119 default:
3120 return 0;
3124 static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
3125 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3127 TCGv_vec t1, t2;
3129 tcg_debug_assert(vece == MO_8);
3131 t1 = tcg_temp_new_vec(type);
3132 t2 = tcg_temp_new_vec(type);
3134 /* Unpack to W, shift, and repack. Tricky bits:
3135 (1) Use punpck*bw x,x to produce DDCCBBAA,
3136 i.e. duplicate in other half of the 16-bit lane.
3137 (2) For right-shift, add 8 so that the high half of
3138 the lane becomes zero. For left-shift, we must
3139 shift up and down again.
3140 (3) Step 2 leaves high half zero such that PACKUSWB
3141 (pack with unsigned saturation) does not modify
3142 the quantity. */
3143 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3144 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3145 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3146 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3148 if (shr) {
3149 tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
3150 tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
3151 } else {
3152 tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
3153 tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
3154 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3155 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3158 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3159 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3160 tcg_temp_free_vec(t1);
3161 tcg_temp_free_vec(t2);
3164 static void expand_vec_sari(TCGType type, unsigned vece,
3165 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3167 TCGv_vec t1, t2;
3169 switch (vece) {
3170 case MO_8:
3171 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
3172 t1 = tcg_temp_new_vec(type);
3173 t2 = tcg_temp_new_vec(type);
3174 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3175 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3176 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3177 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3178 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3179 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3180 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3181 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3182 tcg_temp_free_vec(t1);
3183 tcg_temp_free_vec(t2);
3184 break;
3186 case MO_64:
3187 if (imm <= 32) {
3188 /* We can emulate a small sign extend by performing an arithmetic
3189 * 32-bit shift and overwriting the high half of a 64-bit logical
3190 * shift (note that the ISA says shift of 32 is valid).
3192 t1 = tcg_temp_new_vec(type);
3193 tcg_gen_sari_vec(MO_32, t1, v1, imm);
3194 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3195 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3196 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3197 tcgv_vec_arg(t1), 0xaa);
3198 tcg_temp_free_vec(t1);
3199 } else {
3200 /* Otherwise we will need to use a compare vs 0 to produce
3201 * the sign-extend, shift and merge.
3203 t1 = tcg_const_zeros_vec(type);
3204 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3205 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3206 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3207 tcg_gen_or_vec(MO_64, v0, v0, t1);
3208 tcg_temp_free_vec(t1);
3210 break;
3212 default:
3213 g_assert_not_reached();
3217 static void expand_vec_mul(TCGType type, unsigned vece,
3218 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3220 TCGv_vec t1, t2, t3, t4;
3222 tcg_debug_assert(vece == MO_8);
3225 * Unpack v1 bytes to words, 0 | x.
3226 * Unpack v2 bytes to words, y | 0.
3227 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3228 * Shift logical right by 8 bits to clear the high 8 bytes before
3229 * using an unsigned saturated pack.
3231 * The difference between the V64, V128 and V256 cases is merely how
3232 * we distribute the expansion between temporaries.
3234 switch (type) {
3235 case TCG_TYPE_V64:
3236 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3237 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3238 tcg_gen_dup16i_vec(t2, 0);
3239 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3240 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3241 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3242 tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3243 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3244 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3245 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3246 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3247 tcg_temp_free_vec(t1);
3248 tcg_temp_free_vec(t2);
3249 break;
3251 case TCG_TYPE_V128:
3252 case TCG_TYPE_V256:
3253 t1 = tcg_temp_new_vec(type);
3254 t2 = tcg_temp_new_vec(type);
3255 t3 = tcg_temp_new_vec(type);
3256 t4 = tcg_temp_new_vec(type);
3257 tcg_gen_dup16i_vec(t4, 0);
3258 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3259 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3260 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3261 tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3262 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3263 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3264 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3265 tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3266 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3267 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3268 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3269 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3270 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3271 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3272 tcg_temp_free_vec(t1);
3273 tcg_temp_free_vec(t2);
3274 tcg_temp_free_vec(t3);
3275 tcg_temp_free_vec(t4);
3276 break;
3278 default:
3279 g_assert_not_reached();
3283 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3284 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3286 enum {
3287 NEED_SWAP = 1,
3288 NEED_INV = 2,
3289 NEED_BIAS = 4
3291 static const uint8_t fixups[16] = {
3292 [0 ... 15] = -1,
3293 [TCG_COND_EQ] = 0,
3294 [TCG_COND_NE] = NEED_INV,
3295 [TCG_COND_GT] = 0,
3296 [TCG_COND_LT] = NEED_SWAP,
3297 [TCG_COND_LE] = NEED_INV,
3298 [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3299 [TCG_COND_GTU] = NEED_BIAS,
3300 [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3301 [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3302 [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3304 TCGv_vec t1, t2;
3305 uint8_t fixup;
3307 fixup = fixups[cond & 15];
3308 tcg_debug_assert(fixup != 0xff);
3310 if (fixup & NEED_INV) {
3311 cond = tcg_invert_cond(cond);
3313 if (fixup & NEED_SWAP) {
3314 t1 = v1, v1 = v2, v2 = t1;
3315 cond = tcg_swap_cond(cond);
3318 t1 = t2 = NULL;
3319 if (fixup & NEED_BIAS) {
3320 t1 = tcg_temp_new_vec(type);
3321 t2 = tcg_temp_new_vec(type);
3322 tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3323 tcg_gen_sub_vec(vece, t1, v1, t2);
3324 tcg_gen_sub_vec(vece, t2, v2, t2);
3325 v1 = t1;
3326 v2 = t2;
3327 cond = tcg_signed_cond(cond);
3330 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3331 /* Expand directly; do not recurse. */
3332 vec_gen_4(INDEX_op_cmp_vec, type, vece,
3333 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3335 if (t1) {
3336 tcg_temp_free_vec(t1);
3337 if (t2) {
3338 tcg_temp_free_vec(t2);
3341 if (fixup & NEED_INV) {
3342 tcg_gen_not_vec(vece, v0, v0);
3346 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3347 TCGArg a0, ...)
3349 va_list va;
3350 TCGArg a2;
3351 TCGv_vec v0, v1, v2;
3353 va_start(va, a0);
3354 v0 = temp_tcgv_vec(arg_temp(a0));
3355 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3356 a2 = va_arg(va, TCGArg);
3358 switch (opc) {
3359 case INDEX_op_shli_vec:
3360 case INDEX_op_shri_vec:
3361 expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
3362 break;
3364 case INDEX_op_sari_vec:
3365 expand_vec_sari(type, vece, v0, v1, a2);
3366 break;
3368 case INDEX_op_mul_vec:
3369 v2 = temp_tcgv_vec(arg_temp(a2));
3370 expand_vec_mul(type, vece, v0, v1, v2);
3371 break;
3373 case INDEX_op_cmp_vec:
3374 v2 = temp_tcgv_vec(arg_temp(a2));
3375 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3376 break;
3378 default:
3379 break;
3382 va_end(va);
3385 static const int tcg_target_callee_save_regs[] = {
3386 #if TCG_TARGET_REG_BITS == 64
3387 TCG_REG_RBP,
3388 TCG_REG_RBX,
3389 #if defined(_WIN64)
3390 TCG_REG_RDI,
3391 TCG_REG_RSI,
3392 #endif
3393 TCG_REG_R12,
3394 TCG_REG_R13,
3395 TCG_REG_R14, /* Currently used for the global env. */
3396 TCG_REG_R15,
3397 #else
3398 TCG_REG_EBP, /* Currently used for the global env. */
3399 TCG_REG_EBX,
3400 TCG_REG_ESI,
3401 TCG_REG_EDI,
3402 #endif
3405 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3406 and tcg_register_jit. */
3408 #define PUSH_SIZE \
3409 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3410 * (TCG_TARGET_REG_BITS / 8))
3412 #define FRAME_SIZE \
3413 ((PUSH_SIZE \
3414 + TCG_STATIC_CALL_ARGS_SIZE \
3415 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3416 + TCG_TARGET_STACK_ALIGN - 1) \
3417 & ~(TCG_TARGET_STACK_ALIGN - 1))
3419 /* Generate global QEMU prologue and epilogue code */
3420 static void tcg_target_qemu_prologue(TCGContext *s)
3422 int i, stack_addend;
3424 /* TB prologue */
3426 /* Reserve some stack space, also for TCG temps. */
3427 stack_addend = FRAME_SIZE - PUSH_SIZE;
3428 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3429 CPU_TEMP_BUF_NLONGS * sizeof(long));
3431 /* Save all callee saved registers. */
3432 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3433 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3436 #if TCG_TARGET_REG_BITS == 32
3437 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3438 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3439 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3440 /* jmp *tb. */
3441 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3442 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3443 + stack_addend);
3444 #else
3445 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3446 if (guest_base) {
3447 int seg = setup_guest_base_seg();
3448 if (seg != 0) {
3449 x86_guest_base_seg = seg;
3450 } else if (guest_base == (int32_t)guest_base) {
3451 x86_guest_base_offset = guest_base;
3452 } else {
3453 /* Choose R12 because, as a base, it requires a SIB byte. */
3454 x86_guest_base_index = TCG_REG_R12;
3455 tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3456 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3459 # endif
3460 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3461 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3462 /* jmp *tb. */
3463 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3464 #endif
3467 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3468 * and fall through to the rest of the epilogue.
3470 s->code_gen_epilogue = s->code_ptr;
3471 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3473 /* TB epilogue */
3474 tb_ret_addr = s->code_ptr;
3476 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3478 if (have_avx2) {
3479 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3481 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3482 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3484 tcg_out_opc(s, OPC_RET, 0, 0, 0);
3487 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3489 memset(p, 0x90, count);
3492 static void tcg_target_init(TCGContext *s)
3494 #ifdef CONFIG_CPUID_H
3495 unsigned a, b, c, d, b7 = 0;
3496 int max = __get_cpuid_max(0, 0);
3498 if (max >= 7) {
3499 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3500 __cpuid_count(7, 0, a, b7, c, d);
3501 have_bmi1 = (b7 & bit_BMI) != 0;
3502 have_bmi2 = (b7 & bit_BMI2) != 0;
3505 if (max >= 1) {
3506 __cpuid(1, a, b, c, d);
3507 #ifndef have_cmov
3508 /* For 32-bit, 99% certainty that we're running on hardware that
3509 supports cmov, but we still need to check. In case cmov is not
3510 available, we'll use a small forward branch. */
3511 have_cmov = (d & bit_CMOV) != 0;
3512 #endif
3514 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3515 need to probe for it. */
3516 have_movbe = (c & bit_MOVBE) != 0;
3517 have_popcnt = (c & bit_POPCNT) != 0;
3519 /* There are a number of things we must check before we can be
3520 sure of not hitting invalid opcode. */
3521 if (c & bit_OSXSAVE) {
3522 unsigned xcrl, xcrh;
3523 /* The xgetbv instruction is not available to older versions of
3524 * the assembler, so we encode the instruction manually.
3526 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3527 if ((xcrl & 6) == 6) {
3528 have_avx1 = (c & bit_AVX) != 0;
3529 have_avx2 = (b7 & bit_AVX2) != 0;
3534 max = __get_cpuid_max(0x8000000, 0);
3535 if (max >= 1) {
3536 __cpuid(0x80000001, a, b, c, d);
3537 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3538 have_lzcnt = (c & bit_LZCNT) != 0;
3540 #endif /* CONFIG_CPUID_H */
3542 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3543 if (TCG_TARGET_REG_BITS == 64) {
3544 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3546 if (have_avx1) {
3547 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3548 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3550 if (have_avx2) {
3551 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3554 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3555 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3556 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3557 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3558 if (TCG_TARGET_REG_BITS == 64) {
3559 #if !defined(_WIN64)
3560 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3561 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3562 #endif
3563 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3564 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3565 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3566 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3569 s->reserved_regs = 0;
3570 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3573 typedef struct {
3574 DebugFrameHeader h;
3575 uint8_t fde_def_cfa[4];
3576 uint8_t fde_reg_ofs[14];
3577 } DebugFrame;
3579 /* We're expecting a 2 byte uleb128 encoded value. */
3580 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3582 #if !defined(__ELF__)
3583 /* Host machine without ELF. */
3584 #elif TCG_TARGET_REG_BITS == 64
3585 #define ELF_HOST_MACHINE EM_X86_64
3586 static const DebugFrame debug_frame = {
3587 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3588 .h.cie.id = -1,
3589 .h.cie.version = 1,
3590 .h.cie.code_align = 1,
3591 .h.cie.data_align = 0x78, /* sleb128 -8 */
3592 .h.cie.return_column = 16,
3594 /* Total FDE size does not include the "len" member. */
3595 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3597 .fde_def_cfa = {
3598 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3599 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3600 (FRAME_SIZE >> 7)
3602 .fde_reg_ofs = {
3603 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3604 /* The following ordering must match tcg_target_callee_save_regs. */
3605 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3606 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3607 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3608 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3609 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3610 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3613 #else
3614 #define ELF_HOST_MACHINE EM_386
3615 static const DebugFrame debug_frame = {
3616 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3617 .h.cie.id = -1,
3618 .h.cie.version = 1,
3619 .h.cie.code_align = 1,
3620 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3621 .h.cie.return_column = 8,
3623 /* Total FDE size does not include the "len" member. */
3624 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3626 .fde_def_cfa = {
3627 12, 4, /* DW_CFA_def_cfa %esp, ... */
3628 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3629 (FRAME_SIZE >> 7)
3631 .fde_reg_ofs = {
3632 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3633 /* The following ordering must match tcg_target_callee_save_regs. */
3634 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3635 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3636 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3637 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3640 #endif
3642 #if defined(ELF_HOST_MACHINE)
3643 void tcg_register_jit(void *buf, size_t buf_size)
3645 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3647 #endif