chardev: ensure termios is fully initialized
[qemu/ar7.git] / tcg / i386 / tcg-target.inc.c
blobe0670e50986b53c8d59eae3d73b266a4d943422d
1 /*
2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 #include "tcg-pool.inc.c"
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 #else
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33 #endif
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39 #endif
41 #endif
43 static const int tcg_target_reg_alloc_order[] = {
44 #if TCG_TARGET_REG_BITS == 64
45 TCG_REG_RBP,
46 TCG_REG_RBX,
47 TCG_REG_R12,
48 TCG_REG_R13,
49 TCG_REG_R14,
50 TCG_REG_R15,
51 TCG_REG_R10,
52 TCG_REG_R11,
53 TCG_REG_R9,
54 TCG_REG_R8,
55 TCG_REG_RCX,
56 TCG_REG_RDX,
57 TCG_REG_RSI,
58 TCG_REG_RDI,
59 TCG_REG_RAX,
60 #else
61 TCG_REG_EBX,
62 TCG_REG_ESI,
63 TCG_REG_EDI,
64 TCG_REG_EBP,
65 TCG_REG_ECX,
66 TCG_REG_EDX,
67 TCG_REG_EAX,
68 #endif
69 TCG_REG_XMM0,
70 TCG_REG_XMM1,
71 TCG_REG_XMM2,
72 TCG_REG_XMM3,
73 TCG_REG_XMM4,
74 TCG_REG_XMM5,
75 #ifndef _WIN64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
78 TCG_REG_XMM6,
79 TCG_REG_XMM7,
80 #if TCG_TARGET_REG_BITS == 64
81 TCG_REG_XMM8,
82 TCG_REG_XMM9,
83 TCG_REG_XMM10,
84 TCG_REG_XMM11,
85 TCG_REG_XMM12,
86 TCG_REG_XMM13,
87 TCG_REG_XMM14,
88 TCG_REG_XMM15,
89 #endif
90 #endif
93 static const int tcg_target_call_iarg_regs[] = {
94 #if TCG_TARGET_REG_BITS == 64
95 #if defined(_WIN64)
96 TCG_REG_RCX,
97 TCG_REG_RDX,
98 #else
99 TCG_REG_RDI,
100 TCG_REG_RSI,
101 TCG_REG_RDX,
102 TCG_REG_RCX,
103 #endif
104 TCG_REG_R8,
105 TCG_REG_R9,
106 #else
107 /* 32 bit mode uses stack based calling convention (GCC default). */
108 #endif
111 static const int tcg_target_call_oarg_regs[] = {
112 TCG_REG_EAX,
113 #if TCG_TARGET_REG_BITS == 32
114 TCG_REG_EDX
115 #endif
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
126 i386. */
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130 #else
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
133 #endif
135 /* The host compiler should supply <cpuid.h> to enable runtime features
136 detection, as we're not going to go so far as our own inline assembly.
137 If not available, default values will be assumed. */
138 #if defined(CONFIG_CPUID_H)
139 #include "qemu/cpuid.h"
140 #endif
142 /* For 64-bit, we always know that CMOV is available. */
143 #if TCG_TARGET_REG_BITS == 64
144 # define have_cmov 1
145 #elif defined(CONFIG_CPUID_H)
146 static bool have_cmov;
147 #else
148 # define have_cmov 0
149 #endif
151 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
152 it there. Therefore we always define the variable. */
153 bool have_bmi1;
154 bool have_popcnt;
155 bool have_avx1;
156 bool have_avx2;
158 #ifdef CONFIG_CPUID_H
159 static bool have_movbe;
160 static bool have_bmi2;
161 static bool have_lzcnt;
162 #else
163 # define have_movbe 0
164 # define have_bmi2 0
165 # define have_lzcnt 0
166 #endif
168 static tcg_insn_unit *tb_ret_addr;
170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171 intptr_t value, intptr_t addend)
173 value += addend;
174 switch(type) {
175 case R_386_PC32:
176 value -= (uintptr_t)code_ptr;
177 if (value != (int32_t)value) {
178 return false;
180 /* FALLTHRU */
181 case R_386_32:
182 tcg_patch32(code_ptr, value);
183 break;
184 case R_386_PC8:
185 value -= (uintptr_t)code_ptr;
186 if (value != (int8_t)value) {
187 return false;
189 tcg_patch8(code_ptr, value);
190 break;
191 default:
192 tcg_abort();
194 return true;
197 #if TCG_TARGET_REG_BITS == 64
198 #define ALL_GENERAL_REGS 0x0000ffffu
199 #define ALL_VECTOR_REGS 0xffff0000u
200 #else
201 #define ALL_GENERAL_REGS 0x000000ffu
202 #define ALL_VECTOR_REGS 0x00ff0000u
203 #endif
205 /* parse target specific constraints */
206 static const char *target_parse_constraint(TCGArgConstraint *ct,
207 const char *ct_str, TCGType type)
209 switch(*ct_str++) {
210 case 'a':
211 ct->ct |= TCG_CT_REG;
212 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
213 break;
214 case 'b':
215 ct->ct |= TCG_CT_REG;
216 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
217 break;
218 case 'c':
219 ct->ct |= TCG_CT_REG;
220 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
221 break;
222 case 'd':
223 ct->ct |= TCG_CT_REG;
224 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
225 break;
226 case 'S':
227 ct->ct |= TCG_CT_REG;
228 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
229 break;
230 case 'D':
231 ct->ct |= TCG_CT_REG;
232 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
233 break;
234 case 'q':
235 /* A register that can be used as a byte operand. */
236 ct->ct |= TCG_CT_REG;
237 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
238 break;
239 case 'Q':
240 /* A register with an addressable second byte (e.g. %ah). */
241 ct->ct |= TCG_CT_REG;
242 ct->u.regs = 0xf;
243 break;
244 case 'r':
245 /* A general register. */
246 ct->ct |= TCG_CT_REG;
247 ct->u.regs |= ALL_GENERAL_REGS;
248 break;
249 case 'W':
250 /* With TZCNT/LZCNT, we can have operand-size as an input. */
251 ct->ct |= TCG_CT_CONST_WSZ;
252 break;
253 case 'x':
254 /* A vector register. */
255 ct->ct |= TCG_CT_REG;
256 ct->u.regs |= ALL_VECTOR_REGS;
257 break;
259 /* qemu_ld/st address constraint */
260 case 'L':
261 ct->ct |= TCG_CT_REG;
262 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
263 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
264 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
265 break;
267 case 'e':
268 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
269 break;
270 case 'Z':
271 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
272 break;
273 case 'I':
274 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
275 break;
277 default:
278 return NULL;
280 return ct_str;
283 /* test if a constant matches the constraint */
284 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
285 const TCGArgConstraint *arg_ct)
287 int ct = arg_ct->ct;
288 if (ct & TCG_CT_CONST) {
289 return 1;
291 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
292 return 1;
294 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
295 return 1;
297 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
298 return 1;
300 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
301 return 1;
303 return 0;
306 # define LOWREGMASK(x) ((x) & 7)
308 #define P_EXT 0x100 /* 0x0f opcode prefix */
309 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
310 #define P_DATA16 0x400 /* 0x66 opcode prefix */
311 #if TCG_TARGET_REG_BITS == 64
312 # define P_REXW 0x1000 /* Set REX.W = 1 */
313 # define P_REXB_R 0x2000 /* REG field as byte register */
314 # define P_REXB_RM 0x4000 /* R/M field as byte register */
315 # define P_GS 0x8000 /* gs segment override */
316 #else
317 # define P_REXW 0
318 # define P_REXB_R 0
319 # define P_REXB_RM 0
320 # define P_GS 0
321 #endif
322 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
323 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
324 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
325 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
327 #define OPC_ARITH_EvIz (0x81)
328 #define OPC_ARITH_EvIb (0x83)
329 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
330 #define OPC_ANDN (0xf2 | P_EXT38)
331 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
332 #define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
333 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
334 #define OPC_BSF (0xbc | P_EXT)
335 #define OPC_BSR (0xbd | P_EXT)
336 #define OPC_BSWAP (0xc8 | P_EXT)
337 #define OPC_CALL_Jz (0xe8)
338 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
339 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
340 #define OPC_DEC_r32 (0x48)
341 #define OPC_IMUL_GvEv (0xaf | P_EXT)
342 #define OPC_IMUL_GvEvIb (0x6b)
343 #define OPC_IMUL_GvEvIz (0x69)
344 #define OPC_INC_r32 (0x40)
345 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
346 #define OPC_JCC_short (0x70) /* ... plus condition code */
347 #define OPC_JMP_long (0xe9)
348 #define OPC_JMP_short (0xeb)
349 #define OPC_LEA (0x8d)
350 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
351 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
352 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
353 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
354 #define OPC_MOVB_EvIz (0xc6)
355 #define OPC_MOVL_EvIz (0xc7)
356 #define OPC_MOVL_Iv (0xb8)
357 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
358 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
359 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
360 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
361 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
362 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
363 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
364 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
365 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
366 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
367 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
368 #define OPC_MOVSBL (0xbe | P_EXT)
369 #define OPC_MOVSWL (0xbf | P_EXT)
370 #define OPC_MOVSLQ (0x63 | P_REXW)
371 #define OPC_MOVZBL (0xb6 | P_EXT)
372 #define OPC_MOVZWL (0xb7 | P_EXT)
373 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
374 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
375 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
376 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
377 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
378 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
379 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
380 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
381 #define OPC_PADDSB (0xec | P_EXT | P_DATA16)
382 #define OPC_PADDSW (0xed | P_EXT | P_DATA16)
383 #define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
384 #define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
385 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
386 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
387 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
388 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
389 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
390 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
391 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
392 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
393 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
394 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
395 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
396 #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
397 #define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
398 #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
399 #define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
400 #define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
401 #define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
402 #define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
403 #define OPC_PMINSW (0xea | P_EXT | P_DATA16)
404 #define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
405 #define OPC_PMINUB (0xda | P_EXT | P_DATA16)
406 #define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
407 #define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
408 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
409 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
410 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
411 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
412 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
413 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
414 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
415 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
416 #define OPC_POR (0xeb | P_EXT | P_DATA16)
417 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
418 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
419 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
420 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
421 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
422 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
423 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
424 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
425 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
426 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
427 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
428 #define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
429 #define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
430 #define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
431 #define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
432 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
433 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
434 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
435 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
436 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
437 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
438 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
439 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
440 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
441 #define OPC_POP_r32 (0x58)
442 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
443 #define OPC_PUSH_r32 (0x50)
444 #define OPC_PUSH_Iv (0x68)
445 #define OPC_PUSH_Ib (0x6a)
446 #define OPC_RET (0xc3)
447 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
448 #define OPC_SHIFT_1 (0xd1)
449 #define OPC_SHIFT_Ib (0xc1)
450 #define OPC_SHIFT_cl (0xd3)
451 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
452 #define OPC_SHUFPS (0xc6 | P_EXT)
453 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
454 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
455 #define OPC_TESTL (0x85)
456 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
457 #define OPC_UD2 (0x0b | P_EXT)
458 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
459 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
460 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
461 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
462 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
463 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
464 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
465 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
466 #define OPC_VZEROUPPER (0x77 | P_EXT)
467 #define OPC_XCHG_ax_r32 (0x90)
469 #define OPC_GRP3_Ev (0xf7)
470 #define OPC_GRP5 (0xff)
471 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
473 /* Group 1 opcode extensions for 0x80-0x83.
474 These are also used as modifiers for OPC_ARITH. */
475 #define ARITH_ADD 0
476 #define ARITH_OR 1
477 #define ARITH_ADC 2
478 #define ARITH_SBB 3
479 #define ARITH_AND 4
480 #define ARITH_SUB 5
481 #define ARITH_XOR 6
482 #define ARITH_CMP 7
484 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
485 #define SHIFT_ROL 0
486 #define SHIFT_ROR 1
487 #define SHIFT_SHL 4
488 #define SHIFT_SHR 5
489 #define SHIFT_SAR 7
491 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
492 #define EXT3_NOT 2
493 #define EXT3_NEG 3
494 #define EXT3_MUL 4
495 #define EXT3_IMUL 5
496 #define EXT3_DIV 6
497 #define EXT3_IDIV 7
499 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
500 #define EXT5_INC_Ev 0
501 #define EXT5_DEC_Ev 1
502 #define EXT5_CALLN_Ev 2
503 #define EXT5_JMPN_Ev 4
505 /* Condition codes to be added to OPC_JCC_{long,short}. */
506 #define JCC_JMP (-1)
507 #define JCC_JO 0x0
508 #define JCC_JNO 0x1
509 #define JCC_JB 0x2
510 #define JCC_JAE 0x3
511 #define JCC_JE 0x4
512 #define JCC_JNE 0x5
513 #define JCC_JBE 0x6
514 #define JCC_JA 0x7
515 #define JCC_JS 0x8
516 #define JCC_JNS 0x9
517 #define JCC_JP 0xa
518 #define JCC_JNP 0xb
519 #define JCC_JL 0xc
520 #define JCC_JGE 0xd
521 #define JCC_JLE 0xe
522 #define JCC_JG 0xf
524 static const uint8_t tcg_cond_to_jcc[] = {
525 [TCG_COND_EQ] = JCC_JE,
526 [TCG_COND_NE] = JCC_JNE,
527 [TCG_COND_LT] = JCC_JL,
528 [TCG_COND_GE] = JCC_JGE,
529 [TCG_COND_LE] = JCC_JLE,
530 [TCG_COND_GT] = JCC_JG,
531 [TCG_COND_LTU] = JCC_JB,
532 [TCG_COND_GEU] = JCC_JAE,
533 [TCG_COND_LEU] = JCC_JBE,
534 [TCG_COND_GTU] = JCC_JA,
537 #if TCG_TARGET_REG_BITS == 64
538 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
540 int rex;
542 if (opc & P_GS) {
543 tcg_out8(s, 0x65);
545 if (opc & P_DATA16) {
546 /* We should never be asking for both 16 and 64-bit operation. */
547 tcg_debug_assert((opc & P_REXW) == 0);
548 tcg_out8(s, 0x66);
550 if (opc & P_SIMDF3) {
551 tcg_out8(s, 0xf3);
552 } else if (opc & P_SIMDF2) {
553 tcg_out8(s, 0xf2);
556 rex = 0;
557 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
558 rex |= (r & 8) >> 1; /* REX.R */
559 rex |= (x & 8) >> 2; /* REX.X */
560 rex |= (rm & 8) >> 3; /* REX.B */
562 /* P_REXB_{R,RM} indicates that the given register is the low byte.
563 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
564 as otherwise the encoding indicates %[abcd]h. Note that the values
565 that are ORed in merely indicate that the REX byte must be present;
566 those bits get discarded in output. */
567 rex |= opc & (r >= 4 ? P_REXB_R : 0);
568 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
570 if (rex) {
571 tcg_out8(s, (uint8_t)(rex | 0x40));
574 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
575 tcg_out8(s, 0x0f);
576 if (opc & P_EXT38) {
577 tcg_out8(s, 0x38);
578 } else if (opc & P_EXT3A) {
579 tcg_out8(s, 0x3a);
583 tcg_out8(s, opc);
585 #else
586 static void tcg_out_opc(TCGContext *s, int opc)
588 if (opc & P_DATA16) {
589 tcg_out8(s, 0x66);
591 if (opc & P_SIMDF3) {
592 tcg_out8(s, 0xf3);
593 } else if (opc & P_SIMDF2) {
594 tcg_out8(s, 0xf2);
596 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
597 tcg_out8(s, 0x0f);
598 if (opc & P_EXT38) {
599 tcg_out8(s, 0x38);
600 } else if (opc & P_EXT3A) {
601 tcg_out8(s, 0x3a);
604 tcg_out8(s, opc);
606 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
607 the 32-bit compilation paths. This method works with all versions of gcc,
608 whereas relying on optimization may not be able to exclude them. */
609 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
610 #endif
612 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
614 tcg_out_opc(s, opc, r, rm, 0);
615 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
618 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
619 int rm, int index)
621 int tmp;
623 /* Use the two byte form if possible, which cannot encode
624 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
625 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
626 && ((rm | index) & 8) == 0) {
627 /* Two byte VEX prefix. */
628 tcg_out8(s, 0xc5);
630 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
631 } else {
632 /* Three byte VEX prefix. */
633 tcg_out8(s, 0xc4);
635 /* VEX.m-mmmm */
636 if (opc & P_EXT3A) {
637 tmp = 3;
638 } else if (opc & P_EXT38) {
639 tmp = 2;
640 } else if (opc & P_EXT) {
641 tmp = 1;
642 } else {
643 g_assert_not_reached();
645 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
646 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
647 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
648 tcg_out8(s, tmp);
650 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
653 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
654 /* VEX.pp */
655 if (opc & P_DATA16) {
656 tmp |= 1; /* 0x66 */
657 } else if (opc & P_SIMDF3) {
658 tmp |= 2; /* 0xf3 */
659 } else if (opc & P_SIMDF2) {
660 tmp |= 3; /* 0xf2 */
662 tmp |= (~v & 15) << 3; /* VEX.vvvv */
663 tcg_out8(s, tmp);
664 tcg_out8(s, opc);
667 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
669 tcg_out_vex_opc(s, opc, r, v, rm, 0);
670 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
673 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
674 We handle either RM and INDEX missing with a negative value. In 64-bit
675 mode for absolute addresses, ~RM is the size of the immediate operand
676 that will follow the instruction. */
678 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
679 int shift, intptr_t offset)
681 int mod, len;
683 if (index < 0 && rm < 0) {
684 if (TCG_TARGET_REG_BITS == 64) {
685 /* Try for a rip-relative addressing mode. This has replaced
686 the 32-bit-mode absolute addressing encoding. */
687 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
688 intptr_t disp = offset - pc;
689 if (disp == (int32_t)disp) {
690 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
691 tcg_out32(s, disp);
692 return;
695 /* Try for an absolute address encoding. This requires the
696 use of the MODRM+SIB encoding and is therefore larger than
697 rip-relative addressing. */
698 if (offset == (int32_t)offset) {
699 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
700 tcg_out8(s, (4 << 3) | 5);
701 tcg_out32(s, offset);
702 return;
705 /* ??? The memory isn't directly addressable. */
706 g_assert_not_reached();
707 } else {
708 /* Absolute address. */
709 tcg_out8(s, (r << 3) | 5);
710 tcg_out32(s, offset);
711 return;
715 /* Find the length of the immediate addend. Note that the encoding
716 that would be used for (%ebp) indicates absolute addressing. */
717 if (rm < 0) {
718 mod = 0, len = 4, rm = 5;
719 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
720 mod = 0, len = 0;
721 } else if (offset == (int8_t)offset) {
722 mod = 0x40, len = 1;
723 } else {
724 mod = 0x80, len = 4;
727 /* Use a single byte MODRM format if possible. Note that the encoding
728 that would be used for %esp is the escape to the two byte form. */
729 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
730 /* Single byte MODRM format. */
731 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
732 } else {
733 /* Two byte MODRM+SIB format. */
735 /* Note that the encoding that would place %esp into the index
736 field indicates no index register. In 64-bit mode, the REX.X
737 bit counts, so %r12 can be used as the index. */
738 if (index < 0) {
739 index = 4;
740 } else {
741 tcg_debug_assert(index != TCG_REG_ESP);
744 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
745 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
748 if (len == 1) {
749 tcg_out8(s, offset);
750 } else if (len == 4) {
751 tcg_out32(s, offset);
755 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
756 int index, int shift, intptr_t offset)
758 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
759 tcg_out_sib_offset(s, r, rm, index, shift, offset);
762 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
763 int rm, int index, int shift,
764 intptr_t offset)
766 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
767 tcg_out_sib_offset(s, r, rm, index, shift, offset);
770 /* A simplification of the above with no index or shift. */
771 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
772 int rm, intptr_t offset)
774 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
777 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
778 int v, int rm, intptr_t offset)
780 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
783 /* Output an opcode with an expected reference to the constant pool. */
784 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
786 tcg_out_opc(s, opc, r, 0, 0);
787 /* Absolute for 32-bit, pc-relative for 64-bit. */
788 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
789 tcg_out32(s, 0);
792 /* Output an opcode with an expected reference to the constant pool. */
793 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
795 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
796 /* Absolute for 32-bit, pc-relative for 64-bit. */
797 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
798 tcg_out32(s, 0);
801 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
802 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
804 /* Propagate an opcode prefix, such as P_REXW. */
805 int ext = subop & ~0x7;
806 subop &= 0x7;
808 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
811 static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
813 int rexw = 0;
815 if (arg == ret) {
816 return;
818 switch (type) {
819 case TCG_TYPE_I64:
820 rexw = P_REXW;
821 /* fallthru */
822 case TCG_TYPE_I32:
823 if (ret < 16) {
824 if (arg < 16) {
825 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
826 } else {
827 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
829 } else {
830 if (arg < 16) {
831 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
832 } else {
833 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
836 break;
838 case TCG_TYPE_V64:
839 tcg_debug_assert(ret >= 16 && arg >= 16);
840 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
841 break;
842 case TCG_TYPE_V128:
843 tcg_debug_assert(ret >= 16 && arg >= 16);
844 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
845 break;
846 case TCG_TYPE_V256:
847 tcg_debug_assert(ret >= 16 && arg >= 16);
848 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
849 break;
851 default:
852 g_assert_not_reached();
856 static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
857 TCGReg r, TCGReg a)
859 if (have_avx2) {
860 static const int dup_insn[4] = {
861 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
862 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
864 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
865 tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
866 } else {
867 switch (vece) {
868 case MO_8:
869 /* ??? With zero in a register, use PSHUFB. */
870 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
871 a = r;
872 /* FALLTHRU */
873 case MO_16:
874 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
875 a = r;
876 /* FALLTHRU */
877 case MO_32:
878 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
879 /* imm8 operand: all output lanes selected from input lane 0. */
880 tcg_out8(s, 0);
881 break;
882 case MO_64:
883 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
884 break;
885 default:
886 g_assert_not_reached();
891 static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
892 TCGReg ret, tcg_target_long arg)
894 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
896 if (arg == 0) {
897 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
898 return;
900 if (arg == -1) {
901 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
902 return;
905 if (TCG_TARGET_REG_BITS == 64) {
906 if (type == TCG_TYPE_V64) {
907 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
908 } else if (have_avx2) {
909 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
910 } else {
911 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
913 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
914 } else if (have_avx2) {
915 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
916 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
917 } else {
918 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
919 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
920 tcg_out_dup_vec(s, type, MO_32, ret, ret);
924 static void tcg_out_movi(TCGContext *s, TCGType type,
925 TCGReg ret, tcg_target_long arg)
927 tcg_target_long diff;
929 switch (type) {
930 case TCG_TYPE_I32:
931 #if TCG_TARGET_REG_BITS == 64
932 case TCG_TYPE_I64:
933 #endif
934 if (ret < 16) {
935 break;
937 /* fallthru */
938 case TCG_TYPE_V64:
939 case TCG_TYPE_V128:
940 case TCG_TYPE_V256:
941 tcg_debug_assert(ret >= 16);
942 tcg_out_dupi_vec(s, type, ret, arg);
943 return;
944 default:
945 g_assert_not_reached();
948 if (arg == 0) {
949 tgen_arithr(s, ARITH_XOR, ret, ret);
950 return;
952 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
953 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
954 tcg_out32(s, arg);
955 return;
957 if (arg == (int32_t)arg) {
958 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
959 tcg_out32(s, arg);
960 return;
963 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
964 diff = arg - ((uintptr_t)s->code_ptr + 7);
965 if (diff == (int32_t)diff) {
966 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
967 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
968 tcg_out32(s, diff);
969 return;
972 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
973 tcg_out64(s, arg);
976 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
978 if (val == (int8_t)val) {
979 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
980 tcg_out8(s, val);
981 } else if (val == (int32_t)val) {
982 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
983 tcg_out32(s, val);
984 } else {
985 tcg_abort();
989 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
991 /* Given the strength of x86 memory ordering, we only need care for
992 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
993 faster than "mfence", so don't bother with the sse insn. */
994 if (a0 & TCG_MO_ST_LD) {
995 tcg_out8(s, 0xf0);
996 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
997 tcg_out8(s, 0);
1001 static inline void tcg_out_push(TCGContext *s, int reg)
1003 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1006 static inline void tcg_out_pop(TCGContext *s, int reg)
1008 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1011 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1012 TCGReg arg1, intptr_t arg2)
1014 switch (type) {
1015 case TCG_TYPE_I32:
1016 if (ret < 16) {
1017 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1018 } else {
1019 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1021 break;
1022 case TCG_TYPE_I64:
1023 if (ret < 16) {
1024 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1025 break;
1027 /* FALLTHRU */
1028 case TCG_TYPE_V64:
1029 tcg_debug_assert(ret >= 16);
1030 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1031 break;
1032 case TCG_TYPE_V128:
1033 tcg_debug_assert(ret >= 16);
1034 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1035 break;
1036 case TCG_TYPE_V256:
1037 tcg_debug_assert(ret >= 16);
1038 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1039 ret, 0, arg1, arg2);
1040 break;
1041 default:
1042 g_assert_not_reached();
1046 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1047 TCGReg arg1, intptr_t arg2)
1049 switch (type) {
1050 case TCG_TYPE_I32:
1051 if (arg < 16) {
1052 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1053 } else {
1054 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1056 break;
1057 case TCG_TYPE_I64:
1058 if (arg < 16) {
1059 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1060 break;
1062 /* FALLTHRU */
1063 case TCG_TYPE_V64:
1064 tcg_debug_assert(arg >= 16);
1065 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1066 break;
1067 case TCG_TYPE_V128:
1068 tcg_debug_assert(arg >= 16);
1069 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1070 break;
1071 case TCG_TYPE_V256:
1072 tcg_debug_assert(arg >= 16);
1073 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1074 arg, 0, arg1, arg2);
1075 break;
1076 default:
1077 g_assert_not_reached();
1081 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1082 TCGReg base, intptr_t ofs)
1084 int rexw = 0;
1085 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1086 if (val != (int32_t)val) {
1087 return false;
1089 rexw = P_REXW;
1090 } else if (type != TCG_TYPE_I32) {
1091 return false;
1093 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1094 tcg_out32(s, val);
1095 return true;
1098 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1100 /* Propagate an opcode prefix, such as P_DATA16. */
1101 int ext = subopc & ~0x7;
1102 subopc &= 0x7;
1104 if (count == 1) {
1105 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1106 } else {
1107 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1108 tcg_out8(s, count);
1112 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1114 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1117 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1119 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1122 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1124 /* movzbl */
1125 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1126 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1129 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1131 /* movsbl */
1132 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1133 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1136 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1138 /* movzwl */
1139 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1142 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1144 /* movsw[lq] */
1145 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1148 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1150 /* 32-bit mov zero extends. */
1151 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1154 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1156 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1159 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1161 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1164 static void tgen_arithi(TCGContext *s, int c, int r0,
1165 tcg_target_long val, int cf)
1167 int rexw = 0;
1169 if (TCG_TARGET_REG_BITS == 64) {
1170 rexw = c & -8;
1171 c &= 7;
1174 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1175 partial flags update stalls on Pentium4 and are not recommended
1176 by current Intel optimization manuals. */
1177 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1178 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1179 if (TCG_TARGET_REG_BITS == 64) {
1180 /* The single-byte increment encodings are re-tasked as the
1181 REX prefixes. Use the MODRM encoding. */
1182 tcg_out_modrm(s, OPC_GRP5 + rexw,
1183 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1184 } else {
1185 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1187 return;
1190 if (c == ARITH_AND) {
1191 if (TCG_TARGET_REG_BITS == 64) {
1192 if (val == 0xffffffffu) {
1193 tcg_out_ext32u(s, r0, r0);
1194 return;
1196 if (val == (uint32_t)val) {
1197 /* AND with no high bits set can use a 32-bit operation. */
1198 rexw = 0;
1201 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1202 tcg_out_ext8u(s, r0, r0);
1203 return;
1205 if (val == 0xffffu) {
1206 tcg_out_ext16u(s, r0, r0);
1207 return;
1211 if (val == (int8_t)val) {
1212 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1213 tcg_out8(s, val);
1214 return;
1216 if (rexw == 0 || val == (int32_t)val) {
1217 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1218 tcg_out32(s, val);
1219 return;
1222 tcg_abort();
1225 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1227 if (val != 0) {
1228 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1232 /* Use SMALL != 0 to force a short forward branch. */
1233 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1235 int32_t val, val1;
1237 if (l->has_value) {
1238 val = tcg_pcrel_diff(s, l->u.value_ptr);
1239 val1 = val - 2;
1240 if ((int8_t)val1 == val1) {
1241 if (opc == -1) {
1242 tcg_out8(s, OPC_JMP_short);
1243 } else {
1244 tcg_out8(s, OPC_JCC_short + opc);
1246 tcg_out8(s, val1);
1247 } else {
1248 if (small) {
1249 tcg_abort();
1251 if (opc == -1) {
1252 tcg_out8(s, OPC_JMP_long);
1253 tcg_out32(s, val - 5);
1254 } else {
1255 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1256 tcg_out32(s, val - 6);
1259 } else if (small) {
1260 if (opc == -1) {
1261 tcg_out8(s, OPC_JMP_short);
1262 } else {
1263 tcg_out8(s, OPC_JCC_short + opc);
1265 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1266 s->code_ptr += 1;
1267 } else {
1268 if (opc == -1) {
1269 tcg_out8(s, OPC_JMP_long);
1270 } else {
1271 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1273 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1274 s->code_ptr += 4;
1278 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1279 int const_arg2, int rexw)
1281 if (const_arg2) {
1282 if (arg2 == 0) {
1283 /* test r, r */
1284 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1285 } else {
1286 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1288 } else {
1289 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1293 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1294 TCGArg arg1, TCGArg arg2, int const_arg2,
1295 TCGLabel *label, int small)
1297 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1298 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1301 #if TCG_TARGET_REG_BITS == 64
1302 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1303 TCGArg arg1, TCGArg arg2, int const_arg2,
1304 TCGLabel *label, int small)
1306 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1307 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1309 #else
1310 /* XXX: we implement it at the target level to avoid having to
1311 handle cross basic blocks temporaries */
1312 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1313 const int *const_args, int small)
1315 TCGLabel *label_next = gen_new_label();
1316 TCGLabel *label_this = arg_label(args[5]);
1318 switch(args[4]) {
1319 case TCG_COND_EQ:
1320 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1321 label_next, 1);
1322 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1323 label_this, small);
1324 break;
1325 case TCG_COND_NE:
1326 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1327 label_this, small);
1328 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1329 label_this, small);
1330 break;
1331 case TCG_COND_LT:
1332 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1333 label_this, small);
1334 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1335 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1336 label_this, small);
1337 break;
1338 case TCG_COND_LE:
1339 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1340 label_this, small);
1341 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1342 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1343 label_this, small);
1344 break;
1345 case TCG_COND_GT:
1346 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1347 label_this, small);
1348 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1349 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1350 label_this, small);
1351 break;
1352 case TCG_COND_GE:
1353 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1354 label_this, small);
1355 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1356 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1357 label_this, small);
1358 break;
1359 case TCG_COND_LTU:
1360 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1361 label_this, small);
1362 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1363 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1364 label_this, small);
1365 break;
1366 case TCG_COND_LEU:
1367 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1368 label_this, small);
1369 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1370 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1371 label_this, small);
1372 break;
1373 case TCG_COND_GTU:
1374 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1375 label_this, small);
1376 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1377 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1378 label_this, small);
1379 break;
1380 case TCG_COND_GEU:
1381 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1382 label_this, small);
1383 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1384 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1385 label_this, small);
1386 break;
1387 default:
1388 tcg_abort();
1390 tcg_out_label(s, label_next, s->code_ptr);
1392 #endif
1394 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1395 TCGArg arg1, TCGArg arg2, int const_arg2)
1397 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1398 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1399 tcg_out_ext8u(s, dest, dest);
1402 #if TCG_TARGET_REG_BITS == 64
1403 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1404 TCGArg arg1, TCGArg arg2, int const_arg2)
1406 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1407 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1408 tcg_out_ext8u(s, dest, dest);
1410 #else
1411 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1412 const int *const_args)
1414 TCGArg new_args[6];
1415 TCGLabel *label_true, *label_over;
1417 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1419 if (args[0] == args[1] || args[0] == args[2]
1420 || (!const_args[3] && args[0] == args[3])
1421 || (!const_args[4] && args[0] == args[4])) {
1422 /* When the destination overlaps with one of the argument
1423 registers, don't do anything tricky. */
1424 label_true = gen_new_label();
1425 label_over = gen_new_label();
1427 new_args[5] = label_arg(label_true);
1428 tcg_out_brcond2(s, new_args, const_args+1, 1);
1430 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1431 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1432 tcg_out_label(s, label_true, s->code_ptr);
1434 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1435 tcg_out_label(s, label_over, s->code_ptr);
1436 } else {
1437 /* When the destination does not overlap one of the arguments,
1438 clear the destination first, jump if cond false, and emit an
1439 increment in the true case. This results in smaller code. */
1441 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1443 label_over = gen_new_label();
1444 new_args[4] = tcg_invert_cond(new_args[4]);
1445 new_args[5] = label_arg(label_over);
1446 tcg_out_brcond2(s, new_args, const_args+1, 1);
1448 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1449 tcg_out_label(s, label_over, s->code_ptr);
1452 #endif
1454 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1455 TCGReg dest, TCGReg v1)
1457 if (have_cmov) {
1458 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1459 } else {
1460 TCGLabel *over = gen_new_label();
1461 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1462 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1463 tcg_out_label(s, over, s->code_ptr);
1467 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1468 TCGReg c1, TCGArg c2, int const_c2,
1469 TCGReg v1)
1471 tcg_out_cmp(s, c1, c2, const_c2, 0);
1472 tcg_out_cmov(s, cond, 0, dest, v1);
1475 #if TCG_TARGET_REG_BITS == 64
1476 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1477 TCGReg c1, TCGArg c2, int const_c2,
1478 TCGReg v1)
1480 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1481 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1483 #endif
1485 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1486 TCGArg arg2, bool const_a2)
1488 if (have_bmi1) {
1489 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1490 if (const_a2) {
1491 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1492 } else {
1493 tcg_debug_assert(dest != arg2);
1494 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1496 } else {
1497 tcg_debug_assert(dest != arg2);
1498 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1499 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1503 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1504 TCGArg arg2, bool const_a2)
1506 if (have_lzcnt) {
1507 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1508 if (const_a2) {
1509 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1510 } else {
1511 tcg_debug_assert(dest != arg2);
1512 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1514 } else {
1515 tcg_debug_assert(!const_a2);
1516 tcg_debug_assert(dest != arg1);
1517 tcg_debug_assert(dest != arg2);
1519 /* Recall that the output of BSR is the index not the count. */
1520 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1521 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1523 /* Since we have destroyed the flags from BSR, we have to re-test. */
1524 tcg_out_cmp(s, arg1, 0, 1, rexw);
1525 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1529 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1531 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1533 if (disp == (int32_t)disp) {
1534 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1535 tcg_out32(s, disp);
1536 } else {
1537 /* rip-relative addressing into the constant pool.
1538 This is 6 + 8 = 14 bytes, as compared to using an
1539 an immediate load 10 + 6 = 16 bytes, plus we may
1540 be able to re-use the pool constant for more calls. */
1541 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1542 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1543 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1544 tcg_out32(s, 0);
1548 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1550 tcg_out_branch(s, 1, dest);
1553 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1555 tcg_out_branch(s, 0, dest);
1558 static void tcg_out_nopn(TCGContext *s, int n)
1560 int i;
1561 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1562 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1563 * duplicate prefix, and all of the interesting recent cores can
1564 * decode and discard the duplicates in a single cycle.
1566 tcg_debug_assert(n >= 1);
1567 for (i = 1; i < n; ++i) {
1568 tcg_out8(s, 0x66);
1570 tcg_out8(s, 0x90);
1573 #if defined(CONFIG_SOFTMMU)
1574 #include "tcg-ldst.inc.c"
1576 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1577 * int mmu_idx, uintptr_t ra)
1579 static void * const qemu_ld_helpers[16] = {
1580 [MO_UB] = helper_ret_ldub_mmu,
1581 [MO_LEUW] = helper_le_lduw_mmu,
1582 [MO_LEUL] = helper_le_ldul_mmu,
1583 [MO_LEQ] = helper_le_ldq_mmu,
1584 [MO_BEUW] = helper_be_lduw_mmu,
1585 [MO_BEUL] = helper_be_ldul_mmu,
1586 [MO_BEQ] = helper_be_ldq_mmu,
1589 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1590 * uintxx_t val, int mmu_idx, uintptr_t ra)
1592 static void * const qemu_st_helpers[16] = {
1593 [MO_UB] = helper_ret_stb_mmu,
1594 [MO_LEUW] = helper_le_stw_mmu,
1595 [MO_LEUL] = helper_le_stl_mmu,
1596 [MO_LEQ] = helper_le_stq_mmu,
1597 [MO_BEUW] = helper_be_stw_mmu,
1598 [MO_BEUL] = helper_be_stl_mmu,
1599 [MO_BEQ] = helper_be_stq_mmu,
1602 /* Perform the TLB load and compare.
1604 Inputs:
1605 ADDRLO and ADDRHI contain the low and high part of the address.
1607 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1609 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1610 This should be offsetof addr_read or addr_write.
1612 Outputs:
1613 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1614 positions of the displacements of forward jumps to the TLB miss case.
1616 Second argument register is loaded with the low part of the address.
1617 In the TLB hit case, it has been adjusted as indicated by the TLB
1618 and so is a host address. In the TLB miss case, it continues to
1619 hold a guest address.
1621 First argument register is clobbered. */
1623 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1624 int mem_index, TCGMemOp opc,
1625 tcg_insn_unit **label_ptr, int which)
1627 const TCGReg r0 = TCG_REG_L0;
1628 const TCGReg r1 = TCG_REG_L1;
1629 TCGType ttype = TCG_TYPE_I32;
1630 TCGType tlbtype = TCG_TYPE_I32;
1631 int trexw = 0, hrexw = 0, tlbrexw = 0;
1632 unsigned a_bits = get_alignment_bits(opc);
1633 unsigned s_bits = opc & MO_SIZE;
1634 unsigned a_mask = (1 << a_bits) - 1;
1635 unsigned s_mask = (1 << s_bits) - 1;
1636 target_ulong tlb_mask;
1638 if (TCG_TARGET_REG_BITS == 64) {
1639 if (TARGET_LONG_BITS == 64) {
1640 ttype = TCG_TYPE_I64;
1641 trexw = P_REXW;
1643 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1644 hrexw = P_REXW;
1645 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1646 tlbtype = TCG_TYPE_I64;
1647 tlbrexw = P_REXW;
1652 tcg_out_mov(s, tlbtype, r0, addrlo);
1653 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1654 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1656 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1657 offsetof(CPUArchState, tlb_mask[mem_index]));
1659 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1660 offsetof(CPUArchState, tlb_table[mem_index]));
1662 /* If the required alignment is at least as large as the access, simply
1663 copy the address and mask. For lesser alignments, check that we don't
1664 cross pages for the complete access. */
1665 if (a_bits >= s_bits) {
1666 tcg_out_mov(s, ttype, r1, addrlo);
1667 } else {
1668 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1670 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1671 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1673 /* cmp 0(r0), r1 */
1674 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1676 /* Prepare for both the fast path add of the tlb addend, and the slow
1677 path function argument setup. */
1678 tcg_out_mov(s, ttype, r1, addrlo);
1680 /* jne slow_path */
1681 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1682 label_ptr[0] = s->code_ptr;
1683 s->code_ptr += 4;
1685 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1686 /* cmp 4(r0), addrhi */
1687 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1689 /* jne slow_path */
1690 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1691 label_ptr[1] = s->code_ptr;
1692 s->code_ptr += 4;
1695 /* TLB Hit. */
1697 /* add addend(r0), r1 */
1698 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1699 offsetof(CPUTLBEntry, addend));
1703 * Record the context of a call to the out of line helper code for the slow path
1704 * for a load or store, so that we can later generate the correct helper code
1706 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1707 TCGMemOpIdx oi,
1708 TCGReg datalo, TCGReg datahi,
1709 TCGReg addrlo, TCGReg addrhi,
1710 tcg_insn_unit *raddr,
1711 tcg_insn_unit **label_ptr)
1713 TCGLabelQemuLdst *label = new_ldst_label(s);
1715 label->is_ld = is_ld;
1716 label->oi = oi;
1717 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1718 label->datalo_reg = datalo;
1719 label->datahi_reg = datahi;
1720 label->addrlo_reg = addrlo;
1721 label->addrhi_reg = addrhi;
1722 label->raddr = raddr;
1723 label->label_ptr[0] = label_ptr[0];
1724 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1725 label->label_ptr[1] = label_ptr[1];
1730 * Generate code for the slow path for a load at the end of block
1732 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1734 TCGMemOpIdx oi = l->oi;
1735 TCGMemOp opc = get_memop(oi);
1736 TCGReg data_reg;
1737 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1738 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1740 /* resolve label address */
1741 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1742 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1743 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1746 if (TCG_TARGET_REG_BITS == 32) {
1747 int ofs = 0;
1749 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1750 ofs += 4;
1752 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1753 ofs += 4;
1755 if (TARGET_LONG_BITS == 64) {
1756 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1757 ofs += 4;
1760 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1761 ofs += 4;
1763 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1764 } else {
1765 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1766 /* The second argument is already loaded with addrlo. */
1767 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1768 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1769 (uintptr_t)l->raddr);
1772 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1774 data_reg = l->datalo_reg;
1775 switch (opc & MO_SSIZE) {
1776 case MO_SB:
1777 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1778 break;
1779 case MO_SW:
1780 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1781 break;
1782 #if TCG_TARGET_REG_BITS == 64
1783 case MO_SL:
1784 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1785 break;
1786 #endif
1787 case MO_UB:
1788 case MO_UW:
1789 /* Note that the helpers have zero-extended to tcg_target_long. */
1790 case MO_UL:
1791 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1792 break;
1793 case MO_Q:
1794 if (TCG_TARGET_REG_BITS == 64) {
1795 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1796 } else if (data_reg == TCG_REG_EDX) {
1797 /* xchg %edx, %eax */
1798 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1799 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1800 } else {
1801 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1802 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1804 break;
1805 default:
1806 tcg_abort();
1809 /* Jump to the code corresponding to next IR of qemu_st */
1810 tcg_out_jmp(s, l->raddr);
1814 * Generate code for the slow path for a store at the end of block
1816 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1818 TCGMemOpIdx oi = l->oi;
1819 TCGMemOp opc = get_memop(oi);
1820 TCGMemOp s_bits = opc & MO_SIZE;
1821 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1822 TCGReg retaddr;
1824 /* resolve label address */
1825 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1826 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1827 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1830 if (TCG_TARGET_REG_BITS == 32) {
1831 int ofs = 0;
1833 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1834 ofs += 4;
1836 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1837 ofs += 4;
1839 if (TARGET_LONG_BITS == 64) {
1840 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1841 ofs += 4;
1844 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1845 ofs += 4;
1847 if (s_bits == MO_64) {
1848 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1849 ofs += 4;
1852 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1853 ofs += 4;
1855 retaddr = TCG_REG_EAX;
1856 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1857 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1858 } else {
1859 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1860 /* The second argument is already loaded with addrlo. */
1861 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1862 tcg_target_call_iarg_regs[2], l->datalo_reg);
1863 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1865 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1866 retaddr = tcg_target_call_iarg_regs[4];
1867 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1868 } else {
1869 retaddr = TCG_REG_RAX;
1870 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1871 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1872 TCG_TARGET_CALL_STACK_OFFSET);
1876 /* "Tail call" to the helper, with the return address back inline. */
1877 tcg_out_push(s, retaddr);
1878 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1880 #elif TCG_TARGET_REG_BITS == 32
1881 # define x86_guest_base_seg 0
1882 # define x86_guest_base_index -1
1883 # define x86_guest_base_offset guest_base
1884 #else
1885 static int x86_guest_base_seg;
1886 static int x86_guest_base_index = -1;
1887 static int32_t x86_guest_base_offset;
1888 # if defined(__x86_64__) && defined(__linux__)
1889 # include <asm/prctl.h>
1890 # include <sys/prctl.h>
1891 int arch_prctl(int code, unsigned long addr);
1892 static inline int setup_guest_base_seg(void)
1894 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1895 return P_GS;
1897 return 0;
1899 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1900 # include <machine/sysarch.h>
1901 static inline int setup_guest_base_seg(void)
1903 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1904 return P_GS;
1906 return 0;
1908 # else
1909 static inline int setup_guest_base_seg(void)
1911 return 0;
1913 # endif
1914 #endif /* SOFTMMU */
1916 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1917 TCGReg base, int index, intptr_t ofs,
1918 int seg, bool is64, TCGMemOp memop)
1920 const TCGMemOp real_bswap = memop & MO_BSWAP;
1921 TCGMemOp bswap = real_bswap;
1922 int rexw = is64 * P_REXW;
1923 int movop = OPC_MOVL_GvEv;
1925 if (have_movbe && real_bswap) {
1926 bswap = 0;
1927 movop = OPC_MOVBE_GyMy;
1930 switch (memop & MO_SSIZE) {
1931 case MO_UB:
1932 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1933 base, index, 0, ofs);
1934 break;
1935 case MO_SB:
1936 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1937 base, index, 0, ofs);
1938 break;
1939 case MO_UW:
1940 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1941 base, index, 0, ofs);
1942 if (real_bswap) {
1943 tcg_out_rolw_8(s, datalo);
1945 break;
1946 case MO_SW:
1947 if (real_bswap) {
1948 if (have_movbe) {
1949 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1950 datalo, base, index, 0, ofs);
1951 } else {
1952 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1953 base, index, 0, ofs);
1954 tcg_out_rolw_8(s, datalo);
1956 tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
1957 } else {
1958 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
1959 datalo, base, index, 0, ofs);
1961 break;
1962 case MO_UL:
1963 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1964 if (bswap) {
1965 tcg_out_bswap32(s, datalo);
1967 break;
1968 #if TCG_TARGET_REG_BITS == 64
1969 case MO_SL:
1970 if (real_bswap) {
1971 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1972 base, index, 0, ofs);
1973 if (bswap) {
1974 tcg_out_bswap32(s, datalo);
1976 tcg_out_ext32s(s, datalo, datalo);
1977 } else {
1978 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1979 base, index, 0, ofs);
1981 break;
1982 #endif
1983 case MO_Q:
1984 if (TCG_TARGET_REG_BITS == 64) {
1985 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1986 base, index, 0, ofs);
1987 if (bswap) {
1988 tcg_out_bswap64(s, datalo);
1990 } else {
1991 if (real_bswap) {
1992 int t = datalo;
1993 datalo = datahi;
1994 datahi = t;
1996 if (base != datalo) {
1997 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1998 base, index, 0, ofs);
1999 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2000 base, index, 0, ofs + 4);
2001 } else {
2002 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2003 base, index, 0, ofs + 4);
2004 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2005 base, index, 0, ofs);
2007 if (bswap) {
2008 tcg_out_bswap32(s, datalo);
2009 tcg_out_bswap32(s, datahi);
2012 break;
2013 default:
2014 tcg_abort();
2018 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2019 EAX. It will be useful once fixed registers globals are less
2020 common. */
2021 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2023 TCGReg datalo, datahi, addrlo;
2024 TCGReg addrhi __attribute__((unused));
2025 TCGMemOpIdx oi;
2026 TCGMemOp opc;
2027 #if defined(CONFIG_SOFTMMU)
2028 int mem_index;
2029 tcg_insn_unit *label_ptr[2];
2030 #endif
2032 datalo = *args++;
2033 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2034 addrlo = *args++;
2035 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2036 oi = *args++;
2037 opc = get_memop(oi);
2039 #if defined(CONFIG_SOFTMMU)
2040 mem_index = get_mmuidx(oi);
2042 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2043 label_ptr, offsetof(CPUTLBEntry, addr_read));
2045 /* TLB Hit. */
2046 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2048 /* Record the current context of a load into ldst label */
2049 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2050 s->code_ptr, label_ptr);
2051 #else
2052 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2053 x86_guest_base_offset, x86_guest_base_seg,
2054 is64, opc);
2055 #endif
2058 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2059 TCGReg base, int index, intptr_t ofs,
2060 int seg, TCGMemOp memop)
2062 /* ??? Ideally we wouldn't need a scratch register. For user-only,
2063 we could perform the bswap twice to restore the original value
2064 instead of moving to the scratch. But as it is, the L constraint
2065 means that TCG_REG_L0 is definitely free here. */
2066 const TCGReg scratch = TCG_REG_L0;
2067 const TCGMemOp real_bswap = memop & MO_BSWAP;
2068 TCGMemOp bswap = real_bswap;
2069 int movop = OPC_MOVL_EvGv;
2071 if (have_movbe && real_bswap) {
2072 bswap = 0;
2073 movop = OPC_MOVBE_MyGy;
2076 switch (memop & MO_SIZE) {
2077 case MO_8:
2078 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2079 Use the scratch register if necessary. */
2080 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2081 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2082 datalo = scratch;
2084 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2085 datalo, base, index, 0, ofs);
2086 break;
2087 case MO_16:
2088 if (bswap) {
2089 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2090 tcg_out_rolw_8(s, scratch);
2091 datalo = scratch;
2093 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2094 base, index, 0, ofs);
2095 break;
2096 case MO_32:
2097 if (bswap) {
2098 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2099 tcg_out_bswap32(s, scratch);
2100 datalo = scratch;
2102 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2103 break;
2104 case MO_64:
2105 if (TCG_TARGET_REG_BITS == 64) {
2106 if (bswap) {
2107 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2108 tcg_out_bswap64(s, scratch);
2109 datalo = scratch;
2111 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2112 base, index, 0, ofs);
2113 } else if (bswap) {
2114 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2115 tcg_out_bswap32(s, scratch);
2116 tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2117 base, index, 0, ofs);
2118 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2119 tcg_out_bswap32(s, scratch);
2120 tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2121 base, index, 0, ofs + 4);
2122 } else {
2123 if (real_bswap) {
2124 int t = datalo;
2125 datalo = datahi;
2126 datahi = t;
2128 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2129 base, index, 0, ofs);
2130 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2131 base, index, 0, ofs + 4);
2133 break;
2134 default:
2135 tcg_abort();
2139 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2141 TCGReg datalo, datahi, addrlo;
2142 TCGReg addrhi __attribute__((unused));
2143 TCGMemOpIdx oi;
2144 TCGMemOp opc;
2145 #if defined(CONFIG_SOFTMMU)
2146 int mem_index;
2147 tcg_insn_unit *label_ptr[2];
2148 #endif
2150 datalo = *args++;
2151 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2152 addrlo = *args++;
2153 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2154 oi = *args++;
2155 opc = get_memop(oi);
2157 #if defined(CONFIG_SOFTMMU)
2158 mem_index = get_mmuidx(oi);
2160 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2161 label_ptr, offsetof(CPUTLBEntry, addr_write));
2163 /* TLB Hit. */
2164 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2166 /* Record the current context of a store into ldst label */
2167 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2168 s->code_ptr, label_ptr);
2169 #else
2170 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2171 x86_guest_base_offset, x86_guest_base_seg, opc);
2172 #endif
2175 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2176 const TCGArg *args, const int *const_args)
2178 TCGArg a0, a1, a2;
2179 int c, const_a2, vexop, rexw = 0;
2181 #if TCG_TARGET_REG_BITS == 64
2182 # define OP_32_64(x) \
2183 case glue(glue(INDEX_op_, x), _i64): \
2184 rexw = P_REXW; /* FALLTHRU */ \
2185 case glue(glue(INDEX_op_, x), _i32)
2186 #else
2187 # define OP_32_64(x) \
2188 case glue(glue(INDEX_op_, x), _i32)
2189 #endif
2191 /* Hoist the loads of the most common arguments. */
2192 a0 = args[0];
2193 a1 = args[1];
2194 a2 = args[2];
2195 const_a2 = const_args[2];
2197 switch (opc) {
2198 case INDEX_op_exit_tb:
2199 /* Reuse the zeroing that exists for goto_ptr. */
2200 if (a0 == 0) {
2201 tcg_out_jmp(s, s->code_gen_epilogue);
2202 } else {
2203 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2204 tcg_out_jmp(s, tb_ret_addr);
2206 break;
2207 case INDEX_op_goto_tb:
2208 if (s->tb_jmp_insn_offset) {
2209 /* direct jump method */
2210 int gap;
2211 /* jump displacement must be aligned for atomic patching;
2212 * see if we need to add extra nops before jump
2214 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2215 if (gap != 1) {
2216 tcg_out_nopn(s, gap - 1);
2218 tcg_out8(s, OPC_JMP_long); /* jmp im */
2219 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2220 tcg_out32(s, 0);
2221 } else {
2222 /* indirect jump method */
2223 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2224 (intptr_t)(s->tb_jmp_target_addr + a0));
2226 set_jmp_reset_offset(s, a0);
2227 break;
2228 case INDEX_op_goto_ptr:
2229 /* jmp to the given host address (could be epilogue) */
2230 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2231 break;
2232 case INDEX_op_br:
2233 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2234 break;
2235 OP_32_64(ld8u):
2236 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2237 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2238 break;
2239 OP_32_64(ld8s):
2240 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2241 break;
2242 OP_32_64(ld16u):
2243 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2244 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2245 break;
2246 OP_32_64(ld16s):
2247 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2248 break;
2249 #if TCG_TARGET_REG_BITS == 64
2250 case INDEX_op_ld32u_i64:
2251 #endif
2252 case INDEX_op_ld_i32:
2253 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2254 break;
2256 OP_32_64(st8):
2257 if (const_args[0]) {
2258 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2259 tcg_out8(s, a0);
2260 } else {
2261 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2263 break;
2264 OP_32_64(st16):
2265 if (const_args[0]) {
2266 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2267 tcg_out16(s, a0);
2268 } else {
2269 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2271 break;
2272 #if TCG_TARGET_REG_BITS == 64
2273 case INDEX_op_st32_i64:
2274 #endif
2275 case INDEX_op_st_i32:
2276 if (const_args[0]) {
2277 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2278 tcg_out32(s, a0);
2279 } else {
2280 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2282 break;
2284 OP_32_64(add):
2285 /* For 3-operand addition, use LEA. */
2286 if (a0 != a1) {
2287 TCGArg c3 = 0;
2288 if (const_a2) {
2289 c3 = a2, a2 = -1;
2290 } else if (a0 == a2) {
2291 /* Watch out for dest = src + dest, since we've removed
2292 the matching constraint on the add. */
2293 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2294 break;
2297 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2298 break;
2300 c = ARITH_ADD;
2301 goto gen_arith;
2302 OP_32_64(sub):
2303 c = ARITH_SUB;
2304 goto gen_arith;
2305 OP_32_64(and):
2306 c = ARITH_AND;
2307 goto gen_arith;
2308 OP_32_64(or):
2309 c = ARITH_OR;
2310 goto gen_arith;
2311 OP_32_64(xor):
2312 c = ARITH_XOR;
2313 goto gen_arith;
2314 gen_arith:
2315 if (const_a2) {
2316 tgen_arithi(s, c + rexw, a0, a2, 0);
2317 } else {
2318 tgen_arithr(s, c + rexw, a0, a2);
2320 break;
2322 OP_32_64(andc):
2323 if (const_a2) {
2324 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2325 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2326 } else {
2327 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2329 break;
2331 OP_32_64(mul):
2332 if (const_a2) {
2333 int32_t val;
2334 val = a2;
2335 if (val == (int8_t)val) {
2336 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2337 tcg_out8(s, val);
2338 } else {
2339 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2340 tcg_out32(s, val);
2342 } else {
2343 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2345 break;
2347 OP_32_64(div2):
2348 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2349 break;
2350 OP_32_64(divu2):
2351 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2352 break;
2354 OP_32_64(shl):
2355 /* For small constant 3-operand shift, use LEA. */
2356 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2357 if (a2 - 1 == 0) {
2358 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2359 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2360 } else {
2361 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2362 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2364 break;
2366 c = SHIFT_SHL;
2367 vexop = OPC_SHLX;
2368 goto gen_shift_maybe_vex;
2369 OP_32_64(shr):
2370 c = SHIFT_SHR;
2371 vexop = OPC_SHRX;
2372 goto gen_shift_maybe_vex;
2373 OP_32_64(sar):
2374 c = SHIFT_SAR;
2375 vexop = OPC_SARX;
2376 goto gen_shift_maybe_vex;
2377 OP_32_64(rotl):
2378 c = SHIFT_ROL;
2379 goto gen_shift;
2380 OP_32_64(rotr):
2381 c = SHIFT_ROR;
2382 goto gen_shift;
2383 gen_shift_maybe_vex:
2384 if (have_bmi2) {
2385 if (!const_a2) {
2386 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2387 break;
2389 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2391 /* FALLTHRU */
2392 gen_shift:
2393 if (const_a2) {
2394 tcg_out_shifti(s, c + rexw, a0, a2);
2395 } else {
2396 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2398 break;
2400 OP_32_64(ctz):
2401 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2402 break;
2403 OP_32_64(clz):
2404 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2405 break;
2406 OP_32_64(ctpop):
2407 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2408 break;
2410 case INDEX_op_brcond_i32:
2411 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2412 break;
2413 case INDEX_op_setcond_i32:
2414 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2415 break;
2416 case INDEX_op_movcond_i32:
2417 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2418 break;
2420 OP_32_64(bswap16):
2421 tcg_out_rolw_8(s, a0);
2422 break;
2423 OP_32_64(bswap32):
2424 tcg_out_bswap32(s, a0);
2425 break;
2427 OP_32_64(neg):
2428 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2429 break;
2430 OP_32_64(not):
2431 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2432 break;
2434 OP_32_64(ext8s):
2435 tcg_out_ext8s(s, a0, a1, rexw);
2436 break;
2437 OP_32_64(ext16s):
2438 tcg_out_ext16s(s, a0, a1, rexw);
2439 break;
2440 OP_32_64(ext8u):
2441 tcg_out_ext8u(s, a0, a1);
2442 break;
2443 OP_32_64(ext16u):
2444 tcg_out_ext16u(s, a0, a1);
2445 break;
2447 case INDEX_op_qemu_ld_i32:
2448 tcg_out_qemu_ld(s, args, 0);
2449 break;
2450 case INDEX_op_qemu_ld_i64:
2451 tcg_out_qemu_ld(s, args, 1);
2452 break;
2453 case INDEX_op_qemu_st_i32:
2454 tcg_out_qemu_st(s, args, 0);
2455 break;
2456 case INDEX_op_qemu_st_i64:
2457 tcg_out_qemu_st(s, args, 1);
2458 break;
2460 OP_32_64(mulu2):
2461 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2462 break;
2463 OP_32_64(muls2):
2464 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2465 break;
2466 OP_32_64(add2):
2467 if (const_args[4]) {
2468 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2469 } else {
2470 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2472 if (const_args[5]) {
2473 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2474 } else {
2475 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2477 break;
2478 OP_32_64(sub2):
2479 if (const_args[4]) {
2480 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2481 } else {
2482 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2484 if (const_args[5]) {
2485 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2486 } else {
2487 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2489 break;
2491 #if TCG_TARGET_REG_BITS == 32
2492 case INDEX_op_brcond2_i32:
2493 tcg_out_brcond2(s, args, const_args, 0);
2494 break;
2495 case INDEX_op_setcond2_i32:
2496 tcg_out_setcond2(s, args, const_args);
2497 break;
2498 #else /* TCG_TARGET_REG_BITS == 64 */
2499 case INDEX_op_ld32s_i64:
2500 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2501 break;
2502 case INDEX_op_ld_i64:
2503 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2504 break;
2505 case INDEX_op_st_i64:
2506 if (const_args[0]) {
2507 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2508 tcg_out32(s, a0);
2509 } else {
2510 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2512 break;
2514 case INDEX_op_brcond_i64:
2515 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2516 break;
2517 case INDEX_op_setcond_i64:
2518 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2519 break;
2520 case INDEX_op_movcond_i64:
2521 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2522 break;
2524 case INDEX_op_bswap64_i64:
2525 tcg_out_bswap64(s, a0);
2526 break;
2527 case INDEX_op_extu_i32_i64:
2528 case INDEX_op_ext32u_i64:
2529 case INDEX_op_extrl_i64_i32:
2530 tcg_out_ext32u(s, a0, a1);
2531 break;
2532 case INDEX_op_ext_i32_i64:
2533 case INDEX_op_ext32s_i64:
2534 tcg_out_ext32s(s, a0, a1);
2535 break;
2536 case INDEX_op_extrh_i64_i32:
2537 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2538 break;
2539 #endif
2541 OP_32_64(deposit):
2542 if (args[3] == 0 && args[4] == 8) {
2543 /* load bits 0..7 */
2544 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2545 } else if (args[3] == 8 && args[4] == 8) {
2546 /* load bits 8..15 */
2547 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2548 } else if (args[3] == 0 && args[4] == 16) {
2549 /* load bits 0..15 */
2550 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2551 } else {
2552 tcg_abort();
2554 break;
2556 case INDEX_op_extract_i64:
2557 if (a2 + args[3] == 32) {
2558 /* This is a 32-bit zero-extending right shift. */
2559 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2560 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2561 break;
2563 /* FALLTHRU */
2564 case INDEX_op_extract_i32:
2565 /* On the off-chance that we can use the high-byte registers.
2566 Otherwise we emit the same ext16 + shift pattern that we
2567 would have gotten from the normal tcg-op.c expansion. */
2568 tcg_debug_assert(a2 == 8 && args[3] == 8);
2569 if (a1 < 4 && a0 < 8) {
2570 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2571 } else {
2572 tcg_out_ext16u(s, a0, a1);
2573 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2575 break;
2577 case INDEX_op_sextract_i32:
2578 /* We don't implement sextract_i64, as we cannot sign-extend to
2579 64-bits without using the REX prefix that explicitly excludes
2580 access to the high-byte registers. */
2581 tcg_debug_assert(a2 == 8 && args[3] == 8);
2582 if (a1 < 4 && a0 < 8) {
2583 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2584 } else {
2585 tcg_out_ext16s(s, a0, a1, 0);
2586 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2588 break;
2590 case INDEX_op_mb:
2591 tcg_out_mb(s, a0);
2592 break;
2593 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2594 case INDEX_op_mov_i64:
2595 case INDEX_op_mov_vec:
2596 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2597 case INDEX_op_movi_i64:
2598 case INDEX_op_dupi_vec:
2599 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2600 default:
2601 tcg_abort();
2604 #undef OP_32_64
2607 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2608 unsigned vecl, unsigned vece,
2609 const TCGArg *args, const int *const_args)
2611 static int const add_insn[4] = {
2612 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2614 static int const ssadd_insn[4] = {
2615 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2617 static int const usadd_insn[4] = {
2618 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2620 static int const sub_insn[4] = {
2621 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2623 static int const sssub_insn[4] = {
2624 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2626 static int const ussub_insn[4] = {
2627 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2629 static int const mul_insn[4] = {
2630 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2632 static int const shift_imm_insn[4] = {
2633 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2635 static int const cmpeq_insn[4] = {
2636 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2638 static int const cmpgt_insn[4] = {
2639 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2641 static int const punpckl_insn[4] = {
2642 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2644 static int const punpckh_insn[4] = {
2645 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2647 static int const packss_insn[4] = {
2648 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2650 static int const packus_insn[4] = {
2651 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2653 static int const smin_insn[4] = {
2654 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2656 static int const smax_insn[4] = {
2657 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2659 static int const umin_insn[4] = {
2660 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2662 static int const umax_insn[4] = {
2663 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2666 TCGType type = vecl + TCG_TYPE_V64;
2667 int insn, sub;
2668 TCGArg a0, a1, a2;
2670 a0 = args[0];
2671 a1 = args[1];
2672 a2 = args[2];
2674 switch (opc) {
2675 case INDEX_op_add_vec:
2676 insn = add_insn[vece];
2677 goto gen_simd;
2678 case INDEX_op_ssadd_vec:
2679 insn = ssadd_insn[vece];
2680 goto gen_simd;
2681 case INDEX_op_usadd_vec:
2682 insn = usadd_insn[vece];
2683 goto gen_simd;
2684 case INDEX_op_sub_vec:
2685 insn = sub_insn[vece];
2686 goto gen_simd;
2687 case INDEX_op_sssub_vec:
2688 insn = sssub_insn[vece];
2689 goto gen_simd;
2690 case INDEX_op_ussub_vec:
2691 insn = ussub_insn[vece];
2692 goto gen_simd;
2693 case INDEX_op_mul_vec:
2694 insn = mul_insn[vece];
2695 goto gen_simd;
2696 case INDEX_op_and_vec:
2697 insn = OPC_PAND;
2698 goto gen_simd;
2699 case INDEX_op_or_vec:
2700 insn = OPC_POR;
2701 goto gen_simd;
2702 case INDEX_op_xor_vec:
2703 insn = OPC_PXOR;
2704 goto gen_simd;
2705 case INDEX_op_smin_vec:
2706 insn = smin_insn[vece];
2707 goto gen_simd;
2708 case INDEX_op_umin_vec:
2709 insn = umin_insn[vece];
2710 goto gen_simd;
2711 case INDEX_op_smax_vec:
2712 insn = smax_insn[vece];
2713 goto gen_simd;
2714 case INDEX_op_umax_vec:
2715 insn = umax_insn[vece];
2716 goto gen_simd;
2717 case INDEX_op_x86_punpckl_vec:
2718 insn = punpckl_insn[vece];
2719 goto gen_simd;
2720 case INDEX_op_x86_punpckh_vec:
2721 insn = punpckh_insn[vece];
2722 goto gen_simd;
2723 case INDEX_op_x86_packss_vec:
2724 insn = packss_insn[vece];
2725 goto gen_simd;
2726 case INDEX_op_x86_packus_vec:
2727 insn = packus_insn[vece];
2728 goto gen_simd;
2729 #if TCG_TARGET_REG_BITS == 32
2730 case INDEX_op_dup2_vec:
2731 /* Constraints have already placed both 32-bit inputs in xmm regs. */
2732 insn = OPC_PUNPCKLDQ;
2733 goto gen_simd;
2734 #endif
2735 gen_simd:
2736 tcg_debug_assert(insn != OPC_UD2);
2737 if (type == TCG_TYPE_V256) {
2738 insn |= P_VEXL;
2740 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2741 break;
2743 case INDEX_op_cmp_vec:
2744 sub = args[3];
2745 if (sub == TCG_COND_EQ) {
2746 insn = cmpeq_insn[vece];
2747 } else if (sub == TCG_COND_GT) {
2748 insn = cmpgt_insn[vece];
2749 } else {
2750 g_assert_not_reached();
2752 goto gen_simd;
2754 case INDEX_op_andc_vec:
2755 insn = OPC_PANDN;
2756 if (type == TCG_TYPE_V256) {
2757 insn |= P_VEXL;
2759 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2760 break;
2762 case INDEX_op_shli_vec:
2763 sub = 6;
2764 goto gen_shift;
2765 case INDEX_op_shri_vec:
2766 sub = 2;
2767 goto gen_shift;
2768 case INDEX_op_sari_vec:
2769 tcg_debug_assert(vece != MO_64);
2770 sub = 4;
2771 gen_shift:
2772 tcg_debug_assert(vece != MO_8);
2773 insn = shift_imm_insn[vece];
2774 if (type == TCG_TYPE_V256) {
2775 insn |= P_VEXL;
2777 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2778 tcg_out8(s, a2);
2779 break;
2781 case INDEX_op_ld_vec:
2782 tcg_out_ld(s, type, a0, a1, a2);
2783 break;
2784 case INDEX_op_st_vec:
2785 tcg_out_st(s, type, a0, a1, a2);
2786 break;
2787 case INDEX_op_dup_vec:
2788 tcg_out_dup_vec(s, type, vece, a0, a1);
2789 break;
2791 case INDEX_op_x86_shufps_vec:
2792 insn = OPC_SHUFPS;
2793 sub = args[3];
2794 goto gen_simd_imm8;
2795 case INDEX_op_x86_blend_vec:
2796 if (vece == MO_16) {
2797 insn = OPC_PBLENDW;
2798 } else if (vece == MO_32) {
2799 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2800 } else {
2801 g_assert_not_reached();
2803 sub = args[3];
2804 goto gen_simd_imm8;
2805 case INDEX_op_x86_vperm2i128_vec:
2806 insn = OPC_VPERM2I128;
2807 sub = args[3];
2808 goto gen_simd_imm8;
2809 gen_simd_imm8:
2810 if (type == TCG_TYPE_V256) {
2811 insn |= P_VEXL;
2813 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2814 tcg_out8(s, sub);
2815 break;
2817 case INDEX_op_x86_vpblendvb_vec:
2818 insn = OPC_VPBLENDVB;
2819 if (type == TCG_TYPE_V256) {
2820 insn |= P_VEXL;
2822 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2823 tcg_out8(s, args[3] << 4);
2824 break;
2826 case INDEX_op_x86_psrldq_vec:
2827 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2828 tcg_out8(s, a2);
2829 break;
2831 default:
2832 g_assert_not_reached();
2836 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2838 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2839 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2840 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2841 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2842 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2843 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2844 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2845 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2846 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2847 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2848 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2849 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2850 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2851 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2852 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2853 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2854 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2855 static const TCGTargetOpDef r_r_L_L
2856 = { .args_ct_str = { "r", "r", "L", "L" } };
2857 static const TCGTargetOpDef L_L_L_L
2858 = { .args_ct_str = { "L", "L", "L", "L" } };
2859 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2860 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2861 static const TCGTargetOpDef x_x_x_x
2862 = { .args_ct_str = { "x", "x", "x", "x" } };
2863 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2865 switch (op) {
2866 case INDEX_op_goto_ptr:
2867 return &r;
2869 case INDEX_op_ld8u_i32:
2870 case INDEX_op_ld8u_i64:
2871 case INDEX_op_ld8s_i32:
2872 case INDEX_op_ld8s_i64:
2873 case INDEX_op_ld16u_i32:
2874 case INDEX_op_ld16u_i64:
2875 case INDEX_op_ld16s_i32:
2876 case INDEX_op_ld16s_i64:
2877 case INDEX_op_ld_i32:
2878 case INDEX_op_ld32u_i64:
2879 case INDEX_op_ld32s_i64:
2880 case INDEX_op_ld_i64:
2881 return &r_r;
2883 case INDEX_op_st8_i32:
2884 case INDEX_op_st8_i64:
2885 return &qi_r;
2886 case INDEX_op_st16_i32:
2887 case INDEX_op_st16_i64:
2888 case INDEX_op_st_i32:
2889 case INDEX_op_st32_i64:
2890 return &ri_r;
2891 case INDEX_op_st_i64:
2892 return &re_r;
2894 case INDEX_op_add_i32:
2895 case INDEX_op_add_i64:
2896 return &r_r_re;
2897 case INDEX_op_sub_i32:
2898 case INDEX_op_sub_i64:
2899 case INDEX_op_mul_i32:
2900 case INDEX_op_mul_i64:
2901 case INDEX_op_or_i32:
2902 case INDEX_op_or_i64:
2903 case INDEX_op_xor_i32:
2904 case INDEX_op_xor_i64:
2905 return &r_0_re;
2907 case INDEX_op_and_i32:
2908 case INDEX_op_and_i64:
2910 static const TCGTargetOpDef and
2911 = { .args_ct_str = { "r", "0", "reZ" } };
2912 return &and;
2914 break;
2915 case INDEX_op_andc_i32:
2916 case INDEX_op_andc_i64:
2918 static const TCGTargetOpDef andc
2919 = { .args_ct_str = { "r", "r", "rI" } };
2920 return &andc;
2922 break;
2924 case INDEX_op_shl_i32:
2925 case INDEX_op_shl_i64:
2926 case INDEX_op_shr_i32:
2927 case INDEX_op_shr_i64:
2928 case INDEX_op_sar_i32:
2929 case INDEX_op_sar_i64:
2930 return have_bmi2 ? &r_r_ri : &r_0_ci;
2931 case INDEX_op_rotl_i32:
2932 case INDEX_op_rotl_i64:
2933 case INDEX_op_rotr_i32:
2934 case INDEX_op_rotr_i64:
2935 return &r_0_ci;
2937 case INDEX_op_brcond_i32:
2938 case INDEX_op_brcond_i64:
2939 return &r_re;
2941 case INDEX_op_bswap16_i32:
2942 case INDEX_op_bswap16_i64:
2943 case INDEX_op_bswap32_i32:
2944 case INDEX_op_bswap32_i64:
2945 case INDEX_op_bswap64_i64:
2946 case INDEX_op_neg_i32:
2947 case INDEX_op_neg_i64:
2948 case INDEX_op_not_i32:
2949 case INDEX_op_not_i64:
2950 case INDEX_op_extrh_i64_i32:
2951 return &r_0;
2953 case INDEX_op_ext8s_i32:
2954 case INDEX_op_ext8s_i64:
2955 case INDEX_op_ext8u_i32:
2956 case INDEX_op_ext8u_i64:
2957 return &r_q;
2958 case INDEX_op_ext16s_i32:
2959 case INDEX_op_ext16s_i64:
2960 case INDEX_op_ext16u_i32:
2961 case INDEX_op_ext16u_i64:
2962 case INDEX_op_ext32s_i64:
2963 case INDEX_op_ext32u_i64:
2964 case INDEX_op_ext_i32_i64:
2965 case INDEX_op_extu_i32_i64:
2966 case INDEX_op_extrl_i64_i32:
2967 case INDEX_op_extract_i32:
2968 case INDEX_op_extract_i64:
2969 case INDEX_op_sextract_i32:
2970 case INDEX_op_ctpop_i32:
2971 case INDEX_op_ctpop_i64:
2972 return &r_r;
2974 case INDEX_op_deposit_i32:
2975 case INDEX_op_deposit_i64:
2977 static const TCGTargetOpDef dep
2978 = { .args_ct_str = { "Q", "0", "Q" } };
2979 return &dep;
2981 case INDEX_op_setcond_i32:
2982 case INDEX_op_setcond_i64:
2984 static const TCGTargetOpDef setc
2985 = { .args_ct_str = { "q", "r", "re" } };
2986 return &setc;
2988 case INDEX_op_movcond_i32:
2989 case INDEX_op_movcond_i64:
2991 static const TCGTargetOpDef movc
2992 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2993 return &movc;
2995 case INDEX_op_div2_i32:
2996 case INDEX_op_div2_i64:
2997 case INDEX_op_divu2_i32:
2998 case INDEX_op_divu2_i64:
3000 static const TCGTargetOpDef div2
3001 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3002 return &div2;
3004 case INDEX_op_mulu2_i32:
3005 case INDEX_op_mulu2_i64:
3006 case INDEX_op_muls2_i32:
3007 case INDEX_op_muls2_i64:
3009 static const TCGTargetOpDef mul2
3010 = { .args_ct_str = { "a", "d", "a", "r" } };
3011 return &mul2;
3013 case INDEX_op_add2_i32:
3014 case INDEX_op_add2_i64:
3015 case INDEX_op_sub2_i32:
3016 case INDEX_op_sub2_i64:
3018 static const TCGTargetOpDef arith2
3019 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3020 return &arith2;
3022 case INDEX_op_ctz_i32:
3023 case INDEX_op_ctz_i64:
3025 static const TCGTargetOpDef ctz[2] = {
3026 { .args_ct_str = { "&r", "r", "r" } },
3027 { .args_ct_str = { "&r", "r", "rW" } },
3029 return &ctz[have_bmi1];
3031 case INDEX_op_clz_i32:
3032 case INDEX_op_clz_i64:
3034 static const TCGTargetOpDef clz[2] = {
3035 { .args_ct_str = { "&r", "r", "r" } },
3036 { .args_ct_str = { "&r", "r", "rW" } },
3038 return &clz[have_lzcnt];
3041 case INDEX_op_qemu_ld_i32:
3042 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3043 case INDEX_op_qemu_st_i32:
3044 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3045 case INDEX_op_qemu_ld_i64:
3046 return (TCG_TARGET_REG_BITS == 64 ? &r_L
3047 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3048 : &r_r_L_L);
3049 case INDEX_op_qemu_st_i64:
3050 return (TCG_TARGET_REG_BITS == 64 ? &L_L
3051 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3052 : &L_L_L_L);
3054 case INDEX_op_brcond2_i32:
3056 static const TCGTargetOpDef b2
3057 = { .args_ct_str = { "r", "r", "ri", "ri" } };
3058 return &b2;
3060 case INDEX_op_setcond2_i32:
3062 static const TCGTargetOpDef s2
3063 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3064 return &s2;
3067 case INDEX_op_ld_vec:
3068 case INDEX_op_st_vec:
3069 return &x_r;
3071 case INDEX_op_add_vec:
3072 case INDEX_op_sub_vec:
3073 case INDEX_op_mul_vec:
3074 case INDEX_op_and_vec:
3075 case INDEX_op_or_vec:
3076 case INDEX_op_xor_vec:
3077 case INDEX_op_andc_vec:
3078 case INDEX_op_ssadd_vec:
3079 case INDEX_op_usadd_vec:
3080 case INDEX_op_sssub_vec:
3081 case INDEX_op_ussub_vec:
3082 case INDEX_op_smin_vec:
3083 case INDEX_op_umin_vec:
3084 case INDEX_op_smax_vec:
3085 case INDEX_op_umax_vec:
3086 case INDEX_op_cmp_vec:
3087 case INDEX_op_x86_shufps_vec:
3088 case INDEX_op_x86_blend_vec:
3089 case INDEX_op_x86_packss_vec:
3090 case INDEX_op_x86_packus_vec:
3091 case INDEX_op_x86_vperm2i128_vec:
3092 case INDEX_op_x86_punpckl_vec:
3093 case INDEX_op_x86_punpckh_vec:
3094 #if TCG_TARGET_REG_BITS == 32
3095 case INDEX_op_dup2_vec:
3096 #endif
3097 return &x_x_x;
3098 case INDEX_op_dup_vec:
3099 case INDEX_op_shli_vec:
3100 case INDEX_op_shri_vec:
3101 case INDEX_op_sari_vec:
3102 case INDEX_op_x86_psrldq_vec:
3103 return &x_x;
3104 case INDEX_op_x86_vpblendvb_vec:
3105 return &x_x_x_x;
3107 default:
3108 break;
3110 return NULL;
3113 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3115 switch (opc) {
3116 case INDEX_op_add_vec:
3117 case INDEX_op_sub_vec:
3118 case INDEX_op_and_vec:
3119 case INDEX_op_or_vec:
3120 case INDEX_op_xor_vec:
3121 case INDEX_op_andc_vec:
3122 return 1;
3123 case INDEX_op_cmp_vec:
3124 return -1;
3126 case INDEX_op_shli_vec:
3127 case INDEX_op_shri_vec:
3128 /* We must expand the operation for MO_8. */
3129 return vece == MO_8 ? -1 : 1;
3131 case INDEX_op_sari_vec:
3132 /* We must expand the operation for MO_8. */
3133 if (vece == MO_8) {
3134 return -1;
3136 /* We can emulate this for MO_64, but it does not pay off
3137 unless we're producing at least 4 values. */
3138 if (vece == MO_64) {
3139 return type >= TCG_TYPE_V256 ? -1 : 0;
3141 return 1;
3143 case INDEX_op_mul_vec:
3144 if (vece == MO_8) {
3145 /* We can expand the operation for MO_8. */
3146 return -1;
3148 if (vece == MO_64) {
3149 return 0;
3151 return 1;
3153 case INDEX_op_ssadd_vec:
3154 case INDEX_op_usadd_vec:
3155 case INDEX_op_sssub_vec:
3156 case INDEX_op_ussub_vec:
3157 return vece <= MO_16;
3158 case INDEX_op_smin_vec:
3159 case INDEX_op_smax_vec:
3160 case INDEX_op_umin_vec:
3161 case INDEX_op_umax_vec:
3162 return vece <= MO_32 ? 1 : -1;
3164 default:
3165 return 0;
3169 static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
3170 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3172 TCGv_vec t1, t2;
3174 tcg_debug_assert(vece == MO_8);
3176 t1 = tcg_temp_new_vec(type);
3177 t2 = tcg_temp_new_vec(type);
3179 /* Unpack to W, shift, and repack. Tricky bits:
3180 (1) Use punpck*bw x,x to produce DDCCBBAA,
3181 i.e. duplicate in other half of the 16-bit lane.
3182 (2) For right-shift, add 8 so that the high half of
3183 the lane becomes zero. For left-shift, we must
3184 shift up and down again.
3185 (3) Step 2 leaves high half zero such that PACKUSWB
3186 (pack with unsigned saturation) does not modify
3187 the quantity. */
3188 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3189 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3190 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3191 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3193 if (shr) {
3194 tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
3195 tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
3196 } else {
3197 tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
3198 tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
3199 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3200 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3203 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3204 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3205 tcg_temp_free_vec(t1);
3206 tcg_temp_free_vec(t2);
3209 static void expand_vec_sari(TCGType type, unsigned vece,
3210 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3212 TCGv_vec t1, t2;
3214 switch (vece) {
3215 case MO_8:
3216 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
3217 t1 = tcg_temp_new_vec(type);
3218 t2 = tcg_temp_new_vec(type);
3219 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3220 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3221 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3222 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3223 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3224 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3225 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3226 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3227 tcg_temp_free_vec(t1);
3228 tcg_temp_free_vec(t2);
3229 break;
3231 case MO_64:
3232 if (imm <= 32) {
3233 /* We can emulate a small sign extend by performing an arithmetic
3234 * 32-bit shift and overwriting the high half of a 64-bit logical
3235 * shift (note that the ISA says shift of 32 is valid).
3237 t1 = tcg_temp_new_vec(type);
3238 tcg_gen_sari_vec(MO_32, t1, v1, imm);
3239 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3240 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3241 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3242 tcgv_vec_arg(t1), 0xaa);
3243 tcg_temp_free_vec(t1);
3244 } else {
3245 /* Otherwise we will need to use a compare vs 0 to produce
3246 * the sign-extend, shift and merge.
3248 t1 = tcg_const_zeros_vec(type);
3249 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3250 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3251 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3252 tcg_gen_or_vec(MO_64, v0, v0, t1);
3253 tcg_temp_free_vec(t1);
3255 break;
3257 default:
3258 g_assert_not_reached();
3262 static void expand_vec_mul(TCGType type, unsigned vece,
3263 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3265 TCGv_vec t1, t2, t3, t4;
3267 tcg_debug_assert(vece == MO_8);
3270 * Unpack v1 bytes to words, 0 | x.
3271 * Unpack v2 bytes to words, y | 0.
3272 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3273 * Shift logical right by 8 bits to clear the high 8 bytes before
3274 * using an unsigned saturated pack.
3276 * The difference between the V64, V128 and V256 cases is merely how
3277 * we distribute the expansion between temporaries.
3279 switch (type) {
3280 case TCG_TYPE_V64:
3281 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3282 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3283 tcg_gen_dup16i_vec(t2, 0);
3284 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3285 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3286 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3287 tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3288 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3289 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3290 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3291 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3292 tcg_temp_free_vec(t1);
3293 tcg_temp_free_vec(t2);
3294 break;
3296 case TCG_TYPE_V128:
3297 case TCG_TYPE_V256:
3298 t1 = tcg_temp_new_vec(type);
3299 t2 = tcg_temp_new_vec(type);
3300 t3 = tcg_temp_new_vec(type);
3301 t4 = tcg_temp_new_vec(type);
3302 tcg_gen_dup16i_vec(t4, 0);
3303 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3304 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3305 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3306 tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3307 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3308 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3309 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3310 tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3311 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3312 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3313 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3314 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3315 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3316 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3317 tcg_temp_free_vec(t1);
3318 tcg_temp_free_vec(t2);
3319 tcg_temp_free_vec(t3);
3320 tcg_temp_free_vec(t4);
3321 break;
3323 default:
3324 g_assert_not_reached();
3328 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3329 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3331 enum {
3332 NEED_SWAP = 1,
3333 NEED_INV = 2,
3334 NEED_BIAS = 4
3336 static const uint8_t fixups[16] = {
3337 [0 ... 15] = -1,
3338 [TCG_COND_EQ] = 0,
3339 [TCG_COND_NE] = NEED_INV,
3340 [TCG_COND_GT] = 0,
3341 [TCG_COND_LT] = NEED_SWAP,
3342 [TCG_COND_LE] = NEED_INV,
3343 [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3344 [TCG_COND_GTU] = NEED_BIAS,
3345 [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3346 [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3347 [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3349 TCGv_vec t1, t2;
3350 uint8_t fixup;
3352 fixup = fixups[cond & 15];
3353 tcg_debug_assert(fixup != 0xff);
3355 if (fixup & NEED_INV) {
3356 cond = tcg_invert_cond(cond);
3358 if (fixup & NEED_SWAP) {
3359 t1 = v1, v1 = v2, v2 = t1;
3360 cond = tcg_swap_cond(cond);
3363 t1 = t2 = NULL;
3364 if (fixup & NEED_BIAS) {
3365 t1 = tcg_temp_new_vec(type);
3366 t2 = tcg_temp_new_vec(type);
3367 tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3368 tcg_gen_sub_vec(vece, t1, v1, t2);
3369 tcg_gen_sub_vec(vece, t2, v2, t2);
3370 v1 = t1;
3371 v2 = t2;
3372 cond = tcg_signed_cond(cond);
3375 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3376 /* Expand directly; do not recurse. */
3377 vec_gen_4(INDEX_op_cmp_vec, type, vece,
3378 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3380 if (t1) {
3381 tcg_temp_free_vec(t1);
3382 if (t2) {
3383 tcg_temp_free_vec(t2);
3386 if (fixup & NEED_INV) {
3387 tcg_gen_not_vec(vece, v0, v0);
3391 static void expand_vec_minmax(TCGType type, unsigned vece,
3392 TCGCond cond, bool min,
3393 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3395 TCGv_vec t1 = tcg_temp_new_vec(type);
3397 tcg_debug_assert(vece == MO_64);
3399 tcg_gen_cmp_vec(cond, vece, t1, v1, v2);
3400 if (min) {
3401 TCGv_vec t2;
3402 t2 = v1, v1 = v2, v2 = t2;
3404 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3405 tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3406 tcgv_vec_arg(v2), tcgv_vec_arg(t1));
3407 tcg_temp_free_vec(t1);
3410 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3411 TCGArg a0, ...)
3413 va_list va;
3414 TCGArg a2;
3415 TCGv_vec v0, v1, v2;
3417 va_start(va, a0);
3418 v0 = temp_tcgv_vec(arg_temp(a0));
3419 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3420 a2 = va_arg(va, TCGArg);
3422 switch (opc) {
3423 case INDEX_op_shli_vec:
3424 case INDEX_op_shri_vec:
3425 expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
3426 break;
3428 case INDEX_op_sari_vec:
3429 expand_vec_sari(type, vece, v0, v1, a2);
3430 break;
3432 case INDEX_op_mul_vec:
3433 v2 = temp_tcgv_vec(arg_temp(a2));
3434 expand_vec_mul(type, vece, v0, v1, v2);
3435 break;
3437 case INDEX_op_cmp_vec:
3438 v2 = temp_tcgv_vec(arg_temp(a2));
3439 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3440 break;
3442 case INDEX_op_smin_vec:
3443 v2 = temp_tcgv_vec(arg_temp(a2));
3444 expand_vec_minmax(type, vece, TCG_COND_GT, true, v0, v1, v2);
3445 break;
3446 case INDEX_op_smax_vec:
3447 v2 = temp_tcgv_vec(arg_temp(a2));
3448 expand_vec_minmax(type, vece, TCG_COND_GT, false, v0, v1, v2);
3449 break;
3450 case INDEX_op_umin_vec:
3451 v2 = temp_tcgv_vec(arg_temp(a2));
3452 expand_vec_minmax(type, vece, TCG_COND_GTU, true, v0, v1, v2);
3453 break;
3454 case INDEX_op_umax_vec:
3455 v2 = temp_tcgv_vec(arg_temp(a2));
3456 expand_vec_minmax(type, vece, TCG_COND_GTU, false, v0, v1, v2);
3457 break;
3459 default:
3460 break;
3463 va_end(va);
3466 static const int tcg_target_callee_save_regs[] = {
3467 #if TCG_TARGET_REG_BITS == 64
3468 TCG_REG_RBP,
3469 TCG_REG_RBX,
3470 #if defined(_WIN64)
3471 TCG_REG_RDI,
3472 TCG_REG_RSI,
3473 #endif
3474 TCG_REG_R12,
3475 TCG_REG_R13,
3476 TCG_REG_R14, /* Currently used for the global env. */
3477 TCG_REG_R15,
3478 #else
3479 TCG_REG_EBP, /* Currently used for the global env. */
3480 TCG_REG_EBX,
3481 TCG_REG_ESI,
3482 TCG_REG_EDI,
3483 #endif
3486 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3487 and tcg_register_jit. */
3489 #define PUSH_SIZE \
3490 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3491 * (TCG_TARGET_REG_BITS / 8))
3493 #define FRAME_SIZE \
3494 ((PUSH_SIZE \
3495 + TCG_STATIC_CALL_ARGS_SIZE \
3496 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3497 + TCG_TARGET_STACK_ALIGN - 1) \
3498 & ~(TCG_TARGET_STACK_ALIGN - 1))
3500 /* Generate global QEMU prologue and epilogue code */
3501 static void tcg_target_qemu_prologue(TCGContext *s)
3503 int i, stack_addend;
3505 /* TB prologue */
3507 /* Reserve some stack space, also for TCG temps. */
3508 stack_addend = FRAME_SIZE - PUSH_SIZE;
3509 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3510 CPU_TEMP_BUF_NLONGS * sizeof(long));
3512 /* Save all callee saved registers. */
3513 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3514 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3517 #if TCG_TARGET_REG_BITS == 32
3518 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3519 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3520 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3521 /* jmp *tb. */
3522 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3523 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3524 + stack_addend);
3525 #else
3526 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3527 if (guest_base) {
3528 int seg = setup_guest_base_seg();
3529 if (seg != 0) {
3530 x86_guest_base_seg = seg;
3531 } else if (guest_base == (int32_t)guest_base) {
3532 x86_guest_base_offset = guest_base;
3533 } else {
3534 /* Choose R12 because, as a base, it requires a SIB byte. */
3535 x86_guest_base_index = TCG_REG_R12;
3536 tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3537 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3540 # endif
3541 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3542 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3543 /* jmp *tb. */
3544 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3545 #endif
3548 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3549 * and fall through to the rest of the epilogue.
3551 s->code_gen_epilogue = s->code_ptr;
3552 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3554 /* TB epilogue */
3555 tb_ret_addr = s->code_ptr;
3557 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3559 if (have_avx2) {
3560 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3562 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3563 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3565 tcg_out_opc(s, OPC_RET, 0, 0, 0);
3568 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3570 memset(p, 0x90, count);
3573 static void tcg_target_init(TCGContext *s)
3575 #ifdef CONFIG_CPUID_H
3576 unsigned a, b, c, d, b7 = 0;
3577 int max = __get_cpuid_max(0, 0);
3579 if (max >= 7) {
3580 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3581 __cpuid_count(7, 0, a, b7, c, d);
3582 have_bmi1 = (b7 & bit_BMI) != 0;
3583 have_bmi2 = (b7 & bit_BMI2) != 0;
3586 if (max >= 1) {
3587 __cpuid(1, a, b, c, d);
3588 #ifndef have_cmov
3589 /* For 32-bit, 99% certainty that we're running on hardware that
3590 supports cmov, but we still need to check. In case cmov is not
3591 available, we'll use a small forward branch. */
3592 have_cmov = (d & bit_CMOV) != 0;
3593 #endif
3595 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3596 need to probe for it. */
3597 have_movbe = (c & bit_MOVBE) != 0;
3598 have_popcnt = (c & bit_POPCNT) != 0;
3600 /* There are a number of things we must check before we can be
3601 sure of not hitting invalid opcode. */
3602 if (c & bit_OSXSAVE) {
3603 unsigned xcrl, xcrh;
3604 /* The xgetbv instruction is not available to older versions of
3605 * the assembler, so we encode the instruction manually.
3607 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3608 if ((xcrl & 6) == 6) {
3609 have_avx1 = (c & bit_AVX) != 0;
3610 have_avx2 = (b7 & bit_AVX2) != 0;
3615 max = __get_cpuid_max(0x8000000, 0);
3616 if (max >= 1) {
3617 __cpuid(0x80000001, a, b, c, d);
3618 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3619 have_lzcnt = (c & bit_LZCNT) != 0;
3621 #endif /* CONFIG_CPUID_H */
3623 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3624 if (TCG_TARGET_REG_BITS == 64) {
3625 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3627 if (have_avx1) {
3628 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3629 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3631 if (have_avx2) {
3632 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3635 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3636 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3637 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3638 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3639 if (TCG_TARGET_REG_BITS == 64) {
3640 #if !defined(_WIN64)
3641 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3642 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3643 #endif
3644 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3645 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3646 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3647 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3650 s->reserved_regs = 0;
3651 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3654 typedef struct {
3655 DebugFrameHeader h;
3656 uint8_t fde_def_cfa[4];
3657 uint8_t fde_reg_ofs[14];
3658 } DebugFrame;
3660 /* We're expecting a 2 byte uleb128 encoded value. */
3661 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3663 #if !defined(__ELF__)
3664 /* Host machine without ELF. */
3665 #elif TCG_TARGET_REG_BITS == 64
3666 #define ELF_HOST_MACHINE EM_X86_64
3667 static const DebugFrame debug_frame = {
3668 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3669 .h.cie.id = -1,
3670 .h.cie.version = 1,
3671 .h.cie.code_align = 1,
3672 .h.cie.data_align = 0x78, /* sleb128 -8 */
3673 .h.cie.return_column = 16,
3675 /* Total FDE size does not include the "len" member. */
3676 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3678 .fde_def_cfa = {
3679 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3680 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3681 (FRAME_SIZE >> 7)
3683 .fde_reg_ofs = {
3684 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3685 /* The following ordering must match tcg_target_callee_save_regs. */
3686 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3687 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3688 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3689 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3690 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3691 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3694 #else
3695 #define ELF_HOST_MACHINE EM_386
3696 static const DebugFrame debug_frame = {
3697 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3698 .h.cie.id = -1,
3699 .h.cie.version = 1,
3700 .h.cie.code_align = 1,
3701 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3702 .h.cie.return_column = 8,
3704 /* Total FDE size does not include the "len" member. */
3705 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3707 .fde_def_cfa = {
3708 12, 4, /* DW_CFA_def_cfa %esp, ... */
3709 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3710 (FRAME_SIZE >> 7)
3712 .fde_reg_ofs = {
3713 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3714 /* The following ordering must match tcg_target_callee_save_regs. */
3715 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3716 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3717 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3718 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3721 #endif
3723 #if defined(ELF_HOST_MACHINE)
3724 void tcg_register_jit(void *buf, size_t buf_size)
3726 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3728 #endif