2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "../tcg-pool.c.inc"
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
43 static const int tcg_target_reg_alloc_order[] = {
44 #if TCG_TARGET_REG_BITS == 64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
80 #if TCG_TARGET_REG_BITS == 64
93 static const int tcg_target_call_iarg_regs[] = {
94 #if TCG_TARGET_REG_BITS == 64
107 /* 32 bit mode uses stack based calling convention (GCC default). */
111 static const int tcg_target_call_oarg_regs[] = {
113 #if TCG_TARGET_REG_BITS == 32
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
135 #define ALL_BYTEH_REGS 0x0000000fu
136 #if TCG_TARGET_REG_BITS == 64
137 # define ALL_GENERAL_REGS 0x0000ffffu
138 # define ALL_VECTOR_REGS 0xffff0000u
139 # define ALL_BYTEL_REGS ALL_GENERAL_REGS
141 # define ALL_GENERAL_REGS 0x000000ffu
142 # define ALL_VECTOR_REGS 0x00ff0000u
143 # define ALL_BYTEL_REGS ALL_BYTEH_REGS
145 #ifdef CONFIG_SOFTMMU
146 # define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
148 # define SOFTMMU_RESERVE_REGS 0
151 /* The host compiler should supply <cpuid.h> to enable runtime features
152 detection, as we're not going to go so far as our own inline assembly.
153 If not available, default values will be assumed. */
154 #if defined(CONFIG_CPUID_H)
155 #include "qemu/cpuid.h"
158 /* For 64-bit, we always know that CMOV is available. */
159 #if TCG_TARGET_REG_BITS == 64
161 #elif defined(CONFIG_CPUID_H)
162 static bool have_cmov;
167 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
168 it there. Therefore we always define the variable. */
175 #ifdef CONFIG_CPUID_H
176 static bool have_bmi2;
177 static bool have_lzcnt;
180 # define have_lzcnt 0
183 static const tcg_insn_unit *tb_ret_addr;
185 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
186 intptr_t value, intptr_t addend)
191 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
192 if (value != (int32_t)value) {
197 tcg_patch32(code_ptr, value);
200 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
201 if (value != (int8_t)value) {
204 tcg_patch8(code_ptr, value);
212 /* test if a constant matches the constraint */
213 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
215 if (ct & TCG_CT_CONST) {
218 if (type == TCG_TYPE_I32) {
219 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
233 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
239 # define LOWREGMASK(x) ((x) & 7)
241 #define P_EXT 0x100 /* 0x0f opcode prefix */
242 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
243 #define P_DATA16 0x400 /* 0x66 opcode prefix */
244 #define P_VEXW 0x1000 /* Set VEX.W = 1 */
245 #if TCG_TARGET_REG_BITS == 64
246 # define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */
247 # define P_REXB_R 0x2000 /* REG field as byte register */
248 # define P_REXB_RM 0x4000 /* R/M field as byte register */
249 # define P_GS 0x8000 /* gs segment override */
256 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
257 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
258 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
259 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
261 #define OPC_ARITH_EvIz (0x81)
262 #define OPC_ARITH_EvIb (0x83)
263 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
264 #define OPC_ANDN (0xf2 | P_EXT38)
265 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
266 #define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
267 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
268 #define OPC_BSF (0xbc | P_EXT)
269 #define OPC_BSR (0xbd | P_EXT)
270 #define OPC_BSWAP (0xc8 | P_EXT)
271 #define OPC_CALL_Jz (0xe8)
272 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
273 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
274 #define OPC_DEC_r32 (0x48)
275 #define OPC_IMUL_GvEv (0xaf | P_EXT)
276 #define OPC_IMUL_GvEvIb (0x6b)
277 #define OPC_IMUL_GvEvIz (0x69)
278 #define OPC_INC_r32 (0x40)
279 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
280 #define OPC_JCC_short (0x70) /* ... plus condition code */
281 #define OPC_JMP_long (0xe9)
282 #define OPC_JMP_short (0xeb)
283 #define OPC_LEA (0x8d)
284 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
285 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
286 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
287 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
288 #define OPC_MOVB_EvIz (0xc6)
289 #define OPC_MOVL_EvIz (0xc7)
290 #define OPC_MOVL_Iv (0xb8)
291 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
292 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
293 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
294 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
295 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
296 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
297 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
298 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
299 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
300 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
301 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
302 #define OPC_MOVSBL (0xbe | P_EXT)
303 #define OPC_MOVSWL (0xbf | P_EXT)
304 #define OPC_MOVSLQ (0x63 | P_REXW)
305 #define OPC_MOVZBL (0xb6 | P_EXT)
306 #define OPC_MOVZWL (0xb7 | P_EXT)
307 #define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
308 #define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
309 #define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
310 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
311 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
312 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
313 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
314 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
315 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
316 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
317 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
318 #define OPC_PADDSB (0xec | P_EXT | P_DATA16)
319 #define OPC_PADDSW (0xed | P_EXT | P_DATA16)
320 #define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
321 #define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
322 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
323 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
324 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
325 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
326 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
327 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
328 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
329 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
330 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
331 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
332 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
333 #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
334 #define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
335 #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
336 #define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
337 #define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
338 #define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
339 #define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
340 #define OPC_PMINSW (0xea | P_EXT | P_DATA16)
341 #define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
342 #define OPC_PMINUB (0xda | P_EXT | P_DATA16)
343 #define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
344 #define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
345 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
346 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
347 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
348 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
349 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
350 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
351 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
352 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
353 #define OPC_POR (0xeb | P_EXT | P_DATA16)
354 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
355 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
356 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
357 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
358 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
359 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
360 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
361 #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
362 #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
363 #define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
364 #define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
365 #define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
366 #define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
367 #define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
368 #define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
369 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
370 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
371 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
372 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
373 #define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
374 #define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
375 #define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
376 #define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
377 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
378 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
379 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
380 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
381 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
382 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
383 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
384 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
385 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
386 #define OPC_POP_r32 (0x58)
387 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
388 #define OPC_PUSH_r32 (0x50)
389 #define OPC_PUSH_Iv (0x68)
390 #define OPC_PUSH_Ib (0x6a)
391 #define OPC_RET (0xc3)
392 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
393 #define OPC_SHIFT_1 (0xd1)
394 #define OPC_SHIFT_Ib (0xc1)
395 #define OPC_SHIFT_cl (0xd3)
396 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
397 #define OPC_SHUFPS (0xc6 | P_EXT)
398 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
399 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
400 #define OPC_SHRD_Ib (0xac | P_EXT)
401 #define OPC_TESTL (0x85)
402 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
403 #define OPC_UD2 (0x0b | P_EXT)
404 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
405 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
406 #define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
407 #define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
408 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
409 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
410 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
411 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
412 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
413 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
414 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
415 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
416 #define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
417 #define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
418 #define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
419 #define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
420 #define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
421 #define OPC_VZEROUPPER (0x77 | P_EXT)
422 #define OPC_XCHG_ax_r32 (0x90)
424 #define OPC_GRP3_Ev (0xf7)
425 #define OPC_GRP5 (0xff)
426 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
428 /* Group 1 opcode extensions for 0x80-0x83.
429 These are also used as modifiers for OPC_ARITH. */
439 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
446 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
454 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
455 #define EXT5_INC_Ev 0
456 #define EXT5_DEC_Ev 1
457 #define EXT5_CALLN_Ev 2
458 #define EXT5_JMPN_Ev 4
460 /* Condition codes to be added to OPC_JCC_{long,short}. */
479 static const uint8_t tcg_cond_to_jcc[] = {
480 [TCG_COND_EQ] = JCC_JE,
481 [TCG_COND_NE] = JCC_JNE,
482 [TCG_COND_LT] = JCC_JL,
483 [TCG_COND_GE] = JCC_JGE,
484 [TCG_COND_LE] = JCC_JLE,
485 [TCG_COND_GT] = JCC_JG,
486 [TCG_COND_LTU] = JCC_JB,
487 [TCG_COND_GEU] = JCC_JAE,
488 [TCG_COND_LEU] = JCC_JBE,
489 [TCG_COND_GTU] = JCC_JA,
492 #if TCG_TARGET_REG_BITS == 64
493 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
500 if (opc & P_DATA16) {
501 /* We should never be asking for both 16 and 64-bit operation. */
502 tcg_debug_assert((opc & P_REXW) == 0);
505 if (opc & P_SIMDF3) {
507 } else if (opc & P_SIMDF2) {
512 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
513 rex |= (r & 8) >> 1; /* REX.R */
514 rex |= (x & 8) >> 2; /* REX.X */
515 rex |= (rm & 8) >> 3; /* REX.B */
517 /* P_REXB_{R,RM} indicates that the given register is the low byte.
518 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
519 as otherwise the encoding indicates %[abcd]h. Note that the values
520 that are ORed in merely indicate that the REX byte must be present;
521 those bits get discarded in output. */
522 rex |= opc & (r >= 4 ? P_REXB_R : 0);
523 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
526 tcg_out8(s, (uint8_t)(rex | 0x40));
529 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
533 } else if (opc & P_EXT3A) {
541 static void tcg_out_opc(TCGContext *s, int opc)
543 if (opc & P_DATA16) {
546 if (opc & P_SIMDF3) {
548 } else if (opc & P_SIMDF2) {
551 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
555 } else if (opc & P_EXT3A) {
561 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
562 the 32-bit compilation paths. This method works with all versions of gcc,
563 whereas relying on optimization may not be able to exclude them. */
564 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
567 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
569 tcg_out_opc(s, opc, r, rm, 0);
570 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
573 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
578 /* Use the two byte form if possible, which cannot encode
579 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
580 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
581 && ((rm | index) & 8) == 0) {
582 /* Two byte VEX prefix. */
585 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
587 /* Three byte VEX prefix. */
593 } else if (opc & P_EXT38) {
595 } else if (opc & P_EXT) {
598 g_assert_not_reached();
600 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
601 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
602 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
605 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */
608 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
610 if (opc & P_DATA16) {
612 } else if (opc & P_SIMDF3) {
614 } else if (opc & P_SIMDF2) {
617 tmp |= (~v & 15) << 3; /* VEX.vvvv */
622 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
624 tcg_out_vex_opc(s, opc, r, v, rm, 0);
625 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
628 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
629 We handle either RM and INDEX missing with a negative value. In 64-bit
630 mode for absolute addresses, ~RM is the size of the immediate operand
631 that will follow the instruction. */
633 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
634 int shift, intptr_t offset)
638 if (index < 0 && rm < 0) {
639 if (TCG_TARGET_REG_BITS == 64) {
640 /* Try for a rip-relative addressing mode. This has replaced
641 the 32-bit-mode absolute addressing encoding. */
642 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
643 intptr_t disp = offset - pc;
644 if (disp == (int32_t)disp) {
645 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
650 /* Try for an absolute address encoding. This requires the
651 use of the MODRM+SIB encoding and is therefore larger than
652 rip-relative addressing. */
653 if (offset == (int32_t)offset) {
654 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
655 tcg_out8(s, (4 << 3) | 5);
656 tcg_out32(s, offset);
660 /* ??? The memory isn't directly addressable. */
661 g_assert_not_reached();
663 /* Absolute address. */
664 tcg_out8(s, (r << 3) | 5);
665 tcg_out32(s, offset);
670 /* Find the length of the immediate addend. Note that the encoding
671 that would be used for (%ebp) indicates absolute addressing. */
673 mod = 0, len = 4, rm = 5;
674 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
676 } else if (offset == (int8_t)offset) {
682 /* Use a single byte MODRM format if possible. Note that the encoding
683 that would be used for %esp is the escape to the two byte form. */
684 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
685 /* Single byte MODRM format. */
686 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
688 /* Two byte MODRM+SIB format. */
690 /* Note that the encoding that would place %esp into the index
691 field indicates no index register. In 64-bit mode, the REX.X
692 bit counts, so %r12 can be used as the index. */
696 tcg_debug_assert(index != TCG_REG_ESP);
699 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
700 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
705 } else if (len == 4) {
706 tcg_out32(s, offset);
710 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
711 int index, int shift, intptr_t offset)
713 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
714 tcg_out_sib_offset(s, r, rm, index, shift, offset);
717 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
718 int rm, int index, int shift,
721 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
722 tcg_out_sib_offset(s, r, rm, index, shift, offset);
725 /* A simplification of the above with no index or shift. */
726 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
727 int rm, intptr_t offset)
729 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
732 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
733 int v, int rm, intptr_t offset)
735 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
738 /* Output an opcode with an expected reference to the constant pool. */
739 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
741 tcg_out_opc(s, opc, r, 0, 0);
742 /* Absolute for 32-bit, pc-relative for 64-bit. */
743 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
747 /* Output an opcode with an expected reference to the constant pool. */
748 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
750 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
751 /* Absolute for 32-bit, pc-relative for 64-bit. */
752 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
756 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
757 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
759 /* Propagate an opcode prefix, such as P_REXW. */
760 int ext = subop & ~0x7;
763 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
766 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
780 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
782 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
786 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
788 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
794 tcg_debug_assert(ret >= 16 && arg >= 16);
795 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
798 tcg_debug_assert(ret >= 16 && arg >= 16);
799 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
802 tcg_debug_assert(ret >= 16 && arg >= 16);
803 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
807 g_assert_not_reached();
812 static const int avx2_dup_insn[4] = {
813 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
814 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
817 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
821 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
822 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
826 /* ??? With zero in a register, use PSHUFB. */
827 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
831 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
835 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
836 /* imm8 operand: all output lanes selected from input lane 0. */
840 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
843 g_assert_not_reached();
849 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
850 TCGReg r, TCGReg base, intptr_t offset)
853 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
854 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
859 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
862 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
865 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
866 tcg_out8(s, 0); /* imm8 */
867 tcg_out_dup_vec(s, type, vece, r, r);
870 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
871 tcg_out8(s, 0); /* imm8 */
872 tcg_out_dup_vec(s, type, vece, r, r);
875 g_assert_not_reached();
881 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
882 TCGReg ret, int64_t arg)
884 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
887 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
891 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
895 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
897 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
899 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
901 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
903 if (type == TCG_TYPE_V64) {
904 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
905 } else if (have_avx2) {
906 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
908 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
910 if (TCG_TARGET_REG_BITS == 64) {
911 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
913 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
918 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
919 TCGReg ret, tcg_target_long arg)
922 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
926 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
930 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
931 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
932 if (TCG_TARGET_REG_BITS == 64) {
933 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
935 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
939 static void tcg_out_movi_int(TCGContext *s, TCGType type,
940 TCGReg ret, tcg_target_long arg)
942 tcg_target_long diff;
945 tgen_arithr(s, ARITH_XOR, ret, ret);
948 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
949 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
953 if (arg == (int32_t)arg) {
954 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
959 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
960 diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
961 if (diff == (int32_t)diff) {
962 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
963 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
968 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
972 static void tcg_out_movi(TCGContext *s, TCGType type,
973 TCGReg ret, tcg_target_long arg)
977 #if TCG_TARGET_REG_BITS == 64
981 tcg_out_movi_int(s, type, ret, arg);
983 tcg_out_movi_vec(s, type, ret, arg);
987 g_assert_not_reached();
991 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
993 if (val == (int8_t)val) {
994 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
996 } else if (val == (int32_t)val) {
997 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1004 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1006 /* Given the strength of x86 memory ordering, we only need care for
1007 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
1008 faster than "mfence", so don't bother with the sse insn. */
1009 if (a0 & TCG_MO_ST_LD) {
1011 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1016 static inline void tcg_out_push(TCGContext *s, int reg)
1018 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1021 static inline void tcg_out_pop(TCGContext *s, int reg)
1023 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1026 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1027 TCGReg arg1, intptr_t arg2)
1032 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1034 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1039 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1044 /* There is no instruction that can validate 8-byte alignment. */
1045 tcg_debug_assert(ret >= 16);
1046 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1050 * The gvec infrastructure is asserts that v128 vector loads
1051 * and stores use a 16-byte aligned offset. Validate that the
1052 * final pointer is aligned by using an insn that will SIGSEGV.
1054 tcg_debug_assert(ret >= 16);
1055 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1059 * The gvec infrastructure only requires 16-byte alignment,
1060 * so here we must use an unaligned load.
1062 tcg_debug_assert(ret >= 16);
1063 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1064 ret, 0, arg1, arg2);
1067 g_assert_not_reached();
1071 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1072 TCGReg arg1, intptr_t arg2)
1077 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1079 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1084 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1089 /* There is no instruction that can validate 8-byte alignment. */
1090 tcg_debug_assert(arg >= 16);
1091 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1095 * The gvec infrastructure is asserts that v128 vector loads
1096 * and stores use a 16-byte aligned offset. Validate that the
1097 * final pointer is aligned by using an insn that will SIGSEGV.
1099 tcg_debug_assert(arg >= 16);
1100 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1104 * The gvec infrastructure only requires 16-byte alignment,
1105 * so here we must use an unaligned store.
1107 tcg_debug_assert(arg >= 16);
1108 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1109 arg, 0, arg1, arg2);
1112 g_assert_not_reached();
1116 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1117 TCGReg base, intptr_t ofs)
1120 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1121 if (val != (int32_t)val) {
1125 } else if (type != TCG_TYPE_I32) {
1128 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1133 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1135 /* Propagate an opcode prefix, such as P_DATA16. */
1136 int ext = subopc & ~0x7;
1140 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1142 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1147 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1149 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1152 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1154 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1157 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1160 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1161 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1164 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1167 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1168 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1171 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1174 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1177 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1180 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1183 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1185 /* 32-bit mov zero extends. */
1186 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1189 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1191 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1194 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1196 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1199 static void tgen_arithi(TCGContext *s, int c, int r0,
1200 tcg_target_long val, int cf)
1204 if (TCG_TARGET_REG_BITS == 64) {
1209 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1210 partial flags update stalls on Pentium4 and are not recommended
1211 by current Intel optimization manuals. */
1212 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1213 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1214 if (TCG_TARGET_REG_BITS == 64) {
1215 /* The single-byte increment encodings are re-tasked as the
1216 REX prefixes. Use the MODRM encoding. */
1217 tcg_out_modrm(s, OPC_GRP5 + rexw,
1218 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1220 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1225 if (c == ARITH_AND) {
1226 if (TCG_TARGET_REG_BITS == 64) {
1227 if (val == 0xffffffffu) {
1228 tcg_out_ext32u(s, r0, r0);
1231 if (val == (uint32_t)val) {
1232 /* AND with no high bits set can use a 32-bit operation. */
1236 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1237 tcg_out_ext8u(s, r0, r0);
1240 if (val == 0xffffu) {
1241 tcg_out_ext16u(s, r0, r0);
1246 if (val == (int8_t)val) {
1247 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1251 if (rexw == 0 || val == (int32_t)val) {
1252 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1260 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1263 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1267 /* Use SMALL != 0 to force a short forward branch. */
1268 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1273 val = tcg_pcrel_diff(s, l->u.value_ptr);
1275 if ((int8_t)val1 == val1) {
1277 tcg_out8(s, OPC_JMP_short);
1279 tcg_out8(s, OPC_JCC_short + opc);
1287 tcg_out8(s, OPC_JMP_long);
1288 tcg_out32(s, val - 5);
1290 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1291 tcg_out32(s, val - 6);
1296 tcg_out8(s, OPC_JMP_short);
1298 tcg_out8(s, OPC_JCC_short + opc);
1300 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1304 tcg_out8(s, OPC_JMP_long);
1306 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1308 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1313 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1314 int const_arg2, int rexw)
1319 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1321 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1324 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1328 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1329 TCGArg arg1, TCGArg arg2, int const_arg2,
1330 TCGLabel *label, int small)
1332 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1333 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1336 #if TCG_TARGET_REG_BITS == 64
1337 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1338 TCGArg arg1, TCGArg arg2, int const_arg2,
1339 TCGLabel *label, int small)
1341 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1342 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1345 /* XXX: we implement it at the target level to avoid having to
1346 handle cross basic blocks temporaries */
1347 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1348 const int *const_args, int small)
1350 TCGLabel *label_next = gen_new_label();
1351 TCGLabel *label_this = arg_label(args[5]);
1355 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1357 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1361 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1363 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1367 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1369 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1370 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1374 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1376 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1377 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1381 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1383 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1384 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1388 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1390 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1391 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1395 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1397 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1398 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1402 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1404 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1405 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1409 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1411 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1412 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1416 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1418 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1419 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1425 tcg_out_label(s, label_next);
1429 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1430 TCGArg arg1, TCGArg arg2, int const_arg2)
1432 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1433 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1434 tcg_out_ext8u(s, dest, dest);
1437 #if TCG_TARGET_REG_BITS == 64
1438 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1439 TCGArg arg1, TCGArg arg2, int const_arg2)
1441 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1442 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1443 tcg_out_ext8u(s, dest, dest);
1446 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1447 const int *const_args)
1450 TCGLabel *label_true, *label_over;
1452 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1454 if (args[0] == args[1] || args[0] == args[2]
1455 || (!const_args[3] && args[0] == args[3])
1456 || (!const_args[4] && args[0] == args[4])) {
1457 /* When the destination overlaps with one of the argument
1458 registers, don't do anything tricky. */
1459 label_true = gen_new_label();
1460 label_over = gen_new_label();
1462 new_args[5] = label_arg(label_true);
1463 tcg_out_brcond2(s, new_args, const_args+1, 1);
1465 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1466 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1467 tcg_out_label(s, label_true);
1469 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1470 tcg_out_label(s, label_over);
1472 /* When the destination does not overlap one of the arguments,
1473 clear the destination first, jump if cond false, and emit an
1474 increment in the true case. This results in smaller code. */
1476 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1478 label_over = gen_new_label();
1479 new_args[4] = tcg_invert_cond(new_args[4]);
1480 new_args[5] = label_arg(label_over);
1481 tcg_out_brcond2(s, new_args, const_args+1, 1);
1483 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1484 tcg_out_label(s, label_over);
1489 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1490 TCGReg dest, TCGReg v1)
1493 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1495 TCGLabel *over = gen_new_label();
1496 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1497 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1498 tcg_out_label(s, over);
1502 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1503 TCGReg c1, TCGArg c2, int const_c2,
1506 tcg_out_cmp(s, c1, c2, const_c2, 0);
1507 tcg_out_cmov(s, cond, 0, dest, v1);
1510 #if TCG_TARGET_REG_BITS == 64
1511 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1512 TCGReg c1, TCGArg c2, int const_c2,
1515 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1516 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1520 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1521 TCGArg arg2, bool const_a2)
1524 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1526 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1528 tcg_debug_assert(dest != arg2);
1529 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1532 tcg_debug_assert(dest != arg2);
1533 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1534 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1538 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1539 TCGArg arg2, bool const_a2)
1542 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1544 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1546 tcg_debug_assert(dest != arg2);
1547 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1550 tcg_debug_assert(!const_a2);
1551 tcg_debug_assert(dest != arg1);
1552 tcg_debug_assert(dest != arg2);
1554 /* Recall that the output of BSR is the index not the count. */
1555 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1556 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1558 /* Since we have destroyed the flags from BSR, we have to re-test. */
1559 tcg_out_cmp(s, arg1, 0, 1, rexw);
1560 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1564 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1566 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1568 if (disp == (int32_t)disp) {
1569 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1572 /* rip-relative addressing into the constant pool.
1573 This is 6 + 8 = 14 bytes, as compared to using an
1574 an immediate load 10 + 6 = 16 bytes, plus we may
1575 be able to re-use the pool constant for more calls. */
1576 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1577 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1578 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1583 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1585 tcg_out_branch(s, 1, dest);
1588 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1590 tcg_out_branch(s, 0, dest);
1593 static void tcg_out_nopn(TCGContext *s, int n)
1596 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1597 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1598 * duplicate prefix, and all of the interesting recent cores can
1599 * decode and discard the duplicates in a single cycle.
1601 tcg_debug_assert(n >= 1);
1602 for (i = 1; i < n; ++i) {
1608 #if defined(CONFIG_SOFTMMU)
1609 #include "../tcg-ldst.c.inc"
1611 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1612 * int mmu_idx, uintptr_t ra)
1614 static void * const qemu_ld_helpers[16] = {
1615 [MO_UB] = helper_ret_ldub_mmu,
1616 [MO_LEUW] = helper_le_lduw_mmu,
1617 [MO_LEUL] = helper_le_ldul_mmu,
1618 [MO_LEQ] = helper_le_ldq_mmu,
1619 [MO_BEUW] = helper_be_lduw_mmu,
1620 [MO_BEUL] = helper_be_ldul_mmu,
1621 [MO_BEQ] = helper_be_ldq_mmu,
1624 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1625 * uintxx_t val, int mmu_idx, uintptr_t ra)
1627 static void * const qemu_st_helpers[16] = {
1628 [MO_UB] = helper_ret_stb_mmu,
1629 [MO_LEUW] = helper_le_stw_mmu,
1630 [MO_LEUL] = helper_le_stl_mmu,
1631 [MO_LEQ] = helper_le_stq_mmu,
1632 [MO_BEUW] = helper_be_stw_mmu,
1633 [MO_BEUL] = helper_be_stl_mmu,
1634 [MO_BEQ] = helper_be_stq_mmu,
1637 /* Perform the TLB load and compare.
1640 ADDRLO and ADDRHI contain the low and high part of the address.
1642 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1644 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1645 This should be offsetof addr_read or addr_write.
1648 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1649 positions of the displacements of forward jumps to the TLB miss case.
1651 Second argument register is loaded with the low part of the address.
1652 In the TLB hit case, it has been adjusted as indicated by the TLB
1653 and so is a host address. In the TLB miss case, it continues to
1654 hold a guest address.
1656 First argument register is clobbered. */
1658 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1659 int mem_index, MemOp opc,
1660 tcg_insn_unit **label_ptr, int which)
1662 const TCGReg r0 = TCG_REG_L0;
1663 const TCGReg r1 = TCG_REG_L1;
1664 TCGType ttype = TCG_TYPE_I32;
1665 TCGType tlbtype = TCG_TYPE_I32;
1666 int trexw = 0, hrexw = 0, tlbrexw = 0;
1667 unsigned a_bits = get_alignment_bits(opc);
1668 unsigned s_bits = opc & MO_SIZE;
1669 unsigned a_mask = (1 << a_bits) - 1;
1670 unsigned s_mask = (1 << s_bits) - 1;
1671 target_ulong tlb_mask;
1673 if (TCG_TARGET_REG_BITS == 64) {
1674 if (TARGET_LONG_BITS == 64) {
1675 ttype = TCG_TYPE_I64;
1678 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1680 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1681 tlbtype = TCG_TYPE_I64;
1687 tcg_out_mov(s, tlbtype, r0, addrlo);
1688 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1689 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1691 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1692 TLB_MASK_TABLE_OFS(mem_index) +
1693 offsetof(CPUTLBDescFast, mask));
1695 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1696 TLB_MASK_TABLE_OFS(mem_index) +
1697 offsetof(CPUTLBDescFast, table));
1699 /* If the required alignment is at least as large as the access, simply
1700 copy the address and mask. For lesser alignments, check that we don't
1701 cross pages for the complete access. */
1702 if (a_bits >= s_bits) {
1703 tcg_out_mov(s, ttype, r1, addrlo);
1705 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1707 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1708 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1711 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1713 /* Prepare for both the fast path add of the tlb addend, and the slow
1714 path function argument setup. */
1715 tcg_out_mov(s, ttype, r1, addrlo);
1718 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1719 label_ptr[0] = s->code_ptr;
1722 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1723 /* cmp 4(r0), addrhi */
1724 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1727 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1728 label_ptr[1] = s->code_ptr;
1734 /* add addend(r0), r1 */
1735 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1736 offsetof(CPUTLBEntry, addend));
1740 * Record the context of a call to the out of line helper code for the slow path
1741 * for a load or store, so that we can later generate the correct helper code
1743 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1745 TCGReg datalo, TCGReg datahi,
1746 TCGReg addrlo, TCGReg addrhi,
1747 tcg_insn_unit *raddr,
1748 tcg_insn_unit **label_ptr)
1750 TCGLabelQemuLdst *label = new_ldst_label(s);
1752 label->is_ld = is_ld;
1754 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1755 label->datalo_reg = datalo;
1756 label->datahi_reg = datahi;
1757 label->addrlo_reg = addrlo;
1758 label->addrhi_reg = addrhi;
1759 label->raddr = tcg_splitwx_to_rx(raddr);
1760 label->label_ptr[0] = label_ptr[0];
1761 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1762 label->label_ptr[1] = label_ptr[1];
1767 * Generate code for the slow path for a load at the end of block
1769 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1771 TCGMemOpIdx oi = l->oi;
1772 MemOp opc = get_memop(oi);
1774 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1775 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1777 /* resolve label address */
1778 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1779 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1780 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1783 if (TCG_TARGET_REG_BITS == 32) {
1786 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1789 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1792 if (TARGET_LONG_BITS == 64) {
1793 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1797 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1800 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1802 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1803 /* The second argument is already loaded with addrlo. */
1804 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1805 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1806 (uintptr_t)l->raddr);
1809 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1811 data_reg = l->datalo_reg;
1812 switch (opc & MO_SSIZE) {
1814 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1817 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1819 #if TCG_TARGET_REG_BITS == 64
1821 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1826 /* Note that the helpers have zero-extended to tcg_target_long. */
1828 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1831 if (TCG_TARGET_REG_BITS == 64) {
1832 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1833 } else if (data_reg == TCG_REG_EDX) {
1834 /* xchg %edx, %eax */
1835 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1836 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1838 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1839 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1846 /* Jump to the code corresponding to next IR of qemu_st */
1847 tcg_out_jmp(s, l->raddr);
1852 * Generate code for the slow path for a store at the end of block
1854 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856 TCGMemOpIdx oi = l->oi;
1857 MemOp opc = get_memop(oi);
1858 MemOp s_bits = opc & MO_SIZE;
1859 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1862 /* resolve label address */
1863 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1864 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1865 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1868 if (TCG_TARGET_REG_BITS == 32) {
1871 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1874 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1877 if (TARGET_LONG_BITS == 64) {
1878 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1882 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1885 if (s_bits == MO_64) {
1886 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1890 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1893 retaddr = TCG_REG_EAX;
1894 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1895 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1897 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1898 /* The second argument is already loaded with addrlo. */
1899 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1900 tcg_target_call_iarg_regs[2], l->datalo_reg);
1901 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1903 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1904 retaddr = tcg_target_call_iarg_regs[4];
1905 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1907 retaddr = TCG_REG_RAX;
1908 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1909 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1910 TCG_TARGET_CALL_STACK_OFFSET);
1914 /* "Tail call" to the helper, with the return address back inline. */
1915 tcg_out_push(s, retaddr);
1916 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1919 #elif TCG_TARGET_REG_BITS == 32
1920 # define x86_guest_base_seg 0
1921 # define x86_guest_base_index -1
1922 # define x86_guest_base_offset guest_base
1924 static int x86_guest_base_seg;
1925 static int x86_guest_base_index = -1;
1926 static int32_t x86_guest_base_offset;
1927 # if defined(__x86_64__) && defined(__linux__)
1928 # include <asm/prctl.h>
1929 # include <sys/prctl.h>
1930 int arch_prctl(int code, unsigned long addr);
1931 static inline int setup_guest_base_seg(void)
1933 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1938 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1939 # include <machine/sysarch.h>
1940 static inline int setup_guest_base_seg(void)
1942 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1948 static inline int setup_guest_base_seg(void)
1953 #endif /* SOFTMMU */
1955 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1956 TCGReg base, int index, intptr_t ofs,
1957 int seg, bool is64, MemOp memop)
1959 bool use_movbe = false;
1960 int rexw = is64 * P_REXW;
1961 int movop = OPC_MOVL_GvEv;
1963 /* Do big-endian loads with movbe. */
1964 if (memop & MO_BSWAP) {
1965 tcg_debug_assert(have_movbe);
1967 movop = OPC_MOVBE_GyMy;
1970 switch (memop & MO_SSIZE) {
1972 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1973 base, index, 0, ofs);
1976 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1977 base, index, 0, ofs);
1981 /* There is no extending movbe; only low 16-bits are modified. */
1982 if (datalo != base && datalo != index) {
1983 /* XOR breaks dependency chains. */
1984 tgen_arithr(s, ARITH_XOR, datalo, datalo);
1985 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1986 datalo, base, index, 0, ofs);
1988 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1989 datalo, base, index, 0, ofs);
1990 tcg_out_ext16u(s, datalo, datalo);
1993 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1994 base, index, 0, ofs);
1999 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2000 datalo, base, index, 0, ofs);
2001 tcg_out_ext16s(s, datalo, datalo, rexw);
2003 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2004 datalo, base, index, 0, ofs);
2008 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2010 #if TCG_TARGET_REG_BITS == 64
2013 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2014 base, index, 0, ofs);
2015 tcg_out_ext32s(s, datalo, datalo);
2017 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2018 base, index, 0, ofs);
2023 if (TCG_TARGET_REG_BITS == 64) {
2024 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2025 base, index, 0, ofs);
2032 if (base != datalo) {
2033 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2034 base, index, 0, ofs);
2035 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2036 base, index, 0, ofs + 4);
2038 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2039 base, index, 0, ofs + 4);
2040 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2041 base, index, 0, ofs);
2046 g_assert_not_reached();
2050 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2051 EAX. It will be useful once fixed registers globals are less
2053 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2055 TCGReg datalo, datahi, addrlo;
2056 TCGReg addrhi __attribute__((unused));
2059 #if defined(CONFIG_SOFTMMU)
2061 tcg_insn_unit *label_ptr[2];
2065 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2067 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2069 opc = get_memop(oi);
2071 #if defined(CONFIG_SOFTMMU)
2072 mem_index = get_mmuidx(oi);
2074 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2075 label_ptr, offsetof(CPUTLBEntry, addr_read));
2078 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2080 /* Record the current context of a load into ldst label */
2081 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2082 s->code_ptr, label_ptr);
2084 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2085 x86_guest_base_offset, x86_guest_base_seg,
2090 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2091 TCGReg base, int index, intptr_t ofs,
2092 int seg, MemOp memop)
2094 bool use_movbe = false;
2095 int movop = OPC_MOVL_EvGv;
2098 * Do big-endian stores with movbe or softmmu.
2099 * User-only without movbe will have its swapping done generically.
2101 if (memop & MO_BSWAP) {
2102 tcg_debug_assert(have_movbe);
2104 movop = OPC_MOVBE_MyGy;
2107 switch (memop & MO_SIZE) {
2109 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2110 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2111 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2112 datalo, base, index, 0, ofs);
2115 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2116 base, index, 0, ofs);
2119 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2122 if (TCG_TARGET_REG_BITS == 64) {
2123 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2124 base, index, 0, ofs);
2131 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2132 base, index, 0, ofs);
2133 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2134 base, index, 0, ofs + 4);
2138 g_assert_not_reached();
2142 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2144 TCGReg datalo, datahi, addrlo;
2145 TCGReg addrhi __attribute__((unused));
2148 #if defined(CONFIG_SOFTMMU)
2150 tcg_insn_unit *label_ptr[2];
2154 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2156 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2158 opc = get_memop(oi);
2160 #if defined(CONFIG_SOFTMMU)
2161 mem_index = get_mmuidx(oi);
2163 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2164 label_ptr, offsetof(CPUTLBEntry, addr_write));
2167 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2169 /* Record the current context of a store into ldst label */
2170 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2171 s->code_ptr, label_ptr);
2173 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2174 x86_guest_base_offset, x86_guest_base_seg, opc);
2178 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2179 const TCGArg args[TCG_MAX_OP_ARGS],
2180 const int const_args[TCG_MAX_OP_ARGS])
2183 int c, const_a2, vexop, rexw = 0;
2185 #if TCG_TARGET_REG_BITS == 64
2186 # define OP_32_64(x) \
2187 case glue(glue(INDEX_op_, x), _i64): \
2188 rexw = P_REXW; /* FALLTHRU */ \
2189 case glue(glue(INDEX_op_, x), _i32)
2191 # define OP_32_64(x) \
2192 case glue(glue(INDEX_op_, x), _i32)
2195 /* Hoist the loads of the most common arguments. */
2199 const_a2 = const_args[2];
2202 case INDEX_op_exit_tb:
2203 /* Reuse the zeroing that exists for goto_ptr. */
2205 tcg_out_jmp(s, tcg_code_gen_epilogue);
2207 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2208 tcg_out_jmp(s, tb_ret_addr);
2211 case INDEX_op_goto_tb:
2212 if (s->tb_jmp_insn_offset) {
2213 /* direct jump method */
2215 /* jump displacement must be aligned for atomic patching;
2216 * see if we need to add extra nops before jump
2218 gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2220 tcg_out_nopn(s, gap - 1);
2222 tcg_out8(s, OPC_JMP_long); /* jmp im */
2223 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2226 /* indirect jump method */
2227 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2228 (intptr_t)(s->tb_jmp_target_addr + a0));
2230 set_jmp_reset_offset(s, a0);
2232 case INDEX_op_goto_ptr:
2233 /* jmp to the given host address (could be epilogue) */
2234 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2237 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2240 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2241 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2244 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2247 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2248 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2251 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2253 #if TCG_TARGET_REG_BITS == 64
2254 case INDEX_op_ld32u_i64:
2256 case INDEX_op_ld_i32:
2257 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2261 if (const_args[0]) {
2262 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2265 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2269 if (const_args[0]) {
2270 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2273 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2276 #if TCG_TARGET_REG_BITS == 64
2277 case INDEX_op_st32_i64:
2279 case INDEX_op_st_i32:
2280 if (const_args[0]) {
2281 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2284 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2289 /* For 3-operand addition, use LEA. */
2294 } else if (a0 == a2) {
2295 /* Watch out for dest = src + dest, since we've removed
2296 the matching constraint on the add. */
2297 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2301 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2320 tgen_arithi(s, c + rexw, a0, a2, 0);
2322 tgen_arithr(s, c + rexw, a0, a2);
2328 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2329 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2331 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2339 if (val == (int8_t)val) {
2340 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2343 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2347 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2352 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2355 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2359 /* For small constant 3-operand shift, use LEA. */
2360 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2362 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2363 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2365 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2366 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2372 goto gen_shift_maybe_vex;
2376 goto gen_shift_maybe_vex;
2380 goto gen_shift_maybe_vex;
2387 gen_shift_maybe_vex:
2390 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2393 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2398 tcg_out_shifti(s, c + rexw, a0, a2);
2400 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2405 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2408 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2411 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2414 case INDEX_op_brcond_i32:
2415 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2417 case INDEX_op_setcond_i32:
2418 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2420 case INDEX_op_movcond_i32:
2421 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2425 if (a2 & TCG_BSWAP_OS) {
2426 /* Output must be sign-extended. */
2428 tcg_out_bswap64(s, a0);
2429 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2431 tcg_out_bswap32(s, a0);
2432 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2434 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2435 /* Output must be zero-extended, but input isn't. */
2436 tcg_out_bswap32(s, a0);
2437 tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2439 tcg_out_rolw_8(s, a0);
2443 tcg_out_bswap32(s, a0);
2444 if (rexw && (a2 & TCG_BSWAP_OS)) {
2445 tcg_out_ext32s(s, a0, a0);
2450 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2453 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2457 tcg_out_ext8s(s, a0, a1, rexw);
2460 tcg_out_ext16s(s, a0, a1, rexw);
2463 tcg_out_ext8u(s, a0, a1);
2466 tcg_out_ext16u(s, a0, a1);
2469 case INDEX_op_qemu_ld_i32:
2470 tcg_out_qemu_ld(s, args, 0);
2472 case INDEX_op_qemu_ld_i64:
2473 tcg_out_qemu_ld(s, args, 1);
2475 case INDEX_op_qemu_st_i32:
2476 case INDEX_op_qemu_st8_i32:
2477 tcg_out_qemu_st(s, args, 0);
2479 case INDEX_op_qemu_st_i64:
2480 tcg_out_qemu_st(s, args, 1);
2484 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2487 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2490 if (const_args[4]) {
2491 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2493 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2495 if (const_args[5]) {
2496 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2498 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2502 if (const_args[4]) {
2503 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2505 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2507 if (const_args[5]) {
2508 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2510 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2514 #if TCG_TARGET_REG_BITS == 32
2515 case INDEX_op_brcond2_i32:
2516 tcg_out_brcond2(s, args, const_args, 0);
2518 case INDEX_op_setcond2_i32:
2519 tcg_out_setcond2(s, args, const_args);
2521 #else /* TCG_TARGET_REG_BITS == 64 */
2522 case INDEX_op_ld32s_i64:
2523 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2525 case INDEX_op_ld_i64:
2526 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2528 case INDEX_op_st_i64:
2529 if (const_args[0]) {
2530 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2533 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2537 case INDEX_op_brcond_i64:
2538 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2540 case INDEX_op_setcond_i64:
2541 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2543 case INDEX_op_movcond_i64:
2544 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2547 case INDEX_op_bswap64_i64:
2548 tcg_out_bswap64(s, a0);
2550 case INDEX_op_extu_i32_i64:
2551 case INDEX_op_ext32u_i64:
2552 case INDEX_op_extrl_i64_i32:
2553 tcg_out_ext32u(s, a0, a1);
2555 case INDEX_op_ext_i32_i64:
2556 case INDEX_op_ext32s_i64:
2557 tcg_out_ext32s(s, a0, a1);
2559 case INDEX_op_extrh_i64_i32:
2560 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2565 if (args[3] == 0 && args[4] == 8) {
2566 /* load bits 0..7 */
2567 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2568 } else if (args[3] == 8 && args[4] == 8) {
2569 /* load bits 8..15 */
2570 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2571 } else if (args[3] == 0 && args[4] == 16) {
2572 /* load bits 0..15 */
2573 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2579 case INDEX_op_extract_i64:
2580 if (a2 + args[3] == 32) {
2581 /* This is a 32-bit zero-extending right shift. */
2582 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2583 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2587 case INDEX_op_extract_i32:
2588 /* On the off-chance that we can use the high-byte registers.
2589 Otherwise we emit the same ext16 + shift pattern that we
2590 would have gotten from the normal tcg-op.c expansion. */
2591 tcg_debug_assert(a2 == 8 && args[3] == 8);
2592 if (a1 < 4 && a0 < 8) {
2593 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2595 tcg_out_ext16u(s, a0, a1);
2596 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2600 case INDEX_op_sextract_i32:
2601 /* We don't implement sextract_i64, as we cannot sign-extend to
2602 64-bits without using the REX prefix that explicitly excludes
2603 access to the high-byte registers. */
2604 tcg_debug_assert(a2 == 8 && args[3] == 8);
2605 if (a1 < 4 && a0 < 8) {
2606 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2608 tcg_out_ext16s(s, a0, a1, 0);
2609 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2614 /* Note that SHRD outputs to the r/m operand. */
2615 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2616 tcg_out8(s, args[3]);
2622 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2623 case INDEX_op_mov_i64:
2624 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2632 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2633 unsigned vecl, unsigned vece,
2634 const TCGArg args[TCG_MAX_OP_ARGS],
2635 const int const_args[TCG_MAX_OP_ARGS])
2637 static int const add_insn[4] = {
2638 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2640 static int const ssadd_insn[4] = {
2641 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2643 static int const usadd_insn[4] = {
2644 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2646 static int const sub_insn[4] = {
2647 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2649 static int const sssub_insn[4] = {
2650 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2652 static int const ussub_insn[4] = {
2653 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2655 static int const mul_insn[4] = {
2656 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2658 static int const shift_imm_insn[4] = {
2659 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2661 static int const cmpeq_insn[4] = {
2662 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2664 static int const cmpgt_insn[4] = {
2665 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2667 static int const punpckl_insn[4] = {
2668 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2670 static int const punpckh_insn[4] = {
2671 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2673 static int const packss_insn[4] = {
2674 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2676 static int const packus_insn[4] = {
2677 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2679 static int const smin_insn[4] = {
2680 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2682 static int const smax_insn[4] = {
2683 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2685 static int const umin_insn[4] = {
2686 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2688 static int const umax_insn[4] = {
2689 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2691 static int const shlv_insn[4] = {
2692 /* TODO: AVX512 adds support for MO_16. */
2693 OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2695 static int const shrv_insn[4] = {
2696 /* TODO: AVX512 adds support for MO_16. */
2697 OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2699 static int const sarv_insn[4] = {
2700 /* TODO: AVX512 adds support for MO_16, MO_64. */
2701 OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2703 static int const shls_insn[4] = {
2704 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2706 static int const shrs_insn[4] = {
2707 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2709 static int const sars_insn[4] = {
2710 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2712 static int const abs_insn[4] = {
2713 /* TODO: AVX512 adds support for MO_64. */
2714 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2717 TCGType type = vecl + TCG_TYPE_V64;
2726 case INDEX_op_add_vec:
2727 insn = add_insn[vece];
2729 case INDEX_op_ssadd_vec:
2730 insn = ssadd_insn[vece];
2732 case INDEX_op_usadd_vec:
2733 insn = usadd_insn[vece];
2735 case INDEX_op_sub_vec:
2736 insn = sub_insn[vece];
2738 case INDEX_op_sssub_vec:
2739 insn = sssub_insn[vece];
2741 case INDEX_op_ussub_vec:
2742 insn = ussub_insn[vece];
2744 case INDEX_op_mul_vec:
2745 insn = mul_insn[vece];
2747 case INDEX_op_and_vec:
2750 case INDEX_op_or_vec:
2753 case INDEX_op_xor_vec:
2756 case INDEX_op_smin_vec:
2757 insn = smin_insn[vece];
2759 case INDEX_op_umin_vec:
2760 insn = umin_insn[vece];
2762 case INDEX_op_smax_vec:
2763 insn = smax_insn[vece];
2765 case INDEX_op_umax_vec:
2766 insn = umax_insn[vece];
2768 case INDEX_op_shlv_vec:
2769 insn = shlv_insn[vece];
2771 case INDEX_op_shrv_vec:
2772 insn = shrv_insn[vece];
2774 case INDEX_op_sarv_vec:
2775 insn = sarv_insn[vece];
2777 case INDEX_op_shls_vec:
2778 insn = shls_insn[vece];
2780 case INDEX_op_shrs_vec:
2781 insn = shrs_insn[vece];
2783 case INDEX_op_sars_vec:
2784 insn = sars_insn[vece];
2786 case INDEX_op_x86_punpckl_vec:
2787 insn = punpckl_insn[vece];
2789 case INDEX_op_x86_punpckh_vec:
2790 insn = punpckh_insn[vece];
2792 case INDEX_op_x86_packss_vec:
2793 insn = packss_insn[vece];
2795 case INDEX_op_x86_packus_vec:
2796 insn = packus_insn[vece];
2798 #if TCG_TARGET_REG_BITS == 32
2799 case INDEX_op_dup2_vec:
2800 /* First merge the two 32-bit inputs to a single 64-bit element. */
2801 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2802 /* Then replicate the 64-bit elements across the rest of the vector. */
2803 if (type != TCG_TYPE_V64) {
2804 tcg_out_dup_vec(s, type, MO_64, a0, a0);
2808 case INDEX_op_abs_vec:
2809 insn = abs_insn[vece];
2814 tcg_debug_assert(insn != OPC_UD2);
2815 if (type == TCG_TYPE_V256) {
2818 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2821 case INDEX_op_cmp_vec:
2823 if (sub == TCG_COND_EQ) {
2824 insn = cmpeq_insn[vece];
2825 } else if (sub == TCG_COND_GT) {
2826 insn = cmpgt_insn[vece];
2828 g_assert_not_reached();
2832 case INDEX_op_andc_vec:
2834 if (type == TCG_TYPE_V256) {
2837 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2840 case INDEX_op_shli_vec:
2843 case INDEX_op_shri_vec:
2846 case INDEX_op_sari_vec:
2847 tcg_debug_assert(vece != MO_64);
2850 tcg_debug_assert(vece != MO_8);
2851 insn = shift_imm_insn[vece];
2852 if (type == TCG_TYPE_V256) {
2855 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2859 case INDEX_op_ld_vec:
2860 tcg_out_ld(s, type, a0, a1, a2);
2862 case INDEX_op_st_vec:
2863 tcg_out_st(s, type, a0, a1, a2);
2865 case INDEX_op_dupm_vec:
2866 tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2869 case INDEX_op_x86_shufps_vec:
2873 case INDEX_op_x86_blend_vec:
2874 if (vece == MO_16) {
2876 } else if (vece == MO_32) {
2877 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2879 g_assert_not_reached();
2883 case INDEX_op_x86_vperm2i128_vec:
2884 insn = OPC_VPERM2I128;
2888 if (type == TCG_TYPE_V256) {
2891 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2895 case INDEX_op_x86_vpblendvb_vec:
2896 insn = OPC_VPBLENDVB;
2897 if (type == TCG_TYPE_V256) {
2900 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2901 tcg_out8(s, args[3] << 4);
2904 case INDEX_op_x86_psrldq_vec:
2905 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2909 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
2910 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
2912 g_assert_not_reached();
2916 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2919 case INDEX_op_goto_ptr:
2922 case INDEX_op_ld8u_i32:
2923 case INDEX_op_ld8u_i64:
2924 case INDEX_op_ld8s_i32:
2925 case INDEX_op_ld8s_i64:
2926 case INDEX_op_ld16u_i32:
2927 case INDEX_op_ld16u_i64:
2928 case INDEX_op_ld16s_i32:
2929 case INDEX_op_ld16s_i64:
2930 case INDEX_op_ld_i32:
2931 case INDEX_op_ld32u_i64:
2932 case INDEX_op_ld32s_i64:
2933 case INDEX_op_ld_i64:
2934 return C_O1_I1(r, r);
2936 case INDEX_op_st8_i32:
2937 case INDEX_op_st8_i64:
2938 return C_O0_I2(qi, r);
2940 case INDEX_op_st16_i32:
2941 case INDEX_op_st16_i64:
2942 case INDEX_op_st_i32:
2943 case INDEX_op_st32_i64:
2944 return C_O0_I2(ri, r);
2946 case INDEX_op_st_i64:
2947 return C_O0_I2(re, r);
2949 case INDEX_op_add_i32:
2950 case INDEX_op_add_i64:
2951 return C_O1_I2(r, r, re);
2953 case INDEX_op_sub_i32:
2954 case INDEX_op_sub_i64:
2955 case INDEX_op_mul_i32:
2956 case INDEX_op_mul_i64:
2957 case INDEX_op_or_i32:
2958 case INDEX_op_or_i64:
2959 case INDEX_op_xor_i32:
2960 case INDEX_op_xor_i64:
2961 return C_O1_I2(r, 0, re);
2963 case INDEX_op_and_i32:
2964 case INDEX_op_and_i64:
2965 return C_O1_I2(r, 0, reZ);
2967 case INDEX_op_andc_i32:
2968 case INDEX_op_andc_i64:
2969 return C_O1_I2(r, r, rI);
2971 case INDEX_op_shl_i32:
2972 case INDEX_op_shl_i64:
2973 case INDEX_op_shr_i32:
2974 case INDEX_op_shr_i64:
2975 case INDEX_op_sar_i32:
2976 case INDEX_op_sar_i64:
2977 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2979 case INDEX_op_rotl_i32:
2980 case INDEX_op_rotl_i64:
2981 case INDEX_op_rotr_i32:
2982 case INDEX_op_rotr_i64:
2983 return C_O1_I2(r, 0, ci);
2985 case INDEX_op_brcond_i32:
2986 case INDEX_op_brcond_i64:
2987 return C_O0_I2(r, re);
2989 case INDEX_op_bswap16_i32:
2990 case INDEX_op_bswap16_i64:
2991 case INDEX_op_bswap32_i32:
2992 case INDEX_op_bswap32_i64:
2993 case INDEX_op_bswap64_i64:
2994 case INDEX_op_neg_i32:
2995 case INDEX_op_neg_i64:
2996 case INDEX_op_not_i32:
2997 case INDEX_op_not_i64:
2998 case INDEX_op_extrh_i64_i32:
2999 return C_O1_I1(r, 0);
3001 case INDEX_op_ext8s_i32:
3002 case INDEX_op_ext8s_i64:
3003 case INDEX_op_ext8u_i32:
3004 case INDEX_op_ext8u_i64:
3005 return C_O1_I1(r, q);
3007 case INDEX_op_ext16s_i32:
3008 case INDEX_op_ext16s_i64:
3009 case INDEX_op_ext16u_i32:
3010 case INDEX_op_ext16u_i64:
3011 case INDEX_op_ext32s_i64:
3012 case INDEX_op_ext32u_i64:
3013 case INDEX_op_ext_i32_i64:
3014 case INDEX_op_extu_i32_i64:
3015 case INDEX_op_extrl_i64_i32:
3016 case INDEX_op_extract_i32:
3017 case INDEX_op_extract_i64:
3018 case INDEX_op_sextract_i32:
3019 case INDEX_op_ctpop_i32:
3020 case INDEX_op_ctpop_i64:
3021 return C_O1_I1(r, r);
3023 case INDEX_op_extract2_i32:
3024 case INDEX_op_extract2_i64:
3025 return C_O1_I2(r, 0, r);
3027 case INDEX_op_deposit_i32:
3028 case INDEX_op_deposit_i64:
3029 return C_O1_I2(Q, 0, Q);
3031 case INDEX_op_setcond_i32:
3032 case INDEX_op_setcond_i64:
3033 return C_O1_I2(q, r, re);
3035 case INDEX_op_movcond_i32:
3036 case INDEX_op_movcond_i64:
3037 return C_O1_I4(r, r, re, r, 0);
3039 case INDEX_op_div2_i32:
3040 case INDEX_op_div2_i64:
3041 case INDEX_op_divu2_i32:
3042 case INDEX_op_divu2_i64:
3043 return C_O2_I3(a, d, 0, 1, r);
3045 case INDEX_op_mulu2_i32:
3046 case INDEX_op_mulu2_i64:
3047 case INDEX_op_muls2_i32:
3048 case INDEX_op_muls2_i64:
3049 return C_O2_I2(a, d, a, r);
3051 case INDEX_op_add2_i32:
3052 case INDEX_op_add2_i64:
3053 case INDEX_op_sub2_i32:
3054 case INDEX_op_sub2_i64:
3055 return C_O2_I4(r, r, 0, 1, re, re);
3057 case INDEX_op_ctz_i32:
3058 case INDEX_op_ctz_i64:
3059 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3061 case INDEX_op_clz_i32:
3062 case INDEX_op_clz_i64:
3063 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3065 case INDEX_op_qemu_ld_i32:
3066 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3067 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3069 case INDEX_op_qemu_st_i32:
3070 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3071 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3072 case INDEX_op_qemu_st8_i32:
3073 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3074 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3076 case INDEX_op_qemu_ld_i64:
3077 return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3078 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3079 : C_O2_I2(r, r, L, L));
3081 case INDEX_op_qemu_st_i64:
3082 return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3083 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3084 : C_O0_I4(L, L, L, L));
3086 case INDEX_op_brcond2_i32:
3087 return C_O0_I4(r, r, ri, ri);
3089 case INDEX_op_setcond2_i32:
3090 return C_O1_I4(r, r, r, ri, ri);
3092 case INDEX_op_ld_vec:
3093 case INDEX_op_dupm_vec:
3094 return C_O1_I1(x, r);
3096 case INDEX_op_st_vec:
3097 return C_O0_I2(x, r);
3099 case INDEX_op_add_vec:
3100 case INDEX_op_sub_vec:
3101 case INDEX_op_mul_vec:
3102 case INDEX_op_and_vec:
3103 case INDEX_op_or_vec:
3104 case INDEX_op_xor_vec:
3105 case INDEX_op_andc_vec:
3106 case INDEX_op_ssadd_vec:
3107 case INDEX_op_usadd_vec:
3108 case INDEX_op_sssub_vec:
3109 case INDEX_op_ussub_vec:
3110 case INDEX_op_smin_vec:
3111 case INDEX_op_umin_vec:
3112 case INDEX_op_smax_vec:
3113 case INDEX_op_umax_vec:
3114 case INDEX_op_shlv_vec:
3115 case INDEX_op_shrv_vec:
3116 case INDEX_op_sarv_vec:
3117 case INDEX_op_shls_vec:
3118 case INDEX_op_shrs_vec:
3119 case INDEX_op_sars_vec:
3120 case INDEX_op_rotls_vec:
3121 case INDEX_op_cmp_vec:
3122 case INDEX_op_x86_shufps_vec:
3123 case INDEX_op_x86_blend_vec:
3124 case INDEX_op_x86_packss_vec:
3125 case INDEX_op_x86_packus_vec:
3126 case INDEX_op_x86_vperm2i128_vec:
3127 case INDEX_op_x86_punpckl_vec:
3128 case INDEX_op_x86_punpckh_vec:
3129 #if TCG_TARGET_REG_BITS == 32
3130 case INDEX_op_dup2_vec:
3132 return C_O1_I2(x, x, x);
3134 case INDEX_op_abs_vec:
3135 case INDEX_op_dup_vec:
3136 case INDEX_op_shli_vec:
3137 case INDEX_op_shri_vec:
3138 case INDEX_op_sari_vec:
3139 case INDEX_op_x86_psrldq_vec:
3140 return C_O1_I1(x, x);
3142 case INDEX_op_x86_vpblendvb_vec:
3143 return C_O1_I3(x, x, x, x);
3146 g_assert_not_reached();
3150 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3153 case INDEX_op_add_vec:
3154 case INDEX_op_sub_vec:
3155 case INDEX_op_and_vec:
3156 case INDEX_op_or_vec:
3157 case INDEX_op_xor_vec:
3158 case INDEX_op_andc_vec:
3160 case INDEX_op_rotli_vec:
3161 case INDEX_op_cmp_vec:
3162 case INDEX_op_cmpsel_vec:
3165 case INDEX_op_shli_vec:
3166 case INDEX_op_shri_vec:
3167 /* We must expand the operation for MO_8. */
3168 return vece == MO_8 ? -1 : 1;
3170 case INDEX_op_sari_vec:
3171 /* We must expand the operation for MO_8. */
3175 /* We can emulate this for MO_64, but it does not pay off
3176 unless we're producing at least 4 values. */
3177 if (vece == MO_64) {
3178 return type >= TCG_TYPE_V256 ? -1 : 0;
3182 case INDEX_op_shls_vec:
3183 case INDEX_op_shrs_vec:
3184 return vece >= MO_16;
3185 case INDEX_op_sars_vec:
3186 return vece >= MO_16 && vece <= MO_32;
3187 case INDEX_op_rotls_vec:
3188 return vece >= MO_16 ? -1 : 0;
3190 case INDEX_op_shlv_vec:
3191 case INDEX_op_shrv_vec:
3192 return have_avx2 && vece >= MO_32;
3193 case INDEX_op_sarv_vec:
3194 return have_avx2 && vece == MO_32;
3195 case INDEX_op_rotlv_vec:
3196 case INDEX_op_rotrv_vec:
3197 return have_avx2 && vece >= MO_32 ? -1 : 0;
3199 case INDEX_op_mul_vec:
3201 /* We can expand the operation for MO_8. */
3204 if (vece == MO_64) {
3209 case INDEX_op_ssadd_vec:
3210 case INDEX_op_usadd_vec:
3211 case INDEX_op_sssub_vec:
3212 case INDEX_op_ussub_vec:
3213 return vece <= MO_16;
3214 case INDEX_op_smin_vec:
3215 case INDEX_op_smax_vec:
3216 case INDEX_op_umin_vec:
3217 case INDEX_op_umax_vec:
3218 case INDEX_op_abs_vec:
3219 return vece <= MO_32;
3226 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3227 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3231 tcg_debug_assert(vece == MO_8);
3233 t1 = tcg_temp_new_vec(type);
3234 t2 = tcg_temp_new_vec(type);
3237 * Unpack to W, shift, and repack. Tricky bits:
3238 * (1) Use punpck*bw x,x to produce DDCCBBAA,
3239 * i.e. duplicate in other half of the 16-bit lane.
3240 * (2) For right-shift, add 8 so that the high half of the lane
3241 * becomes zero. For left-shift, and left-rotate, we must
3242 * shift up and down again.
3243 * (3) Step 2 leaves high half zero such that PACKUSWB
3244 * (pack with unsigned saturation) does not modify
3247 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3248 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3249 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3250 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3252 if (opc != INDEX_op_rotli_vec) {
3255 if (opc == INDEX_op_shri_vec) {
3256 tcg_gen_shri_vec(MO_16, t1, t1, imm);
3257 tcg_gen_shri_vec(MO_16, t2, t2, imm);
3259 tcg_gen_shli_vec(MO_16, t1, t1, imm);
3260 tcg_gen_shli_vec(MO_16, t2, t2, imm);
3261 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3262 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3265 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3266 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3267 tcg_temp_free_vec(t1);
3268 tcg_temp_free_vec(t2);
3271 static void expand_vec_sari(TCGType type, unsigned vece,
3272 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3278 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
3279 t1 = tcg_temp_new_vec(type);
3280 t2 = tcg_temp_new_vec(type);
3281 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3282 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3283 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3284 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3285 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3286 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3287 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3288 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3289 tcg_temp_free_vec(t1);
3290 tcg_temp_free_vec(t2);
3296 * We can emulate a small sign extend by performing an arithmetic
3297 * 32-bit shift and overwriting the high half of a 64-bit logical
3298 * shift. Note that the ISA says shift of 32 is valid, but TCG
3299 * does not, so we have to bound the smaller shift -- we get the
3300 * same result in the high half either way.
3302 t1 = tcg_temp_new_vec(type);
3303 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3304 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3305 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3306 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3307 tcgv_vec_arg(t1), 0xaa);
3308 tcg_temp_free_vec(t1);
3310 /* Otherwise we will need to use a compare vs 0 to produce
3311 * the sign-extend, shift and merge.
3313 t1 = tcg_const_zeros_vec(type);
3314 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3315 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3316 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3317 tcg_gen_or_vec(MO_64, v0, v0, t1);
3318 tcg_temp_free_vec(t1);
3323 g_assert_not_reached();
3327 static void expand_vec_rotli(TCGType type, unsigned vece,
3328 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3333 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3337 t = tcg_temp_new_vec(type);
3338 tcg_gen_shli_vec(vece, t, v1, imm);
3339 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3340 tcg_gen_or_vec(vece, v0, v0, t);
3341 tcg_temp_free_vec(t);
3344 static void expand_vec_rotls(TCGType type, unsigned vece,
3345 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3350 tcg_debug_assert(vece != MO_8);
3352 t = tcg_temp_new_vec(type);
3353 rsh = tcg_temp_new_i32();
3355 tcg_gen_neg_i32(rsh, lsh);
3356 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3357 tcg_gen_shls_vec(vece, t, v1, lsh);
3358 tcg_gen_shrs_vec(vece, v0, v1, rsh);
3359 tcg_gen_or_vec(vece, v0, v0, t);
3360 tcg_temp_free_vec(t);
3361 tcg_temp_free_i32(rsh);
3364 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3365 TCGv_vec v1, TCGv_vec sh, bool right)
3367 TCGv_vec t = tcg_temp_new_vec(type);
3369 tcg_gen_dupi_vec(vece, t, 8 << vece);
3370 tcg_gen_sub_vec(vece, t, t, sh);
3372 tcg_gen_shlv_vec(vece, t, v1, t);
3373 tcg_gen_shrv_vec(vece, v0, v1, sh);
3375 tcg_gen_shrv_vec(vece, t, v1, t);
3376 tcg_gen_shlv_vec(vece, v0, v1, sh);
3378 tcg_gen_or_vec(vece, v0, v0, t);
3379 tcg_temp_free_vec(t);
3382 static void expand_vec_mul(TCGType type, unsigned vece,
3383 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3385 TCGv_vec t1, t2, t3, t4, zero;
3387 tcg_debug_assert(vece == MO_8);
3390 * Unpack v1 bytes to words, 0 | x.
3391 * Unpack v2 bytes to words, y | 0.
3392 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3393 * Shift logical right by 8 bits to clear the high 8 bytes before
3394 * using an unsigned saturated pack.
3396 * The difference between the V64, V128 and V256 cases is merely how
3397 * we distribute the expansion between temporaries.
3401 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3402 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3403 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3404 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3405 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3406 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3407 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3408 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3409 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3410 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3411 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3412 tcg_temp_free_vec(t1);
3413 tcg_temp_free_vec(t2);
3418 t1 = tcg_temp_new_vec(type);
3419 t2 = tcg_temp_new_vec(type);
3420 t3 = tcg_temp_new_vec(type);
3421 t4 = tcg_temp_new_vec(type);
3422 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3423 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3424 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3425 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3426 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3427 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3428 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3429 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3430 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3431 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3432 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3433 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3434 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3435 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3436 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3437 tcg_temp_free_vec(t1);
3438 tcg_temp_free_vec(t2);
3439 tcg_temp_free_vec(t3);
3440 tcg_temp_free_vec(t4);
3444 g_assert_not_reached();
3448 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3449 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3458 TCGv_vec t1, t2, t3;
3474 fixup = NEED_SWAP | NEED_INV;
3477 if (vece <= MO_32) {
3480 fixup = NEED_BIAS | NEED_INV;
3484 if (vece <= MO_32) {
3485 fixup = NEED_UMIN | NEED_INV;
3491 if (vece <= MO_32) {
3494 fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3498 if (vece <= MO_32) {
3499 fixup = NEED_UMAX | NEED_INV;
3501 fixup = NEED_BIAS | NEED_SWAP;
3505 g_assert_not_reached();
3508 if (fixup & NEED_INV) {
3509 cond = tcg_invert_cond(cond);
3511 if (fixup & NEED_SWAP) {
3512 t1 = v1, v1 = v2, v2 = t1;
3513 cond = tcg_swap_cond(cond);
3517 if (fixup & (NEED_UMIN | NEED_UMAX)) {
3518 t1 = tcg_temp_new_vec(type);
3519 if (fixup & NEED_UMIN) {
3520 tcg_gen_umin_vec(vece, t1, v1, v2);
3522 tcg_gen_umax_vec(vece, t1, v1, v2);
3526 } else if (fixup & NEED_BIAS) {
3527 t1 = tcg_temp_new_vec(type);
3528 t2 = tcg_temp_new_vec(type);
3529 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3530 tcg_gen_sub_vec(vece, t1, v1, t3);
3531 tcg_gen_sub_vec(vece, t2, v2, t3);
3534 cond = tcg_signed_cond(cond);
3537 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3538 /* Expand directly; do not recurse. */
3539 vec_gen_4(INDEX_op_cmp_vec, type, vece,
3540 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3543 tcg_temp_free_vec(t1);
3545 tcg_temp_free_vec(t2);
3548 return fixup & NEED_INV;
3551 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3552 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3554 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3555 tcg_gen_not_vec(vece, v0, v0);
3559 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3560 TCGv_vec c1, TCGv_vec c2,
3561 TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3563 TCGv_vec t = tcg_temp_new_vec(type);
3565 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3566 /* Invert the sense of the compare by swapping arguments. */
3568 x = v3, v3 = v4, v4 = x;
3570 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3571 tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3572 tcgv_vec_arg(v3), tcgv_vec_arg(t));
3573 tcg_temp_free_vec(t);
3576 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3581 TCGv_vec v0, v1, v2, v3, v4;
3584 v0 = temp_tcgv_vec(arg_temp(a0));
3585 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3586 a2 = va_arg(va, TCGArg);
3589 case INDEX_op_shli_vec:
3590 case INDEX_op_shri_vec:
3591 expand_vec_shi(type, vece, opc, v0, v1, a2);
3594 case INDEX_op_sari_vec:
3595 expand_vec_sari(type, vece, v0, v1, a2);
3598 case INDEX_op_rotli_vec:
3599 expand_vec_rotli(type, vece, v0, v1, a2);
3602 case INDEX_op_rotls_vec:
3603 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3606 case INDEX_op_rotlv_vec:
3607 v2 = temp_tcgv_vec(arg_temp(a2));
3608 expand_vec_rotv(type, vece, v0, v1, v2, false);
3610 case INDEX_op_rotrv_vec:
3611 v2 = temp_tcgv_vec(arg_temp(a2));
3612 expand_vec_rotv(type, vece, v0, v1, v2, true);
3615 case INDEX_op_mul_vec:
3616 v2 = temp_tcgv_vec(arg_temp(a2));
3617 expand_vec_mul(type, vece, v0, v1, v2);
3620 case INDEX_op_cmp_vec:
3621 v2 = temp_tcgv_vec(arg_temp(a2));
3622 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3625 case INDEX_op_cmpsel_vec:
3626 v2 = temp_tcgv_vec(arg_temp(a2));
3627 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3628 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3629 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3639 static const int tcg_target_callee_save_regs[] = {
3640 #if TCG_TARGET_REG_BITS == 64
3649 TCG_REG_R14, /* Currently used for the global env. */
3652 TCG_REG_EBP, /* Currently used for the global env. */
3659 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3660 and tcg_register_jit. */
3663 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3664 * (TCG_TARGET_REG_BITS / 8))
3666 #define FRAME_SIZE \
3668 + TCG_STATIC_CALL_ARGS_SIZE \
3669 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3670 + TCG_TARGET_STACK_ALIGN - 1) \
3671 & ~(TCG_TARGET_STACK_ALIGN - 1))
3673 /* Generate global QEMU prologue and epilogue code */
3674 static void tcg_target_qemu_prologue(TCGContext *s)
3676 int i, stack_addend;
3680 /* Reserve some stack space, also for TCG temps. */
3681 stack_addend = FRAME_SIZE - PUSH_SIZE;
3682 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3683 CPU_TEMP_BUF_NLONGS * sizeof(long));
3685 /* Save all callee saved registers. */
3686 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3687 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3690 #if TCG_TARGET_REG_BITS == 32
3691 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3692 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3693 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3695 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3696 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3699 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3701 int seg = setup_guest_base_seg();
3703 x86_guest_base_seg = seg;
3704 } else if (guest_base == (int32_t)guest_base) {
3705 x86_guest_base_offset = guest_base;
3707 /* Choose R12 because, as a base, it requires a SIB byte. */
3708 x86_guest_base_index = TCG_REG_R12;
3709 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3710 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3714 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3715 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3717 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3721 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3722 * and fall through to the rest of the epilogue.
3724 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3725 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3728 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3730 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3733 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3735 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3736 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3738 tcg_out_opc(s, OPC_RET, 0, 0, 0);
3741 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3743 memset(p, 0x90, count);
3746 static void tcg_target_init(TCGContext *s)
3748 #ifdef CONFIG_CPUID_H
3749 unsigned a, b, c, d, b7 = 0;
3750 int max = __get_cpuid_max(0, 0);
3753 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3754 __cpuid_count(7, 0, a, b7, c, d);
3755 have_bmi1 = (b7 & bit_BMI) != 0;
3756 have_bmi2 = (b7 & bit_BMI2) != 0;
3760 __cpuid(1, a, b, c, d);
3762 /* For 32-bit, 99% certainty that we're running on hardware that
3763 supports cmov, but we still need to check. In case cmov is not
3764 available, we'll use a small forward branch. */
3765 have_cmov = (d & bit_CMOV) != 0;
3768 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3769 need to probe for it. */
3770 have_movbe = (c & bit_MOVBE) != 0;
3771 have_popcnt = (c & bit_POPCNT) != 0;
3773 /* There are a number of things we must check before we can be
3774 sure of not hitting invalid opcode. */
3775 if (c & bit_OSXSAVE) {
3776 unsigned xcrl, xcrh;
3777 /* The xgetbv instruction is not available to older versions of
3778 * the assembler, so we encode the instruction manually.
3780 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3781 if ((xcrl & 6) == 6) {
3782 have_avx1 = (c & bit_AVX) != 0;
3783 have_avx2 = (b7 & bit_AVX2) != 0;
3788 max = __get_cpuid_max(0x8000000, 0);
3790 __cpuid(0x80000001, a, b, c, d);
3791 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3792 have_lzcnt = (c & bit_LZCNT) != 0;
3794 #endif /* CONFIG_CPUID_H */
3796 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3797 if (TCG_TARGET_REG_BITS == 64) {
3798 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3801 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3802 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3805 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3808 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3809 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3810 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3811 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3812 if (TCG_TARGET_REG_BITS == 64) {
3813 #if !defined(_WIN64)
3814 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3815 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3817 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3818 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3819 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3820 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3823 s->reserved_regs = 0;
3824 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3829 uint8_t fde_def_cfa[4];
3830 uint8_t fde_reg_ofs[14];
3833 /* We're expecting a 2 byte uleb128 encoded value. */
3834 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3836 #if !defined(__ELF__)
3837 /* Host machine without ELF. */
3838 #elif TCG_TARGET_REG_BITS == 64
3839 #define ELF_HOST_MACHINE EM_X86_64
3840 static const DebugFrame debug_frame = {
3841 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3844 .h.cie.code_align = 1,
3845 .h.cie.data_align = 0x78, /* sleb128 -8 */
3846 .h.cie.return_column = 16,
3848 /* Total FDE size does not include the "len" member. */
3849 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3852 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3853 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3857 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3858 /* The following ordering must match tcg_target_callee_save_regs. */
3859 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3860 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3861 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3862 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3863 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3864 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3868 #define ELF_HOST_MACHINE EM_386
3869 static const DebugFrame debug_frame = {
3870 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3873 .h.cie.code_align = 1,
3874 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3875 .h.cie.return_column = 8,
3877 /* Total FDE size does not include the "len" member. */
3878 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3881 12, 4, /* DW_CFA_def_cfa %esp, ... */
3882 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3886 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3887 /* The following ordering must match tcg_target_callee_save_regs. */
3888 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3889 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3890 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3891 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3896 #if defined(ELF_HOST_MACHINE)
3897 void tcg_register_jit(const void *buf, size_t buf_size)
3899 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));