2 ** IR assembler (SSA IR -> machine code).
3 ** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
28 #include "lj_dispatch.h"
30 #include "lj_target.h"
32 /* -- Assembler state and common macros ----------------------------------- */
34 /* Assembler state. */
35 typedef struct ASMState
{
36 RegCost cost
[RID_MAX
]; /* Reference and blended allocation cost for regs. */
38 MCode
*mcp
; /* Current MCode pointer (grows down). */
39 MCode
*mclim
; /* Lower limit for MCode memory + red zone. */
41 IRIns
*ir
; /* Copy of pointer to IR instructions/constants. */
42 jit_State
*J
; /* JIT compiler state. */
44 x86ModRM mrm
; /* Fused x86 address operand. */
46 RegSet freeset
; /* Set of free registers. */
47 RegSet modset
; /* Set of registers modified inside the loop. */
48 RegSet weakset
; /* Set of weakly referenced registers. */
49 RegSet phiset
; /* Set of PHI registers. */
51 uint32_t flags
; /* Copy of JIT compiler flags. */
52 int loopinv
; /* Loop branch inversion (0:no, 1:yes, 2:yes+CC_P). */
54 int32_t evenspill
; /* Next even spill slot. */
55 int32_t oddspill
; /* Next odd spill slot (or 0). */
57 IRRef curins
; /* Reference of current instruction. */
58 IRRef stopins
; /* Stop assembly before hitting this instruction. */
59 IRRef orignins
; /* Original T->nins. */
61 IRRef snapref
; /* Current snapshot is active after this reference. */
62 IRRef snaprename
; /* Rename highwater mark for snapshot check. */
63 SnapNo snapno
; /* Current snapshot number. */
64 SnapNo loopsnapno
; /* Loop snapshot number. */
66 IRRef fuseref
; /* Fusion limit (loopref, 0 or FUSE_DISABLED). */
67 IRRef sectref
; /* Section base reference (loopref or 0). */
68 IRRef loopref
; /* Reference of LOOP instruction (or 0). */
70 BCReg topslot
; /* Number of slots for stack check (unless 0). */
71 MSize gcsteps
; /* Accumulated number of GC steps (per section). */
73 GCtrace
*T
; /* Trace to assemble. */
74 GCtrace
*parent
; /* Parent trace (or NULL). */
76 MCode
*mcbot
; /* Bottom of reserved MCode. */
77 MCode
*mctop
; /* Top of generated MCode. */
78 MCode
*mcloop
; /* Pointer to loop MCode (or NULL). */
79 MCode
*invmcp
; /* Points to invertible loop branch (or NULL). */
80 MCode
*testmcp
; /* Pending opportunity to remove test r,r. */
81 MCode
*realign
; /* Realign loop if not NULL. */
83 IRRef1 phireg
[RID_MAX
]; /* PHI register references. */
84 uint16_t parentmap
[LJ_MAX_JSLOTS
]; /* Parent slot to RegSP map. */
87 #define IR(ref) (&as->ir[(ref)])
89 #define ASMREF_TMP1 REF_TRUE /* Temp. register. */
90 #define ASMREF_TMP2 REF_FALSE /* Temp. register. */
91 #define ASMREF_L REF_NIL /* Stores register for L. */
93 /* Check for variant to invariant references. */
94 #define iscrossref(as, ref) ((ref) < as->sectref)
96 /* Inhibit memory op fusion from variant to invariant references. */
97 #define FUSE_DISABLED (~(IRRef)0)
98 #define mayfuse(as, ref) ((ref) > as->fuseref)
99 #define neverfuse(as) (as->fuseref == FUSE_DISABLED)
100 #define opisfusableload(o) \
101 ((o) == IR_ALOAD || (o) == IR_HLOAD || (o) == IR_ULOAD || \
102 (o) == IR_FLOAD || (o) == IR_XLOAD || (o) == IR_SLOAD || (o) == IR_VLOAD)
104 /* Instruction selection for XMM moves. */
105 #define XMM_MOVRR(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVSD : XO_MOVAPS)
106 #define XMM_MOVRM(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVLPD : XO_MOVSD)
108 /* Sparse limit checks using a red zone before the actual limit. */
109 #define MCLIM_REDZONE 64
110 #define checkmclim(as) \
111 if (LJ_UNLIKELY(as->mcp < as->mclim)) asm_mclimit(as)
113 static LJ_NORET LJ_NOINLINE
void asm_mclimit(ASMState
*as
)
115 lj_mcode_limiterr(as
->J
, (size_t)(as
->mctop
- as
->mcp
+ 4*MCLIM_REDZONE
));
118 /* -- Emit x86 instructions ----------------------------------------------- */
120 #define MODRM(mode, r1, r2) ((MCode)((mode)+(((r1)&7)<<3)+((r2)&7)))
123 #define REXRB(p, rr, rb) \
124 { MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \
125 if (rex != 0x40) *--(p) = rex; }
126 #define FORCE_REX 0x200
127 #define REX_64 (FORCE_REX|0x080000)
129 #define REXRB(p, rr, rb) ((void)0)
134 #define emit_i8(as, i) (*--as->mcp = (MCode)(i))
135 #define emit_i32(as, i) (*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4)
136 #define emit_u32(as, u) (*(uint32_t *)(as->mcp-4) = (u), as->mcp -= 4)
138 #define emit_x87op(as, xo) \
139 (*(uint16_t *)(as->mcp-2) = (uint16_t)(xo), as->mcp -= 2)
142 static LJ_AINLINE MCode
*emit_op(x86Op xo
, Reg rr
, Reg rb
, Reg rx
,
146 #if defined(__GNUC__)
147 if (__builtin_constant_p(xo
) && n
== -2)
148 p
[delta
-2] = (MCode
)(xo
>> 24);
149 else if (__builtin_constant_p(xo
) && n
== -3)
150 *(uint16_t *)(p
+delta
-3) = (uint16_t)(xo
>> 16);
153 *(uint32_t *)(p
+delta
-5) = (uint32_t)xo
;
157 uint32_t rex
= 0x40 + ((rr
>>1)&(4+(FORCE_REX
>>1)))+((rx
>>2)&2)+((rb
>>3)&1);
160 if (n
== -4) { *p
= (MCode
)rex
; rex
= (MCode
)(xo
>> 8); }
161 else if ((xo
& 0xffffff) == 0x6600fd) { *p
= (MCode
)rex
; rex
= 0x66; }
166 UNUSED(rr
); UNUSED(rb
); UNUSED(rx
);
172 #define emit_opm(xo, mode, rr, rb, p, delta) \
173 (p[(delta)-1] = MODRM((mode), (rr), (rb)), \
174 emit_op((xo), (rr), (rb), 0, (p), (delta)))
176 /* op + modrm + sib */
177 #define emit_opmx(xo, mode, scale, rr, rb, rx, p) \
178 (p[-1] = MODRM((scale), (rx), (rb)), \
179 p[-2] = MODRM((mode), (rr), RID_ESP), \
180 emit_op((xo), (rr), (rb), (rx), (p), -1))
183 static void emit_rr(ASMState
*as
, x86Op xo
, Reg r1
, Reg r2
)
186 as
->mcp
= emit_opm(xo
, XM_REG
, r1
, r2
, p
, 0);
189 #if LJ_64 && defined(LUA_USE_ASSERT)
190 /* [addr] is sign-extended in x64 and must be in lower 2G (not 4G). */
191 static int32_t ptr2addr(const void *p
)
193 lua_assert((uintptr_t)p
< (uintptr_t)0x80000000);
197 #define ptr2addr(p) (i32ptr((p)))
201 static void emit_rma(ASMState
*as
, x86Op xo
, Reg rr
, const void *addr
)
204 *(int32_t *)(p
-4) = ptr2addr(addr
);
206 p
[-5] = MODRM(XM_SCALE1
, RID_ESP
, RID_EBP
);
207 as
->mcp
= emit_opm(xo
, XM_OFS0
, rr
, RID_ESP
, p
, -5);
209 as
->mcp
= emit_opm(xo
, XM_OFS0
, rr
, RID_EBP
, p
, -4);
213 /* op r, [base+ofs] */
214 static void emit_rmro(ASMState
*as
, x86Op xo
, Reg rr
, Reg rb
, int32_t ofs
)
219 if (ofs
== 0 && (rb
&7) != RID_EBP
) {
221 } else if (checki8(ofs
)) {
229 if ((rb
&7) == RID_ESP
)
230 *--p
= MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
232 *(int32_t *)(p
-4) = ofs
;
234 p
[-5] = MODRM(XM_SCALE1
, RID_ESP
, RID_EBP
);
243 as
->mcp
= emit_opm(xo
, mode
, rr
, rb
, p
, 0);
246 /* op r, [base+idx*scale+ofs] */
247 static void emit_rmrxo(ASMState
*as
, x86Op xo
, Reg rr
, Reg rb
, Reg rx
,
248 x86Mode scale
, int32_t ofs
)
252 if (ofs
== 0 && (rb
&7) != RID_EBP
) {
254 } else if (checki8(ofs
)) {
262 as
->mcp
= emit_opmx(xo
, mode
, scale
, rr
, rb
, rx
, p
);
266 static void emit_gri(ASMState
*as
, x86Group xg
, Reg rb
, int32_t i
)
278 as
->mcp
= emit_opm(xo
, XM_REG
, (Reg
)(xg
& 7) | (rb
& REX_64
), rb
, p
, 0);
281 /* op [base+ofs], i */
282 static void emit_gmroi(ASMState
*as
, x86Group xg
, Reg rb
, int32_t ofs
,
293 emit_rmro(as
, xo
, (Reg
)(xg
& 7), rb
, ofs
);
296 #define emit_shifti(as, xg, r, i) \
297 (emit_i8(as, (i)), emit_rr(as, XO_SHIFTi, (Reg)(xg), (r)))
300 static void emit_mrm(ASMState
*as
, x86Op xo
, Reg rr
, Reg rb
)
303 x86Mode mode
= XM_REG
;
306 if (rb
== RID_NONE
) {
310 *(int32_t *)p
= as
->mrm
.ofs
;
311 if (as
->mrm
.idx
!= RID_NONE
)
314 *--p
= MODRM(XM_SCALE1
, RID_ESP
, RID_EBP
);
318 if (as
->mrm
.ofs
== 0 && (rb
&7) != RID_EBP
) {
320 } else if (checki8(as
->mrm
.ofs
)) {
321 *--p
= (MCode
)as
->mrm
.ofs
;
325 *(int32_t *)p
= as
->mrm
.ofs
;
328 if (as
->mrm
.idx
!= RID_NONE
) {
330 as
->mcp
= emit_opmx(xo
, mode
, as
->mrm
.scale
, rr
, rb
, as
->mrm
.idx
, p
);
333 if ((rb
&7) == RID_ESP
)
334 *--p
= MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
337 as
->mcp
= emit_opm(xo
, mode
, rr
, rb
, p
, 0);
340 static void emit_addptr(ASMState
*as
, Reg r
, int32_t ofs
)
343 if ((as
->flags
& JIT_F_LEA_AGU
))
344 emit_rmro(as
, XO_LEA
, r
, r
, ofs
);
346 emit_gri(as
, XG_ARITHi(XOg_ADD
), r
, ofs
);
351 static void emit_gmrmi(ASMState
*as
, x86Group xg
, Reg rb
, int32_t i
)
361 emit_mrm(as
, xo
, (Reg
)(xg
& 7) | (rb
& REX_64
), (rb
& ~REX_64
));
364 /* -- Emit moves ---------------------------------------------------------- */
366 /* mov [base+ofs], i */
367 static void emit_movmroi(ASMState
*as
, Reg base
, int32_t ofs
, int32_t i
)
370 emit_rmro(as
, XO_MOVmi
, 0, base
, ofs
);
373 /* mov [base+ofs], r */
374 #define emit_movtomro(as, r, base, ofs) \
375 emit_rmro(as, XO_MOVto, (r), (base), (ofs))
377 /* Get/set global_State fields. */
378 #define emit_opgl(as, xo, r, field) \
379 emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field)
380 #define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r), field)
381 #define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r), field)
382 #define emit_setgli(as, field, i) \
383 (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, field))
385 /* mov r, i / xor r, r */
386 static void emit_loadi(ASMState
*as
, Reg r
, int32_t i
)
388 /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
389 if (i
== 0 && !(LJ_32
&& (IR(as
->curins
)->o
== IR_HIOP
||
390 (as
->curins
+1 < as
->T
->nins
&&
391 IR(as
->curins
+1)->o
== IR_HIOP
)))) {
392 emit_rr(as
, XO_ARITH(XOg_XOR
), r
, r
);
395 *(int32_t *)(p
-4) = i
;
396 p
[-5] = (MCode
)(XI_MOVri
+(r
&7));
404 #define emit_loada(as, r, addr) \
405 emit_loadi(as, (r), ptr2addr((addr)))
408 /* mov r, imm64 or shorter 32 bit extended load. */
409 static void emit_loadu64(ASMState
*as
, Reg r
, uint64_t u64
)
411 if (checku32(u64
)) { /* 32 bit load clears upper 32 bits. */
412 emit_loadi(as
, r
, (int32_t)u64
);
413 } else if (checki32((int64_t)u64
)) { /* Sign-extended 32 bit load. */
415 *(int32_t *)(p
-4) = (int32_t)u64
;
416 as
->mcp
= emit_opm(XO_MOVmi
, XM_REG
, REX_64
, r
, p
, -4);
417 } else { /* Full-size 64 bit load. */
419 *(uint64_t *)(p
-8) = u64
;
420 p
[-9] = (MCode
)(XI_MOVri
+(r
&7));
421 p
[-10] = 0x48 + ((r
>>3)&1);
428 /* movsd r, [&tv->n] / xorps r, r */
429 static void emit_loadn(ASMState
*as
, Reg r
, cTValue
*tv
)
431 if (tvispzero(tv
)) /* Use xor only for +0. */
432 emit_rr(as
, XO_XORPS
, r
, r
);
434 emit_rma(as
, XMM_MOVRM(as
), r
, &tv
->n
);
437 /* -- Emit branches ------------------------------------------------------- */
439 /* Label for short jumps. */
440 typedef MCode
*MCLabel
;
442 #if LJ_32 && LJ_HASFFI
443 /* jmp short target */
444 static void emit_sjmp(ASMState
*as
, MCLabel target
)
447 ptrdiff_t delta
= target
- p
;
448 lua_assert(delta
== (int8_t)delta
);
449 p
[-1] = (MCode
)(int8_t)delta
;
455 /* jcc short target */
456 static void emit_sjcc(ASMState
*as
, int cc
, MCLabel target
)
459 ptrdiff_t delta
= target
- p
;
460 lua_assert(delta
== (int8_t)delta
);
461 p
[-1] = (MCode
)(int8_t)delta
;
462 p
[-2] = (MCode
)(XI_JCCs
+(cc
&15));
466 /* jcc short (pending target) */
467 static MCLabel
emit_sjcc_label(ASMState
*as
, int cc
)
471 p
[-2] = (MCode
)(XI_JCCs
+(cc
&15));
476 /* Fixup jcc short target. */
477 static void emit_sfixup(ASMState
*as
, MCLabel source
)
479 source
[-1] = (MCode
)(as
->mcp
-source
);
482 /* Return label pointing to current PC. */
483 #define emit_label(as) ((as)->mcp)
485 /* Compute relative 32 bit offset for jump and call instructions. */
486 static LJ_AINLINE
int32_t jmprel(MCode
*p
, MCode
*target
)
488 ptrdiff_t delta
= target
- p
;
489 lua_assert(delta
== (int32_t)delta
);
490 return (int32_t)delta
;
494 static void emit_jcc(ASMState
*as
, int cc
, MCode
*target
)
497 *(int32_t *)(p
-4) = jmprel(p
, target
);
498 p
[-5] = (MCode
)(XI_JCCn
+(cc
&15));
504 static void emit_call_(ASMState
*as
, MCode
*target
)
508 if (target
-p
!= (int32_t)(target
-p
)) {
509 /* Assumes RID_RET is never an argument to calls and always clobbered. */
510 emit_rr(as
, XO_GROUP5
, XOg_CALL
, RID_RET
);
511 emit_loadu64(as
, RID_RET
, (uint64_t)target
);
515 *(int32_t *)(p
-4) = jmprel(p
, target
);
520 #define emit_call(as, f) emit_call_(as, (MCode *)(void *)(f))
522 /* -- Register allocator debugging ---------------------------------------- */
524 /* #define LUAJIT_DEBUG_RA */
526 #ifdef LUAJIT_DEBUG_RA
531 #define RIDNAME(name) #name,
532 static const char *const ra_regname
[] = {
540 static char ra_dbg_buf
[65536];
541 static char *ra_dbg_p
;
542 static char *ra_dbg_merge
;
543 static MCode
*ra_dbg_mcp
;
545 static void ra_dstart(void)
547 ra_dbg_p
= ra_dbg_buf
;
552 static void ra_dflush(void)
554 fwrite(ra_dbg_buf
, 1, (size_t)(ra_dbg_p
-ra_dbg_buf
), stdout
);
558 static void ra_dprintf(ASMState
*as
, const char *fmt
, ...)
563 p
= ra_dbg_mcp
== as
->mcp
? ra_dbg_merge
: ra_dbg_p
;
565 p
+= sprintf(p
, "%08x \e[36m%04d ", (uintptr_t)as
->mcp
, as
->curins
-REF_BIAS
);
567 const char *e
= strchr(fmt
, '$');
568 if (e
== NULL
) break;
569 memcpy(p
, fmt
, (size_t)(e
-fmt
));
572 Reg r
= va_arg(argp
, Reg
) & RID_MASK
;
575 for (q
= ra_regname
[r
]; *q
; q
++)
576 *p
++ = *q
>= 'A' && *q
<= 'Z' ? *q
+ 0x20 : *q
;
581 } else if (e
[1] == 'f' || e
[1] == 'i') {
584 ref
= va_arg(argp
, IRRef
);
586 ref
= va_arg(argp
, IRIns
*) - as
->ir
;
588 p
+= sprintf(p
, "%04d", ref
- REF_BIAS
);
590 p
+= sprintf(p
, "K%03d", REF_BIAS
- ref
);
591 } else if (e
[1] == 's') {
592 uint32_t slot
= va_arg(argp
, uint32_t);
593 p
+= sprintf(p
, "[esp+0x%x]", sps_scale(slot
));
602 *p
++ = '\e'; *p
++ = '['; *p
++ = 'm'; *p
++ = '\n';
603 if (p
> ra_dbg_buf
+sizeof(ra_dbg_buf
)-256) {
604 fwrite(ra_dbg_buf
, 1, (size_t)(p
-ra_dbg_buf
), stdout
);
610 #define RA_DBG_START() ra_dstart()
611 #define RA_DBG_FLUSH() ra_dflush()
612 #define RA_DBG_REF() \
613 do { char *_p = ra_dbg_p; ra_dprintf(as, ""); \
614 ra_dbg_merge = _p; ra_dbg_mcp = as->mcp; } while (0)
615 #define RA_DBGX(x) ra_dprintf x
618 #define RA_DBG_START() ((void)0)
619 #define RA_DBG_FLUSH() ((void)0)
620 #define RA_DBG_REF() ((void)0)
621 #define RA_DBGX(x) ((void)0)
624 /* -- Register allocator -------------------------------------------------- */
626 #define ra_free(as, r) rset_set(as->freeset, (r))
627 #define ra_modified(as, r) rset_set(as->modset, (r))
628 #define ra_weak(as, r) rset_set(as->weakset, (r))
629 #define ra_noweak(as, r) rset_clear(as->weakset, (r))
631 #define ra_used(ir) (ra_hasreg((ir)->r) || ra_hasspill((ir)->s))
633 /* Setup register allocator. */
634 static void ra_setup(ASMState
*as
)
636 /* Initially all regs (except the stack pointer) are free for use. */
637 as
->freeset
= RSET_ALL
;
638 as
->modset
= RSET_EMPTY
;
639 as
->weakset
= RSET_EMPTY
;
640 as
->phiset
= RSET_EMPTY
;
641 memset(as
->phireg
, 0, sizeof(as
->phireg
));
642 memset(as
->cost
, 0, sizeof(as
->cost
));
643 as
->cost
[RID_ESP
] = REGCOST(~0u, 0u);
646 /* Rematerialize constants. */
647 static Reg
ra_rematk(ASMState
*as
, IRIns
*ir
)
650 lua_assert(ra_hasreg(r
) && !ra_hasspill(ir
->s
));
653 ir
->r
= RID_INIT
; /* Do not keep any hint. */
654 RA_DBGX((as
, "remat $i $r", ir
, r
));
655 if (ir
->o
== IR_KNUM
) {
656 emit_loadn(as
, r
, ir_knum(ir
));
657 } else if (ir
->o
== IR_BASE
) {
658 ra_sethint(ir
->r
, RID_BASE
); /* Restore BASE register hint. */
659 emit_getgl(as
, r
, jit_base
);
660 } else if (ir
->o
== IR_KPRI
) { /* REF_NIL stores ASMREF_L register. */
661 lua_assert(irt_isnil(ir
->t
));
662 emit_getgl(as
, r
, jit_L
);
664 } else if (ir
->o
== IR_KINT64
) {
665 emit_loadu64(as
, r
, ir_kint64(ir
)->u64
);
668 lua_assert(ir
->o
== IR_KINT
|| ir
->o
== IR_KGC
||
669 ir
->o
== IR_KPTR
|| ir
->o
== IR_KKPTR
|| ir
->o
== IR_KNULL
);
670 emit_loadi(as
, r
, ir
->i
);
675 /* Force a spill. Allocate a new spill slot if needed. */
676 static int32_t ra_spill(ASMState
*as
, IRIns
*ir
)
678 int32_t slot
= ir
->s
;
679 if (!ra_hasspill(slot
)) {
680 if (irt_is64(ir
->t
)) {
681 slot
= as
->evenspill
;
683 } else if (as
->oddspill
) {
687 slot
= as
->evenspill
;
688 as
->oddspill
= slot
+1;
691 if (as
->evenspill
> 256)
692 lj_trace_err(as
->J
, LJ_TRERR_SPILLOV
);
693 ir
->s
= (uint8_t)slot
;
695 return sps_scale(slot
);
698 /* Release the temporarily allocated register in ASMREF_TMP1/ASMREF_TMP2. */
699 static Reg
ra_releasetmp(ASMState
*as
, IRRef ref
)
703 lua_assert(ra_hasreg(r
) && !ra_hasspill(ir
->s
));
710 /* Use 64 bit operations to handle 64 bit IR types. */
712 #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
714 #define REX_64IR(ir, r) (r)
717 /* Generic move between two regs. */
718 static void ra_movrr(ASMState
*as
, IRIns
*ir
, Reg r1
, Reg r2
)
721 if (r1
< RID_MAX_GPR
)
722 emit_rr(as
, XO_MOV
, REX_64IR(ir
, r1
), r2
);
724 emit_rr(as
, XMM_MOVRR(as
), r1
, r2
);
727 /* Restore a register (marked as free). Rematerialize or force a spill. */
728 static Reg
ra_restore(ASMState
*as
, IRRef ref
)
731 if (irref_isk(ref
) || ref
== REF_BASE
) {
732 return ra_rematk(as
, ir
);
734 int32_t ofs
= ra_spill(as
, ir
); /* Force a spill slot. */
736 lua_assert(ra_hasreg(r
));
737 ra_sethint(ir
->r
, r
); /* Keep hint. */
739 if (!rset_test(as
->weakset
, r
)) { /* Only restore non-weak references. */
741 RA_DBGX((as
, "restore $i $r", ir
, r
));
743 emit_rmro(as
, XO_MOV
, REX_64IR(ir
, r
), RID_ESP
, ofs
);
745 emit_rmro(as
, irt_isnum(ir
->t
) ? XMM_MOVRM(as
) : XO_MOVSS
,
752 /* Save a register to a spill slot. */
753 static void ra_save(ASMState
*as
, IRIns
*ir
, Reg r
)
755 RA_DBGX((as
, "save $i $r", ir
, r
));
757 emit_rmro(as
, XO_MOVto
, REX_64IR(ir
, r
), RID_ESP
, sps_scale(ir
->s
));
759 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_MOVSDto
: XO_MOVSSto
,
760 r
, RID_ESP
, sps_scale(ir
->s
));
764 if (LJ_LIKELY(allow&RID2RSET(r)) && as->cost[r] < cost) \
767 /* Evict the register with the lowest cost, forcing a restore. */
768 static Reg
ra_evict(ASMState
*as
, RegSet allow
)
771 RegCost cost
= ~(RegCost
)0;
772 lua_assert(allow
!= RSET_EMPTY
);
773 if (allow
< RID2RSET(RID_MAX_GPR
)) {
774 MINCOST(RID_EAX
);MINCOST(RID_ECX
);MINCOST(RID_EDX
);MINCOST(RID_EBX
);
775 MINCOST(RID_EBP
);MINCOST(RID_ESI
);MINCOST(RID_EDI
);
777 MINCOST(RID_R8D
);MINCOST(RID_R9D
);MINCOST(RID_R10D
);MINCOST(RID_R11D
);
778 MINCOST(RID_R12D
);MINCOST(RID_R13D
);MINCOST(RID_R14D
);MINCOST(RID_R15D
);
781 MINCOST(RID_XMM0
);MINCOST(RID_XMM1
);MINCOST(RID_XMM2
);MINCOST(RID_XMM3
);
782 MINCOST(RID_XMM4
);MINCOST(RID_XMM5
);MINCOST(RID_XMM6
);MINCOST(RID_XMM7
);
784 MINCOST(RID_XMM8
);MINCOST(RID_XMM9
);MINCOST(RID_XMM10
);MINCOST(RID_XMM11
);
785 MINCOST(RID_XMM12
);MINCOST(RID_XMM13
);MINCOST(RID_XMM14
);MINCOST(RID_XMM15
);
788 ref
= regcost_ref(cost
);
789 lua_assert(ref
>= as
->T
->nk
&& ref
< as
->T
->nins
);
790 /* Preferably pick any weak ref instead of a non-weak, non-const ref. */
791 if (!irref_isk(ref
) && (as
->weakset
& allow
)) {
793 if (!rset_test(as
->weakset
, ir
->r
))
794 ref
= regcost_ref(as
->cost
[rset_pickbot((as
->weakset
& allow
))]);
796 return ra_restore(as
, ref
);
799 /* Pick any register (marked as free). Evict on-demand. */
800 static Reg
ra_pick(ASMState
*as
, RegSet allow
)
802 RegSet pick
= as
->freeset
& allow
;
804 return ra_evict(as
, allow
);
806 return rset_picktop(pick
);
809 /* Get a scratch register (marked as free). */
810 static Reg
ra_scratch(ASMState
*as
, RegSet allow
)
812 Reg r
= ra_pick(as
, allow
);
814 RA_DBGX((as
, "scratch $r", r
));
818 /* Evict all registers from a set (if not free). */
819 static void ra_evictset(ASMState
*as
, RegSet drop
)
822 drop
&= ~as
->freeset
;
824 Reg r
= rset_picktop(drop
);
825 ra_restore(as
, regcost_ref(as
->cost
[r
]));
831 /* Evict (rematerialize) all registers allocated to constants. */
832 static void ra_evictk(ASMState
*as
)
834 RegSet work
= ~as
->freeset
& RSET_ALL
;
836 Reg r
= rset_pickbot(work
);
837 IRRef ref
= regcost_ref(as
->cost
[r
]);
838 if (irref_isk(ref
)) {
839 ra_rematk(as
, IR(ref
));
846 /* Allocate a register for ref from the allowed set of registers.
847 ** Note: this function assumes the ref does NOT have a register yet!
848 ** Picks an optimal register, sets the cost and marks the register as non-free.
850 static Reg
ra_allocref(ASMState
*as
, IRRef ref
, RegSet allow
)
853 RegSet pick
= as
->freeset
& allow
;
855 lua_assert(ra_noreg(ir
->r
));
857 /* First check register hint from propagation or PHI. */
858 if (ra_hashint(ir
->r
)) {
859 r
= ra_gethint(ir
->r
);
860 if (rset_test(pick
, r
)) /* Use hint register if possible. */
862 /* Rematerialization is cheaper than missing a hint. */
863 if (rset_test(allow
, r
) && irref_isk(regcost_ref(as
->cost
[r
]))) {
864 ra_rematk(as
, IR(regcost_ref(as
->cost
[r
])));
867 RA_DBGX((as
, "hintmiss $f $r", ref
, r
));
869 /* Invariants should preferably get unmodified registers. */
870 if (ref
< as
->loopref
&& !irt_isphi(ir
->t
)) {
871 if ((pick
& ~as
->modset
))
873 r
= rset_pickbot(pick
); /* Reduce conflicts with inverse allocation. */
876 /* We've got plenty of regs, so get callee-save regs if possible. */
877 if ((pick
& ~RSET_SCRATCH
))
878 pick
&= ~RSET_SCRATCH
;
880 r
= rset_picktop(pick
);
883 r
= ra_evict(as
, allow
);
886 RA_DBGX((as
, "alloc $f $r", ref
, r
));
888 rset_clear(as
->freeset
, r
);
890 as
->cost
[r
] = REGCOST_REF_T(ref
, irt_t(ir
->t
));
894 /* Allocate a register on-demand. */
895 static Reg
ra_alloc1(ASMState
*as
, IRRef ref
, RegSet allow
)
898 /* Note: allow is ignored if the register is already allocated. */
899 if (ra_noreg(r
)) r
= ra_allocref(as
, ref
, allow
);
904 /* Rename register allocation and emit move. */
905 static void ra_rename(ASMState
*as
, Reg down
, Reg up
)
907 IRRef ren
, ref
= regcost_ref(as
->cost
[up
] = as
->cost
[down
]);
911 lua_assert((down
< RID_MAX_GPR
) == (up
< RID_MAX_GPR
));
912 lua_assert(!rset_test(as
->freeset
, down
) && rset_test(as
->freeset
, up
));
913 ra_free(as
, down
); /* 'down' is free ... */
914 ra_modified(as
, down
);
915 rset_clear(as
->freeset
, up
); /* ... and 'up' is now allocated. */
917 RA_DBGX((as
, "rename $f $r $r", regcost_ref(as
->cost
[up
]), down
, up
));
918 ra_movrr(as
, ir
, down
, up
); /* Backwards codegen needs inverse move. */
919 if (!ra_hasspill(IR(ref
)->s
)) { /* Add the rename to the IR. */
920 lj_ir_set(as
->J
, IRT(IR_RENAME
, IRT_NIL
), ref
, as
->snapno
);
921 ren
= tref_ref(lj_ir_emit(as
->J
));
922 as
->ir
= as
->T
->ir
; /* The IR may have been reallocated. */
923 IR(ren
)->r
= (uint8_t)down
;
924 IR(ren
)->s
= SPS_NONE
;
928 /* Pick a destination register (marked as free).
929 ** Caveat: allow is ignored if there's already a destination register.
930 ** Use ra_destreg() to get a specific register.
932 static Reg
ra_dest(ASMState
*as
, IRIns
*ir
, RegSet allow
)
935 if (ra_hasreg(dest
)) {
937 ra_modified(as
, dest
);
939 dest
= ra_scratch(as
, allow
);
941 if (LJ_UNLIKELY(ra_hasspill(ir
->s
))) ra_save(as
, ir
, dest
);
945 /* Force a specific destination register (marked as free). */
946 static void ra_destreg(ASMState
*as
, IRIns
*ir
, Reg r
)
948 Reg dest
= ra_dest(as
, ir
, RID2RSET(r
));
950 ra_scratch(as
, RID2RSET(r
));
951 ra_movrr(as
, ir
, dest
, r
);
955 /* Propagate dest register to left reference. Emit moves as needed.
956 ** This is a required fixup step for all 2-operand machine instructions.
958 static void ra_left(ASMState
*as
, Reg dest
, IRRef lref
)
960 IRIns
*ir
= IR(lref
);
962 if (ra_noreg(left
)) {
963 if (irref_isk(lref
)) {
964 if (ir
->o
== IR_KNUM
) {
965 cTValue
*tv
= ir_knum(ir
);
966 /* FP remat needs a load except for +0. Still better than eviction. */
967 if (tvispzero(tv
) || !(as
->freeset
& RSET_FPR
)) {
968 emit_loadn(as
, dest
, tv
);
972 } else if (ir
->o
== IR_KINT64
) {
973 emit_loadu64(as
, dest
, ir_kint64(ir
)->u64
);
977 lua_assert(ir
->o
== IR_KINT
|| ir
->o
== IR_KGC
||
978 ir
->o
== IR_KPTR
|| ir
->o
== IR_KKPTR
|| ir
->o
== IR_KNULL
);
979 emit_loadi(as
, dest
, ir
->i
);
983 if (!ra_hashint(left
) && !iscrossref(as
, lref
))
984 ra_sethint(ir
->r
, dest
); /* Propagate register hint. */
985 left
= ra_allocref(as
, lref
, dest
< RID_MAX_GPR
? RSET_GPR
: RSET_FPR
);
988 /* Move needed for true 3-operand instruction: y=a+b ==> y=a; y+=b. */
990 /* Use register renaming if dest is the PHI reg. */
991 if (irt_isphi(ir
->t
) && as
->phireg
[dest
] == lref
) {
992 ra_modified(as
, left
);
993 ra_rename(as
, left
, dest
);
995 ra_movrr(as
, ir
, dest
, left
);
1000 /* -- Exit stubs ---------------------------------------------------------- */
1002 /* Generate an exit stub group at the bottom of the reserved MCode memory. */
1003 static MCode
*asm_exitstub_gen(ASMState
*as
, ExitNo group
)
1005 ExitNo i
, groupofs
= (group
*EXITSTUBS_PER_GROUP
) & 0xff;
1006 MCode
*mxp
= as
->mcbot
;
1007 MCode
*mxpstart
= mxp
;
1008 if (mxp
+ (2+2)*EXITSTUBS_PER_GROUP
+8+5 >= as
->mctop
)
1010 /* Push low byte of exitno for each exit stub. */
1011 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)groupofs
;
1012 for (i
= 1; i
< EXITSTUBS_PER_GROUP
; i
++) {
1013 *mxp
++ = XI_JMPs
; *mxp
++ = (MCode
)((2+2)*(EXITSTUBS_PER_GROUP
- i
) - 2);
1014 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)(groupofs
+ i
);
1016 /* Push the high byte of the exitno for each exit stub group. */
1017 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)((group
*EXITSTUBS_PER_GROUP
)>>8);
1018 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
1020 *mxp
++ = MODRM(XM_OFS8
, 0, RID_ESP
);
1021 *mxp
++ = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
1022 *mxp
++ = 2*sizeof(void *);
1023 *(int32_t *)mxp
= ptr2addr(J2GG(as
->J
)->dispatch
); mxp
+= 4;
1024 /* Jump to exit handler which fills in the ExitState. */
1025 *mxp
++ = XI_JMP
; mxp
+= 4;
1026 *((int32_t *)(mxp
-4)) = jmprel(mxp
, (MCode
*)(void *)lj_vm_exit_handler
);
1027 /* Commit the code for this group (even if assembly fails later on). */
1028 lj_mcode_commitbot(as
->J
, mxp
);
1030 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
1034 /* Setup all needed exit stubs. */
1035 static void asm_exitstub_setup(ASMState
*as
, ExitNo nexits
)
1038 if (nexits
>= EXITSTUBS_PER_GROUP
*LJ_MAX_EXITSTUBGR
)
1039 lj_trace_err(as
->J
, LJ_TRERR_SNAPOV
);
1040 for (i
= 0; i
< (nexits
+EXITSTUBS_PER_GROUP
-1)/EXITSTUBS_PER_GROUP
; i
++)
1041 if (as
->J
->exitstubgroup
[i
] == NULL
)
1042 as
->J
->exitstubgroup
[i
] = asm_exitstub_gen(as
, i
);
1045 /* -- Snapshot and guard handling ----------------------------------------- */
1047 /* Can we rematerialize a KNUM instead of forcing a spill? */
1048 static int asm_snap_canremat(ASMState
*as
)
1051 for (r
= RID_MIN_FPR
; r
< RID_MAX_FPR
; r
++)
1052 if (irref_isk(regcost_ref(as
->cost
[r
])))
1057 /* Allocate registers or spill slots for refs escaping to a snapshot. */
1058 static void asm_snap_alloc(ASMState
*as
)
1060 SnapShot
*snap
= &as
->T
->snap
[as
->snapno
];
1061 SnapEntry
*map
= &as
->T
->snapmap
[snap
->mapofs
];
1062 MSize n
, nent
= snap
->nent
;
1063 for (n
= 0; n
< nent
; n
++) {
1064 IRRef ref
= snap_ref(map
[n
]);
1065 if (!irref_isk(ref
)) {
1066 IRIns
*ir
= IR(ref
);
1068 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
1069 /* Get a weak register if we have a free one or can rematerialize. */
1070 if ((as
->freeset
& allow
) ||
1071 (allow
== RSET_FPR
&& asm_snap_canremat(as
))) {
1072 Reg r
= ra_allocref(as
, ref
, allow
); /* Allocate a register. */
1073 if (!irt_isphi(ir
->t
))
1074 ra_weak(as
, r
); /* But mark it as weakly referenced. */
1076 RA_DBGX((as
, "snapreg $f $r", ref
, ir
->r
));
1078 ra_spill(as
, ir
); /* Otherwise force a spill slot. */
1079 RA_DBGX((as
, "snapspill $f $s", ref
, ir
->s
));
1086 /* All guards for a snapshot use the same exitno. This is currently the
1087 ** same as the snapshot number. Since the exact origin of the exit cannot
1088 ** be determined, all guards for the same snapshot must exit with the same
1090 ** A renamed ref which has been used in a prior guard for the same snapshot
1091 ** would cause an inconsistency. The easy way out is to force a spill slot.
1093 static int asm_snap_checkrename(ASMState
*as
, IRRef ren
)
1095 SnapShot
*snap
= &as
->T
->snap
[as
->snapno
];
1096 SnapEntry
*map
= &as
->T
->snapmap
[snap
->mapofs
];
1097 MSize n
, nent
= snap
->nent
;
1098 for (n
= 0; n
< nent
; n
++) {
1099 IRRef ref
= snap_ref(map
[n
]);
1101 IRIns
*ir
= IR(ref
);
1102 ra_spill(as
, ir
); /* Register renamed, so force a spill slot. */
1103 RA_DBGX((as
, "snaprensp $f $s", ref
, ir
->s
));
1104 return 1; /* Found. */
1107 return 0; /* Not found. */
1110 /* Prepare snapshot for next guard instruction. */
1111 static void asm_snap_prep(ASMState
*as
)
1113 if (as
->curins
< as
->snapref
) {
1115 lua_assert(as
->snapno
!= 0);
1117 as
->snapref
= as
->T
->snap
[as
->snapno
].ref
;
1118 } while (as
->curins
< as
->snapref
);
1120 as
->snaprename
= as
->T
->nins
;
1122 /* Process any renames above the highwater mark. */
1123 for (; as
->snaprename
< as
->T
->nins
; as
->snaprename
++) {
1124 IRIns
*ir
= IR(as
->snaprename
);
1125 if (asm_snap_checkrename(as
, ir
->op1
))
1126 ir
->op2
= REF_BIAS
-1; /* Kill rename. */
1131 /* Emit conditional branch to exit for guard.
1132 ** It's important to emit this *after* all registers have been allocated,
1133 ** because rematerializations may invalidate the flags.
1135 static void asm_guardcc(ASMState
*as
, int cc
)
1137 MCode
*target
= exitstub_addr(as
->J
, as
->snapno
);
1139 if (LJ_UNLIKELY(p
== as
->invmcp
)) {
1141 *(int32_t *)(p
+1) = jmprel(p
+5, target
);
1145 emit_sjcc(as
, cc
, target
);
1149 emit_jcc(as
, cc
, target
);
1152 /* -- Memory operand fusion ----------------------------------------------- */
1154 /* Arch-specific field offsets. */
1155 static const uint8_t field_ofs
[IRFL__MAX
+1] = {
1156 #define FLOFS(name, ofs) (uint8_t)(ofs),
1162 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
1163 #define CONFLICT_SEARCH_LIM 31
1165 /* Check if a reference is a signed 32 bit constant. */
1166 static int asm_isk32(ASMState
*as
, IRRef ref
, int32_t *k
)
1168 if (irref_isk(ref
)) {
1169 IRIns
*ir
= IR(ref
);
1170 if (ir
->o
!= IR_KINT64
) {
1173 } else if (checki32((int64_t)ir_kint64(ir
)->u64
)) {
1174 *k
= (int32_t)ir_kint64(ir
)->u64
;
1181 /* Check if there's no conflicting instruction between curins and ref.
1182 ** Also avoid fusing loads if there are multiple references.
1184 static int noconflict(ASMState
*as
, IRRef ref
, IROp conflict
, int noload
)
1187 IRRef i
= as
->curins
;
1188 if (i
> ref
+ CONFLICT_SEARCH_LIM
)
1189 return 0; /* Give up, ref is too far away. */
1191 if (ir
[i
].o
== conflict
)
1192 return 0; /* Conflict found. */
1193 else if (!noload
&& (ir
[i
].op1
== ref
|| ir
[i
].op2
== ref
))
1196 return 1; /* Ok, no conflict. */
1199 /* Fuse array base into memory operand. */
1200 static IRRef
asm_fuseabase(ASMState
*as
, IRRef ref
)
1202 IRIns
*irb
= IR(ref
);
1204 if (irb
->o
== IR_FLOAD
) {
1205 IRIns
*ira
= IR(irb
->op1
);
1206 lua_assert(irb
->op2
== IRFL_TAB_ARRAY
);
1207 /* We can avoid the FLOAD of t->array for colocated arrays. */
1208 if (ira
->o
== IR_TNEW
&& ira
->op1
<= LJ_MAX_COLOSIZE
&&
1209 noconflict(as
, irb
->op1
, IR_NEWREF
, 1)) {
1210 as
->mrm
.ofs
= (int32_t)sizeof(GCtab
); /* Ofs to colocated array. */
1211 return irb
->op1
; /* Table obj. */
1213 } else if (irb
->o
== IR_ADD
&& irref_isk(irb
->op2
)) {
1214 /* Fuse base offset (vararg load). */
1215 as
->mrm
.ofs
= IR(irb
->op2
)->i
;
1218 return ref
; /* Otherwise use the given array base. */
1221 /* Fuse array reference into memory operand. */
1222 static void asm_fusearef(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1225 lua_assert(ir
->o
== IR_AREF
);
1226 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, asm_fuseabase(as
, ir
->op1
), allow
);
1228 if (irref_isk(ir
->op2
)) {
1229 as
->mrm
.ofs
+= 8*irx
->i
;
1230 as
->mrm
.idx
= RID_NONE
;
1232 rset_clear(allow
, as
->mrm
.base
);
1233 as
->mrm
.scale
= XM_SCALE8
;
1234 /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
1235 ** Doesn't help much without ABCelim, but reduces register pressure.
1237 if (!LJ_64
&& /* Has bad effects with negative index on x64. */
1238 mayfuse(as
, ir
->op2
) && ra_noreg(irx
->r
) &&
1239 irx
->o
== IR_ADD
&& irref_isk(irx
->op2
)) {
1240 as
->mrm
.ofs
+= 8*IR(irx
->op2
)->i
;
1241 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, irx
->op1
, allow
);
1243 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, ir
->op2
, allow
);
1248 /* Fuse array/hash/upvalue reference into memory operand.
1249 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
1250 ** pass the final allow mask, excluding any GPRs used for other inputs.
1251 ** In particular: 2-operand GPR instructions need to call ra_dest() first!
1253 static void asm_fuseahuref(ASMState
*as
, IRRef ref
, RegSet allow
)
1255 IRIns
*ir
= IR(ref
);
1256 if (ra_noreg(ir
->r
)) {
1257 switch ((IROp
)ir
->o
) {
1259 if (mayfuse(as
, ref
)) {
1260 asm_fusearef(as
, ir
, allow
);
1265 if (mayfuse(as
, ref
)) {
1266 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
1267 as
->mrm
.ofs
= (int32_t)(IR(ir
->op2
)->op2
* sizeof(Node
));
1268 as
->mrm
.idx
= RID_NONE
;
1273 if (irref_isk(ir
->op1
)) {
1274 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
1275 GCupval
*uv
= &gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)])->uv
;
1276 as
->mrm
.ofs
= ptr2addr(&uv
->tv
);
1277 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1282 lua_assert(ir
->o
== IR_HREF
|| ir
->o
== IR_NEWREF
|| ir
->o
== IR_UREFO
);
1286 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
1288 as
->mrm
.idx
= RID_NONE
;
1291 /* Fuse FLOAD/FREF reference into memory operand. */
1292 static void asm_fusefref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1294 lua_assert(ir
->o
== IR_FLOAD
|| ir
->o
== IR_FREF
);
1295 as
->mrm
.ofs
= field_ofs
[ir
->op2
];
1296 as
->mrm
.idx
= RID_NONE
;
1297 if (irref_isk(ir
->op1
)) {
1298 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
1299 as
->mrm
.base
= RID_NONE
;
1301 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
1305 /* Fuse string reference into memory operand. */
1306 static void asm_fusestrref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1309 lua_assert(ir
->o
== IR_STRREF
);
1310 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1311 as
->mrm
.scale
= XM_SCALE1
;
1312 as
->mrm
.ofs
= sizeof(GCstr
);
1313 if (irref_isk(ir
->op1
)) {
1314 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
1316 Reg r
= ra_alloc1(as
, ir
->op1
, allow
);
1317 rset_clear(allow
, r
);
1318 as
->mrm
.base
= (uint8_t)r
;
1321 if (irref_isk(ir
->op2
)) {
1322 as
->mrm
.ofs
+= irr
->i
;
1325 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
1326 if (!LJ_64
&& /* Has bad effects with negative index on x64. */
1327 mayfuse(as
, ir
->op2
) && irr
->o
== IR_ADD
&& irref_isk(irr
->op2
)) {
1328 as
->mrm
.ofs
+= IR(irr
->op2
)->i
;
1329 r
= ra_alloc1(as
, irr
->op1
, allow
);
1331 r
= ra_alloc1(as
, ir
->op2
, allow
);
1333 if (as
->mrm
.base
== RID_NONE
)
1334 as
->mrm
.base
= (uint8_t)r
;
1336 as
->mrm
.idx
= (uint8_t)r
;
1340 static void asm_fusexref(ASMState
*as
, IRRef ref
, RegSet allow
)
1342 IRIns
*ir
= IR(ref
);
1343 as
->mrm
.idx
= RID_NONE
;
1344 if (ir
->o
== IR_KPTR
|| ir
->o
== IR_KKPTR
) {
1345 as
->mrm
.ofs
= ir
->i
;
1346 as
->mrm
.base
= RID_NONE
;
1347 } else if (ir
->o
== IR_STRREF
) {
1348 asm_fusestrref(as
, ir
, allow
);
1351 if (mayfuse(as
, ref
) && ir
->o
== IR_ADD
&& ra_noreg(ir
->r
)) {
1352 /* Gather (base+idx*sz)+ofs as emitted by cdata ptr/array indexing. */
1356 if (asm_isk32(as
, ir
->op2
, &as
->mrm
.ofs
)) { /* Recognize x+ofs. */
1359 if (!(ir
->o
== IR_ADD
&& mayfuse(as
, ref
) && ra_noreg(ir
->r
)))
1362 as
->mrm
.scale
= XM_SCALE1
;
1366 if (!(irx
->o
== IR_BSHL
|| irx
->o
== IR_ADD
)) { /* Try other operand. */
1371 if (mayfuse(as
, idx
) && ra_noreg(irx
->r
)) {
1372 if (irx
->o
== IR_BSHL
&& irref_isk(irx
->op2
) && IR(irx
->op2
)->i
<= 3) {
1373 /* Recognize idx<<b with b = 0-3, corresponding to sz = (1),2,4,8. */
1375 as
->mrm
.scale
= (uint8_t)(IR(irx
->op2
)->i
<< 6);
1376 } else if (irx
->o
== IR_ADD
&& irx
->op1
== irx
->op2
) {
1377 /* FOLD does idx*2 ==> idx<<1 ==> idx+idx. */
1379 as
->mrm
.scale
= XM_SCALE2
;
1382 r
= ra_alloc1(as
, idx
, allow
);
1383 rset_clear(allow
, r
);
1384 as
->mrm
.idx
= (uint8_t)r
;
1387 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
1391 /* Fuse load into memory operand. */
1392 static Reg
asm_fuseload(ASMState
*as
, IRRef ref
, RegSet allow
)
1394 IRIns
*ir
= IR(ref
);
1395 if (ra_hasreg(ir
->r
)) {
1396 if (allow
!= RSET_EMPTY
) { /* Fast path. */
1397 ra_noweak(as
, ir
->r
);
1401 /* Force a spill if only memory operands are allowed (asm_x87load). */
1402 as
->mrm
.base
= RID_ESP
;
1403 as
->mrm
.ofs
= ra_spill(as
, ir
);
1404 as
->mrm
.idx
= RID_NONE
;
1407 if (ir
->o
== IR_KNUM
) {
1408 RegSet avail
= as
->freeset
& ~as
->modset
& RSET_FPR
;
1409 lua_assert(allow
!= RSET_EMPTY
);
1410 if (!(avail
& (avail
-1))) { /* Fuse if less than two regs available. */
1411 as
->mrm
.ofs
= ptr2addr(ir_knum(ir
));
1412 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1415 } else if (mayfuse(as
, ref
)) {
1416 RegSet xallow
= (allow
& RSET_GPR
) ? allow
: RSET_GPR
;
1417 if (ir
->o
== IR_SLOAD
) {
1418 if (!(ir
->op2
& (IRSLOAD_PARENT
|IRSLOAD_CONVERT
)) &&
1419 noconflict(as
, ref
, IR_RETF
, 0)) {
1420 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, REF_BASE
, xallow
);
1421 as
->mrm
.ofs
= 8*((int32_t)ir
->op1
-1) + ((ir
->op2
&IRSLOAD_FRAME
)?4:0);
1422 as
->mrm
.idx
= RID_NONE
;
1425 } else if (ir
->o
== IR_FLOAD
) {
1426 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
1427 if ((irt_isint(ir
->t
) || irt_isaddr(ir
->t
)) &&
1428 noconflict(as
, ref
, IR_FSTORE
, 0)) {
1429 asm_fusefref(as
, ir
, xallow
);
1432 } else if (ir
->o
== IR_ALOAD
|| ir
->o
== IR_HLOAD
|| ir
->o
== IR_ULOAD
) {
1433 if (noconflict(as
, ref
, ir
->o
+ IRDELTA_L2S
, 0)) {
1434 asm_fuseahuref(as
, ir
->op1
, xallow
);
1437 } else if (ir
->o
== IR_XLOAD
) {
1438 /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp).
1439 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
1441 if ((!irt_typerange(ir
->t
, IRT_I8
, IRT_U16
)) &&
1442 noconflict(as
, ref
, IR_XSTORE
, 0)) {
1443 asm_fusexref(as
, ir
->op1
, xallow
);
1446 } else if (ir
->o
== IR_VLOAD
) {
1447 asm_fuseahuref(as
, ir
->op1
, xallow
);
1451 if (!(as
->freeset
& allow
) &&
1452 (allow
== RSET_EMPTY
|| ra_hasspill(ir
->s
) || iscrossref(as
, ref
)))
1454 return ra_allocref(as
, ref
, allow
);
1457 /* -- Calls --------------------------------------------------------------- */
1459 /* Generate a call to a C function. */
1460 static void asm_gencall(ASMState
*as
, const CCallInfo
*ci
, IRRef
*args
)
1462 uint32_t n
, nargs
= CCI_NARGS(ci
);
1463 int32_t ofs
= STACKARG_OFS
;
1464 uint32_t gprs
= REGARG_GPRS
;
1466 Reg fpr
= REGARG_FIRSTFPR
;
1468 lua_assert(!(nargs
> 2 && (ci
->flags
&CCI_FASTCALL
))); /* Avoid stack adj. */
1469 if ((void *)ci
->func
)
1470 emit_call(as
, ci
->func
);
1471 for (n
= 0; n
< nargs
; n
++) { /* Setup args. */
1472 IRRef ref
= args
[n
];
1473 IRIns
*ir
= IR(ref
);
1475 #if LJ_64 && LJ_ABI_WIN
1476 /* Windows/x64 argument registers are strictly positional. */
1477 r
= irt_isfp(ir
->t
) ? (fpr
<= REGARG_LASTFPR
? fpr
: 0) : (gprs
& 31);
1480 /* POSIX/x64 argument registers are used in order of appearance. */
1481 if (irt_isfp(ir
->t
)) {
1482 r
= fpr
<= REGARG_LASTFPR
? fpr
: 0; fpr
++;
1484 r
= gprs
& 31; gprs
>>= 5;
1487 if (irt_isfp(ir
->t
) || !(ci
->flags
& CCI_FASTCALL
)) {
1490 r
= gprs
& 31; gprs
>>= 5;
1493 if (r
) { /* Argument is in a register. */
1494 if (r
< RID_MAX_GPR
&& ref
< ASMREF_TMP1
) {
1496 if (ir
->o
== IR_KINT64
)
1497 emit_loadu64(as
, r
, ir_kint64(ir
)->u64
);
1500 emit_loadi(as
, r
, ir
->i
);
1502 lua_assert(rset_test(as
->freeset
, r
)); /* Must have been evicted. */
1503 if (ra_hasreg(ir
->r
)) {
1504 ra_noweak(as
, ir
->r
);
1505 ra_movrr(as
, ir
, r
, ir
->r
);
1507 ra_allocref(as
, ref
, RID2RSET(r
));
1510 } else if (irt_isfp(ir
->t
)) { /* FP argument is on stack. */
1511 lua_assert(!(irt_isfloat(ir
->t
) && irref_isk(ref
))); /* No float k. */
1512 if (LJ_32
&& (ofs
& 4) && irref_isk(ref
)) {
1513 /* Split stores for unaligned FP consts. */
1514 emit_movmroi(as
, RID_ESP
, ofs
, (int32_t)ir_knum(ir
)->u32
.lo
);
1515 emit_movmroi(as
, RID_ESP
, ofs
+4, (int32_t)ir_knum(ir
)->u32
.hi
);
1517 r
= ra_alloc1(as
, ref
, RSET_FPR
);
1518 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_MOVSDto
: XO_MOVSSto
,
1521 ofs
+= (LJ_32
&& irt_isfloat(ir
->t
)) ? 4 : 8;
1522 } else { /* Non-FP argument is on stack. */
1523 if (LJ_32
&& ref
< ASMREF_TMP1
) {
1524 emit_movmroi(as
, RID_ESP
, ofs
, ir
->i
);
1526 r
= ra_alloc1(as
, ref
, RSET_GPR
);
1527 emit_movtomro(as
, REX_64IR(ir
, r
), RID_ESP
, ofs
);
1529 ofs
+= sizeof(intptr_t);
1534 /* Setup result reg/sp for call. Evict scratch regs. */
1535 static void asm_setupresult(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
1537 RegSet drop
= RSET_SCRATCH
;
1538 if ((ci
->flags
& CCI_NOFPRCLOBBER
))
1540 if (ra_hasreg(ir
->r
))
1541 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
1542 ra_evictset(as
, drop
); /* Evictions must be performed first. */
1544 if (irt_isfp(ir
->t
)) {
1545 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
1547 if ((ci
->flags
& CCI_CASTU64
)) {
1549 if (ra_hasreg(dest
)) {
1551 ra_modified(as
, dest
);
1552 emit_rr(as
, XO_MOVD
, dest
|REX_64
, RID_RET
); /* Really MOVQ. */
1554 emit_movtomro(as
, RID_RET
|REX_64
, RID_ESP
, ofs
);
1557 ra_destreg(as
, ir
, RID_FPRET
);
1560 /* Number result is in x87 st0 for x86 calling convention. */
1562 if (ra_hasreg(dest
)) {
1564 ra_modified(as
, dest
);
1565 emit_rmro(as
, irt_isnum(ir
->t
) ? XMM_MOVRM(as
) : XO_MOVSS
,
1566 dest
, RID_ESP
, ofs
);
1568 if ((ci
->flags
& CCI_CASTU64
)) {
1569 emit_movtomro(as
, RID_RET
, RID_ESP
, ofs
);
1570 emit_movtomro(as
, RID_RETHI
, RID_ESP
, ofs
+4);
1572 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_FSTPq
: XO_FSTPd
,
1573 irt_isnum(ir
->t
) ? XOg_FSTPq
: XOg_FSTPd
, RID_ESP
, ofs
);
1577 lua_assert(!irt_ispri(ir
->t
));
1578 ra_destreg(as
, ir
, RID_RET
);
1583 /* Collect arguments from CALL* and CARG instructions. */
1584 static void asm_collectargs(ASMState
*as
, IRIns
*ir
,
1585 const CCallInfo
*ci
, IRRef
*args
)
1587 uint32_t n
= CCI_NARGS(ci
);
1588 lua_assert(n
<= CCI_NARGS_MAX
);
1589 if ((ci
->flags
& CCI_L
)) { *args
++ = ASMREF_L
; n
--; }
1592 lua_assert(ir
->o
== IR_CARG
);
1596 lua_assert(IR(ir
->op1
)->o
!= IR_CARG
);
1599 static void asm_call(ASMState
*as
, IRIns
*ir
)
1601 IRRef args
[CCI_NARGS_MAX
];
1602 const CCallInfo
*ci
= &lj_ir_callinfo
[ir
->op2
];
1603 asm_collectargs(as
, ir
, ci
, args
);
1604 asm_setupresult(as
, ir
, ci
);
1605 asm_gencall(as
, ci
, args
);
1608 /* Reconstruct CCallInfo flags for CALLX*. */
1609 static uint32_t asm_callx_flags(ASMState
*as
, IRIns
*ir
)
1612 if (ir
->op1
!= REF_NIL
) { /* Count number of arguments first. */
1613 IRIns
*ira
= IR(ir
->op1
);
1615 while (ira
->o
== IR_CARG
) { nargs
++; ira
= IR(ira
->op1
); }
1617 /* NYI: fastcall etc. */
1618 return (nargs
| (ir
->t
.irt
<< CCI_OTSHIFT
));
1621 static void asm_callx(ASMState
*as
, IRIns
*ir
)
1623 IRRef args
[CCI_NARGS_MAX
];
1626 ci
.flags
= asm_callx_flags(as
, ir
);
1627 asm_collectargs(as
, ir
, &ci
, args
);
1628 asm_setupresult(as
, ir
, &ci
);
1630 if (LJ_32
&& irref_isk(ir
->op2
)) { /* Call to constant address on x86. */
1631 ci
.func
= (ASMFunction
)(void *)(uintptr_t)(uint32_t)irf
->i
;
1633 /* Prefer a non-argument register or RID_RET for indirect calls. */
1634 RegSet allow
= (RSET_GPR
& ~RSET_SCRATCH
)|RID2RSET(RID_RET
);
1635 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
1636 emit_rr(as
, XO_GROUP5
, XOg_CALL
, r
);
1637 ci
.func
= (ASMFunction
)(void *)0;
1639 asm_gencall(as
, &ci
, args
);
1642 /* -- Returns ------------------------------------------------------------- */
1644 /* Return to lower frame. Guard that it goes to the right spot. */
1645 static void asm_retf(ASMState
*as
, IRIns
*ir
)
1647 Reg base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1648 void *pc
= ir_kptr(IR(ir
->op2
));
1649 int32_t delta
= 1+bc_a(*((const BCIns
*)pc
- 1));
1650 as
->topslot
-= (BCReg
)delta
;
1651 if ((int32_t)as
->topslot
< 0) as
->topslot
= 0;
1652 emit_setgl(as
, base
, jit_base
);
1653 emit_addptr(as
, base
, -8*delta
);
1654 asm_guardcc(as
, CC_NE
);
1655 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), base
, -4, ptr2addr(pc
));
1658 /* -- Type conversions ---------------------------------------------------- */
1660 static void asm_tointg(ASMState
*as
, IRIns
*ir
, Reg left
)
1662 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_FPR
, left
));
1663 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1664 asm_guardcc(as
, CC_P
);
1665 asm_guardcc(as
, CC_NE
);
1666 emit_rr(as
, XO_UCOMISD
, left
, tmp
);
1667 emit_rr(as
, XO_CVTSI2SD
, tmp
, dest
);
1668 if (!(as
->flags
& JIT_F_SPLIT_XMM
))
1669 emit_rr(as
, XO_XORPS
, tmp
, tmp
); /* Avoid partial register stall. */
1670 emit_rr(as
, XO_CVTTSD2SI
, dest
, left
);
1671 /* Can't fuse since left is needed twice. */
1674 static void asm_tobit(ASMState
*as
, IRIns
*ir
)
1676 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1677 Reg tmp
= ra_noreg(IR(ir
->op1
)->r
) ?
1678 ra_alloc1(as
, ir
->op1
, RSET_FPR
) :
1679 ra_scratch(as
, RSET_FPR
);
1680 Reg right
= asm_fuseload(as
, ir
->op2
, rset_exclude(RSET_FPR
, tmp
));
1681 emit_rr(as
, XO_MOVDto
, tmp
, dest
);
1682 emit_mrm(as
, XO_ADDSD
, tmp
, right
);
1683 ra_left(as
, tmp
, ir
->op1
);
1686 static void asm_conv(ASMState
*as
, IRIns
*ir
)
1688 IRType st
= (IRType
)(ir
->op2
& IRCONV_SRCMASK
);
1689 int st64
= (st
== IRT_I64
|| st
== IRT_U64
|| (LJ_64
&& st
== IRT_P64
));
1690 int stfp
= (st
== IRT_NUM
|| st
== IRT_FLOAT
);
1691 IRRef lref
= ir
->op1
;
1692 lua_assert(irt_type(ir
->t
) != st
);
1693 lua_assert(!(LJ_32
&& (irt_isint64(ir
->t
) || st64
))); /* Handled by SPLIT. */
1694 if (irt_isfp(ir
->t
)) {
1695 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
1696 if (stfp
) { /* FP to FP conversion. */
1697 Reg left
= asm_fuseload(as
, lref
, RSET_FPR
);
1698 emit_mrm(as
, st
== IRT_NUM
? XO_CVTSD2SS
: XO_CVTSS2SD
, dest
, left
);
1699 if (left
== dest
) return; /* Avoid the XO_XORPS. */
1700 } else if (LJ_32
&& st
== IRT_U32
) { /* U32 to FP conversion on x86. */
1701 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
1702 cTValue
*k
= lj_ir_k64_find(as
->J
, U64x(43380000,00000000));
1703 Reg bias
= ra_scratch(as
, rset_exclude(RSET_FPR
, dest
));
1704 if (irt_isfloat(ir
->t
))
1705 emit_rr(as
, XO_CVTSD2SS
, dest
, dest
);
1706 emit_rr(as
, XO_SUBSD
, dest
, bias
); /* Subtract 2^52+2^51 bias. */
1707 emit_rr(as
, XO_XORPS
, dest
, bias
); /* Merge bias and integer. */
1708 emit_loadn(as
, bias
, k
);
1709 emit_mrm(as
, XO_MOVD
, dest
, asm_fuseload(as
, lref
, RSET_GPR
));
1711 } else { /* Integer to FP conversion. */
1712 Reg left
= (LJ_64
&& (st
== IRT_U32
|| st
== IRT_U64
)) ?
1713 ra_alloc1(as
, lref
, RSET_GPR
) :
1714 asm_fuseload(as
, lref
, RSET_GPR
);
1715 if (LJ_64
&& st
== IRT_U64
) {
1716 MCLabel l_end
= emit_label(as
);
1717 const void *k
= lj_ir_k64_find(as
->J
, U64x(43f00000
,00000000));
1718 emit_rma(as
, XO_ADDSD
, dest
, k
); /* Add 2^64 to compensate. */
1719 emit_sjcc(as
, CC_NS
, l_end
);
1720 emit_rr(as
, XO_TEST
, left
|REX_64
, left
); /* Check if u64 >= 2^63. */
1722 emit_mrm(as
, irt_isnum(ir
->t
) ? XO_CVTSI2SD
: XO_CVTSI2SS
,
1723 dest
|((LJ_64
&& (st64
|| st
== IRT_U32
)) ? REX_64
: 0), left
);
1725 if (!(as
->flags
& JIT_F_SPLIT_XMM
))
1726 emit_rr(as
, XO_XORPS
, dest
, dest
); /* Avoid partial register stall. */
1727 } else if (stfp
) { /* FP to integer conversion. */
1728 if (irt_isguard(ir
->t
)) {
1729 /* Checked conversions are only supported from number to int. */
1730 lua_assert(irt_isint(ir
->t
) && st
== IRT_NUM
);
1731 asm_tointg(as
, ir
, ra_alloc1(as
, lref
, RSET_FPR
));
1733 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1734 x86Op op
= st
== IRT_NUM
?
1735 ((ir
->op2
& IRCONV_TRUNC
) ? XO_CVTTSD2SI
: XO_CVTSD2SI
) :
1736 ((ir
->op2
& IRCONV_TRUNC
) ? XO_CVTTSS2SI
: XO_CVTSS2SI
);
1737 if (LJ_32
&& irt_isu32(ir
->t
)) { /* FP to U32 conversion on x86. */
1738 /* u32 = (int32_t)(number - 2^31) + 2^31 */
1739 Reg tmp
= ra_noreg(IR(lref
)->r
) ? ra_alloc1(as
, lref
, RSET_FPR
) :
1740 ra_scratch(as
, RSET_FPR
);
1741 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, (int32_t)0x80000000);
1742 emit_rr(as
, op
, dest
, tmp
);
1744 emit_rma(as
, XO_ADDSD
, tmp
,
1745 lj_ir_k64_find(as
->J
, U64x(c1e00000
,00000000)));
1747 emit_rma(as
, XO_ADDSS
, tmp
,
1748 lj_ir_k64_find(as
->J
, U64x(00000000,cf000000
)));
1749 ra_left(as
, tmp
, lref
);
1750 } else if (LJ_64
&& irt_isu64(ir
->t
)) {
1751 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1752 Reg tmp
= ra_noreg(IR(lref
)->r
) ? ra_alloc1(as
, lref
, RSET_FPR
) :
1753 ra_scratch(as
, RSET_FPR
);
1754 MCLabel l_end
= emit_label(as
);
1755 emit_rr(as
, op
, dest
|REX_64
, tmp
);
1757 emit_rma(as
, XO_ADDSD
, tmp
,
1758 lj_ir_k64_find(as
->J
, U64x(c3f00000
,00000000)));
1760 emit_rma(as
, XO_ADDSS
, tmp
,
1761 lj_ir_k64_find(as
->J
, U64x(00000000,df800000
)));
1762 emit_sjcc(as
, CC_NS
, l_end
);
1763 emit_rr(as
, XO_TEST
, dest
|REX_64
, dest
); /* Check if dest < 2^63. */
1764 emit_rr(as
, op
, dest
|REX_64
, tmp
);
1765 ra_left(as
, tmp
, lref
);
1767 Reg left
= asm_fuseload(as
, lref
, RSET_FPR
);
1768 if (LJ_64
&& irt_isu32(ir
->t
))
1769 emit_rr(as
, XO_MOV
, dest
, dest
); /* Zero hiword. */
1772 (irt_is64(ir
->t
) || irt_isu32(ir
->t
))) ? REX_64
: 0),
1776 } else if (st
>= IRT_I8
&& st
<= IRT_U16
) { /* Extend to 32 bit integer. */
1777 Reg left
, dest
= ra_dest(as
, ir
, RSET_GPR
);
1778 RegSet allow
= RSET_GPR
;
1780 lua_assert(irt_isint(ir
->t
) || irt_isu32(ir
->t
));
1782 op
= XO_MOVSXb
; allow
= RSET_GPR8
; dest
|= FORCE_REX
;
1783 } else if (st
== IRT_U8
) {
1784 op
= XO_MOVZXb
; allow
= RSET_GPR8
; dest
|= FORCE_REX
;
1785 } else if (st
== IRT_I16
) {
1790 left
= asm_fuseload(as
, lref
, allow
);
1791 /* Add extra MOV if source is already in wrong register. */
1792 if (!LJ_64
&& left
!= RID_MRM
&& !rset_test(allow
, left
)) {
1793 Reg tmp
= ra_scratch(as
, allow
);
1794 emit_rr(as
, op
, dest
, tmp
);
1795 emit_rr(as
, XO_MOV
, tmp
, left
);
1797 emit_mrm(as
, op
, dest
, left
);
1799 } else { /* 32/64 bit integer conversions. */
1800 if (LJ_32
) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
1801 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1802 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
1803 } else if (irt_is64(ir
->t
)) {
1804 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1805 if (st64
|| !(ir
->op2
& IRCONV_SEXT
)) {
1806 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
1807 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
1808 } else { /* 32 to 64 bit sign extension. */
1809 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
1810 emit_mrm(as
, XO_MOVSXd
, dest
|REX_64
, left
);
1813 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1815 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
1816 /* This is either a 32 bit reg/reg mov which zeroes the hiword
1817 ** or a load of the loword from a 64 bit address.
1819 emit_mrm(as
, XO_MOV
, dest
, left
);
1820 } else { /* 32/32 bit no-op (cast). */
1821 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
1827 #if LJ_32 && LJ_HASFFI
1828 /* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
1830 /* 64 bit integer to FP conversion in 32 bit mode. */
1831 static void asm_conv_fp_int64(ASMState
*as
, IRIns
*ir
)
1833 Reg hi
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1834 Reg lo
= ra_alloc1(as
, (ir
-1)->op1
, rset_exclude(RSET_GPR
, hi
));
1835 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
1837 if (ra_hasreg(dest
)) {
1839 ra_modified(as
, dest
);
1840 emit_rmro(as
, irt_isnum(ir
->t
) ? XMM_MOVRM(as
) : XO_MOVSS
,
1841 dest
, RID_ESP
, ofs
);
1843 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_FSTPq
: XO_FSTPd
,
1844 irt_isnum(ir
->t
) ? XOg_FSTPq
: XOg_FSTPd
, RID_ESP
, ofs
);
1845 if (((ir
-1)->op2
& IRCONV_SRCMASK
) == IRT_U64
) {
1846 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
1847 MCLabel l_end
= emit_label(as
);
1848 emit_rma(as
, XO_FADDq
, XOg_FADDq
,
1849 lj_ir_k64_find(as
->J
, U64x(43f00000
,00000000)));
1850 emit_sjcc(as
, CC_NS
, l_end
);
1851 emit_rr(as
, XO_TEST
, hi
, hi
); /* Check if u64 >= 2^63. */
1853 lua_assert(((ir
-1)->op2
& IRCONV_SRCMASK
) == IRT_I64
);
1855 emit_rmro(as
, XO_FILDq
, XOg_FILDq
, RID_ESP
, 0);
1856 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
1857 emit_rmro(as
, XO_MOVto
, hi
, RID_ESP
, 4);
1858 emit_rmro(as
, XO_MOVto
, lo
, RID_ESP
, 0);
1861 /* FP to 64 bit integer conversion in 32 bit mode. */
1862 static void asm_conv_int64_fp(ASMState
*as
, IRIns
*ir
)
1864 IRType st
= (IRType
)((ir
-1)->op2
& IRCONV_SRCMASK
);
1865 IRType dt
= (((ir
-1)->op2
& IRCONV_DSTMASK
) >> IRCONV_DSH
);
1867 lua_assert(st
== IRT_NUM
|| st
== IRT_FLOAT
);
1868 lua_assert(dt
== IRT_I64
|| dt
== IRT_U64
);
1869 lua_assert(((ir
-1)->op2
& IRCONV_TRUNC
));
1870 hi
= ra_dest(as
, ir
, RSET_GPR
);
1871 lo
= ra_dest(as
, ir
-1, rset_exclude(RSET_GPR
, hi
));
1872 if (ra_used(ir
-1)) emit_rmro(as
, XO_MOV
, lo
, RID_ESP
, 0);
1873 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1874 if (!(as
->flags
& JIT_F_SSE3
)) { /* Set FPU rounding mode to default. */
1875 emit_rmro(as
, XO_FLDCW
, XOg_FLDCW
, RID_ESP
, 4);
1876 emit_rmro(as
, XO_MOVto
, lo
, RID_ESP
, 4);
1877 emit_gri(as
, XG_ARITHi(XOg_AND
), lo
, 0xf3ff);
1879 if (dt
== IRT_U64
) {
1880 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1881 MCLabel l_pop
, l_end
= emit_label(as
);
1882 emit_x87op(as
, XI_FPOP
);
1883 l_pop
= emit_label(as
);
1884 emit_sjmp(as
, l_end
);
1885 emit_rmro(as
, XO_MOV
, hi
, RID_ESP
, 4);
1886 if ((as
->flags
& JIT_F_SSE3
))
1887 emit_rmro(as
, XO_FISTTPq
, XOg_FISTTPq
, RID_ESP
, 0);
1889 emit_rmro(as
, XO_FISTPq
, XOg_FISTPq
, RID_ESP
, 0);
1890 emit_rma(as
, XO_FADDq
, XOg_FADDq
,
1891 lj_ir_k64_find(as
->J
, U64x(c3f00000
,00000000)));
1892 emit_sjcc(as
, CC_NS
, l_pop
);
1893 emit_rr(as
, XO_TEST
, hi
, hi
); /* Check if out-of-range (2^63). */
1895 emit_rmro(as
, XO_MOV
, hi
, RID_ESP
, 4);
1896 if ((as
->flags
& JIT_F_SSE3
)) { /* Truncation is easy with SSE3. */
1897 emit_rmro(as
, XO_FISTTPq
, XOg_FISTTPq
, RID_ESP
, 0);
1898 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1899 emit_rmro(as
, XO_FISTPq
, XOg_FISTPq
, RID_ESP
, 0);
1900 emit_rmro(as
, XO_FLDCW
, XOg_FLDCW
, RID_ESP
, 0);
1901 emit_rmro(as
, XO_MOVtow
, lo
, RID_ESP
, 0);
1902 emit_rmro(as
, XO_ARITHw(XOg_OR
), lo
, RID_ESP
, 0);
1903 emit_loadi(as
, lo
, 0xc00);
1904 emit_rmro(as
, XO_FNSTCW
, XOg_FNSTCW
, RID_ESP
, 0);
1907 emit_x87op(as
, XI_FDUP
);
1908 emit_mrm(as
, st
== IRT_NUM
? XO_FLDq
: XO_FLDd
,
1909 st
== IRT_NUM
? XOg_FLDq
: XOg_FLDd
,
1910 asm_fuseload(as
, ir
->op1
, RSET_EMPTY
));
1914 static void asm_strto(ASMState
*as
, IRIns
*ir
)
1916 /* Force a spill slot for the destination register (if any). */
1917 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_tonum
];
1919 RegSet drop
= RSET_SCRATCH
;
1920 if ((drop
& RSET_FPR
) != RSET_FPR
&& ra_hasreg(ir
->r
))
1921 rset_set(drop
, ir
->r
); /* WIN64 doesn't spill all FPRs. */
1922 ra_evictset(as
, drop
);
1923 asm_guardcc(as
, CC_E
);
1924 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
); /* Test return status. */
1925 args
[0] = ir
->op1
; /* GCstr *str */
1926 args
[1] = ASMREF_TMP1
; /* TValue *n */
1927 asm_gencall(as
, ci
, args
);
1928 /* Store the result to the spill slot or temp slots. */
1929 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
)|REX_64
,
1930 RID_ESP
, sps_scale(ir
->s
));
1933 static void asm_tostr(ASMState
*as
, IRIns
*ir
)
1935 IRIns
*irl
= IR(ir
->op1
);
1939 if (irt_isnum(irl
->t
)) {
1940 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_fromnum
];
1941 args
[1] = ASMREF_TMP1
; /* const lua_Number * */
1942 asm_setupresult(as
, ir
, ci
); /* GCstr * */
1943 asm_gencall(as
, ci
, args
);
1944 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
)|REX_64
,
1945 RID_ESP
, ra_spill(as
, irl
));
1947 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_fromint
];
1948 args
[1] = ir
->op1
; /* int32_t k */
1949 asm_setupresult(as
, ir
, ci
); /* GCstr * */
1950 asm_gencall(as
, ci
, args
);
1954 /* -- Memory references --------------------------------------------------- */
1956 static void asm_aref(ASMState
*as
, IRIns
*ir
)
1958 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1959 asm_fusearef(as
, ir
, RSET_GPR
);
1960 if (!(as
->mrm
.idx
== RID_NONE
&& as
->mrm
.ofs
== 0))
1961 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1962 else if (as
->mrm
.base
!= dest
)
1963 emit_rr(as
, XO_MOV
, dest
, as
->mrm
.base
);
1966 /* Must match with hash*() in lj_tab.c. */
1967 static uint32_t ir_khash(IRIns
*ir
)
1970 if (irt_isstr(ir
->t
)) {
1971 return ir_kstr(ir
)->hash
;
1972 } else if (irt_isnum(ir
->t
)) {
1973 lo
= ir_knum(ir
)->u32
.lo
;
1974 hi
= ir_knum(ir
)->u32
.hi
<< 1;
1975 } else if (irt_ispri(ir
->t
)) {
1976 lua_assert(!irt_isnil(ir
->t
));
1977 return irt_type(ir
->t
)-IRT_FALSE
;
1979 lua_assert(irt_isgcv(ir
->t
));
1980 lo
= u32ptr(ir_kgc(ir
));
1981 hi
= lo
+ HASH_BIAS
;
1983 return hashrot(lo
, hi
);
1986 /* Merge NE(HREF, niltv) check. */
1987 static MCode
*merge_href_niltv(ASMState
*as
, IRIns
*ir
)
1989 /* Assumes nothing else generates NE of HREF. */
1990 if ((ir
[1].o
== IR_NE
|| ir
[1].o
== IR_EQ
) && ir
[1].op1
== as
->curins
&&
1993 p
+= (LJ_64
&& *p
!= XI_ARITHi
) ? 7+6 : 6+6;
1994 /* Ensure no loop branch inversion happened. */
1995 if (p
[-6] == 0x0f && p
[-5] == XI_JCCn
+(CC_NE
^(ir
[1].o
& 1))) {
1996 as
->mcp
= p
; /* Kill cmp reg, imm32 + jz exit. */
1997 return p
+ *(int32_t *)(p
-4); /* Return exit address. */
2003 /* Inlined hash lookup. Specialized for key type and for const keys.
2004 ** The equivalent C code is:
2005 ** Node *n = hashkey(t, key);
2007 ** if (lj_obj_equal(&n->key, key)) return &n->val;
2008 ** } while ((n = nextnode(n)));
2011 static void asm_href(ASMState
*as
, IRIns
*ir
)
2013 MCode
*nilexit
= merge_href_niltv(as
, ir
); /* Do this before any restores. */
2014 RegSet allow
= RSET_GPR
;
2015 Reg dest
= ra_dest(as
, ir
, allow
);
2016 Reg tab
= ra_alloc1(as
, ir
->op1
, rset_clear(allow
, dest
));
2017 Reg key
= RID_NONE
, tmp
= RID_NONE
;
2018 IRIns
*irkey
= IR(ir
->op2
);
2019 int isk
= irref_isk(ir
->op2
);
2020 IRType1 kt
= irkey
->t
;
2022 MCLabel l_end
, l_loop
, l_next
;
2025 rset_clear(allow
, tab
);
2026 key
= ra_alloc1(as
, ir
->op2
, irt_isnum(kt
) ? RSET_FPR
: allow
);
2028 tmp
= ra_scratch(as
, rset_exclude(allow
, key
));
2031 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */
2032 l_end
= emit_label(as
);
2033 if (nilexit
&& ir
[1].o
== IR_NE
) {
2034 emit_jcc(as
, CC_E
, nilexit
); /* XI_JMP is not found by lj_asm_patchexit. */
2037 emit_loada(as
, dest
, niltvg(J2G(as
->J
)));
2040 /* Follow hash chain until the end. */
2041 l_loop
= emit_sjcc_label(as
, CC_NZ
);
2042 emit_rr(as
, XO_TEST
, dest
, dest
);
2043 emit_rmro(as
, XO_MOV
, dest
, dest
, offsetof(Node
, next
));
2044 l_next
= emit_label(as
);
2046 /* Type and value comparison. */
2048 emit_jcc(as
, CC_E
, nilexit
);
2050 emit_sjcc(as
, CC_E
, l_end
);
2051 if (irt_isnum(kt
)) {
2053 /* Assumes -0.0 is already canonicalized to +0.0. */
2054 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.lo
),
2055 (int32_t)ir_knum(irkey
)->u32
.lo
);
2056 emit_sjcc(as
, CC_NE
, l_next
);
2057 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.hi
),
2058 (int32_t)ir_knum(irkey
)->u32
.hi
);
2060 emit_sjcc(as
, CC_P
, l_next
);
2061 emit_rmro(as
, XO_UCOMISD
, key
, dest
, offsetof(Node
, key
.n
));
2062 emit_sjcc(as
, CC_A
, l_next
);
2063 /* The type check avoids NaN penalties and complaints from Valgrind. */
2065 emit_u32(as
, LJ_TISNUM
);
2066 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
2068 emit_i8(as
, LJ_TISNUM
);
2069 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
2073 } else if (irt_islightud(kt
)) {
2074 emit_rmro(as
, XO_CMP
, key
|REX_64
, dest
, offsetof(Node
, key
.u64
));
2077 if (!irt_ispri(kt
)) {
2078 lua_assert(irt_isaddr(kt
));
2080 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.gcr
),
2081 ptr2addr(ir_kgc(irkey
)));
2083 emit_rmro(as
, XO_CMP
, key
, dest
, offsetof(Node
, key
.gcr
));
2084 emit_sjcc(as
, CC_NE
, l_next
);
2086 lua_assert(!irt_isnil(kt
));
2087 emit_i8(as
, irt_toitype(kt
));
2088 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
2090 emit_sfixup(as
, l_loop
);
2093 /* Load main position relative to tab->node into dest. */
2094 khash
= isk
? ir_khash(irkey
) : 1;
2096 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, node
));
2098 emit_rmro(as
, XO_ARITH(XOg_ADD
), dest
, tab
, offsetof(GCtab
, node
));
2099 if ((as
->flags
& JIT_F_PREFER_IMUL
)) {
2100 emit_i8(as
, sizeof(Node
));
2101 emit_rr(as
, XO_IMULi8
, dest
, dest
);
2103 emit_shifti(as
, XOg_SHL
, dest
, 3);
2104 emit_rmrxo(as
, XO_LEA
, dest
, dest
, dest
, XM_SCALE2
, 0);
2107 emit_gri(as
, XG_ARITHi(XOg_AND
), dest
, (int32_t)khash
);
2108 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
2109 } else if (irt_isstr(kt
)) {
2110 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, key
, offsetof(GCstr
, hash
));
2111 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
2112 } else { /* Must match with hashrot() in lj_tab.c. */
2113 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, tab
, offsetof(GCtab
, hmask
));
2114 emit_rr(as
, XO_ARITH(XOg_SUB
), dest
, tmp
);
2115 emit_shifti(as
, XOg_ROL
, tmp
, HASH_ROT3
);
2116 emit_rr(as
, XO_ARITH(XOg_XOR
), dest
, tmp
);
2117 emit_shifti(as
, XOg_ROL
, dest
, HASH_ROT2
);
2118 emit_rr(as
, XO_ARITH(XOg_SUB
), tmp
, dest
);
2119 emit_shifti(as
, XOg_ROL
, dest
, HASH_ROT1
);
2120 emit_rr(as
, XO_ARITH(XOg_XOR
), tmp
, dest
);
2121 if (irt_isnum(kt
)) {
2122 emit_rr(as
, XO_ARITH(XOg_ADD
), dest
, dest
);
2124 emit_shifti(as
, XOg_SHR
|REX_64
, dest
, 32);
2125 emit_rr(as
, XO_MOV
, tmp
, dest
);
2126 emit_rr(as
, XO_MOVDto
, key
|REX_64
, dest
);
2128 emit_rmro(as
, XO_MOV
, dest
, RID_ESP
, ra_spill(as
, irkey
)+4);
2129 emit_rr(as
, XO_MOVDto
, key
, tmp
);
2132 emit_rr(as
, XO_MOV
, tmp
, key
);
2133 emit_rmro(as
, XO_LEA
, dest
, key
, HASH_BIAS
);
2139 static void asm_hrefk(ASMState
*as
, IRIns
*ir
)
2141 IRIns
*kslot
= IR(ir
->op2
);
2142 IRIns
*irkey
= IR(kslot
->op1
);
2143 int32_t ofs
= (int32_t)(kslot
->op2
* sizeof(Node
));
2144 Reg dest
= ra_used(ir
) ? ra_dest(as
, ir
, RSET_GPR
) : RID_NONE
;
2145 Reg node
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
2149 lua_assert(ofs
% sizeof(Node
) == 0);
2150 if (ra_hasreg(dest
)) {
2152 if (dest
== node
&& !(as
->flags
& JIT_F_LEA_AGU
))
2153 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, ofs
);
2155 emit_rmro(as
, XO_LEA
, dest
, node
, ofs
);
2156 } else if (dest
!= node
) {
2157 emit_rr(as
, XO_MOV
, dest
, node
);
2160 asm_guardcc(as
, CC_NE
);
2162 if (!irt_ispri(irkey
->t
)) {
2163 Reg key
= ra_scratch(as
, rset_exclude(RSET_GPR
, node
));
2164 emit_rmro(as
, XO_CMP
, key
|REX_64
, node
,
2165 ofs
+ (int32_t)offsetof(Node
, key
.u64
));
2166 lua_assert(irt_isnum(irkey
->t
) || irt_isgcv(irkey
->t
));
2167 /* Assumes -0.0 is already canonicalized to +0.0. */
2168 emit_loadu64(as
, key
, irt_isnum(irkey
->t
) ? ir_knum(irkey
)->u64
:
2169 ((uint64_t)irt_toitype(irkey
->t
) << 32) |
2170 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey
)));
2172 lua_assert(!irt_isnil(irkey
->t
));
2173 emit_i8(as
, irt_toitype(irkey
->t
));
2174 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
2175 ofs
+ (int32_t)offsetof(Node
, key
.it
));
2178 l_exit
= emit_label(as
);
2179 if (irt_isnum(irkey
->t
)) {
2180 /* Assumes -0.0 is already canonicalized to +0.0. */
2181 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
2182 ofs
+ (int32_t)offsetof(Node
, key
.u32
.lo
),
2183 (int32_t)ir_knum(irkey
)->u32
.lo
);
2184 emit_sjcc(as
, CC_NE
, l_exit
);
2185 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
2186 ofs
+ (int32_t)offsetof(Node
, key
.u32
.hi
),
2187 (int32_t)ir_knum(irkey
)->u32
.hi
);
2189 if (!irt_ispri(irkey
->t
)) {
2190 lua_assert(irt_isgcv(irkey
->t
));
2191 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
2192 ofs
+ (int32_t)offsetof(Node
, key
.gcr
),
2193 ptr2addr(ir_kgc(irkey
)));
2194 emit_sjcc(as
, CC_NE
, l_exit
);
2196 lua_assert(!irt_isnil(irkey
->t
));
2197 emit_i8(as
, irt_toitype(irkey
->t
));
2198 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
2199 ofs
+ (int32_t)offsetof(Node
, key
.it
));
2204 static void asm_newref(ASMState
*as
, IRIns
*ir
)
2206 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_newkey
];
2210 args
[0] = ASMREF_L
; /* lua_State *L */
2211 args
[1] = ir
->op1
; /* GCtab *t */
2212 args
[2] = ASMREF_TMP1
; /* cTValue *key */
2213 asm_setupresult(as
, ir
, ci
); /* TValue * */
2214 asm_gencall(as
, ci
, args
);
2215 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
2216 irkey
= IR(ir
->op2
);
2217 if (irt_isnum(irkey
->t
)) {
2218 /* For numbers use the constant itself or a spill slot as a TValue. */
2219 if (irref_isk(ir
->op2
))
2220 emit_loada(as
, tmp
, ir_knum(irkey
));
2222 emit_rmro(as
, XO_LEA
, tmp
|REX_64
, RID_ESP
, ra_spill(as
, irkey
));
2224 /* Otherwise use g->tmptv to hold the TValue. */
2225 if (!irref_isk(ir
->op2
)) {
2226 Reg src
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_GPR
, tmp
));
2227 emit_movtomro(as
, REX_64IR(irkey
, src
), tmp
, 0);
2228 } else if (!irt_ispri(irkey
->t
)) {
2229 emit_movmroi(as
, tmp
, 0, irkey
->i
);
2231 if (!(LJ_64
&& irt_islightud(irkey
->t
)))
2232 emit_movmroi(as
, tmp
, 4, irt_toitype(irkey
->t
));
2233 emit_loada(as
, tmp
, &J2G(as
->J
)->tmptv
);
2237 static void asm_uref(ASMState
*as
, IRIns
*ir
)
2239 /* NYI: Check that UREFO is still open and not aliasing a slot. */
2240 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2241 if (irref_isk(ir
->op1
)) {
2242 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
2243 MRef
*v
= &gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)])->uv
.v
;
2244 emit_rma(as
, XO_MOV
, dest
, v
);
2246 Reg uv
= ra_scratch(as
, RSET_GPR
);
2247 Reg func
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
2248 if (ir
->o
== IR_UREFC
) {
2249 emit_rmro(as
, XO_LEA
, dest
, uv
, offsetof(GCupval
, tv
));
2250 asm_guardcc(as
, CC_NE
);
2252 emit_rmro(as
, XO_ARITHib
, XOg_CMP
, uv
, offsetof(GCupval
, closed
));
2254 emit_rmro(as
, XO_MOV
, dest
, uv
, offsetof(GCupval
, v
));
2256 emit_rmro(as
, XO_MOV
, uv
, func
,
2257 (int32_t)offsetof(GCfuncL
, uvptr
) + 4*(int32_t)(ir
->op2
>> 8));
2261 static void asm_fref(ASMState
*as
, IRIns
*ir
)
2263 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2264 asm_fusefref(as
, ir
, RSET_GPR
);
2265 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
2268 static void asm_strref(ASMState
*as
, IRIns
*ir
)
2270 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2271 asm_fusestrref(as
, ir
, RSET_GPR
);
2272 if (as
->mrm
.base
== RID_NONE
)
2273 emit_loadi(as
, dest
, as
->mrm
.ofs
);
2274 else if (as
->mrm
.base
== dest
&& as
->mrm
.idx
== RID_NONE
)
2275 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, as
->mrm
.ofs
);
2277 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
2280 /* -- Loads and stores ---------------------------------------------------- */
2282 static void asm_fxload(ASMState
*as
, IRIns
*ir
)
2284 Reg dest
= ra_dest(as
, ir
, irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
);
2286 if (ir
->o
== IR_FLOAD
)
2287 asm_fusefref(as
, ir
, RSET_GPR
);
2289 asm_fusexref(as
, ir
->op1
, RSET_GPR
);
2290 /* ir->op2 is ignored -- unaligned loads are ok on x86. */
2291 switch (irt_type(ir
->t
)) {
2292 case IRT_I8
: xo
= XO_MOVSXb
; break;
2293 case IRT_U8
: xo
= XO_MOVZXb
; break;
2294 case IRT_I16
: xo
= XO_MOVSXw
; break;
2295 case IRT_U16
: xo
= XO_MOVZXw
; break;
2296 case IRT_NUM
: xo
= XMM_MOVRM(as
); break;
2297 case IRT_FLOAT
: xo
= XO_MOVSS
; break;
2299 if (LJ_64
&& irt_is64(ir
->t
))
2302 lua_assert(irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
));
2306 emit_mrm(as
, xo
, dest
, RID_MRM
);
2309 static void asm_fxstore(ASMState
*as
, IRIns
*ir
)
2311 RegSet allow
= RSET_GPR
;
2312 Reg src
= RID_NONE
, osrc
= RID_NONE
;
2314 /* The IRT_I16/IRT_U16 stores should never be simplified for constant
2315 ** values since mov word [mem], imm16 has a length-changing prefix.
2317 if (irt_isi16(ir
->t
) || irt_isu16(ir
->t
) || irt_isfp(ir
->t
) ||
2318 !asm_isk32(as
, ir
->op2
, &k
)) {
2319 RegSet allow8
= irt_isfp(ir
->t
) ? RSET_FPR
:
2320 (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) ? RSET_GPR8
: RSET_GPR
;
2321 src
= osrc
= ra_alloc1(as
, ir
->op2
, allow8
);
2322 if (!LJ_64
&& !rset_test(allow8
, src
)) { /* Already in wrong register. */
2323 rset_clear(allow
, osrc
);
2324 src
= ra_scratch(as
, allow8
);
2326 rset_clear(allow
, src
);
2328 if (ir
->o
== IR_FSTORE
)
2329 asm_fusefref(as
, IR(ir
->op1
), allow
);
2331 asm_fusexref(as
, ir
->op1
, allow
);
2332 /* ir->op2 is ignored -- unaligned stores are ok on x86. */
2333 if (ra_hasreg(src
)) {
2335 switch (irt_type(ir
->t
)) {
2336 case IRT_I8
: case IRT_U8
: xo
= XO_MOVtob
; src
|= FORCE_REX
; break;
2337 case IRT_I16
: case IRT_U16
: xo
= XO_MOVtow
; break;
2338 case IRT_NUM
: xo
= XO_MOVSDto
; break;
2339 case IRT_FLOAT
: xo
= XO_MOVSSto
; break;
2341 case IRT_LIGHTUD
: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */
2344 if (LJ_64
&& irt_is64(ir
->t
))
2347 lua_assert(irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
));
2351 emit_mrm(as
, xo
, src
, RID_MRM
);
2352 if (!LJ_64
&& src
!= osrc
) {
2353 ra_noweak(as
, osrc
);
2354 emit_rr(as
, XO_MOV
, src
, osrc
);
2357 if (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) {
2359 emit_mrm(as
, XO_MOVmib
, 0, RID_MRM
);
2361 lua_assert(irt_is64(ir
->t
) || irt_isint(ir
->t
) || irt_isu32(ir
->t
) ||
2364 emit_mrm(as
, XO_MOVmi
, REX_64IR(ir
, 0), RID_MRM
);
2370 static Reg
asm_load_lightud64(ASMState
*as
, IRIns
*ir
, int typecheck
)
2372 if (ra_used(ir
) || typecheck
) {
2373 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2375 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, dest
));
2376 asm_guardcc(as
, CC_NE
);
2378 emit_rr(as
, XO_ARITHi8
, XOg_CMP
, tmp
);
2379 emit_shifti(as
, XOg_SAR
|REX_64
, tmp
, 47);
2380 emit_rr(as
, XO_MOV
, tmp
|REX_64
, dest
);
2389 static void asm_ahuvload(ASMState
*as
, IRIns
*ir
)
2391 lua_assert(irt_isnum(ir
->t
) || irt_ispri(ir
->t
) || irt_isaddr(ir
->t
));
2393 if (irt_islightud(ir
->t
)) {
2394 Reg dest
= asm_load_lightud64(as
, ir
, 1);
2395 if (ra_hasreg(dest
)) {
2396 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
2397 emit_mrm(as
, XO_MOV
, dest
|REX_64
, RID_MRM
);
2403 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
2404 Reg dest
= ra_dest(as
, ir
, allow
);
2405 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
2406 emit_mrm(as
, dest
< RID_MAX_GPR
? XO_MOV
: XMM_MOVRM(as
), dest
, RID_MRM
);
2408 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
2410 /* Always do the type check, even if the load result is unused. */
2412 asm_guardcc(as
, irt_isnum(ir
->t
) ? CC_A
: CC_NE
);
2413 if (LJ_64
&& irt_isnum(ir
->t
)) {
2414 emit_u32(as
, LJ_TISNUM
);
2415 emit_mrm(as
, XO_ARITHi
, XOg_CMP
, RID_MRM
);
2417 emit_i8(as
, irt_toitype(ir
->t
));
2418 emit_mrm(as
, XO_ARITHi8
, XOg_CMP
, RID_MRM
);
2422 static void asm_ahustore(ASMState
*as
, IRIns
*ir
)
2424 if (irt_isnum(ir
->t
)) {
2425 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_FPR
);
2426 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
2427 emit_mrm(as
, XO_MOVSDto
, src
, RID_MRM
);
2429 } else if (irt_islightud(ir
->t
)) {
2430 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_GPR
);
2431 asm_fuseahuref(as
, ir
->op1
, rset_exclude(RSET_GPR
, src
));
2432 emit_mrm(as
, XO_MOVto
, src
|REX_64
, RID_MRM
);
2435 IRIns
*irr
= IR(ir
->op2
);
2436 RegSet allow
= RSET_GPR
;
2438 if (!irref_isk(ir
->op2
)) {
2439 src
= ra_alloc1(as
, ir
->op2
, allow
);
2440 rset_clear(allow
, src
);
2442 asm_fuseahuref(as
, ir
->op1
, allow
);
2443 if (ra_hasreg(src
)) {
2444 emit_mrm(as
, XO_MOVto
, src
, RID_MRM
);
2445 } else if (!irt_ispri(irr
->t
)) {
2446 lua_assert(irt_isaddr(ir
->t
));
2447 emit_i32(as
, irr
->i
);
2448 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
2451 emit_i32(as
, (int32_t)irt_toitype(ir
->t
));
2452 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
2456 static void asm_sload(ASMState
*as
, IRIns
*ir
)
2458 int32_t ofs
= 8*((int32_t)ir
->op1
-1) + ((ir
->op2
& IRSLOAD_FRAME
) ? 4 : 0);
2461 lua_assert(!(ir
->op2
& IRSLOAD_PARENT
)); /* Handled by asm_head_side(). */
2462 lua_assert(irt_isguard(t
) || !(ir
->op2
& IRSLOAD_TYPECHECK
));
2463 lua_assert(!irt_isint(t
) || (ir
->op2
& (IRSLOAD_CONVERT
|IRSLOAD_FRAME
)));
2464 if ((ir
->op2
& IRSLOAD_CONVERT
) && irt_isguard(t
)) {
2465 Reg left
= ra_scratch(as
, RSET_FPR
);
2466 asm_tointg(as
, ir
, left
); /* Frees dest reg. Do this before base alloc. */
2467 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
2468 emit_rmro(as
, XMM_MOVRM(as
), left
, base
, ofs
);
2469 t
.irt
= IRT_NUM
; /* Continue with a regular number type check. */
2471 } else if (irt_islightud(t
)) {
2472 Reg dest
= asm_load_lightud64(as
, ir
, (ir
->op2
& IRSLOAD_TYPECHECK
));
2473 if (ra_hasreg(dest
)) {
2474 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
2475 emit_rmro(as
, XO_MOV
, dest
|REX_64
, base
, ofs
);
2479 } else if (ra_used(ir
)) {
2480 RegSet allow
= irt_isnum(t
) ? RSET_FPR
: RSET_GPR
;
2481 Reg dest
= ra_dest(as
, ir
, allow
);
2482 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
2483 lua_assert(irt_isnum(t
) || irt_isint(t
) || irt_isaddr(t
));
2484 if ((ir
->op2
& IRSLOAD_CONVERT
))
2485 emit_rmro(as
, XO_CVTSD2SI
, dest
, base
, ofs
);
2486 else if (irt_isnum(t
))
2487 emit_rmro(as
, XMM_MOVRM(as
), dest
, base
, ofs
);
2489 emit_rmro(as
, XO_MOV
, dest
, base
, ofs
);
2491 if (!(ir
->op2
& IRSLOAD_TYPECHECK
))
2492 return; /* No type check: avoid base alloc. */
2493 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
2495 if ((ir
->op2
& IRSLOAD_TYPECHECK
)) {
2496 /* Need type check, even if the load result is unused. */
2497 asm_guardcc(as
, irt_isnum(t
) ? CC_A
: CC_NE
);
2498 if (LJ_64
&& irt_isnum(t
)) {
2499 emit_u32(as
, LJ_TISNUM
);
2500 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, base
, ofs
+4);
2502 emit_i8(as
, irt_toitype(t
));
2503 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, base
, ofs
+4);
2508 /* -- Allocations --------------------------------------------------------- */
2510 static void asm_snew(ASMState
*as
, IRIns
*ir
)
2512 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_new
];
2514 args
[0] = ASMREF_L
; /* lua_State *L */
2515 args
[1] = ir
->op1
; /* const char *str */
2516 args
[2] = ir
->op2
; /* size_t len */
2518 asm_setupresult(as
, ir
, ci
); /* GCstr * */
2519 asm_gencall(as
, ci
, args
);
2522 static void asm_tnew(ASMState
*as
, IRIns
*ir
)
2524 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_new1
];
2526 args
[0] = ASMREF_L
; /* lua_State *L */
2527 args
[1] = ASMREF_TMP1
; /* uint32_t ahsize */
2529 asm_setupresult(as
, ir
, ci
); /* GCtab * */
2530 asm_gencall(as
, ci
, args
);
2531 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP1
), ir
->op1
| (ir
->op2
<< 24));
2534 static void asm_tdup(ASMState
*as
, IRIns
*ir
)
2536 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_dup
];
2538 args
[0] = ASMREF_L
; /* lua_State *L */
2539 args
[1] = ir
->op1
; /* const GCtab *kt */
2541 asm_setupresult(as
, ir
, ci
); /* GCtab * */
2542 asm_gencall(as
, ci
, args
);
2546 static void asm_cnew(ASMState
*as
, IRIns
*ir
)
2548 CTState
*cts
= ctype_ctsG(J2G(as
->J
));
2549 CTypeID
typeid = (CTypeID
)IR(ir
->op1
)->i
;
2550 CTSize sz
= (ir
->o
== IR_CNEWI
|| ir
->op2
== REF_NIL
) ?
2551 lj_ctype_size(cts
, typeid) : (CTSize
)IR(ir
->op2
)->i
;
2552 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_mem_newgco
];
2554 lua_assert(sz
!= CTSIZE_INVALID
);
2556 args
[0] = ASMREF_L
; /* lua_State *L */
2557 args
[1] = ASMREF_TMP1
; /* MSize size */
2559 asm_setupresult(as
, ir
, ci
); /* GCcdata * */
2561 /* Initialize immutable cdata object. */
2562 if (ir
->o
== IR_CNEWI
) {
2563 RegSet allow
= (RSET_GPR
& ~RSET_SCRATCH
);
2565 Reg r64
= sz
== 8 ? REX_64
: 0;
2566 if (irref_isk(ir
->op2
)) {
2567 IRIns
*irk
= IR(ir
->op2
);
2568 uint64_t k
= irk
->o
== IR_KINT64
? ir_k64(irk
)->u64
:
2569 (uint64_t)(uint32_t)irk
->i
;
2570 if (sz
== 4 || checki32((int64_t)k
)) {
2571 emit_i32(as
, (int32_t)k
);
2572 emit_rmro(as
, XO_MOVmi
, r64
, RID_RET
, sizeof(GCcdata
));
2574 emit_movtomro(as
, RID_ECX
+ r64
, RID_RET
, sizeof(GCcdata
));
2575 emit_loadu64(as
, RID_ECX
, k
);
2578 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
2579 emit_movtomro(as
, r
+ r64
, RID_RET
, sizeof(GCcdata
));
2582 int32_t ofs
= sizeof(GCcdata
);
2583 if (LJ_HASFFI
&& sz
== 8) {
2585 lua_assert(ir
->o
== IR_HIOP
);
2588 if (irref_isk(ir
->op2
)) {
2589 emit_movmroi(as
, RID_RET
, ofs
, IR(ir
->op2
)->i
);
2591 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
2592 emit_movtomro(as
, r
, RID_RET
, ofs
);
2593 rset_clear(allow
, r
);
2595 if (!LJ_HASFFI
|| ofs
== sizeof(GCcdata
)) break;
2599 lua_assert(sz
== 4 || (sz
== 8 && (LJ_64
|| LJ_HASFFI
)));
2602 /* Combine initialization of marked, gct and typeid. */
2603 emit_movtomro(as
, RID_ECX
, RID_RET
, offsetof(GCcdata
, marked
));
2604 emit_gri(as
, XG_ARITHi(XOg_OR
), RID_ECX
,
2605 (int32_t)((~LJ_TCDATA
<<8)+(typeid<<16)));
2606 emit_gri(as
, XG_ARITHi(XOg_AND
), RID_ECX
, LJ_GC_WHITES
);
2607 emit_opgl(as
, XO_MOVZXb
, RID_ECX
, gc
.currentwhite
);
2609 asm_gencall(as
, ci
, args
);
2610 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP1
), (int32_t)(sz
+sizeof(GCcdata
)));
2613 #define asm_cnew(as, ir) ((void)0)
2616 /* -- Write barriers ------------------------------------------------------ */
2618 static void asm_tbar(ASMState
*as
, IRIns
*ir
)
2620 Reg tab
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
2621 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, tab
));
2622 MCLabel l_end
= emit_label(as
);
2623 emit_movtomro(as
, tmp
, tab
, offsetof(GCtab
, gclist
));
2624 emit_setgl(as
, tab
, gc
.grayagain
);
2625 emit_getgl(as
, tmp
, gc
.grayagain
);
2626 emit_i8(as
, ~LJ_GC_BLACK
);
2627 emit_rmro(as
, XO_ARITHib
, XOg_AND
, tab
, offsetof(GCtab
, marked
));
2628 emit_sjcc(as
, CC_Z
, l_end
);
2629 emit_i8(as
, LJ_GC_BLACK
);
2630 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, tab
, offsetof(GCtab
, marked
));
2633 static void asm_obar(ASMState
*as
, IRIns
*ir
)
2635 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_barrieruv
];
2639 /* No need for other object barriers (yet). */
2640 lua_assert(IR(ir
->op1
)->o
== IR_UREFC
);
2641 ra_evictset(as
, RSET_SCRATCH
);
2642 l_end
= emit_label(as
);
2643 args
[0] = ASMREF_TMP1
; /* global_State *g */
2644 args
[1] = ir
->op1
; /* TValue *tv */
2645 asm_gencall(as
, ci
, args
);
2646 emit_loada(as
, ra_releasetmp(as
, ASMREF_TMP1
), J2G(as
->J
));
2647 obj
= IR(ir
->op1
)->r
;
2648 emit_sjcc(as
, CC_Z
, l_end
);
2649 emit_i8(as
, LJ_GC_WHITES
);
2650 if (irref_isk(ir
->op2
)) {
2651 GCobj
*vp
= ir_kgc(IR(ir
->op2
));
2652 emit_rma(as
, XO_GROUP3b
, XOg_TEST
, &vp
->gch
.marked
);
2654 Reg val
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_SCRATCH
&RSET_GPR
, obj
));
2655 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, val
, (int32_t)offsetof(GChead
, marked
));
2657 emit_sjcc(as
, CC_Z
, l_end
);
2658 emit_i8(as
, LJ_GC_BLACK
);
2659 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, obj
,
2660 (int32_t)offsetof(GCupval
, marked
)-(int32_t)offsetof(GCupval
, tv
));
2663 /* -- FP/int arithmetic and logic operations ------------------------------ */
2665 /* Load reference onto x87 stack. Force a spill to memory if needed. */
2666 static void asm_x87load(ASMState
*as
, IRRef ref
)
2668 IRIns
*ir
= IR(ref
);
2669 if (ir
->o
== IR_KNUM
) {
2670 cTValue
*tv
= ir_knum(ir
);
2671 if (tvispzero(tv
)) /* Use fldz only for +0. */
2672 emit_x87op(as
, XI_FLDZ
);
2673 else if (tvispone(tv
))
2674 emit_x87op(as
, XI_FLD1
);
2676 emit_rma(as
, XO_FLDq
, XOg_FLDq
, tv
);
2677 } else if (ir
->o
== IR_CONV
&& ir
->op2
== IRCONV_NUM_INT
&& !ra_used(ir
) &&
2678 !irref_isk(ir
->op1
) && mayfuse(as
, ir
->op1
)) {
2679 IRIns
*iri
= IR(ir
->op1
);
2680 emit_rmro(as
, XO_FILDd
, XOg_FILDd
, RID_ESP
, ra_spill(as
, iri
));
2682 emit_mrm(as
, XO_FLDq
, XOg_FLDq
, asm_fuseload(as
, ref
, RSET_EMPTY
));
2686 /* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
2687 static int fpmjoin_pow(ASMState
*as
, IRIns
*ir
)
2689 IRIns
*irp
= IR(ir
->op1
);
2690 if (irp
== ir
-1 && irp
->o
== IR_MUL
&& !ra_used(irp
)) {
2691 IRIns
*irpp
= IR(irp
->op1
);
2692 if (irpp
== ir
-2 && irpp
->o
== IR_FPMATH
&&
2693 irpp
->op2
== IRFPM_LOG2
&& !ra_used(irpp
)) {
2694 /* The modified regs must match with the *.dasc implementation. */
2695 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM2
+1)|RID2RSET(RID_EAX
);
2697 if (ra_hasreg(ir
->r
))
2698 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
2699 ra_evictset(as
, drop
);
2700 ra_destreg(as
, ir
, RID_XMM0
);
2701 emit_call(as
, lj_vm_pow_sse
);
2702 irx
= IR(irpp
->op1
);
2703 if (ra_noreg(irx
->r
) && ra_gethint(irx
->r
) == RID_XMM1
)
2704 irx
->r
= RID_INIT
; /* Avoid allocating xmm1 for x. */
2705 ra_left(as
, RID_XMM0
, irpp
->op1
);
2706 ra_left(as
, RID_XMM1
, irp
->op2
);
2713 static void asm_fpmath(ASMState
*as
, IRIns
*ir
)
2715 IRFPMathOp fpm
= ir
->o
== IR_FPMATH
? (IRFPMathOp
)ir
->op2
: IRFPM_OTHER
;
2716 if (fpm
== IRFPM_SQRT
) {
2717 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
2718 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
2719 emit_mrm(as
, XO_SQRTSD
, dest
, left
);
2720 } else if (fpm
<= IRFPM_TRUNC
) {
2721 if (as
->flags
& JIT_F_SSE4_1
) { /* SSE4.1 has a rounding instruction. */
2722 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
2723 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
2724 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
2725 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
2726 ** This is atrocious, but the alternatives are much worse.
2728 /* Round down/up/trunc == 1001/1010/1011. */
2729 emit_i8(as
, 0x09 + fpm
);
2730 emit_mrm(as
, XO_ROUNDSD
, dest
, left
);
2731 if (LJ_64
&& as
->mcp
[1] != (MCode
)(XO_ROUNDSD
>> 16)) {
2732 as
->mcp
[0] = as
->mcp
[1]; as
->mcp
[1] = 0x0f; /* Swap 0F and REX. */
2734 *--as
->mcp
= 0x66; /* 1st byte of ROUNDSD opcode. */
2735 } else { /* Call helper functions for SSE2 variant. */
2736 /* The modified regs must match with the *.dasc implementation. */
2737 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM3
+1)|RID2RSET(RID_EAX
);
2738 if (ra_hasreg(ir
->r
))
2739 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
2740 ra_evictset(as
, drop
);
2741 ra_destreg(as
, ir
, RID_XMM0
);
2742 emit_call(as
, fpm
== IRFPM_FLOOR
? lj_vm_floor_sse
:
2743 fpm
== IRFPM_CEIL
? lj_vm_ceil_sse
: lj_vm_trunc_sse
);
2744 ra_left(as
, RID_XMM0
, ir
->op1
);
2746 } else if (fpm
== IRFPM_EXP2
&& fpmjoin_pow(as
, ir
)) {
2747 /* Rejoined to pow(). */
2748 } else { /* Handle x87 ops. */
2749 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
2751 if (ra_hasreg(dest
)) {
2753 ra_modified(as
, dest
);
2754 emit_rmro(as
, XMM_MOVRM(as
), dest
, RID_ESP
, ofs
);
2756 emit_rmro(as
, XO_FSTPq
, XOg_FSTPq
, RID_ESP
, ofs
);
2757 switch (fpm
) { /* st0 = lj_vm_*(st0) */
2758 case IRFPM_EXP
: emit_call(as
, lj_vm_exp
); break;
2759 case IRFPM_EXP2
: emit_call(as
, lj_vm_exp2
); break;
2760 case IRFPM_SIN
: emit_x87op(as
, XI_FSIN
); break;
2761 case IRFPM_COS
: emit_x87op(as
, XI_FCOS
); break;
2762 case IRFPM_TAN
: emit_x87op(as
, XI_FPOP
); emit_x87op(as
, XI_FPTAN
); break;
2763 case IRFPM_LOG
: case IRFPM_LOG2
: case IRFPM_LOG10
:
2764 /* Note: the use of fyl2xp1 would be pointless here. When computing
2765 ** log(1.0+eps) the precision is already lost after 1.0 is added.
2766 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
2768 emit_x87op(as
, XI_FYL2X
); break;
2772 emit_x87op(as
, XI_FPATAN
); asm_x87load(as
, ir
->op2
); break;
2774 emit_x87op(as
, XI_FPOP1
); emit_x87op(as
, XI_FSCALE
); break;
2775 default: lua_assert(0); break;
2778 default: lua_assert(0); break;
2780 asm_x87load(as
, ir
->op1
);
2782 case IRFPM_LOG
: emit_x87op(as
, XI_FLDLN2
); break;
2783 case IRFPM_LOG2
: emit_x87op(as
, XI_FLD1
); break;
2784 case IRFPM_LOG10
: emit_x87op(as
, XI_FLDLG2
); break;
2786 if (ir
->o
== IR_LDEXP
) asm_x87load(as
, ir
->op2
);
2793 static void asm_fppowi(ASMState
*as
, IRIns
*ir
)
2795 /* The modified regs must match with the *.dasc implementation. */
2796 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM1
+1)|RID2RSET(RID_EAX
);
2797 if (ra_hasreg(ir
->r
))
2798 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
2799 ra_evictset(as
, drop
);
2800 ra_destreg(as
, ir
, RID_XMM0
);
2801 emit_call(as
, lj_vm_powi_sse
);
2802 ra_left(as
, RID_XMM0
, ir
->op1
);
2803 ra_left(as
, RID_EAX
, ir
->op2
);
2806 #if LJ_64 && LJ_HASFFI
2807 static void asm_arith64(ASMState
*as
, IRIns
*ir
, IRCallID id
)
2809 const CCallInfo
*ci
= &lj_ir_callinfo
[id
];
2813 asm_setupresult(as
, ir
, ci
);
2814 asm_gencall(as
, ci
, args
);
2818 /* Find out whether swapping operands might be beneficial. */
2819 static int swapops(ASMState
*as
, IRIns
*ir
)
2821 IRIns
*irl
= IR(ir
->op1
);
2822 IRIns
*irr
= IR(ir
->op2
);
2823 lua_assert(ra_noreg(irr
->r
));
2824 if (!irm_iscomm(lj_ir_mode
[ir
->o
]))
2825 return 0; /* Can't swap non-commutative operations. */
2826 if (irref_isk(ir
->op2
))
2827 return 0; /* Don't swap constants to the left. */
2828 if (ra_hasreg(irl
->r
))
2829 return 1; /* Swap if left already has a register. */
2830 if (ra_samehint(ir
->r
, irr
->r
))
2831 return 1; /* Swap if dest and right have matching hints. */
2832 if (as
->curins
> as
->loopref
) { /* In variant part? */
2833 if (ir
->op2
< as
->loopref
&& !irt_isphi(irr
->t
))
2834 return 0; /* Keep invariants on the right. */
2835 if (ir
->op1
< as
->loopref
&& !irt_isphi(irl
->t
))
2836 return 1; /* Swap invariants to the right. */
2838 if (opisfusableload(irl
->o
))
2839 return 1; /* Swap fusable loads to the right. */
2840 return 0; /* Otherwise don't swap. */
2843 static void asm_fparith(ASMState
*as
, IRIns
*ir
, x86Op xo
)
2845 IRRef lref
= ir
->op1
;
2846 IRRef rref
= ir
->op2
;
2847 RegSet allow
= RSET_FPR
;
2849 Reg right
= IR(rref
)->r
;
2850 if (ra_hasreg(right
)) {
2851 rset_clear(allow
, right
);
2852 ra_noweak(as
, right
);
2854 dest
= ra_dest(as
, ir
, allow
);
2857 } else if (ra_noreg(right
)) {
2858 if (swapops(as
, ir
)) {
2859 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2861 right
= asm_fuseload(as
, rref
, rset_clear(allow
, dest
));
2863 emit_mrm(as
, xo
, dest
, right
);
2864 ra_left(as
, dest
, lref
);
2867 static void asm_intarith(ASMState
*as
, IRIns
*ir
, x86Arith xa
)
2869 IRRef lref
= ir
->op1
;
2870 IRRef rref
= ir
->op2
;
2871 RegSet allow
= RSET_GPR
;
2874 if (as
->testmcp
== as
->mcp
) { /* Drop test r,r instruction. */
2876 as
->mcp
+= (LJ_64
&& *as
->mcp
!= XI_TEST
) ? 3 : 2;
2878 right
= IR(rref
)->r
;
2879 if (ra_hasreg(right
)) {
2880 rset_clear(allow
, right
);
2881 ra_noweak(as
, right
);
2883 dest
= ra_dest(as
, ir
, allow
);
2886 } else if (ra_noreg(right
) && !asm_isk32(as
, rref
, &k
)) {
2887 if (swapops(as
, ir
)) {
2888 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2890 right
= asm_fuseload(as
, rref
, rset_clear(allow
, dest
));
2892 if (irt_isguard(ir
->t
)) /* For IR_ADDOV etc. */
2893 asm_guardcc(as
, CC_O
);
2894 if (xa
!= XOg_X_IMUL
) {
2895 if (ra_hasreg(right
))
2896 emit_mrm(as
, XO_ARITH(xa
), REX_64IR(ir
, dest
), right
);
2898 emit_gri(as
, XG_ARITHi(xa
), REX_64IR(ir
, dest
), k
);
2899 } else if (ra_hasreg(right
)) { /* IMUL r, mrm. */
2900 emit_mrm(as
, XO_IMUL
, REX_64IR(ir
, dest
), right
);
2901 } else { /* IMUL r, r, k. */
2902 /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */
2903 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
2905 if (checki8(k
)) { emit_i8(as
, k
); xo
= XO_IMULi8
;
2906 } else { emit_i32(as
, k
); xo
= XO_IMULi
; }
2907 emit_rr(as
, xo
, REX_64IR(ir
, dest
), left
);
2910 ra_left(as
, dest
, lref
);
2913 /* LEA is really a 4-operand ADD with an independent destination register,
2914 ** up to two source registers and an immediate. One register can be scaled
2915 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
2918 ** Currently only a few common cases are supported:
2919 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated
2920 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b
2921 ** - Right ADD fusion: y = a+(b+k)
2922 ** The ommited variants have already been reduced by FOLD.
2924 ** There are more fusion opportunities, like gathering shifts or joining
2925 ** common references. But these are probably not worth the trouble, since
2926 ** array indexing is not decomposed and already makes use of all fields
2927 ** of the ModRM operand.
2929 static int asm_lea(ASMState
*as
, IRIns
*ir
)
2931 IRIns
*irl
= IR(ir
->op1
);
2932 IRIns
*irr
= IR(ir
->op2
);
2933 RegSet allow
= RSET_GPR
;
2935 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
2936 as
->mrm
.scale
= XM_SCALE1
;
2938 if (ra_hasreg(irl
->r
)) {
2939 rset_clear(allow
, irl
->r
);
2940 ra_noweak(as
, irl
->r
);
2941 as
->mrm
.base
= irl
->r
;
2942 if (irref_isk(ir
->op2
) || ra_hasreg(irr
->r
)) {
2943 /* The PHI renaming logic does a better job in some cases. */
2944 if (ra_hasreg(ir
->r
) &&
2945 ((irt_isphi(irl
->t
) && as
->phireg
[ir
->r
] == ir
->op1
) ||
2946 (irt_isphi(irr
->t
) && as
->phireg
[ir
->r
] == ir
->op2
)))
2948 if (irref_isk(ir
->op2
)) {
2949 as
->mrm
.ofs
= irr
->i
;
2951 rset_clear(allow
, irr
->r
);
2952 ra_noweak(as
, irr
->r
);
2953 as
->mrm
.idx
= irr
->r
;
2955 } else if (irr
->o
== IR_ADD
&& mayfuse(as
, ir
->op2
) &&
2956 irref_isk(irr
->op2
)) {
2957 Reg idx
= ra_alloc1(as
, irr
->op1
, allow
);
2958 rset_clear(allow
, idx
);
2959 as
->mrm
.idx
= (uint8_t)idx
;
2960 as
->mrm
.ofs
= IR(irr
->op2
)->i
;
2964 } else if (ir
->op1
!= ir
->op2
&& irl
->o
== IR_ADD
&& mayfuse(as
, ir
->op1
) &&
2965 (irref_isk(ir
->op2
) || irref_isk(irl
->op2
))) {
2966 Reg idx
, base
= ra_alloc1(as
, irl
->op1
, allow
);
2967 rset_clear(allow
, base
);
2968 as
->mrm
.base
= (uint8_t)base
;
2969 if (irref_isk(ir
->op2
)) {
2970 as
->mrm
.ofs
= irr
->i
;
2971 idx
= ra_alloc1(as
, irl
->op2
, allow
);
2973 as
->mrm
.ofs
= IR(irl
->op2
)->i
;
2974 idx
= ra_alloc1(as
, ir
->op2
, allow
);
2976 rset_clear(allow
, idx
);
2977 as
->mrm
.idx
= (uint8_t)idx
;
2981 dest
= ra_dest(as
, ir
, allow
);
2982 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
2983 return 1; /* Success. */
2986 static void asm_add(ASMState
*as
, IRIns
*ir
)
2988 if (irt_isnum(ir
->t
))
2989 asm_fparith(as
, ir
, XO_ADDSD
);
2990 else if ((as
->flags
& JIT_F_LEA_AGU
) || as
->testmcp
== as
->mcp
||
2991 irt_is64(ir
->t
) || !asm_lea(as
, ir
))
2992 asm_intarith(as
, ir
, XOg_ADD
);
2995 static void asm_neg_not(ASMState
*as
, IRIns
*ir
, x86Group3 xg
)
2997 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2998 emit_rr(as
, XO_GROUP3
, REX_64IR(ir
, xg
), dest
);
2999 ra_left(as
, dest
, ir
->op1
);
3002 static void asm_bitswap(ASMState
*as
, IRIns
*ir
)
3004 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
3005 as
->mcp
= emit_op(XO_BSWAP
+ ((dest
&7) << 24),
3006 REX_64IR(ir
, dest
), 0, 0, as
->mcp
, 1);
3007 ra_left(as
, dest
, ir
->op1
);
3010 static void asm_bitshift(ASMState
*as
, IRIns
*ir
, x86Shift xs
)
3012 IRRef rref
= ir
->op2
;
3013 IRIns
*irr
= IR(rref
);
3015 if (irref_isk(rref
)) { /* Constant shifts. */
3017 dest
= ra_dest(as
, ir
, RSET_GPR
);
3018 shift
= irr
->i
& (irt_is64(ir
->t
) ? 63 : 31);
3021 case 1: emit_rr(as
, XO_SHIFT1
, REX_64IR(ir
, xs
), dest
); break;
3022 default: emit_shifti(as
, REX_64IR(ir
, xs
), dest
, shift
); break;
3024 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
3025 RegSet allow
= rset_exclude(RSET_GPR
, RID_ECX
);
3027 if (ra_noreg(right
)) {
3028 right
= ra_allocref(as
, rref
, RID2RSET(RID_ECX
));
3029 } else if (right
!= RID_ECX
) {
3030 rset_clear(allow
, right
);
3031 ra_scratch(as
, RID2RSET(RID_ECX
));
3033 dest
= ra_dest(as
, ir
, allow
);
3034 emit_rr(as
, XO_SHIFTcl
, REX_64IR(ir
, xs
), dest
);
3035 if (right
!= RID_ECX
) {
3036 ra_noweak(as
, right
);
3037 emit_rr(as
, XO_MOV
, RID_ECX
, right
);
3040 ra_left(as
, dest
, ir
->op1
);
3042 ** Note: avoid using the flags resulting from a shift or rotate!
3043 ** All of them cause a partial flag stall, except for r,1 shifts
3044 ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
3048 /* -- Comparisons --------------------------------------------------------- */
3050 /* Virtual flags for unordered FP comparisons. */
3051 #define VCC_U 0x1000 /* Unordered. */
3052 #define VCC_P 0x2000 /* Needs extra CC_P branch. */
3053 #define VCC_S 0x4000 /* Swap avoids CC_P branch. */
3054 #define VCC_PS (VCC_P|VCC_S)
3056 /* Map of comparisons to flags. ORDER IR. */
3057 #define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
3058 static const uint16_t asm_compmap
[IR_ABC
+1] = {
3059 /* signed non-eq unsigned flags */
3060 /* LT */ COMPFLAGS(CC_GE
, CC_G
, CC_AE
, VCC_PS
),
3061 /* GE */ COMPFLAGS(CC_L
, CC_L
, CC_B
, 0),
3062 /* LE */ COMPFLAGS(CC_G
, CC_G
, CC_A
, VCC_PS
),
3063 /* GT */ COMPFLAGS(CC_LE
, CC_L
, CC_BE
, 0),
3064 /* ULT */ COMPFLAGS(CC_AE
, CC_A
, CC_AE
, VCC_U
),
3065 /* UGE */ COMPFLAGS(CC_B
, CC_B
, CC_B
, VCC_U
|VCC_PS
),
3066 /* ULE */ COMPFLAGS(CC_A
, CC_A
, CC_A
, VCC_U
),
3067 /* UGT */ COMPFLAGS(CC_BE
, CC_B
, CC_BE
, VCC_U
|VCC_PS
),
3068 /* EQ */ COMPFLAGS(CC_NE
, CC_NE
, CC_NE
, VCC_P
),
3069 /* NE */ COMPFLAGS(CC_E
, CC_E
, CC_E
, VCC_U
|VCC_P
),
3070 /* ABC */ COMPFLAGS(CC_BE
, CC_B
, CC_BE
, VCC_U
|VCC_PS
) /* Same as UGT. */
3073 /* FP and integer comparisons. */
3074 static void asm_comp(ASMState
*as
, IRIns
*ir
, uint32_t cc
)
3076 if (irt_isnum(ir
->t
)) {
3077 IRRef lref
= ir
->op1
;
3078 IRRef rref
= ir
->op2
;
3082 ** An extra CC_P branch is required to preserve ordered/unordered
3083 ** semantics for FP comparisons. This can be avoided by swapping
3084 ** the operands and inverting the condition (except for EQ and UNE).
3085 ** So always try to swap if possible.
3087 ** Another option would be to swap operands to achieve better memory
3088 ** operand fusion. But it's unlikely that this outweighs the cost
3089 ** of the extra branches.
3091 if (cc
& VCC_S
) { /* Swap? */
3092 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
3093 cc
^= (VCC_PS
|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
3095 left
= ra_alloc1(as
, lref
, RSET_FPR
);
3096 right
= asm_fuseload(as
, rref
, rset_exclude(RSET_FPR
, left
));
3097 l_around
= emit_label(as
);
3098 asm_guardcc(as
, cc
>> 4);
3099 if (cc
& VCC_P
) { /* Extra CC_P branch required? */
3100 if (!(cc
& VCC_U
)) {
3101 asm_guardcc(as
, CC_P
); /* Branch to exit for ordered comparisons. */
3102 } else if (l_around
!= as
->invmcp
) {
3103 emit_sjcc(as
, CC_P
, l_around
); /* Branch around for unordered. */
3105 /* Patched to mcloop by asm_loop_fixup. */
3108 emit_sjcc(as
, CC_P
, as
->mcp
);
3110 emit_jcc(as
, CC_P
, as
->mcp
);
3113 emit_mrm(as
, XO_UCOMISD
, left
, right
);
3115 IRRef lref
= ir
->op1
, rref
= ir
->op2
;
3116 IROp leftop
= (IROp
)(IR(lref
)->o
);
3117 Reg r64
= REX_64IR(ir
, 0);
3119 lua_assert(irt_is64(ir
->t
) || irt_isint(ir
->t
) || irt_isaddr(ir
->t
));
3120 /* Swap constants (only for ABC) and fusable loads to the right. */
3121 if (irref_isk(lref
) || (!irref_isk(rref
) && opisfusableload(leftop
))) {
3122 if ((cc
& 0xc) == 0xc) cc
^= 3; /* L <-> G, LE <-> GE */
3123 else if ((cc
& 0xa) == 0x2) cc
^= 5; /* A <-> B, AE <-> BE */
3124 lref
= ir
->op2
; rref
= ir
->op1
;
3126 if (asm_isk32(as
, rref
, &imm
)) {
3127 IRIns
*irl
= IR(lref
);
3128 /* Check wether we can use test ins. Not for unsigned, since CF=0. */
3129 int usetest
= (imm
== 0 && (cc
& 0xa) != 0x2);
3130 if (usetest
&& irl
->o
== IR_BAND
&& irl
+1 == ir
&& !ra_used(irl
)) {
3131 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
3132 Reg right
, left
= RID_NONE
;
3133 RegSet allow
= RSET_GPR
;
3134 if (!asm_isk32(as
, irl
->op2
, &imm
)) {
3135 left
= ra_alloc1(as
, irl
->op2
, allow
);
3136 rset_clear(allow
, left
);
3137 } else { /* Try to Fuse IRT_I8/IRT_U8 loads, too. See below. */
3138 IRIns
*irll
= IR(irl
->op1
);
3139 if (opisfusableload((IROp
)irll
->o
) &&
3140 (irt_isi8(irll
->t
) || irt_isu8(irll
->t
))) {
3141 IRType1 origt
= irll
->t
; /* Temporarily flip types. */
3142 irll
->t
.irt
= (irll
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
3143 as
->curins
--; /* Skip to BAND to avoid failing in noconflict(). */
3144 right
= asm_fuseload(as
, irl
->op1
, RSET_GPR
);
3147 if (right
!= RID_MRM
) goto test_nofuse
;
3148 /* Fusion succeeded, emit test byte mrm, imm8. */
3149 asm_guardcc(as
, cc
);
3150 emit_i8(as
, (imm
& 0xff));
3151 emit_mrm(as
, XO_GROUP3b
, XOg_TEST
, RID_MRM
);
3155 as
->curins
--; /* Skip to BAND to avoid failing in noconflict(). */
3156 right
= asm_fuseload(as
, irl
->op1
, allow
);
3157 as
->curins
++; /* Undo the above. */
3159 asm_guardcc(as
, cc
);
3160 if (ra_noreg(left
)) {
3162 emit_mrm(as
, XO_GROUP3
, r64
+ XOg_TEST
, right
);
3164 emit_mrm(as
, XO_TEST
, r64
+ left
, right
);
3168 if (opisfusableload((IROp
)irl
->o
) &&
3169 ((irt_isu8(irl
->t
) && checku8(imm
)) ||
3170 ((irt_isi8(irl
->t
) || irt_isi16(irl
->t
)) && checki8(imm
)) ||
3171 (irt_isu16(irl
->t
) && checku16(imm
) && checki8((int16_t)imm
)))) {
3172 /* Only the IRT_INT case is fused by asm_fuseload.
3173 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
3174 ** are handled here.
3175 ** Note that cmp word [mem], imm16 should not be generated,
3176 ** since it has a length-changing prefix. Compares of a word
3177 ** against a sign-extended imm8 are ok, however.
3179 IRType1 origt
= irl
->t
; /* Temporarily flip types. */
3180 irl
->t
.irt
= (irl
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
3181 left
= asm_fuseload(as
, lref
, RSET_GPR
);
3183 if (left
== RID_MRM
) { /* Fusion succeeded? */
3184 asm_guardcc(as
, cc
);
3186 emit_mrm(as
, (irt_isi8(origt
) || irt_isu8(origt
)) ?
3187 XO_ARITHib
: XO_ARITHiw8
, r64
+ XOg_CMP
, RID_MRM
);
3189 } /* Otherwise handle register case as usual. */
3191 left
= asm_fuseload(as
, lref
, RSET_GPR
);
3193 asm_guardcc(as
, cc
);
3194 if (usetest
&& left
!= RID_MRM
) {
3195 /* Use test r,r instead of cmp r,0. */
3196 emit_rr(as
, XO_TEST
, r64
+ left
, left
);
3197 if (irl
+1 == ir
) /* Referencing previous ins? */
3198 as
->testmcp
= as
->mcp
; /* Set flag to drop test r,r if possible. */
3200 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), r64
+ left
, imm
);
3204 Reg left
= ra_alloc1(as
, lref
, RSET_GPR
);
3205 Reg right
= asm_fuseload(as
, rref
, rset_exclude(RSET_GPR
, left
));
3206 asm_guardcc(as
, cc
);
3207 emit_mrm(as
, XO_CMP
, r64
+ left
, right
);
3212 #if LJ_32 && LJ_HASFFI
3213 /* 64 bit integer comparisons in 32 bit mode. */
3214 static void asm_comp_int64(ASMState
*as
, IRIns
*ir
)
3216 uint32_t cc
= asm_compmap
[(ir
-1)->o
];
3217 RegSet allow
= RSET_GPR
;
3218 Reg lefthi
= RID_NONE
, leftlo
= RID_NONE
;
3219 Reg righthi
= RID_NONE
, rightlo
= RID_NONE
;
3223 as
->curins
--; /* Skip loword ins. Avoids failing in noconflict(), too. */
3225 /* Allocate/fuse hiword operands. */
3226 if (irref_isk(ir
->op2
)) {
3227 lefthi
= asm_fuseload(as
, ir
->op1
, allow
);
3229 lefthi
= ra_alloc1(as
, ir
->op1
, allow
);
3230 righthi
= asm_fuseload(as
, ir
->op2
, allow
);
3231 if (righthi
== RID_MRM
) {
3232 if (as
->mrm
.base
!= RID_NONE
) rset_clear(allow
, as
->mrm
.base
);
3233 if (as
->mrm
.idx
!= RID_NONE
) rset_clear(allow
, as
->mrm
.idx
);
3235 rset_clear(allow
, righthi
);
3238 mrm
= as
->mrm
; /* Save state for hiword instruction. */
3240 /* Allocate/fuse loword operands. */
3241 if (irref_isk((ir
-1)->op2
)) {
3242 leftlo
= asm_fuseload(as
, (ir
-1)->op1
, allow
);
3244 leftlo
= ra_alloc1(as
, (ir
-1)->op1
, allow
);
3245 rightlo
= asm_fuseload(as
, (ir
-1)->op2
, allow
);
3246 if (rightlo
== RID_MRM
) {
3247 if (as
->mrm
.base
!= RID_NONE
) rset_clear(allow
, as
->mrm
.base
);
3248 if (as
->mrm
.idx
!= RID_NONE
) rset_clear(allow
, as
->mrm
.idx
);
3250 rset_clear(allow
, rightlo
);
3254 /* All register allocations must be performed _before_ this point. */
3255 l_around
= emit_label(as
);
3256 as
->invmcp
= as
->testmcp
= NULL
; /* Cannot use these optimizations. */
3258 /* Loword comparison and branch. */
3259 asm_guardcc(as
, cc
>> 4); /* Always use unsigned compare for loword. */
3260 if (ra_noreg(rightlo
)) {
3261 int32_t imm
= IR((ir
-1)->op2
)->i
;
3262 if (imm
== 0 && ((cc
>> 4) & 0xa) != 0x2 && leftlo
!= RID_MRM
)
3263 emit_rr(as
, XO_TEST
, leftlo
, leftlo
);
3265 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), leftlo
, imm
);
3267 emit_mrm(as
, XO_CMP
, leftlo
, rightlo
);
3270 /* Hiword comparison and branches. */
3271 if ((cc
& 15) != CC_NE
)
3272 emit_sjcc(as
, CC_NE
, l_around
); /* Hiword unequal: skip loword compare. */
3273 if ((cc
& 15) != CC_E
)
3274 asm_guardcc(as
, cc
>> 8); /* Hiword compare without equality check. */
3275 as
->mrm
= mrm
; /* Restore state. */
3276 if (ra_noreg(righthi
)) {
3277 int32_t imm
= IR(ir
->op2
)->i
;
3278 if (imm
== 0 && (cc
& 0xa) != 0x2 && lefthi
!= RID_MRM
)
3279 emit_rr(as
, XO_TEST
, lefthi
, lefthi
);
3281 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), lefthi
, imm
);
3283 emit_mrm(as
, XO_CMP
, lefthi
, righthi
);
3288 /* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
3290 /* Hiword op of a split 64 bit op. Previous op must be the loword op. */
3291 static void asm_hiop(ASMState
*as
, IRIns
*ir
)
3293 #if LJ_32 && LJ_HASFFI
3294 /* HIOP is marked as a store because it needs its own DCE logic. */
3295 int uselo
= ra_used(ir
-1), usehi
= ra_used(ir
); /* Loword/hiword used? */
3296 if (LJ_UNLIKELY(!(as
->flags
& JIT_F_OPT_DCE
))) uselo
= usehi
= 1;
3297 if ((ir
-1)->o
== IR_CONV
) { /* Conversions to/from 64 bit. */
3298 if (usehi
|| uselo
) {
3299 if (irt_isfp(ir
->t
))
3300 asm_conv_fp_int64(as
, ir
);
3302 asm_conv_int64_fp(as
, ir
);
3304 as
->curins
--; /* Always skip the CONV. */
3306 } else if ((ir
-1)->o
<= IR_NE
) { /* 64 bit integer comparisons. ORDER IR. */
3307 asm_comp_int64(as
, ir
);
3310 if (!usehi
) return; /* Skip unused hiword op for all remaining ops. */
3311 switch ((ir
-1)->o
) {
3313 asm_intarith(as
, ir
, uselo
? XOg_ADC
: XOg_ADD
);
3316 asm_intarith(as
, ir
, uselo
? XOg_SBB
: XOg_SUB
);
3319 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
3320 emit_rr(as
, XO_GROUP3
, XOg_NEG
, dest
);
3323 emit_rr(as
, XO_ARITHi8
, XOg_ADC
, dest
);
3325 ra_left(as
, dest
, ir
->op1
);
3329 ra_destreg(as
, ir
, RID_RETHI
);
3331 ra_allocref(as
, ir
->op1
, RID2RSET(RID_RET
)); /* Mark call as used. */
3334 /* Nothing to do here. Handled by CNEWI itself. */
3336 default: lua_assert(0); break;
3339 UNUSED(as
); UNUSED(ir
); lua_assert(0); /* Unused on x64 or without FFI. */
3343 /* -- Stack handling ------------------------------------------------------ */
3345 /* Get extent of the stack for a snapshot. */
3346 static BCReg
asm_stack_extent(ASMState
*as
, SnapShot
*snap
, BCReg
*ptopslot
)
3348 SnapEntry
*map
= &as
->T
->snapmap
[snap
->mapofs
];
3349 MSize n
, nent
= snap
->nent
;
3350 BCReg baseslot
= 0, topslot
= 0;
3351 /* Must check all frames to find topslot (outer can be larger than inner). */
3352 for (n
= 0; n
< nent
; n
++) {
3353 SnapEntry sn
= map
[n
];
3354 if ((sn
& SNAP_FRAME
)) {
3355 IRIns
*ir
= IR(snap_ref(sn
));
3356 GCfunc
*fn
= ir_kfunc(ir
);
3357 if (isluafunc(fn
)) {
3358 BCReg s
= snap_slot(sn
);
3359 BCReg fs
= s
+ funcproto(fn
)->framesize
;
3360 if (fs
> topslot
) topslot
= fs
;
3365 *ptopslot
= topslot
;
3369 /* Check Lua stack size for overflow. Use exit handler as fallback. */
3370 static void asm_stack_check(ASMState
*as
, BCReg topslot
,
3371 Reg pbase
, RegSet allow
, ExitNo exitno
)
3373 /* Try to get an unused temp. register, otherwise spill/restore eax. */
3374 Reg r
= allow
? rset_pickbot(allow
) : RID_EAX
;
3375 emit_jcc(as
, CC_B
, exitstub_addr(as
->J
, exitno
));
3376 if (allow
== RSET_EMPTY
) /* Restore temp. register. */
3377 emit_rmro(as
, XO_MOV
, r
|REX_64
, RID_ESP
, 0);
3380 emit_gri(as
, XG_ARITHi(XOg_CMP
), r
, (int32_t)(8*topslot
));
3381 if (ra_hasreg(pbase
) && pbase
!= r
)
3382 emit_rr(as
, XO_ARITH(XOg_SUB
), r
, pbase
);
3384 emit_rmro(as
, XO_ARITH(XOg_SUB
), r
, RID_NONE
,
3385 ptr2addr(&J2G(as
->J
)->jit_base
));
3386 emit_rmro(as
, XO_MOV
, r
, r
, offsetof(lua_State
, maxstack
));
3387 emit_getgl(as
, r
, jit_L
);
3388 if (allow
== RSET_EMPTY
) /* Spill temp. register. */
3389 emit_rmro(as
, XO_MOVto
, r
|REX_64
, RID_ESP
, 0);
3392 /* Restore Lua stack from on-trace state. */
3393 static void asm_stack_restore(ASMState
*as
, SnapShot
*snap
)
3395 SnapEntry
*map
= &as
->T
->snapmap
[snap
->mapofs
];
3396 MSize n
, nent
= snap
->nent
;
3397 SnapEntry
*flinks
= map
+ nent
+ snap
->depth
;
3398 /* Store the value of all modified slots to the Lua stack. */
3399 for (n
= 0; n
< nent
; n
++) {
3400 SnapEntry sn
= map
[n
];
3401 BCReg s
= snap_slot(sn
);
3402 int32_t ofs
= 8*((int32_t)s
-1);
3403 IRRef ref
= snap_ref(sn
);
3404 IRIns
*ir
= IR(ref
);
3405 if ((sn
& SNAP_NORESTORE
))
3407 if (irt_isnum(ir
->t
)) {
3408 Reg src
= ra_alloc1(as
, ref
, RSET_FPR
);
3409 emit_rmro(as
, XO_MOVSDto
, src
, RID_BASE
, ofs
);
3411 lua_assert(irt_ispri(ir
->t
) || irt_isaddr(ir
->t
));
3412 if (!irref_isk(ref
)) {
3413 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, RID_BASE
));
3414 emit_movtomro(as
, REX_64IR(ir
, src
), RID_BASE
, ofs
);
3415 } else if (!irt_ispri(ir
->t
)) {
3416 emit_movmroi(as
, RID_BASE
, ofs
, ir
->i
);
3418 if ((sn
& (SNAP_CONT
|SNAP_FRAME
))) {
3419 if (s
!= 0) /* Do not overwrite link to previous frame. */
3420 emit_movmroi(as
, RID_BASE
, ofs
+4, (int32_t)(*flinks
--));
3422 if (!(LJ_64
&& irt_islightud(ir
->t
)))
3423 emit_movmroi(as
, RID_BASE
, ofs
+4, irt_toitype(ir
->t
));
3428 lua_assert(map
+ nent
== flinks
);
3431 /* -- GC handling --------------------------------------------------------- */
3433 /* Check GC threshold and do one or more GC steps. */
3434 static void asm_gc_check(ASMState
*as
)
3436 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_step_jit
];
3440 ra_evictset(as
, RSET_SCRATCH
);
3441 l_end
= emit_label(as
);
3442 /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
3443 asm_guardcc(as
, CC_NE
); /* Assumes asm_snap_prep() already done. */
3444 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
);
3445 args
[0] = ASMREF_TMP1
; /* global_State *g */
3446 args
[1] = ASMREF_TMP2
; /* MSize steps */
3447 asm_gencall(as
, ci
, args
);
3448 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
3449 emit_loada(as
, tmp
, J2G(as
->J
));
3450 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP2
), (int32_t)as
->gcsteps
);
3451 /* Jump around GC step if GC total < GC threshold. */
3452 emit_sjcc(as
, CC_B
, l_end
);
3453 emit_opgl(as
, XO_ARITH(XOg_CMP
), tmp
, gc
.threshold
);
3454 emit_getgl(as
, tmp
, gc
.total
);
3459 /* -- PHI and loop handling ----------------------------------------------- */
3461 /* Break a PHI cycle by renaming to a free register (evict if needed). */
3462 static void asm_phi_break(ASMState
*as
, RegSet blocked
, RegSet blockedby
,
3465 RegSet candidates
= blocked
& allow
;
3466 if (candidates
) { /* If this register file has candidates. */
3467 /* Note: the set for ra_pick cannot be empty, since each register file
3468 ** has some registers never allocated to PHIs.
3470 Reg down
, up
= ra_pick(as
, ~blocked
& allow
); /* Get a free register. */
3471 if (candidates
& ~blockedby
) /* Optimize shifts, else it's a cycle. */
3472 candidates
= candidates
& ~blockedby
;
3473 down
= rset_picktop(candidates
); /* Pick candidate PHI register. */
3474 ra_rename(as
, down
, up
); /* And rename it to the free register. */
3478 /* PHI register shuffling.
3480 ** The allocator tries hard to preserve PHI register assignments across
3481 ** the loop body. Most of the time this loop does nothing, since there
3482 ** are no register mismatches.
3484 ** If a register mismatch is detected and ...
3485 ** - the register is currently free: rename it.
3486 ** - the register is blocked by an invariant: restore/remat and rename it.
3487 ** - Otherwise the register is used by another PHI, so mark it as blocked.
3489 ** The renames are order-sensitive, so just retry the loop if a register
3490 ** is marked as blocked, but has been freed in the meantime. A cycle is
3491 ** detected if all of the blocked registers are allocated. To break the
3492 ** cycle rename one of them to a free register and retry.
3494 ** Note that PHI spill slots are kept in sync and don't need to be shuffled.
3496 static void asm_phi_shuffle(ASMState
*as
)
3500 /* Find and resolve PHI register mismatches. */
3502 RegSet blocked
= RSET_EMPTY
;
3503 RegSet blockedby
= RSET_EMPTY
;
3504 RegSet phiset
= as
->phiset
;
3505 while (phiset
) { /* Check all left PHI operand registers. */
3506 Reg r
= rset_picktop(phiset
);
3507 IRIns
*irl
= IR(as
->phireg
[r
]);
3509 if (r
!= left
) { /* Mismatch? */
3510 if (!rset_test(as
->freeset
, r
)) { /* PHI register blocked? */
3511 IRRef ref
= regcost_ref(as
->cost
[r
]);
3512 if (irt_ismarked(IR(ref
)->t
)) { /* Blocked by other PHI (w/reg)? */
3513 rset_set(blocked
, r
);
3514 if (ra_hasreg(left
))
3515 rset_set(blockedby
, left
);
3517 } else { /* Otherwise grab register from invariant. */
3518 ra_restore(as
, ref
);
3522 if (ra_hasreg(left
)) {
3523 ra_rename(as
, left
, r
);
3527 rset_clear(phiset
, r
);
3529 if (!blocked
) break; /* Finished. */
3530 if (!(as
->freeset
& blocked
)) { /* Break cycles if none are free. */
3531 asm_phi_break(as
, blocked
, blockedby
, RSET_GPR
);
3532 asm_phi_break(as
, blocked
, blockedby
, RSET_FPR
);
3534 } /* Else retry some more renames. */
3537 /* Restore/remat invariants whose registers are modified inside the loop. */
3538 work
= as
->modset
& ~(as
->freeset
| as
->phiset
);
3540 Reg r
= rset_picktop(work
);
3541 ra_restore(as
, regcost_ref(as
->cost
[r
]));
3542 rset_clear(work
, r
);
3546 /* Allocate and save all unsaved PHI regs and clear marks. */
3549 Reg r
= rset_picktop(work
);
3550 IRRef lref
= as
->phireg
[r
];
3551 IRIns
*ir
= IR(lref
);
3552 if (ra_hasspill(ir
->s
)) { /* Left PHI gained a spill slot? */
3553 irt_clearmark(ir
->t
); /* Handled here, so clear marker now. */
3554 ra_alloc1(as
, lref
, RID2RSET(r
));
3555 ra_save(as
, ir
, r
); /* Save to spill slot inside the loop. */
3558 rset_clear(work
, r
);
3562 /* Emit renames for left PHIs which are only spilled outside the loop. */
3563 static void asm_phi_fixup(ASMState
*as
)
3565 RegSet work
= as
->phiset
;
3567 Reg r
= rset_picktop(work
);
3568 IRRef lref
= as
->phireg
[r
];
3569 IRIns
*ir
= IR(lref
);
3570 /* Left PHI gained a spill slot before the loop? */
3571 if (irt_ismarked(ir
->t
) && ra_hasspill(ir
->s
)) {
3573 lj_ir_set(as
->J
, IRT(IR_RENAME
, IRT_NIL
), lref
, as
->loopsnapno
);
3574 ren
= tref_ref(lj_ir_emit(as
->J
));
3575 as
->ir
= as
->T
->ir
; /* The IR may have been reallocated. */
3576 IR(ren
)->r
= (uint8_t)r
;
3577 IR(ren
)->s
= SPS_NONE
;
3579 irt_clearmark(ir
->t
); /* Always clear marker. */
3580 rset_clear(work
, r
);
3584 /* Setup right PHI reference. */
3585 static void asm_phi(ASMState
*as
, IRIns
*ir
)
3587 RegSet allow
= (irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
) & ~as
->phiset
;
3588 RegSet afree
= (as
->freeset
& allow
);
3589 IRIns
*irl
= IR(ir
->op1
);
3590 IRIns
*irr
= IR(ir
->op2
);
3591 /* Spill slot shuffling is not implemented yet (but rarely needed). */
3592 if (ra_hasspill(irl
->s
) || ra_hasspill(irr
->s
))
3593 lj_trace_err(as
->J
, LJ_TRERR_NYIPHI
);
3594 /* Leave at least one register free for non-PHIs (and PHI cycle breaking). */
3595 if ((afree
& (afree
-1))) { /* Two or more free registers? */
3597 if (ra_noreg(irr
->r
)) { /* Get a register for the right PHI. */
3598 r
= ra_allocref(as
, ir
->op2
, allow
);
3599 } else { /* Duplicate right PHI, need a copy (rare). */
3600 r
= ra_scratch(as
, allow
);
3601 ra_movrr(as
, irr
, r
, irr
->r
);
3604 rset_set(as
->phiset
, r
);
3605 as
->phireg
[r
] = (IRRef1
)ir
->op1
;
3606 irt_setmark(irl
->t
); /* Marks left PHIs _with_ register. */
3607 if (ra_noreg(irl
->r
))
3608 ra_sethint(irl
->r
, r
); /* Set register hint for left PHI. */
3609 } else { /* Otherwise allocate a spill slot. */
3610 /* This is overly restrictive, but it triggers only on synthetic code. */
3611 if (ra_hasreg(irl
->r
) || ra_hasreg(irr
->r
))
3612 lj_trace_err(as
->J
, LJ_TRERR_NYIPHI
);
3614 irl
->s
= irr
->s
= ir
->s
; /* Sync left/right PHI spill slots. */
3618 /* Fixup the loop branch. */
3619 static void asm_loop_fixup(ASMState
*as
)
3621 MCode
*p
= as
->mctop
;
3622 MCode
*target
= as
->mcp
;
3623 if (as
->realign
) { /* Realigned loops use short jumps. */
3624 as
->realign
= NULL
; /* Stop another retry. */
3625 lua_assert(((intptr_t)target
& 15) == 0);
3626 if (as
->loopinv
) { /* Inverted loop branch? */
3629 lua_assert(target
- p
>= -128);
3630 p
[-1] = (MCode
)(target
- p
); /* Patch sjcc. */
3631 if (as
->loopinv
== 2)
3632 p
[-3] = (MCode
)(target
- p
+ 2); /* Patch opt. short jp. */
3634 lua_assert(target
- p
>= -128);
3635 p
[-1] = (MCode
)(int8_t)(target
- p
); /* Patch short jmp. */
3641 if (as
->loopinv
) { /* Inverted loop branch? */
3642 /* asm_guardcc already inverted the jcc and patched the jmp. */
3645 *(int32_t *)(p
-4) = (int32_t)(target
- p
); /* Patch jcc. */
3646 if (as
->loopinv
== 2) {
3647 *(int32_t *)(p
-10) = (int32_t)(target
- p
+ 6); /* Patch opt. jp. */
3650 } else { /* Otherwise just patch jmp. */
3651 *(int32_t *)(p
-4) = (int32_t)(target
- p
);
3654 /* Realign small loops and shorten the loop branch. */
3655 if (newloop
>= p
- 128) {
3656 as
->realign
= newloop
; /* Force a retry and remember alignment. */
3657 as
->curins
= as
->stopins
; /* Abort asm_trace now. */
3658 as
->T
->nins
= as
->orignins
; /* Remove any added renames. */
3663 /* Middle part of a loop. */
3664 static void asm_loop(ASMState
*as
)
3666 /* LOOP is a guard, so the snapno is up to date. */
3667 as
->loopsnapno
= as
->snapno
;
3670 /* LOOP marks the transition from the variant to the invariant part. */
3671 as
->testmcp
= as
->invmcp
= NULL
;
3673 if (!neverfuse(as
)) as
->fuseref
= 0;
3674 asm_phi_shuffle(as
);
3676 as
->mcloop
= as
->mcp
;
3677 RA_DBGX((as
, "===== LOOP ====="));
3678 if (!as
->realign
) RA_DBG_FLUSH();
3681 /* -- Head of trace ------------------------------------------------------- */
3683 /* Calculate stack adjustment. */
3684 static int32_t asm_stack_adjust(ASMState
*as
)
3686 if (as
->evenspill
<= SPS_FIXED
)
3688 return sps_scale((as
->evenspill
- SPS_FIXED
+ 3) & ~3);
3691 /* Coalesce BASE register for a root trace. */
3692 static void asm_head_root_base(ASMState
*as
)
3694 IRIns
*ir
= IR(REF_BASE
);
3698 if (rset_test(as
->modset
, r
))
3699 ir
->r
= RID_INIT
; /* No inheritance for modified BASE register. */
3701 emit_rr(as
, XO_MOV
, r
, RID_BASE
);
3705 /* Head of a root trace. */
3706 static void asm_head_root(ASMState
*as
)
3709 asm_head_root_base(as
);
3710 emit_setgli(as
, vmstate
, (int32_t)as
->T
->traceno
);
3711 spadj
= asm_stack_adjust(as
);
3712 as
->T
->spadjust
= (uint16_t)spadj
;
3713 emit_addptr(as
, RID_ESP
|REX_64
, -spadj
);
3714 /* Root traces assume a checked stack for the starting proto. */
3715 as
->T
->topslot
= gcref(as
->T
->startpt
)->pt
.framesize
;
3718 /* Coalesce or reload BASE register for a side trace. */
3719 static RegSet
asm_head_side_base(ASMState
*as
, Reg pbase
, RegSet allow
)
3721 IRIns
*ir
= IR(REF_BASE
);
3725 if (rset_test(as
->modset
, r
))
3726 ir
->r
= RID_INIT
; /* No inheritance for modified BASE register. */
3728 rset_clear(allow
, r
); /* Mark same BASE register as coalesced. */
3729 } else if (ra_hasreg(pbase
) && rset_test(as
->freeset
, pbase
)) {
3730 rset_clear(allow
, pbase
);
3731 emit_rr(as
, XO_MOV
, r
, pbase
); /* Move from coalesced parent register. */
3733 emit_getgl(as
, r
, jit_base
); /* Otherwise reload BASE. */
3739 /* Head of a side trace.
3741 ** The current simplistic algorithm requires that all slots inherited
3742 ** from the parent are live in a register between pass 2 and pass 3. This
3743 ** avoids the complexity of stack slot shuffling. But of course this may
3744 ** overflow the register set in some cases and cause the dreaded error:
3745 ** "NYI: register coalescing too complex". A refined algorithm is needed.
3747 static void asm_head_side(ASMState
*as
)
3749 IRRef1 sloadins
[RID_MAX
];
3750 RegSet allow
= RSET_ALL
; /* Inverse of all coalesced registers. */
3751 RegSet live
= RSET_EMPTY
; /* Live parent registers. */
3752 Reg pbase
= as
->parent
->ir
[REF_BASE
].r
; /* Parent base register (if any). */
3753 int32_t spadj
, spdelta
;
3758 allow
= asm_head_side_base(as
, pbase
, allow
);
3760 /* Scan all parent SLOADs and collect register dependencies. */
3761 for (i
= as
->stopins
; i
> REF_BASE
; i
--) {
3764 lua_assert(ir
->o
== IR_SLOAD
&& (ir
->op2
& IRSLOAD_PARENT
));
3765 rs
= as
->parentmap
[ir
->op1
];
3766 if (ra_hasreg(ir
->r
)) {
3767 rset_clear(allow
, ir
->r
);
3768 if (ra_hasspill(ir
->s
))
3769 ra_save(as
, ir
, ir
->r
);
3770 } else if (ra_hasspill(ir
->s
)) {
3774 if (ir
->r
== rs
) { /* Coalesce matching registers right now. */
3776 } else if (ra_hasspill(regsp_spill(rs
))) {
3777 if (ra_hasreg(ir
->r
))
3779 } else if (ra_used(ir
)) {
3780 sloadins
[rs
] = (IRRef1
)i
;
3781 rset_set(live
, rs
); /* Block live parent register. */
3785 /* Calculate stack frame adjustment. */
3786 spadj
= asm_stack_adjust(as
);
3787 spdelta
= spadj
- (int32_t)as
->parent
->spadjust
;
3788 if (spdelta
< 0) { /* Don't shrink the stack frame. */
3789 spadj
= (int32_t)as
->parent
->spadjust
;
3792 as
->T
->spadjust
= (uint16_t)spadj
;
3794 /* Reload spilled target registers. */
3796 for (i
= as
->stopins
; i
> REF_BASE
; i
--) {
3798 if (irt_ismarked(ir
->t
)) {
3802 irt_clearmark(ir
->t
);
3803 rs
= as
->parentmap
[ir
->op1
];
3804 if (!ra_hasspill(regsp_spill(rs
)))
3805 ra_sethint(ir
->r
, rs
); /* Hint may be gone, set it again. */
3806 else if (sps_scale(regsp_spill(rs
))+spdelta
== sps_scale(ir
->s
))
3807 continue; /* Same spill slot, do nothing. */
3808 mask
= (irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
) & allow
;
3809 if (mask
== RSET_EMPTY
)
3810 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
3811 r
= ra_allocref(as
, i
, mask
);
3813 rset_clear(allow
, r
);
3814 if (r
== rs
) { /* Coalesce matching registers right now. */
3816 rset_clear(live
, r
);
3817 } else if (ra_hasspill(regsp_spill(rs
))) {
3825 /* Store trace number and adjust stack frame relative to the parent. */
3826 emit_setgli(as
, vmstate
, (int32_t)as
->T
->traceno
);
3827 emit_addptr(as
, RID_ESP
|REX_64
, -spdelta
);
3829 /* Restore target registers from parent spill slots. */
3831 RegSet work
= ~as
->freeset
& RSET_ALL
;
3833 Reg r
= rset_pickbot(work
);
3834 IRIns
*ir
= IR(regcost_ref(as
->cost
[r
]));
3835 RegSP rs
= as
->parentmap
[ir
->op1
];
3836 rset_clear(work
, r
);
3837 if (ra_hasspill(regsp_spill(rs
))) {
3838 int32_t ofs
= sps_scale(regsp_spill(rs
));
3840 if (r
< RID_MAX_GPR
)
3841 emit_rmro(as
, XO_MOV
, REX_64IR(ir
, r
), RID_ESP
, ofs
);
3843 emit_rmro(as
, XMM_MOVRM(as
), r
, RID_ESP
, ofs
);
3849 /* Shuffle registers to match up target regs with parent regs. */
3853 /* Repeatedly coalesce free live registers by moving to their target. */
3854 while ((work
= as
->freeset
& live
) != RSET_EMPTY
) {
3855 Reg rp
= rset_pickbot(work
);
3856 IRIns
*ir
= IR(sloadins
[rp
]);
3857 rset_clear(live
, rp
);
3858 rset_clear(allow
, rp
);
3860 ra_movrr(as
, ir
, ir
->r
, rp
);
3864 /* We're done if no live registers remain. */
3865 if (live
== RSET_EMPTY
)
3868 /* Break cycles by renaming one target to a temp. register. */
3869 if (live
& RSET_GPR
) {
3870 RegSet tmpset
= as
->freeset
& ~live
& allow
& RSET_GPR
;
3871 if (tmpset
== RSET_EMPTY
)
3872 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
3873 ra_rename(as
, rset_pickbot(live
& RSET_GPR
), rset_pickbot(tmpset
));
3875 if (live
& RSET_FPR
) {
3876 RegSet tmpset
= as
->freeset
& ~live
& allow
& RSET_FPR
;
3877 if (tmpset
== RSET_EMPTY
)
3878 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
3879 ra_rename(as
, rset_pickbot(live
& RSET_FPR
), rset_pickbot(tmpset
));
3882 /* Continue with coalescing to fix up the broken cycle(s). */
3885 /* Inherit top stack slot already checked by parent trace. */
3886 as
->T
->topslot
= as
->parent
->topslot
;
3887 if (as
->topslot
> as
->T
->topslot
) { /* Need to check for higher slot? */
3888 as
->T
->topslot
= (uint8_t)as
->topslot
; /* Remember for child traces. */
3889 /* Reuse the parent exit in the context of the parent trace. */
3890 asm_stack_check(as
, as
->topslot
, pbase
, allow
& RSET_GPR
, as
->J
->exitno
);
3894 /* -- Tail of trace ------------------------------------------------------- */
3896 /* Link to another trace. */
3897 static void asm_tail_link(ASMState
*as
)
3899 SnapNo snapno
= as
->T
->nsnap
-1; /* Last snapshot. */
3900 SnapShot
*snap
= &as
->T
->snap
[snapno
];
3901 BCReg baseslot
= asm_stack_extent(as
, snap
, &as
->topslot
);
3904 ra_allocref(as
, REF_BASE
, RID2RSET(RID_BASE
));
3906 if (as
->T
->link
== TRACE_INTERP
) {
3907 /* Setup fixed registers for exit to interpreter. */
3908 const BCIns
*pc
= snap_pc(as
->T
->snapmap
[snap
->mapofs
+ snap
->nent
]);
3910 if (bc_op(*pc
) == BC_JLOOP
) { /* NYI: find a better way to do this. */
3911 BCIns
*retpc
= &traceref(as
->J
, bc_d(*pc
))->startins
;
3912 if (bc_isret(bc_op(*retpc
)))
3915 emit_loada(as
, RID_DISPATCH
, J2GG(as
->J
)->dispatch
);
3916 emit_loada(as
, RID_PC
, pc
);
3917 mres
= (int32_t)(snap
->nslots
- baseslot
);
3918 switch (bc_op(*pc
)) {
3919 case BC_CALLM
: case BC_CALLMT
:
3920 mres
-= (int32_t)(1 + bc_a(*pc
) + bc_c(*pc
)); break;
3921 case BC_RETM
: mres
-= (int32_t)(bc_a(*pc
) + bc_d(*pc
)); break;
3922 case BC_TSETM
: mres
-= (int32_t)bc_a(*pc
); break;
3923 default: if (bc_op(*pc
) < BC_FUNCF
) mres
= 0; break;
3925 emit_loadi(as
, RID_RET
, mres
); /* Return MULTRES or 0. */
3926 } else if (baseslot
) {
3927 /* Save modified BASE for linking to trace with higher start frame. */
3928 emit_setgl(as
, RID_BASE
, jit_base
);
3930 emit_addptr(as
, RID_BASE
, 8*(int32_t)baseslot
);
3932 /* Sync the interpreter state with the on-trace state. */
3933 asm_stack_restore(as
, snap
);
3935 /* Root traces that grow the stack need to check the stack at the end. */
3936 if (!as
->parent
&& as
->topslot
)
3937 asm_stack_check(as
, as
->topslot
, RID_BASE
, as
->freeset
& RSET_GPR
, snapno
);
3940 /* Fixup the tail code. */
3941 static void asm_tail_fixup(ASMState
*as
, TraceNo lnk
)
3943 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
3944 MCode
*p
= as
->mctop
;
3946 int32_t spadj
= as
->T
->spadjust
;
3948 p
-= ((as
->flags
& JIT_F_LEA_AGU
) ? 7 : 6) + (LJ_64
? 1 : 0);
3951 /* Patch stack adjustment. */
3952 if (checki8(spadj
)) {
3958 *(int32_t *)p1
= spadj
;
3960 if ((as
->flags
& JIT_F_LEA_AGU
)) {
3964 p1
[-3] = (MCode
)XI_LEA
;
3965 p1
[-2] = MODRM(checki8(spadj
) ? XM_OFS8
: XM_OFS32
, RID_ESP
, RID_ESP
);
3966 p1
[-1] = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
3971 p1
[-2] = (MCode
)(checki8(spadj
) ? XI_ARITHi8
: XI_ARITHi
);
3972 p1
[-1] = MODRM(XM_REG
, XOg_ADD
, RID_ESP
);
3975 /* Patch exit branch. */
3976 target
= lnk
== TRACE_INTERP
? (MCode
*)lj_vm_exit_interp
:
3977 traceref(as
->J
, lnk
)->mcode
;
3978 *(int32_t *)(p
-4) = jmprel(p
, target
);
3980 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
3981 for (q
= as
->mctop
-1; q
>= p
; q
--)
3986 /* -- Instruction dispatch ------------------------------------------------ */
3988 /* Assemble a single instruction. */
3989 static void asm_ir(ASMState
*as
, IRIns
*ir
)
3991 switch ((IROp
)ir
->o
) {
3992 /* Miscellaneous ops. */
3993 case IR_LOOP
: asm_loop(as
); break;
3994 case IR_NOP
: case IR_XBAR
: lua_assert(!ra_used(ir
)); break;
3995 case IR_PHI
: asm_phi(as
, ir
); break;
3996 case IR_HIOP
: asm_hiop(as
, ir
); break;
3998 /* Guarded assertions. */
3999 case IR_LT
: case IR_GE
: case IR_LE
: case IR_GT
:
4000 case IR_ULT
: case IR_UGE
: case IR_ULE
: case IR_UGT
:
4001 case IR_EQ
: case IR_NE
: case IR_ABC
:
4002 asm_comp(as
, ir
, asm_compmap
[ir
->o
]);
4005 case IR_RETF
: asm_retf(as
, ir
); break;
4008 case IR_BNOT
: asm_neg_not(as
, ir
, XOg_NOT
);
4009 case IR_BSWAP
: asm_bitswap(as
, ir
); break;
4011 case IR_BAND
: asm_intarith(as
, ir
, XOg_AND
); break;
4012 case IR_BOR
: asm_intarith(as
, ir
, XOg_OR
); break;
4013 case IR_BXOR
: asm_intarith(as
, ir
, XOg_XOR
); break;
4015 case IR_BSHL
: asm_bitshift(as
, ir
, XOg_SHL
); break;
4016 case IR_BSHR
: asm_bitshift(as
, ir
, XOg_SHR
); break;
4017 case IR_BSAR
: asm_bitshift(as
, ir
, XOg_SAR
); break;
4018 case IR_BROL
: asm_bitshift(as
, ir
, XOg_ROL
); break;
4019 case IR_BROR
: asm_bitshift(as
, ir
, XOg_ROR
); break;
4021 /* Arithmetic ops. */
4022 case IR_ADD
: asm_add(as
, ir
); break;
4024 if (irt_isnum(ir
->t
))
4025 asm_fparith(as
, ir
, XO_SUBSD
);
4026 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
4027 asm_intarith(as
, ir
, XOg_SUB
);
4030 if (irt_isnum(ir
->t
))
4031 asm_fparith(as
, ir
, XO_MULSD
);
4033 asm_intarith(as
, ir
, XOg_X_IMUL
);
4036 #if LJ_64 && LJ_HASFFI
4037 if (!irt_isnum(ir
->t
))
4038 asm_arith64(as
, ir
, irt_isi64(ir
->t
) ? IRCALL_lj_carith_divi64
:
4039 IRCALL_lj_carith_divu64
);
4042 asm_fparith(as
, ir
, XO_DIVSD
);
4045 #if LJ_64 && LJ_HASFFI
4046 asm_arith64(as
, ir
, irt_isi64(ir
->t
) ? IRCALL_lj_carith_modi64
:
4047 IRCALL_lj_carith_modu64
);
4054 if (irt_isnum(ir
->t
))
4055 asm_fparith(as
, ir
, XO_XORPS
);
4057 asm_neg_not(as
, ir
, XOg_NEG
);
4059 case IR_ABS
: asm_fparith(as
, ir
, XO_ANDPS
); break;
4061 case IR_MIN
: asm_fparith(as
, ir
, XO_MINSD
); break;
4062 case IR_MAX
: asm_fparith(as
, ir
, XO_MAXSD
); break;
4064 case IR_FPMATH
: case IR_ATAN2
: case IR_LDEXP
:
4068 #if LJ_64 && LJ_HASFFI
4069 if (!irt_isnum(ir
->t
))
4070 asm_arith64(as
, ir
, irt_isi64(ir
->t
) ? IRCALL_lj_carith_powi64
:
4071 IRCALL_lj_carith_powu64
);
4077 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
4078 case IR_ADDOV
: asm_intarith(as
, ir
, XOg_ADD
); break;
4079 case IR_SUBOV
: asm_intarith(as
, ir
, XOg_SUB
); break;
4081 /* Memory references. */
4082 case IR_AREF
: asm_aref(as
, ir
); break;
4083 case IR_HREF
: asm_href(as
, ir
); break;
4084 case IR_HREFK
: asm_hrefk(as
, ir
); break;
4085 case IR_NEWREF
: asm_newref(as
, ir
); break;
4086 case IR_UREFO
: case IR_UREFC
: asm_uref(as
, ir
); break;
4087 case IR_FREF
: asm_fref(as
, ir
); break;
4088 case IR_STRREF
: asm_strref(as
, ir
); break;
4090 /* Loads and stores. */
4091 case IR_ALOAD
: case IR_HLOAD
: case IR_ULOAD
: case IR_VLOAD
:
4092 asm_ahuvload(as
, ir
);
4094 case IR_FLOAD
: case IR_XLOAD
: asm_fxload(as
, ir
); break;
4095 case IR_SLOAD
: asm_sload(as
, ir
); break;
4097 case IR_ASTORE
: case IR_HSTORE
: case IR_USTORE
: asm_ahustore(as
, ir
); break;
4098 case IR_FSTORE
: case IR_XSTORE
: asm_fxstore(as
, ir
); break;
4101 case IR_SNEW
: asm_snew(as
, ir
); break;
4102 case IR_TNEW
: asm_tnew(as
, ir
); break;
4103 case IR_TDUP
: asm_tdup(as
, ir
); break;
4104 case IR_CNEW
: case IR_CNEWI
: asm_cnew(as
, ir
); break;
4106 /* Write barriers. */
4107 case IR_TBAR
: asm_tbar(as
, ir
); break;
4108 case IR_OBAR
: asm_obar(as
, ir
); break;
4110 /* Type conversions. */
4111 case IR_TOBIT
: asm_tobit(as
, ir
); break;
4112 case IR_CONV
: asm_conv(as
, ir
); break;
4113 case IR_TOSTR
: asm_tostr(as
, ir
); break;
4114 case IR_STRTO
: asm_strto(as
, ir
); break;
4117 case IR_CALLN
: case IR_CALLL
: case IR_CALLS
: asm_call(as
, ir
); break;
4118 case IR_CALLXS
: asm_callx(as
, ir
); break;
4119 case IR_CARG
: break;
4122 setintV(&as
->J
->errinfo
, ir
->o
);
4123 lj_trace_err_info(as
->J
, LJ_TRERR_NYIIR
);
4128 /* Assemble a trace in linear backwards order. */
4129 static void asm_trace(ASMState
*as
)
4131 for (as
->curins
--; as
->curins
> as
->stopins
; as
->curins
--) {
4132 IRIns
*ir
= IR(as
->curins
);
4133 lua_assert(!(LJ_32
&& irt_isint64(ir
->t
))); /* Handled by SPLIT. */
4134 if (!ra_used(ir
) && !ir_sideeff(ir
) && (as
->flags
& JIT_F_OPT_DCE
))
4135 continue; /* Dead-code elimination can be soooo easy. */
4136 if (irt_isguard(ir
->t
))
4144 /* -- Trace setup --------------------------------------------------------- */
4146 /* Ensure there are enough stack slots for call arguments. */
4147 static Reg
asm_setup_call_slots(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
4149 IRRef args
[CCI_NARGS_MAX
];
4150 uint32_t nargs
= (int)CCI_NARGS(ci
);
4152 asm_collectargs(as
, ir
, ci
, args
);
4155 nslots
= (int)(nargs
*2); /* Only matters for more than four args. */
4158 int ngpr
= 6, nfpr
= 8;
4159 for (i
= 0; i
< nargs
; i
++)
4160 if (irt_isfp(IR(args
[i
])->t
)) {
4161 if (nfpr
> 0) nfpr
--; else nslots
+= 2;
4163 if (ngpr
> 0) ngpr
--; else nslots
+= 2;
4166 if (nslots
> as
->evenspill
) /* Leave room for args in stack slots. */
4167 as
->evenspill
= nslots
;
4168 return irt_isfp(ir
->t
) ? REGSP_HINT(RID_FPRET
) : REGSP_HINT(RID_RET
);
4170 if ((ci
->flags
& CCI_FASTCALL
)) {
4171 lua_assert(nargs
<= 2);
4174 for (i
= 0; i
< nargs
; i
++)
4175 nslots
+= irt_isnum(IR(args
[i
])->t
) ? 2 : 1;
4176 if (nslots
> as
->evenspill
) /* Leave room for args. */
4177 as
->evenspill
= nslots
;
4179 return irt_isfp(ir
->t
) ? REGSP_INIT
: REGSP_HINT(RID_RET
);
4183 /* Clear reg/sp for all instructions and add register hints. */
4184 static void asm_setup_regsp(ASMState
*as
, GCtrace
*T
)
4191 /* Clear reg/sp for constants. */
4192 for (i
= T
->nk
; i
< REF_BIAS
; i
++)
4193 IR(i
)->prev
= REGSP_INIT
;
4195 /* REF_BASE is used for implicit references to the BASE register. */
4196 IR(REF_BASE
)->prev
= REGSP_HINT(RID_BASE
);
4199 if (IR(nins
-1)->o
== IR_RENAME
) {
4200 do { nins
--; } while (IR(nins
-1)->o
== IR_RENAME
);
4201 T
->nins
= nins
; /* Remove any renames left over from ASM restart. */
4203 as
->snaprename
= nins
;
4205 as
->snapno
= T
->nsnap
;
4207 as
->stopins
= REF_BASE
;
4208 as
->orignins
= nins
;
4212 as
->evenspill
= SPS_FIRST
;
4213 for (i
= REF_FIRST
; i
< nins
; i
++) {
4219 /* Set hints for slot loads from a parent trace. */
4221 if ((ir
->op2
& IRSLOAD_PARENT
)) {
4222 RegSP rs
= as
->parentmap
[ir
->op1
];
4223 lua_assert(regsp_used(rs
));
4225 if (!ra_hasspill(regsp_spill(rs
)) && ra_hasreg(regsp_reg(rs
))) {
4226 ir
->prev
= (uint16_t)REGSP_HINT(regsp_reg(rs
));
4233 ci
.flags
= asm_callx_flags(as
, ir
);
4234 ir
->prev
= asm_setup_call_slots(as
, ir
, &ci
);
4236 as
->modset
|= RSET_SCRATCH
;
4239 case IR_CALLN
: case IR_CALLL
: case IR_CALLS
: {
4240 const CCallInfo
*ci
= &lj_ir_callinfo
[ir
->op2
];
4241 ir
->prev
= asm_setup_call_slots(as
, ir
, ci
);
4243 as
->modset
|= (ci
->flags
& CCI_NOFPRCLOBBER
) ?
4244 (RSET_SCRATCH
& ~RSET_FPR
) : RSET_SCRATCH
;
4247 #if LJ_32 && LJ_HASFFI
4249 if ((ir
-1)->o
== IR_CALLN
) {
4250 ir
->prev
= REGSP_HINT(RID_RETHI
);
4255 /* C calls evict all scratch regs and return results in RID_RET. */
4256 case IR_SNEW
: case IR_NEWREF
:
4258 if (as
->evenspill
< 3) /* lj_str_new and lj_tab_newkey need 3 args. */
4261 case IR_TNEW
: case IR_TDUP
: case IR_CNEW
: case IR_CNEWI
: case IR_TOSTR
:
4262 ir
->prev
= REGSP_HINT(RID_RET
);
4264 as
->modset
= RSET_SCRATCH
;
4266 case IR_STRTO
: case IR_OBAR
:
4268 as
->modset
= RSET_SCRATCH
;
4271 if (irt_isnum(ir
->t
)) {
4272 ir
->prev
= REGSP_HINT(RID_XMM0
);
4274 as
->modset
|= RSET_RANGE(RID_XMM0
, RID_XMM1
+1)|RID2RSET(RID_EAX
);
4278 case IR_DIV
: case IR_MOD
:
4279 #if LJ_64 && LJ_HASFFI
4280 if (!irt_isnum(ir
->t
)) {
4281 ir
->prev
= REGSP_HINT(RID_RET
);
4283 as
->modset
|= (RSET_SCRATCH
& RSET_GPR
);
4289 if (ir
->op2
== IRFPM_EXP2
) { /* May be joined to lj_vm_pow_sse. */
4290 ir
->prev
= REGSP_HINT(RID_XMM0
);
4292 if (as
->evenspill
< 4) /* Leave room for 16 byte scratch area. */
4296 as
->modset
|= RSET_RANGE(RID_XMM0
, RID_XMM2
+1)|RID2RSET(RID_EAX
);
4298 } else if (ir
->op2
<= IRFPM_TRUNC
&& !(as
->flags
& JIT_F_SSE4_1
)) {
4299 ir
->prev
= REGSP_HINT(RID_XMM0
);
4301 as
->modset
|= RSET_RANGE(RID_XMM0
, RID_XMM3
+1)|RID2RSET(RID_EAX
);
4305 /* Non-constant shift counts need to be in RID_ECX. */
4306 case IR_BSHL
: case IR_BSHR
: case IR_BSAR
: case IR_BROL
: case IR_BROR
:
4307 if (!irref_isk(ir
->op2
) && !ra_hashint(IR(ir
->op2
)->r
)) {
4308 IR(ir
->op2
)->r
= REGSP_HINT(RID_ECX
);
4310 rset_set(as
->modset
, RID_ECX
);
4313 /* Do not propagate hints across type conversions. */
4314 case IR_CONV
: case IR_TOBIT
:
4317 /* Propagate hints across likely 'op reg, imm' or 'op reg'. */
4318 if (irref_isk(ir
->op2
) && !irref_isk(ir
->op1
)) {
4319 ir
->prev
= IR(ir
->op1
)->prev
;
4324 ir
->prev
= REGSP_INIT
;
4326 if ((as
->evenspill
& 1))
4327 as
->oddspill
= as
->evenspill
++;
4332 /* -- Assembler core ------------------------------------------------------ */
4334 /* Define this if you want to run LuaJIT with Valgrind. */
4335 #ifdef LUAJIT_USE_VALGRIND
4336 #include <valgrind/valgrind.h>
4337 #define VG_INVALIDATE(p, sz) VALGRIND_DISCARD_TRANSLATIONS(p, sz)
4339 #define VG_INVALIDATE(p, sz) ((void)0)
4342 /* Assemble a trace. */
4343 void lj_asm_trace(jit_State
*J
, GCtrace
*T
)
4346 ASMState
*as
= &as_
;
4348 /* Setup initial state. Copy some fields to reduce indirections. */
4352 as
->flags
= J
->flags
;
4353 as
->loopref
= J
->loopref
;
4357 as
->parent
= traceref(J
, J
->parent
);
4358 lj_snap_regspmap(as
->parentmap
, as
->parent
, J
->exitno
);
4362 as
->mctop
= lj_mcode_reserve(J
, &as
->mcbot
); /* Reserve MCode memory. */
4363 as
->mcp
= as
->mctop
;
4364 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
4365 asm_exitstub_setup(as
, T
->nsnap
);
4368 as
->mcp
= as
->mctop
;
4369 as
->curins
= T
->nins
;
4371 RA_DBGX((as
, "===== STOP ====="));
4372 /* Realign and leave room for backwards loop branch or exit branch. */
4374 int i
= ((int)(intptr_t)as
->realign
) & 15;
4375 MCode
*p
= as
->mctop
;
4376 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
4380 as
->mcp
= p
- (as
->loopinv
? 5 : 2); /* Space for short/near jmp. */
4382 as
->mcp
= as
->mctop
- 5; /* Space for exit branch (near jmp). */
4384 as
->invmcp
= as
->mcp
;
4389 as
->sectref
= as
->loopref
;
4390 as
->fuseref
= (as
->flags
& JIT_F_OPT_FUSE
) ? as
->loopref
: FUSE_DISABLED
;
4392 /* Setup register allocation. */
4393 asm_setup_regsp(as
, T
);
4396 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
4397 as
->mcp
-= (as
->flags
& JIT_F_LEA_AGU
) ? 7 : 6 + (LJ_64
? 1 : 0);
4402 } while (as
->realign
); /* Retry in case the MCode needs to be realigned. */
4407 as
->curins
= as
->T
->snap
[0].ref
;
4408 asm_snap_prep(as
); /* The GC check is a guard. */
4418 RA_DBGX((as
, "===== START ===="));
4420 if (as
->freeset
!= RSET_ALL
)
4421 lj_trace_err(as
->J
, LJ_TRERR_BADRA
); /* Ouch! Should never happen. */
4423 /* Set trace entry point before fixing up tail to allow link to self. */
4425 T
->mcloop
= as
->mcloop
? (MSize
)(as
->mcloop
- as
->mcp
) : 0;
4427 asm_tail_fixup(as
, T
->link
); /* Note: this may change as->mctop! */
4428 T
->szmcode
= (MSize
)(as
->mctop
- as
->mcp
);
4429 VG_INVALIDATE(T
->mcode
, T
->szmcode
);
4432 /* Patch exit jumps of existing machine code to a new target. */
4433 void lj_asm_patchexit(jit_State
*J
, GCtrace
*T
, ExitNo exitno
, MCode
*target
)
4435 MCode
*p
= T
->mcode
;
4436 MCode
*mcarea
= lj_mcode_patch(J
, p
, 0);
4437 MSize len
= T
->szmcode
;
4438 MCode
*px
= exitstub_addr(J
, exitno
) - 6;
4439 MCode
*pe
= p
+len
-6;
4440 uint32_t stateaddr
= u32ptr(&J2G(J
)->vmstate
);
4441 if (len
> 5 && p
[len
-5] == XI_JMP
&& p
+len
-6 + *(int32_t *)(p
+len
-4) == px
)
4442 *(int32_t *)(p
+len
-4) = jmprel(p
+len
, target
);
4443 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
4445 if (*(uint32_t *)(p
+(LJ_64
? 3 : 2)) == stateaddr
&& p
[0] == XI_MOVmi
) {
4446 p
+= LJ_64
? 11 : 10;
4450 for (; p
< pe
; p
++) {
4451 if ((*(uint16_t *)p
& 0xf0ff) == 0x800f && p
+ *(int32_t *)(p
+2) == px
) {
4452 *(int32_t *)(p
+2) = jmprel(p
+6, target
);
4456 lj_mcode_patch(J
, mcarea
, 1);
4457 VG_INVALIDATE(T
->mcode
, T
->szmcode
);