2 ** IR assembler (SSA IR -> machine code).
3 ** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
25 #include "lj_dispatch.h"
27 #include "lj_target.h"
29 /* -- Assembler state and common macros ----------------------------------- */
31 /* Assembler state. */
32 typedef struct ASMState
{
33 RegCost cost
[RID_MAX
]; /* Reference and blended allocation cost for regs. */
35 MCode
*mcp
; /* Current MCode pointer (grows down). */
36 MCode
*mclim
; /* Lower limit for MCode memory + red zone. */
38 IRIns
*ir
; /* Copy of pointer to IR instructions/constants. */
39 jit_State
*J
; /* JIT compiler state. */
41 x86ModRM mrm
; /* Fused x86 address operand. */
43 RegSet freeset
; /* Set of free registers. */
44 RegSet modset
; /* Set of registers modified inside the loop. */
45 RegSet phiset
; /* Set of PHI registers. */
47 uint32_t flags
; /* Copy of JIT compiler flags. */
48 int loopinv
; /* Loop branch inversion (0:no, 1:yes, 2:yes+CC_P). */
50 int32_t evenspill
; /* Next even spill slot. */
51 int32_t oddspill
; /* Next odd spill slot (or 0). */
53 IRRef curins
; /* Reference of current instruction. */
54 IRRef stopins
; /* Stop assembly before hitting this instruction. */
55 IRRef orignins
; /* Original T->nins. */
57 IRRef snapref
; /* Current snapshot is active after this reference. */
58 IRRef snaprename
; /* Rename highwater mark for snapshot check. */
59 SnapNo snapno
; /* Current snapshot number. */
60 SnapNo loopsnapno
; /* Loop snapshot number. */
62 Trace
*T
; /* Trace to assemble. */
63 Trace
*parent
; /* Parent trace (or NULL). */
65 IRRef fuseref
; /* Fusion limit (loopref, 0 or FUSE_DISABLED). */
66 IRRef sectref
; /* Section base reference (loopref or 0). */
67 IRRef loopref
; /* Reference of LOOP instruction (or 0). */
69 BCReg topslot
; /* Number of slots for stack check (unless 0). */
70 MSize gcsteps
; /* Accumulated number of GC steps (per section). */
72 MCode
*mcbot
; /* Bottom of reserved MCode. */
73 MCode
*mctop
; /* Top of generated MCode. */
74 MCode
*mcloop
; /* Pointer to loop MCode (or NULL). */
75 MCode
*invmcp
; /* Points to invertible loop branch (or NULL). */
76 MCode
*testmcp
; /* Pending opportunity to remove test r,r. */
77 MCode
*realign
; /* Realign loop if not NULL. */
79 IRRef1 phireg
[RID_MAX
]; /* PHI register references. */
80 uint16_t parentmap
[LJ_MAX_JSLOTS
]; /* Parent slot to RegSP map. */
83 #define IR(ref) (&as->ir[(ref)])
85 #define ASMREF_TMP1 REF_TRUE /* Temp. register. */
86 #define ASMREF_TMP2 REF_FALSE /* Temp. register. */
87 #define ASMREF_L REF_NIL /* Stores register for L. */
89 /* Check for variant to invariant references. */
90 #define iscrossref(as, ref) ((ref) < as->sectref)
92 /* Inhibit memory op fusion from variant to invariant references. */
93 #define FUSE_DISABLED (~(IRRef)0)
94 #define mayfuse(as, ref) ((ref) > as->fuseref)
95 #define neverfuse(as) (as->fuseref == FUSE_DISABLED)
96 #define opisfusableload(o) \
97 ((o) == IR_ALOAD || (o) == IR_HLOAD || (o) == IR_ULOAD || \
98 (o) == IR_FLOAD || (o) == IR_SLOAD || (o) == IR_XLOAD)
100 /* Instruction selection for XMM moves. */
101 #define XMM_MOVRR(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVSD : XO_MOVAPS)
102 #define XMM_MOVRM(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVLPD : XO_MOVSD)
104 /* Sparse limit checks using a red zone before the actual limit. */
105 #define MCLIM_REDZONE 64
106 #define checkmclim(as) \
107 if (LJ_UNLIKELY(as->mcp < as->mclim)) asm_mclimit(as)
109 static LJ_NORET LJ_NOINLINE
void asm_mclimit(ASMState
*as
)
111 lj_mcode_limiterr(as
->J
, (size_t)(as
->mctop
- as
->mcp
+ 4*MCLIM_REDZONE
));
114 /* -- Emit x86 instructions ----------------------------------------------- */
116 #define MODRM(mode, r1, r2) ((MCode)((mode)+(((r1)&7)<<3)+((r2)&7)))
119 #define REXRB(p, rr, rb) \
120 { MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \
121 if (rex != 0x40) *--(p) = rex; }
122 #define FORCE_REX 0x200
123 #define REX_64 (FORCE_REX|0x080000)
125 #define REXRB(p, rr, rb) ((void)0)
130 #define emit_i8(as, i) (*--as->mcp = (MCode)(i))
131 #define emit_i32(as, i) (*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4)
133 #define emit_x87op(as, xo) \
134 (*(uint16_t *)(as->mcp-2) = (uint16_t)(xo), as->mcp -= 2)
137 static LJ_AINLINE MCode
*emit_op(x86Op xo
, Reg rr
, Reg rb
, Reg rx
,
141 #if defined(__GNUC__)
142 if (__builtin_constant_p(xo
) && n
== -2)
143 p
[delta
-2] = (MCode
)(xo
>> 24);
144 else if (__builtin_constant_p(xo
) && n
== -3)
145 *(uint16_t *)(p
+delta
-3) = (uint16_t)(xo
>> 16);
148 *(uint32_t *)(p
+delta
-5) = (uint32_t)xo
;
152 uint32_t rex
= 0x40 + ((rr
>>1)&(4+(FORCE_REX
>>1)))+((rx
>>2)&2)+((rb
>>3)&1);
155 if (n
== -4) { *p
= (MCode
)rex
; rex
= (MCode
)(xo
>> 8); }
160 UNUSED(rr
); UNUSED(rb
); UNUSED(rx
);
166 #define emit_opm(xo, mode, rr, rb, p, delta) \
167 (p[(delta)-1] = MODRM((mode), (rr), (rb)), \
168 emit_op((xo), (rr), (rb), 0, (p), (delta)))
170 /* op + modrm + sib */
171 #define emit_opmx(xo, mode, scale, rr, rb, rx, p) \
172 (p[-1] = MODRM((scale), (rx), (rb)), \
173 p[-2] = MODRM((mode), (rr), RID_ESP), \
174 emit_op((xo), (rr), (rb), (rx), (p), -1))
177 static void emit_rr(ASMState
*as
, x86Op xo
, Reg r1
, Reg r2
)
180 as
->mcp
= emit_opm(xo
, XM_REG
, r1
, r2
, p
, 0);
183 #if LJ_64 && defined(LUA_USE_ASSERT)
184 /* [addr] is sign-extended in x64 and must be in lower 2G (not 4G). */
185 static int32_t ptr2addr(void *p
)
187 lua_assert((uintptr_t)p
< (uintptr_t)0x80000000);
191 #define ptr2addr(p) (i32ptr((p)))
195 static void emit_rma(ASMState
*as
, x86Op xo
, Reg rr
, const void *addr
)
198 *(int32_t *)(p
-4) = ptr2addr(addr
);
200 p
[-5] = MODRM(XM_SCALE1
, RID_ESP
, RID_EBP
);
201 as
->mcp
= emit_opm(xo
, XM_OFS0
, rr
, RID_ESP
, p
, -5);
203 as
->mcp
= emit_opm(xo
, XM_OFS0
, rr
, RID_EBP
, p
, -4);
207 /* op r, [base+ofs] */
208 static void emit_rmro(ASMState
*as
, x86Op xo
, Reg rr
, Reg rb
, int32_t ofs
)
213 if (ofs
== 0 && (rb
&7) != RID_EBP
) {
215 } else if (checki8(ofs
)) {
223 if ((rb
&7) == RID_ESP
)
224 *--p
= MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
226 *(int32_t *)(p
-4) = ofs
;
228 p
[-5] = MODRM(XM_SCALE1
, RID_ESP
, RID_EBP
);
237 as
->mcp
= emit_opm(xo
, mode
, rr
, rb
, p
, 0);
240 /* op r, [base+idx*scale+ofs] */
241 static void emit_rmrxo(ASMState
*as
, x86Op xo
, Reg rr
, Reg rb
, Reg rx
,
242 x86Mode scale
, int32_t ofs
)
246 if (ofs
== 0 && (rb
&7) != RID_EBP
) {
248 } else if (checki8(ofs
)) {
256 as
->mcp
= emit_opmx(xo
, mode
, scale
, rr
, rb
, rx
, p
);
260 static void emit_gri(ASMState
*as
, x86Group xg
, Reg rb
, int32_t i
)
266 p
[0] = (MCode
)(xg
>> 16);
269 *(int32_t *)(p
+2) = i
;
270 p
[0] = (MCode
)(xg
>> 8);
272 p
[1] = MODRM(XM_REG
, xg
, rb
);
277 /* op [base+ofs], i */
278 static void emit_gmroi(ASMState
*as
, x86Group xg
, Reg rb
, int32_t ofs
,
284 xo
= (x86Op
)(((xg
>> 16) << 24)+0xfe);
287 xo
= (x86Op
)(((xg
>> 8) << 24)+0xfe);
289 emit_rmro(as
, xo
, (Reg
)xg
, rb
, ofs
);
292 #define emit_shifti(as, xg, r, i) \
293 (emit_i8(as, (i)), emit_rr(as, XO_SHIFTi, (Reg)(xg), (r)))
296 static void emit_mrm(ASMState
*as
, x86Op xo
, Reg rr
, Reg rb
)
299 x86Mode mode
= XM_REG
;
302 if (rb
== RID_NONE
) {
306 *(int32_t *)p
= as
->mrm
.ofs
;
307 if (as
->mrm
.idx
!= RID_NONE
)
310 *--p
= MODRM(XM_SCALE1
, RID_ESP
, RID_EBP
);
314 if (as
->mrm
.ofs
== 0 && (rb
&7) != RID_EBP
) {
316 } else if (checki8(as
->mrm
.ofs
)) {
317 *--p
= (MCode
)as
->mrm
.ofs
;
321 *(int32_t *)p
= as
->mrm
.ofs
;
324 if (as
->mrm
.idx
!= RID_NONE
) {
326 as
->mcp
= emit_opmx(xo
, mode
, as
->mrm
.scale
, rr
, rb
, as
->mrm
.idx
, p
);
329 if ((rb
&7) == RID_ESP
)
330 *--p
= MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
333 as
->mcp
= emit_opm(xo
, mode
, rr
, rb
, p
, 0);
336 static void emit_addptr(ASMState
*as
, Reg r
, int32_t ofs
)
339 if ((as
->flags
& JIT_F_LEA_AGU
))
340 emit_rmro(as
, XO_LEA
, r
, r
, ofs
);
342 emit_gri(as
, XG_ARITHi(XOg_ADD
), r
, ofs
);
346 /* -- Emit moves ---------------------------------------------------------- */
348 /* Generic move between two regs. */
349 static void emit_movrr(ASMState
*as
, Reg r1
, Reg r2
)
351 emit_rr(as
, r1
< RID_MAX_GPR
? XO_MOV
: XMM_MOVRR(as
), r1
, r2
);
354 /* Generic move from [base+ofs]. */
355 static void emit_movrmro(ASMState
*as
, Reg rr
, Reg rb
, int32_t ofs
)
357 emit_rmro(as
, rr
< RID_MAX_GPR
? XO_MOV
: XMM_MOVRM(as
), rr
, rb
, ofs
);
360 /* mov [base+ofs], i */
361 static void emit_movmroi(ASMState
*as
, Reg base
, int32_t ofs
, int32_t i
)
364 emit_rmro(as
, XO_MOVmi
, 0, base
, ofs
);
367 /* mov [base+ofs], r */
368 #define emit_movtomro(as, r, base, ofs) \
369 emit_rmro(as, XO_MOVto, (r), (base), (ofs))
371 /* Get/set global_State fields. */
372 #define emit_opgl(as, xo, r, field) \
373 emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field)
374 #define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r), field)
375 #define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r), field)
376 #define emit_setgli(as, field, i) \
377 (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, field))
379 /* mov r, i / xor r, r */
380 static void emit_loadi(ASMState
*as
, Reg r
, int32_t i
)
383 emit_rr(as
, XO_ARITH(XOg_XOR
), r
, r
);
386 *(int32_t *)(p
-4) = i
;
387 p
[-5] = (MCode
)(XI_MOVri
+(r
&7));
395 #define emit_loada(as, r, addr) \
396 emit_loadi(as, (r), ptr2addr((addr)))
398 /* movsd r, [&tv->n] / xorps r, r */
399 static void emit_loadn(ASMState
*as
, Reg r
, cTValue
*tv
)
401 if (tvispzero(tv
)) /* Use xor only for +0. */
402 emit_rr(as
, XO_XORPS
, r
, r
);
404 emit_rma(as
, XMM_MOVRM(as
), r
, &tv
->n
);
407 /* -- Emit branches ------------------------------------------------------- */
409 /* Label for short jumps. */
410 typedef MCode
*MCLabel
;
412 /* jcc short target */
413 static void emit_sjcc(ASMState
*as
, int cc
, MCLabel target
)
416 p
[-1] = (MCode
)(int8_t)(target
-p
);
417 p
[-2] = (MCode
)(XI_JCCs
+(cc
&15));
421 /* jcc short (pending target) */
422 static MCLabel
emit_sjcc_label(ASMState
*as
, int cc
)
426 p
[-2] = (MCode
)(XI_JCCs
+(cc
&15));
431 /* Fixup jcc short target. */
432 static void emit_sfixup(ASMState
*as
, MCLabel source
)
434 source
[-1] = (MCode
)(as
->mcp
-source
);
437 /* Return label pointing to current PC. */
438 #define emit_label(as) ((as)->mcp)
441 static void emit_jcc(ASMState
*as
, int cc
, MCode
*target
)
444 int32_t addr
= (int32_t)(target
- p
);
445 *(int32_t *)(p
-4) = addr
;
446 p
[-5] = (MCode
)(XI_JCCn
+(cc
&15));
452 static void emit_call_(ASMState
*as
, MCode
*target
)
455 *(int32_t *)(p
-4) = (int32_t)(target
- p
);
460 #define emit_call(as, f) emit_call_(as, (MCode *)(void *)(f))
462 /* -- Register allocator debugging ---------------------------------------- */
464 /* #define LUAJIT_DEBUG_RA */
466 #ifdef LUAJIT_DEBUG_RA
471 #define RIDNAME(name) #name,
472 static const char *const ra_regname
[] = {
480 static char ra_dbg_buf
[65536];
481 static char *ra_dbg_p
;
482 static char *ra_dbg_merge
;
483 static MCode
*ra_dbg_mcp
;
485 static void ra_dstart(void)
487 ra_dbg_p
= ra_dbg_buf
;
492 static void ra_dflush(void)
494 fwrite(ra_dbg_buf
, 1, (size_t)(ra_dbg_p
-ra_dbg_buf
), stdout
);
498 static void ra_dprintf(ASMState
*as
, const char *fmt
, ...)
503 p
= ra_dbg_mcp
== as
->mcp
? ra_dbg_merge
: ra_dbg_p
;
505 p
+= sprintf(p
, "%08x \e[36m%04d ", (uintptr_t)as
->mcp
, as
->curins
-REF_BIAS
);
507 const char *e
= strchr(fmt
, '$');
508 if (e
== NULL
) break;
509 memcpy(p
, fmt
, (size_t)(e
-fmt
));
512 Reg r
= va_arg(argp
, Reg
) & RID_MASK
;
515 for (q
= ra_regname
[r
]; *q
; q
++)
516 *p
++ = *q
>= 'A' && *q
<= 'Z' ? *q
+ 0x20 : *q
;
521 } else if (e
[1] == 'f' || e
[1] == 'i') {
524 ref
= va_arg(argp
, IRRef
);
526 ref
= va_arg(argp
, IRIns
*) - as
->ir
;
528 p
+= sprintf(p
, "%04d", ref
- REF_BIAS
);
530 p
+= sprintf(p
, "K%03d", REF_BIAS
- ref
);
531 } else if (e
[1] == 's') {
532 uint32_t slot
= va_arg(argp
, uint32_t);
533 p
+= sprintf(p
, "[esp+0x%x]", sps_scale(slot
));
542 *p
++ = '\e'; *p
++ = '['; *p
++ = 'm'; *p
++ = '\n';
543 if (p
> ra_dbg_buf
+sizeof(ra_dbg_buf
)-256) {
544 fwrite(ra_dbg_buf
, 1, (size_t)(p
-ra_dbg_buf
), stdout
);
550 #define RA_DBG_START() ra_dstart()
551 #define RA_DBG_FLUSH() ra_dflush()
552 #define RA_DBG_REF() \
553 do { char *_p = ra_dbg_p; ra_dprintf(as, ""); \
554 ra_dbg_merge = _p; ra_dbg_mcp = as->mcp; } while (0)
555 #define RA_DBGX(x) ra_dprintf x
558 #define RA_DBG_START() ((void)0)
559 #define RA_DBG_FLUSH() ((void)0)
560 #define RA_DBG_REF() ((void)0)
561 #define RA_DBGX(x) ((void)0)
564 /* -- Register allocator -------------------------------------------------- */
566 #define ra_free(as, r) rset_set(as->freeset, (r))
567 #define ra_modified(as, r) rset_set(as->modset, (r))
569 #define ra_used(ir) (ra_hasreg((ir)->r) || ra_hasspill((ir)->s))
571 /* Setup register allocator. */
572 static void ra_setup(ASMState
*as
)
574 /* Initially all regs (except the stack pointer) are free for use. */
575 as
->freeset
= RSET_ALL
;
576 as
->modset
= RSET_EMPTY
;
577 as
->phiset
= RSET_EMPTY
;
578 memset(as
->phireg
, 0, sizeof(as
->phireg
));
579 memset(as
->cost
, 0, sizeof(as
->cost
));
580 as
->cost
[RID_ESP
] = REGCOST(~0u, 0u);
583 /* Rematerialize constants. */
584 static Reg
ra_rematk(ASMState
*as
, IRIns
*ir
)
587 lua_assert(ra_hasreg(r
) && !ra_hasspill(ir
->s
));
590 ir
->r
= RID_INIT
; /* Do not keep any hint. */
591 RA_DBGX((as
, "remat $i $r", ir
, r
));
592 if (ir
->o
== IR_KNUM
) {
593 emit_loadn(as
, r
, ir_knum(ir
));
594 } else if (ir
->o
== IR_BASE
) {
595 ra_sethint(ir
->r
, RID_BASE
); /* Restore BASE register hint. */
596 emit_getgl(as
, r
, jit_base
);
597 } else if (ir
->o
== IR_KPRI
) { /* REF_NIL stores ASMREF_L register. */
598 lua_assert(irt_isnil(ir
->t
));
599 emit_getgl(as
, r
, jit_L
);
601 lua_assert(ir
->o
== IR_KINT
|| ir
->o
== IR_KGC
||
602 ir
->o
== IR_KPTR
|| ir
->o
== IR_KNULL
);
603 emit_loadi(as
, r
, ir
->i
);
608 /* Force a spill. Allocate a new spill slot if needed. */
609 static int32_t ra_spill(ASMState
*as
, IRIns
*ir
)
611 int32_t slot
= ir
->s
;
612 if (!ra_hasspill(slot
)) {
613 if (irt_isnum(ir
->t
)) {
614 slot
= as
->evenspill
;
616 } else if (as
->oddspill
) {
620 slot
= as
->evenspill
;
621 as
->oddspill
= slot
+1;
624 if (as
->evenspill
> 256)
625 lj_trace_err(as
->J
, LJ_TRERR_SPILLOV
);
626 ir
->s
= (uint8_t)slot
;
628 return sps_scale(slot
);
631 /* Release the temporarily allocated register in ASMREF_TMP1/ASMREF_TMP2. */
632 static Reg
ra_releasetmp(ASMState
*as
, IRRef ref
)
636 lua_assert(ra_hasreg(r
) && !ra_hasspill(ir
->s
));
643 /* Restore a register (marked as free). Rematerialize or force a spill. */
644 static Reg
ra_restore(ASMState
*as
, IRRef ref
)
647 if (irref_isk(ref
) || ref
== REF_BASE
) {
648 return ra_rematk(as
, ir
);
651 lua_assert(ra_hasreg(r
));
654 ra_sethint(ir
->r
, r
); /* Keep hint. */
655 RA_DBGX((as
, "restore $i $r", ir
, r
));
656 emit_movrmro(as
, r
, RID_ESP
, ra_spill(as
, ir
)); /* Force a spill. */
661 /* Save a register to a spill slot. */
662 static LJ_AINLINE
void ra_save(ASMState
*as
, IRIns
*ir
, Reg r
)
664 RA_DBGX((as
, "save $i $r", ir
, r
));
665 emit_rmro(as
, r
< RID_MAX_GPR
? XO_MOVto
: XO_MOVSDto
,
666 r
, RID_ESP
, sps_scale(ir
->s
));
670 if (LJ_LIKELY(allow&RID2RSET(r)) && as->cost[r] < cost) \
673 /* Evict the register with the lowest cost, forcing a restore. */
674 static Reg
ra_evict(ASMState
*as
, RegSet allow
)
676 RegCost cost
= ~(RegCost
)0;
677 if (allow
< RID2RSET(RID_MAX_GPR
)) {
678 MINCOST(RID_EAX
);MINCOST(RID_ECX
);MINCOST(RID_EDX
);MINCOST(RID_EBX
);
679 MINCOST(RID_EBP
);MINCOST(RID_ESI
);MINCOST(RID_EDI
);
681 MINCOST(RID_R8D
);MINCOST(RID_R9D
);MINCOST(RID_R10D
);MINCOST(RID_R11D
);
682 MINCOST(RID_R12D
);MINCOST(RID_R13D
);MINCOST(RID_R14D
);MINCOST(RID_R15D
);
685 MINCOST(RID_XMM0
);MINCOST(RID_XMM1
);MINCOST(RID_XMM2
);MINCOST(RID_XMM3
);
686 MINCOST(RID_XMM4
);MINCOST(RID_XMM5
);MINCOST(RID_XMM6
);MINCOST(RID_XMM7
);
688 MINCOST(RID_XMM8
);MINCOST(RID_XMM9
);MINCOST(RID_XMM10
);MINCOST(RID_XMM11
);
689 MINCOST(RID_XMM12
);MINCOST(RID_XMM13
);MINCOST(RID_XMM14
);MINCOST(RID_XMM15
);
692 lua_assert(allow
!= RSET_EMPTY
);
693 lua_assert(regcost_ref(cost
) >= as
->T
->nk
&& regcost_ref(cost
) < as
->T
->nins
);
694 return ra_restore(as
, regcost_ref(cost
));
697 /* Pick any register (marked as free). Evict on-demand. */
698 static LJ_AINLINE Reg
ra_pick(ASMState
*as
, RegSet allow
)
700 RegSet pick
= as
->freeset
& allow
;
702 return ra_evict(as
, allow
);
704 return rset_picktop(pick
);
707 /* Get a scratch register (marked as free). */
708 static LJ_AINLINE Reg
ra_scratch(ASMState
*as
, RegSet allow
)
710 Reg r
= ra_pick(as
, allow
);
712 RA_DBGX((as
, "scratch $r", r
));
716 /* Evict all registers from a set (if not free). */
717 static void ra_evictset(ASMState
*as
, RegSet drop
)
720 drop
&= ~as
->freeset
;
722 Reg r
= rset_picktop(drop
);
723 ra_restore(as
, regcost_ref(as
->cost
[r
]));
729 /* Allocate a register for ref from the allowed set of registers.
730 ** Note: this function assumes the ref does NOT have a register yet!
731 ** Picks an optimal register, sets the cost and marks the register as non-free.
733 static Reg
ra_allocref(ASMState
*as
, IRRef ref
, RegSet allow
)
736 RegSet pick
= as
->freeset
& allow
;
738 lua_assert(ra_noreg(ir
->r
));
740 /* First check register hint from propagation or PHI. */
741 if (ra_hashint(ir
->r
)) {
742 r
= ra_gethint(ir
->r
);
743 if (rset_test(pick
, r
)) /* Use hint register if possible. */
745 /* Rematerialization is cheaper than missing a hint. */
746 if (rset_test(allow
, r
) && irref_isk(regcost_ref(as
->cost
[r
]))) {
747 ra_rematk(as
, IR(regcost_ref(as
->cost
[r
])));
750 RA_DBGX((as
, "hintmiss $f $r", ref
, r
));
752 /* Invariants should preferably get unmodified registers. */
753 if (ref
< as
->loopref
&& !irt_isphi(ir
->t
)) {
754 if ((pick
& ~as
->modset
))
756 r
= rset_pickbot(pick
); /* Reduce conflicts with inverse allocation. */
758 r
= rset_picktop(pick
);
761 r
= ra_evict(as
, allow
);
764 RA_DBGX((as
, "alloc $f $r", ref
, r
));
766 rset_clear(as
->freeset
, r
);
767 as
->cost
[r
] = REGCOST_REF_T(ref
, irt_t(ir
->t
));
771 /* Allocate a register on-demand. */
772 static LJ_INLINE Reg
ra_alloc1(ASMState
*as
, IRRef ref
, RegSet allow
)
775 /* Note: allow is ignored if the register is already allocated. */
776 if (ra_noreg(r
)) r
= ra_allocref(as
, ref
, allow
);
780 /* Rename register allocation and emit move. */
781 static void ra_rename(ASMState
*as
, Reg down
, Reg up
)
783 IRRef ren
, ref
= regcost_ref(as
->cost
[up
] = as
->cost
[down
]);
784 IR(ref
)->r
= (uint8_t)up
;
786 lua_assert((down
< RID_MAX_GPR
) == (up
< RID_MAX_GPR
));
787 lua_assert(!rset_test(as
->freeset
, down
) && rset_test(as
->freeset
, up
));
788 rset_set(as
->freeset
, down
); /* 'down' is free ... */
789 rset_clear(as
->freeset
, up
); /* ... and 'up' is now allocated. */
790 RA_DBGX((as
, "rename $f $r $r", regcost_ref(as
->cost
[up
]), down
, up
));
791 emit_movrr(as
, down
, up
); /* Backwards code generation needs inverse move. */
792 if (!ra_hasspill(IR(ref
)->s
)) { /* Add the rename to the IR. */
793 lj_ir_set(as
->J
, IRT(IR_RENAME
, IRT_NIL
), ref
, as
->snapno
);
794 ren
= tref_ref(lj_ir_emit(as
->J
));
795 as
->ir
= as
->T
->ir
; /* The IR may have been reallocated. */
796 IR(ren
)->r
= (uint8_t)down
;
797 IR(ren
)->s
= SPS_NONE
;
801 /* Pick a destination register (marked as free).
802 ** Caveat: allow is ignored if there's already a destination register.
803 ** Use ra_destreg() to get a specific register.
805 static Reg
ra_dest(ASMState
*as
, IRIns
*ir
, RegSet allow
)
808 if (ra_hasreg(dest
)) {
810 ra_modified(as
, dest
);
812 dest
= ra_scratch(as
, allow
);
814 if (LJ_UNLIKELY(ra_hasspill(ir
->s
))) ra_save(as
, ir
, dest
);
818 /* Force a specific destination register (marked as free). */
819 static void ra_destreg(ASMState
*as
, IRIns
*ir
, Reg r
)
821 Reg dest
= ra_dest(as
, ir
, RID2RSET(r
));
823 ra_scratch(as
, RID2RSET(r
));
824 emit_movrr(as
, dest
, r
);
828 /* Propagate dest register to left reference. Emit moves as needed.
829 ** This is a required fixup step for all 2-operand machine instructions.
831 static void ra_left(ASMState
*as
, Reg dest
, IRRef lref
)
833 IRIns
*ir
= IR(lref
);
835 if (ra_noreg(left
)) {
836 if (irref_isk(lref
)) {
837 if (ir
->o
== IR_KNUM
) {
838 cTValue
*tv
= ir_knum(ir
);
839 /* FP remat needs a load except for +0. Still better than eviction. */
840 if (tvispzero(tv
) || !(as
->freeset
& RSET_FPR
)) {
841 emit_loadn(as
, dest
, tv
);
845 lua_assert(ir
->o
== IR_KINT
|| ir
->o
== IR_KGC
||
846 ir
->o
== IR_KPTR
|| ir
->o
== IR_KNULL
);
847 emit_loadi(as
, dest
, ir
->i
);
851 if (!ra_hashint(left
) && !iscrossref(as
, lref
))
852 ra_sethint(ir
->r
, dest
); /* Propagate register hint. */
853 left
= ra_allocref(as
, lref
, dest
< RID_MAX_GPR
? RSET_GPR
: RSET_FPR
);
855 /* Move needed for true 3-operand instruction: y=a+b ==> y=a; y+=b. */
857 /* Use register renaming if dest is the PHI reg. */
858 if (irt_isphi(ir
->t
) && as
->phireg
[dest
] == lref
) {
859 ra_modified(as
, left
);
860 ra_rename(as
, left
, dest
);
862 emit_movrr(as
, dest
, left
);
867 /* -- Exit stubs ---------------------------------------------------------- */
869 /* Generate an exit stub group at the bottom of the reserved MCode memory. */
870 static MCode
*asm_exitstub_gen(ASMState
*as
, ExitNo group
)
872 ExitNo i
, groupofs
= (group
*EXITSTUBS_PER_GROUP
) & 0xff;
873 MCode
*mxp
= as
->mcbot
;
874 MCode
*mxpstart
= mxp
;
875 if (mxp
+ (2+2)*EXITSTUBS_PER_GROUP
+8+5 >= as
->mctop
)
877 /* Push low byte of exitno for each exit stub. */
878 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)groupofs
;
879 for (i
= 1; i
< EXITSTUBS_PER_GROUP
; i
++) {
880 *mxp
++ = XI_JMPs
; *mxp
++ = (MCode
)((2+2)*(EXITSTUBS_PER_GROUP
- i
) - 2);
881 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)(groupofs
+ i
);
883 /* Push the high byte of the exitno for each exit stub group. */
884 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)((group
*EXITSTUBS_PER_GROUP
)>>8);
885 /* Store DISPATCH in ExitInfo->dispatch. Account for the two push ops. */
887 *mxp
++ = MODRM(XM_OFS8
, 0, RID_ESP
);
888 *mxp
++ = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
889 *mxp
++ = 2*sizeof(void *);
890 *(int32_t *)mxp
= ptr2addr(J2GG(as
->J
)->dispatch
); mxp
+= 4;
891 /* Jump to exit handler which fills in the ExitState. */
892 *mxp
++ = XI_JMP
; mxp
+= 4;
893 *((int32_t *)(mxp
-4)) = (int32_t)((MCode
*)lj_vm_exit_handler
- mxp
);
894 /* Commit the code for this group (even if assembly fails later on). */
895 lj_mcode_commitbot(as
->J
, mxp
);
897 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
901 /* Setup all needed exit stubs. */
902 static void asm_exitstub_setup(ASMState
*as
, ExitNo nexits
)
905 if (nexits
>= EXITSTUBS_PER_GROUP
*LJ_MAX_EXITSTUBGR
)
906 lj_trace_err(as
->J
, LJ_TRERR_SNAPOV
);
907 for (i
= 0; i
< (nexits
+EXITSTUBS_PER_GROUP
-1)/EXITSTUBS_PER_GROUP
; i
++)
908 if (as
->J
->exitstubgroup
[i
] == NULL
)
909 as
->J
->exitstubgroup
[i
] = asm_exitstub_gen(as
, i
);
912 /* -- Snapshot and guard handling ----------------------------------------- */
914 /* Can we rematerialize a KNUM instead of forcing a spill? */
915 static int asm_snap_canremat(ASMState
*as
)
918 for (r
= RID_MIN_FPR
; r
< RID_MAX_FPR
; r
++)
919 if (irref_isk(regcost_ref(as
->cost
[r
])))
924 /* Allocate registers or spill slots for refs escaping to a snapshot. */
925 static void asm_snap_alloc(ASMState
*as
)
927 SnapShot
*snap
= &as
->T
->snap
[as
->snapno
];
928 IRRef2
*map
= &as
->T
->snapmap
[snap
->mapofs
];
929 BCReg s
, nslots
= snap
->nslots
;
930 for (s
= 0; s
< nslots
; s
++) {
931 IRRef ref
= snap_ref(map
[s
]);
932 if (!irref_isk(ref
)) {
934 if (!ra_used(ir
) && ir
->o
!= IR_FRAME
) {
935 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
936 /* Not a var-to-invar ref and got a free register (or a remat)? */
937 if ((!iscrossref(as
, ref
) || irt_isphi(ir
->t
)) &&
938 ((as
->freeset
& allow
) ||
939 (allow
== RSET_FPR
&& asm_snap_canremat(as
)))) {
940 ra_allocref(as
, ref
, allow
); /* Allocate a register. */
942 RA_DBGX((as
, "snapreg $f $r", ref
, ir
->r
));
944 ra_spill(as
, ir
); /* Otherwise force a spill slot. */
945 RA_DBGX((as
, "snapspill $f $s", ref
, ir
->s
));
952 /* All guards for a snapshot use the same exitno. This is currently the
953 ** same as the snapshot number. Since the exact origin of the exit cannot
954 ** be determined, all guards for the same snapshot must exit with the same
956 ** A renamed ref which has been used in a prior guard for the same snapshot
957 ** would cause an inconsistency. The easy way out is to force a spill slot.
959 static int asm_snap_checkrename(ASMState
*as
, IRRef ren
)
961 SnapShot
*snap
= &as
->T
->snap
[as
->snapno
];
962 IRRef2
*map
= &as
->T
->snapmap
[snap
->mapofs
];
963 BCReg s
, nslots
= snap
->nslots
;
964 for (s
= 0; s
< nslots
; s
++) {
965 IRRef ref
= snap_ref(map
[s
]);
968 ra_spill(as
, ir
); /* Register renamed, so force a spill slot. */
969 RA_DBGX((as
, "snaprensp $f $s", ref
, ir
->s
));
970 return 1; /* Found. */
973 return 0; /* Not found. */
976 /* Prepare snapshot for next guard instruction. */
977 static void asm_snap_prep(ASMState
*as
)
979 if (as
->curins
< as
->snapref
) {
981 lua_assert(as
->snapno
!= 0);
983 as
->snapref
= as
->T
->snap
[as
->snapno
].ref
;
984 } while (as
->curins
< as
->snapref
);
986 as
->snaprename
= as
->T
->nins
;
988 /* Process any renames above the highwater mark. */
989 for (; as
->snaprename
< as
->T
->nins
; as
->snaprename
++) {
990 IRIns
*ir
= IR(as
->snaprename
);
991 if (asm_snap_checkrename(as
, ir
->op1
))
992 ir
->op2
= REF_BIAS
-1; /* Kill rename. */
997 /* Emit conditional branch to exit for guard.
998 ** It's important to emit this *after* all registers have been allocated,
999 ** because rematerializations may invalidate the flags.
1001 static void asm_guardcc(ASMState
*as
, int cc
)
1003 MCode
*target
= exitstub_addr(as
->J
, as
->snapno
);
1005 if (LJ_UNLIKELY(p
== as
->invmcp
)) {
1007 *(int32_t *)(p
+1) = target
- (p
+5);
1011 emit_sjcc(as
, cc
, target
);
1015 emit_jcc(as
, cc
, target
);
1018 /* -- Memory operand fusion ----------------------------------------------- */
1020 /* Arch-specific field offsets. */
1021 static const uint8_t field_ofs
[IRFL__MAX
+1] = {
1022 #define FLOFS(name, ofs) (uint8_t)(ofs),
1028 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
1029 #define CONFLICT_SEARCH_LIM 15
1031 /* Check if there's no conflicting instruction between curins and ref. */
1032 static int noconflict(ASMState
*as
, IRRef ref
, IROp conflict
)
1035 IRRef i
= as
->curins
;
1036 if (i
> ref
+ CONFLICT_SEARCH_LIM
)
1037 return 0; /* Give up, ref is too far away. */
1039 if (ir
[i
].o
== conflict
)
1040 return 0; /* Conflict found. */
1041 return 1; /* Ok, no conflict. */
1044 /* Fuse array reference into memory operand. */
1045 static void asm_fusearef(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1047 IRIns
*irb
= IR(ir
->op1
);
1049 lua_assert(ir
->o
== IR_AREF
);
1050 lua_assert(irb
->o
== IR_FLOAD
&& irb
->op2
== IRFL_TAB_ARRAY
);
1052 if (ira
->o
== IR_TNEW
&& ira
->op1
<= LJ_MAX_COLOSIZE
&&
1053 noconflict(as
, irb
->op1
, IR_NEWREF
)) {
1054 /* We can avoid the FLOAD of t->array for colocated arrays. */
1055 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, irb
->op1
, allow
); /* Table obj. */
1056 as
->mrm
.ofs
= -(int32_t)(ira
->op1
*sizeof(TValue
)); /* Ofs to colo array. */
1058 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
); /* Array base. */
1062 if (irref_isk(ir
->op2
)) {
1063 as
->mrm
.ofs
+= 8*irx
->i
;
1064 as
->mrm
.idx
= RID_NONE
;
1066 rset_clear(allow
, as
->mrm
.base
);
1067 as
->mrm
.scale
= XM_SCALE8
;
1068 /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
1069 ** Doesn't help much without ABCelim, but reduces register pressure.
1071 if (mayfuse(as
, ir
->op2
) && ra_noreg(irx
->r
) &&
1072 irx
->o
== IR_ADD
&& irref_isk(irx
->op2
)) {
1073 as
->mrm
.ofs
+= 8*IR(irx
->op2
)->i
;
1074 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, irx
->op1
, allow
);
1076 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, ir
->op2
, allow
);
1081 /* Fuse array/hash/upvalue reference into memory operand.
1082 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
1083 ** pass the final allow mask, excluding any GPRs used for other inputs.
1084 ** In particular: 2-operand GPR instructions need to call ra_dest() first!
1086 static void asm_fuseahuref(ASMState
*as
, IRRef ref
, RegSet allow
)
1088 IRIns
*ir
= IR(ref
);
1089 if (ra_noreg(ir
->r
)) {
1090 switch ((IROp
)ir
->o
) {
1092 if (mayfuse(as
, ref
)) {
1093 asm_fusearef(as
, ir
, allow
);
1098 if (mayfuse(as
, ref
)) {
1099 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
1100 as
->mrm
.ofs
= (int32_t)(IR(ir
->op2
)->op2
* sizeof(Node
));
1101 as
->mrm
.idx
= RID_NONE
;
1106 if (irref_isk(ir
->op1
)) {
1107 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
1108 GCupval
*uv
= &gcref(fn
->l
.uvptr
[ir
->op2
])->uv
;
1109 as
->mrm
.ofs
= ptr2addr(&uv
->tv
);
1110 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1115 lua_assert(ir
->o
== IR_HREF
|| ir
->o
== IR_NEWREF
|| ir
->o
== IR_UREFO
);
1119 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
1121 as
->mrm
.idx
= RID_NONE
;
1124 /* Fuse FLOAD/FREF reference into memory operand. */
1125 static void asm_fusefref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1127 lua_assert(ir
->o
== IR_FLOAD
|| ir
->o
== IR_FREF
);
1128 as
->mrm
.ofs
= field_ofs
[ir
->op2
];
1129 as
->mrm
.idx
= RID_NONE
;
1130 if (irref_isk(ir
->op1
)) {
1131 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
1132 as
->mrm
.base
= RID_NONE
;
1134 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
1138 /* Fuse string reference into memory operand. */
1139 static void asm_fusestrref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1142 lua_assert(ir
->o
== IR_STRREF
);
1143 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1144 as
->mrm
.scale
= XM_SCALE1
;
1145 as
->mrm
.ofs
= sizeof(GCstr
);
1146 if (irref_isk(ir
->op1
)) {
1147 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
1149 Reg r
= ra_alloc1(as
, ir
->op1
, allow
);
1150 rset_clear(allow
, r
);
1151 as
->mrm
.base
= (uint8_t)r
;
1154 if (irref_isk(ir
->op2
)) {
1155 as
->mrm
.ofs
+= irr
->i
;
1158 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
1159 if (mayfuse(as
, ir
->op2
) && irr
->o
== IR_ADD
&& irref_isk(irr
->op2
)) {
1160 as
->mrm
.ofs
+= IR(irr
->op2
)->i
;
1161 r
= ra_alloc1(as
, irr
->op1
, allow
);
1163 r
= ra_alloc1(as
, ir
->op2
, allow
);
1165 if (as
->mrm
.base
== RID_NONE
)
1166 as
->mrm
.base
= (uint8_t)r
;
1168 as
->mrm
.idx
= (uint8_t)r
;
1172 static void asm_fusexref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
1174 if (ir
->o
== IR_KPTR
) {
1175 as
->mrm
.ofs
= ir
->i
;
1176 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1178 lua_assert(ir
->o
== IR_STRREF
);
1179 asm_fusestrref(as
, ir
, allow
);
1183 /* Fuse load into memory operand. */
1184 static Reg
asm_fuseload(ASMState
*as
, IRRef ref
, RegSet allow
)
1186 IRIns
*ir
= IR(ref
);
1187 if (ra_hasreg(ir
->r
)) {
1188 if (allow
!= RSET_EMPTY
) return ir
->r
; /* Fast path. */
1190 /* Force a spill if only memory operands are allowed (asm_x87load). */
1191 as
->mrm
.base
= RID_ESP
;
1192 as
->mrm
.ofs
= ra_spill(as
, ir
);
1193 as
->mrm
.idx
= RID_NONE
;
1196 if (ir
->o
== IR_KNUM
) {
1197 RegSet avail
= as
->freeset
& ~as
->modset
& RSET_FPR
;
1198 lua_assert(allow
!= RSET_EMPTY
);
1199 if (!(avail
& (avail
-1))) { /* Fuse if less than two regs available. */
1200 as
->mrm
.ofs
= ptr2addr(ir_knum(ir
));
1201 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
1204 } else if (mayfuse(as
, ref
)) {
1205 RegSet xallow
= (allow
& RSET_GPR
) ? allow
: RSET_GPR
;
1206 if (ir
->o
== IR_SLOAD
) {
1207 if (!irt_isint(ir
->t
) && !(ir
->op2
& IRSLOAD_PARENT
)) {
1208 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, REF_BASE
, xallow
);
1209 as
->mrm
.ofs
= 8*((int32_t)ir
->op1
-1);
1210 as
->mrm
.idx
= RID_NONE
;
1213 } else if (ir
->o
== IR_FLOAD
) {
1214 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
1215 if ((irt_isint(ir
->t
) || irt_isaddr(ir
->t
)) &&
1216 noconflict(as
, ref
, IR_FSTORE
)) {
1217 asm_fusefref(as
, ir
, xallow
);
1220 } else if (ir
->o
== IR_ALOAD
|| ir
->o
== IR_HLOAD
|| ir
->o
== IR_ULOAD
) {
1221 if (noconflict(as
, ref
, ir
->o
+ IRDELTA_L2S
)) {
1222 asm_fuseahuref(as
, ir
->op1
, xallow
);
1225 } else if (ir
->o
== IR_XLOAD
) {
1226 /* Generic fusion is only ok for 32 bit operand (but see asm_comp).
1227 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
1229 if (irt_isint(ir
->t
) || irt_isaddr(ir
->t
)) {
1230 asm_fusexref(as
, IR(ir
->op1
), xallow
);
1235 if (!(as
->freeset
& allow
) &&
1236 (allow
== RSET_EMPTY
|| ra_hasspill(ir
->s
) || ref
< as
->loopref
))
1238 return ra_allocref(as
, ref
, allow
);
1241 /* -- Calls --------------------------------------------------------------- */
1243 /* Generate a call to a C function. */
1244 static void asm_gencall(ASMState
*as
, const CCallInfo
*ci
, IRRef
*args
)
1246 RegSet allow
= RSET_ALL
;
1247 uint32_t n
, nargs
= CCI_NARGS(ci
);
1249 lua_assert(!(nargs
> 2 && (ci
->flags
&CCI_FASTCALL
))); /* Avoid stack adj. */
1250 emit_call(as
, ci
->func
);
1251 for (n
= 0; n
< nargs
; n
++) { /* Setup args. */
1253 #error "NYI: 64 bit mode call argument setup"
1255 IRIns
*ir
= IR(args
[n
]);
1256 if (irt_isnum(ir
->t
)) {
1257 if ((ofs
& 4) && irref_isk(args
[n
])) {
1258 /* Split stores for unaligned FP consts. */
1259 emit_movmroi(as
, RID_ESP
, ofs
, (int32_t)ir_knum(ir
)->u32
.lo
);
1260 emit_movmroi(as
, RID_ESP
, ofs
+4, (int32_t)ir_knum(ir
)->u32
.hi
);
1263 if ((allow
& RSET_FPR
) == RSET_EMPTY
)
1264 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
1265 r
= ra_alloc1(as
, args
[n
], allow
& RSET_FPR
);
1266 allow
&= ~RID2RSET(r
);
1267 emit_rmro(as
, XO_MOVSDto
, r
, RID_ESP
, ofs
);
1271 if ((ci
->flags
& CCI_FASTCALL
) && n
< 2) {
1272 Reg r
= n
== 0 ? RID_ECX
: RID_EDX
;
1273 if (args
[n
] < ASMREF_TMP1
) {
1274 emit_loadi(as
, r
, ir
->i
);
1276 lua_assert(rset_test(as
->freeset
, r
)); /* Must have been evicted. */
1277 allow
&= ~RID2RSET(r
);
1278 if (ra_hasreg(ir
->r
))
1279 emit_movrr(as
, r
, ir
->r
);
1281 ra_allocref(as
, args
[n
], RID2RSET(r
));
1284 if (args
[n
] < ASMREF_TMP1
) {
1285 emit_movmroi(as
, RID_ESP
, ofs
, ir
->i
);
1288 if ((allow
& RSET_GPR
) == RSET_EMPTY
)
1289 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
1290 r
= ra_alloc1(as
, args
[n
], allow
& RSET_GPR
);
1291 allow
&= ~RID2RSET(r
);
1292 emit_movtomro(as
, r
, RID_ESP
, ofs
);
1300 /* Setup result reg/sp for call. Evict scratch regs. */
1301 static void asm_setupresult(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
1303 RegSet drop
= RSET_SCRATCH
;
1304 if ((ci
->flags
& CCI_NOFPRCLOBBER
))
1306 if (ra_hasreg(ir
->r
))
1307 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
1308 ra_evictset(as
, drop
); /* Evictions must be performed first. */
1310 if (irt_isnum(ir
->t
)) {
1311 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or slots SPS_TEMP1/2. */
1313 if ((ci
->flags
& CCI_CASTU64
)) {
1315 if (ra_hasreg(dest
)) {
1317 ra_modified(as
, dest
);
1318 emit_rr(as
, XO_MOVD
, dest
|REX_64
, RID_RET
); /* Really MOVQ. */
1320 emit_movrmro(as
, RID_RET
, RID_ESP
, ofs
);
1323 ra_destreg(as
, ir
, RID_FPRET
);
1326 /* Number result is in x87 st0 for x86 calling convention. */
1328 if (ra_hasreg(dest
)) {
1330 ra_modified(as
, dest
);
1331 emit_rmro(as
, XMM_MOVRM(as
), dest
, RID_ESP
, ofs
);
1333 if ((ci
->flags
& CCI_CASTU64
)) {
1334 emit_movtomro(as
, RID_RET
, RID_ESP
, ofs
);
1335 emit_movtomro(as
, RID_RETHI
, RID_ESP
, ofs
+4);
1337 emit_rmro(as
, XO_FSTPq
, XOg_FSTPq
, RID_ESP
, ofs
);
1341 lua_assert(!irt_ispri(ir
->t
));
1342 ra_destreg(as
, ir
, RID_RET
);
1347 /* Collect arguments from CALL* and ARG instructions. */
1348 static void asm_collectargs(ASMState
*as
, IRIns
*ir
,
1349 const CCallInfo
*ci
, IRRef
*args
)
1351 uint32_t n
= CCI_NARGS(ci
);
1352 lua_assert(n
<= CCI_NARGS_MAX
);
1353 if ((ci
->flags
& CCI_L
)) { *args
++ = ASMREF_L
; n
--; }
1356 lua_assert(ir
->o
== IR_CARG
);
1360 lua_assert(IR(ir
->op1
)->o
!= IR_CARG
);
1363 static void asm_call(ASMState
*as
, IRIns
*ir
)
1365 IRRef args
[CCI_NARGS_MAX
];
1366 const CCallInfo
*ci
= &lj_ir_callinfo
[ir
->op2
];
1367 asm_collectargs(as
, ir
, ci
, args
);
1368 asm_setupresult(as
, ir
, ci
);
1369 asm_gencall(as
, ci
, args
);
1372 /* -- Type conversions ---------------------------------------------------- */
1374 static void asm_tonum(ASMState
*as
, IRIns
*ir
)
1376 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
1377 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_GPR
);
1378 emit_mrm(as
, XO_CVTSI2SD
, dest
, left
);
1379 if (!(as
->flags
& JIT_F_SPLIT_XMM
))
1380 emit_rr(as
, XO_XORPS
, dest
, dest
); /* Avoid partial register stall. */
1383 static void asm_tointg(ASMState
*as
, IRIns
*ir
, Reg left
)
1385 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_FPR
, left
));
1386 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1387 asm_guardcc(as
, CC_P
);
1388 asm_guardcc(as
, CC_NE
);
1389 emit_rr(as
, XO_UCOMISD
, left
, tmp
);
1390 emit_rr(as
, XO_CVTSI2SD
, tmp
, dest
);
1391 if (!(as
->flags
& JIT_F_SPLIT_XMM
))
1392 emit_rr(as
, XO_XORPS
, tmp
, tmp
); /* Avoid partial register stall. */
1393 emit_rr(as
, XO_CVTTSD2SI
, dest
, left
);
1394 /* Can't fuse since left is needed twice. */
1397 static void asm_toint(ASMState
*as
, IRIns
*ir
)
1399 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1400 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
1401 emit_mrm(as
, XO_CVTSD2SI
, dest
, left
);
1404 static void asm_tobit(ASMState
*as
, IRIns
*ir
)
1406 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1407 Reg tmp
= ra_noreg(IR(ir
->op1
)->r
) ?
1408 ra_alloc1(as
, ir
->op1
, RSET_FPR
) :
1409 ra_scratch(as
, RSET_FPR
);
1410 Reg right
= asm_fuseload(as
, ir
->op2
, rset_exclude(RSET_FPR
, tmp
));
1411 emit_rr(as
, XO_MOVDto
, tmp
, dest
);
1412 emit_mrm(as
, XO_ADDSD
, tmp
, right
);
1413 ra_left(as
, tmp
, ir
->op1
);
1416 static void asm_strto(ASMState
*as
, IRIns
*ir
)
1418 /* Force a spill slot for the destination register (if any). */
1419 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_tonum
];
1421 RegSet drop
= RSET_SCRATCH
;
1422 if ((drop
& RSET_FPR
) != RSET_FPR
&& ra_hasreg(ir
->r
))
1423 rset_set(drop
, ir
->r
); /* WIN64 doesn't spill all FPRs. */
1424 ra_evictset(as
, drop
);
1425 asm_guardcc(as
, CC_E
);
1426 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
);
1428 args
[1] = ASMREF_TMP1
;
1429 asm_gencall(as
, ci
, args
);
1430 /* Store the result to the spill slot or slots SPS_TEMP1/2. */
1431 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
),
1432 RID_ESP
, sps_scale(ir
->s
));
1435 static void asm_tostr(ASMState
*as
, IRIns
*ir
)
1437 IRIns
*irl
= IR(ir
->op1
);
1441 if (irt_isnum(irl
->t
)) {
1442 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_fromnum
];
1443 args
[1] = ASMREF_TMP1
;
1444 asm_setupresult(as
, ir
, ci
);
1445 asm_gencall(as
, ci
, args
);
1446 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
),
1447 RID_ESP
, ra_spill(as
, irl
));
1449 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_fromint
];
1451 asm_setupresult(as
, ir
, ci
);
1452 asm_gencall(as
, ci
, args
);
1456 /* -- Memory references --------------------------------------------------- */
1458 static void asm_aref(ASMState
*as
, IRIns
*ir
)
1460 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1461 asm_fusearef(as
, ir
, RSET_GPR
);
1462 if (!(as
->mrm
.idx
== RID_NONE
&& as
->mrm
.ofs
== 0))
1463 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1464 else if (as
->mrm
.base
!= dest
)
1465 emit_rr(as
, XO_MOV
, dest
, as
->mrm
.base
);
1468 /* Must match with hashkey() and hashrot() in lj_tab.c. */
1469 static uint32_t ir_khash(IRIns
*ir
)
1472 if (irt_isstr(ir
->t
)) {
1473 return ir_kstr(ir
)->hash
;
1474 } else if (irt_isnum(ir
->t
)) {
1475 lo
= ir_knum(ir
)->u32
.lo
;
1476 hi
= ir_knum(ir
)->u32
.hi
& 0x7fffffff;
1477 } else if (irt_ispri(ir
->t
)) {
1478 lua_assert(!irt_isnil(ir
->t
));
1479 return irt_type(ir
->t
)-IRT_FALSE
;
1481 lua_assert(irt_isgcv(ir
->t
));
1482 lo
= u32ptr(ir_kgc(ir
));
1483 hi
= lo
- 0x04c11db7;
1485 lo
^= hi
; hi
= lj_rol(hi
, 14);
1486 lo
-= hi
; hi
= lj_rol(hi
, 5);
1487 hi
^= lo
; hi
-= lj_rol(lo
, 27);
1491 /* Merge NE(HREF, niltv) check. */
1492 static MCode
*merge_href_niltv(ASMState
*as
, IRIns
*ir
)
1494 /* Assumes nothing else generates NE of HREF. */
1495 if (ir
[1].o
== IR_NE
&& ir
[1].op1
== as
->curins
) {
1496 if (LJ_64
&& *as
->mcp
!= XI_ARITHi
)
1499 as
->mcp
+= 6+6; /* Kill cmp reg, imm32 + jz exit. */
1500 return as
->mcp
+ *(int32_t *)(as
->mcp
-4); /* Return exit address. */
1505 /* Inlined hash lookup. Specialized for key type and for const keys.
1506 ** The equivalent C code is:
1507 ** Node *n = hashkey(t, key);
1509 ** if (lj_obj_equal(&n->key, key)) return &n->val;
1510 ** } while ((n = nextnode(n)));
1513 static void asm_href(ASMState
*as
, IRIns
*ir
)
1515 MCode
*nilexit
= merge_href_niltv(as
, ir
); /* Do this before any restores. */
1516 RegSet allow
= RSET_GPR
;
1517 Reg dest
= ra_dest(as
, ir
, allow
);
1518 Reg tab
= ra_alloc1(as
, ir
->op1
, rset_clear(allow
, dest
));
1519 Reg key
= RID_NONE
, tmp
= RID_NONE
;
1520 IRIns
*irkey
= IR(ir
->op2
);
1521 int isk
= irref_isk(ir
->op2
);
1522 IRType1 kt
= irkey
->t
;
1524 MCLabel l_end
, l_loop
, l_next
;
1527 rset_clear(allow
, tab
);
1528 key
= ra_alloc1(as
, ir
->op2
, irt_isnum(kt
) ? RSET_FPR
: allow
);
1530 tmp
= ra_scratch(as
, rset_exclude(allow
, key
));
1533 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */
1534 l_end
= emit_label(as
);
1536 emit_jcc(as
, CC_E
, nilexit
); /* XI_JMP is not found by lj_asm_patchexit. */
1538 emit_loada(as
, dest
, niltvg(J2G(as
->J
)));
1540 /* Follow hash chain until the end. */
1541 l_loop
= emit_sjcc_label(as
, CC_NZ
);
1542 emit_rr(as
, XO_TEST
, dest
, dest
);
1543 emit_rmro(as
, XO_MOV
, dest
, dest
, offsetof(Node
, next
));
1544 l_next
= emit_label(as
);
1546 /* Type and value comparison. */
1547 emit_sjcc(as
, CC_E
, l_end
);
1548 if (irt_isnum(kt
)) {
1550 /* Assumes -0.0 is already canonicalized to +0.0. */
1551 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.lo
),
1552 (int32_t)ir_knum(irkey
)->u32
.lo
);
1553 emit_sjcc(as
, CC_NE
, l_next
);
1554 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.hi
),
1555 (int32_t)ir_knum(irkey
)->u32
.hi
);
1557 emit_sjcc(as
, CC_P
, l_next
);
1558 emit_rmro(as
, XO_UCOMISD
, key
, dest
, offsetof(Node
, key
.n
));
1559 emit_sjcc(as
, CC_A
, l_next
);
1560 /* The type check avoids NaN penalties and complaints from Valgrind. */
1561 emit_i8(as
, ~IRT_NUM
);
1562 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1565 if (!irt_ispri(kt
)) {
1566 lua_assert(irt_isaddr(kt
));
1568 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.gcr
),
1569 ptr2addr(ir_kgc(irkey
)));
1571 emit_rmro(as
, XO_CMP
, key
, dest
, offsetof(Node
, key
.gcr
));
1572 emit_sjcc(as
, CC_NE
, l_next
);
1574 lua_assert(!irt_isnil(kt
));
1575 emit_i8(as
, ~irt_type(kt
));
1576 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1578 emit_sfixup(as
, l_loop
);
1581 /* Load main position relative to tab->node into dest. */
1582 khash
= isk
? ir_khash(irkey
) : 1;
1584 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, node
));
1586 emit_rmro(as
, XO_ARITH(XOg_ADD
), dest
, tab
, offsetof(GCtab
, node
));
1587 if ((as
->flags
& JIT_F_PREFER_IMUL
)) {
1588 emit_i8(as
, sizeof(Node
));
1589 emit_rr(as
, XO_IMULi8
, dest
, dest
);
1591 emit_shifti(as
, XOg_SHL
, dest
, 3);
1592 emit_rmrxo(as
, XO_LEA
, dest
, dest
, dest
, XM_SCALE2
, 0);
1595 emit_gri(as
, XG_ARITHi(XOg_AND
), dest
, (int32_t)khash
);
1596 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
1597 } else if (irt_isstr(kt
)) {
1598 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, key
, offsetof(GCstr
, hash
));
1599 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
1600 } else { /* Must match with hashrot() in lj_tab.c. */
1601 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, tab
, offsetof(GCtab
, hmask
));
1602 emit_rr(as
, XO_ARITH(XOg_SUB
), dest
, tmp
);
1603 emit_shifti(as
, XOg_ROL
, tmp
, 27);
1604 emit_rr(as
, XO_ARITH(XOg_XOR
), dest
, tmp
);
1605 emit_shifti(as
, XOg_ROL
, dest
, 5);
1606 emit_rr(as
, XO_ARITH(XOg_SUB
), tmp
, dest
);
1607 emit_shifti(as
, XOg_ROL
, dest
, 14);
1608 emit_rr(as
, XO_ARITH(XOg_XOR
), tmp
, dest
);
1609 if (irt_isnum(kt
)) {
1610 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, RID_ESP
, ra_spill(as
, irkey
)+4);
1611 emit_loadi(as
, dest
, 0x7fffffff);
1612 emit_rr(as
, XO_MOVDto
, key
, tmp
);
1614 emit_rr(as
, XO_MOV
, tmp
, key
);
1615 emit_rmro(as
, XO_LEA
, dest
, key
, -0x04c11db7);
1621 static void asm_hrefk(ASMState
*as
, IRIns
*ir
)
1623 IRIns
*kslot
= IR(ir
->op2
);
1624 IRIns
*irkey
= IR(kslot
->op1
);
1625 int32_t ofs
= (int32_t)(kslot
->op2
* sizeof(Node
));
1626 Reg dest
= ra_used(ir
) ? ra_dest(as
, ir
, RSET_GPR
) : RID_NONE
;
1627 Reg node
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1629 lua_assert(ofs
% sizeof(Node
) == 0);
1630 if (ra_hasreg(dest
)) {
1632 if (dest
== node
&& !(as
->flags
& JIT_F_LEA_AGU
))
1633 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, ofs
);
1635 emit_rmro(as
, XO_LEA
, dest
, node
, ofs
);
1636 } else if (dest
!= node
) {
1637 emit_rr(as
, XO_MOV
, dest
, node
);
1640 asm_guardcc(as
, CC_NE
);
1641 l_exit
= emit_label(as
);
1642 if (irt_isnum(irkey
->t
)) {
1643 /* Assumes -0.0 is already canonicalized to +0.0. */
1644 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1645 ofs
+ (int32_t)offsetof(Node
, key
.u32
.lo
),
1646 (int32_t)ir_knum(irkey
)->u32
.lo
);
1647 emit_sjcc(as
, CC_NE
, l_exit
);
1648 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1649 ofs
+ (int32_t)offsetof(Node
, key
.u32
.hi
),
1650 (int32_t)ir_knum(irkey
)->u32
.hi
);
1652 if (!irt_ispri(irkey
->t
)) {
1653 lua_assert(irt_isgcv(irkey
->t
));
1654 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1655 ofs
+ (int32_t)offsetof(Node
, key
.gcr
),
1656 ptr2addr(ir_kgc(irkey
)));
1657 emit_sjcc(as
, CC_NE
, l_exit
);
1659 lua_assert(!irt_isnil(irkey
->t
));
1660 emit_i8(as
, ~irt_type(irkey
->t
));
1661 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
1662 ofs
+ (int32_t)offsetof(Node
, key
.it
));
1666 static void asm_newref(ASMState
*as
, IRIns
*ir
)
1668 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_newkey
];
1674 args
[2] = ASMREF_TMP1
;
1675 asm_setupresult(as
, ir
, ci
);
1676 asm_gencall(as
, ci
, args
);
1677 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
1678 irkey
= IR(ir
->op2
);
1679 if (irt_isnum(irkey
->t
)) {
1680 /* For numbers use the constant itself or a spill slot as a TValue. */
1681 if (irref_isk(ir
->op2
))
1682 emit_loada(as
, tmp
, ir_knum(irkey
));
1684 emit_rmro(as
, XO_LEA
, tmp
, RID_ESP
, ra_spill(as
, irkey
));
1686 /* Otherwise use g->tmptv to hold the TValue. */
1687 if (!irref_isk(ir
->op2
)) {
1688 Reg src
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_GPR
, tmp
));
1689 emit_movtomro(as
, src
, tmp
, 0);
1690 } else if (!irt_ispri(irkey
->t
)) {
1691 emit_movmroi(as
, tmp
, 0, irkey
->i
);
1693 emit_movmroi(as
, tmp
, 4, irt_toitype(irkey
->t
));
1694 emit_loada(as
, tmp
, &J2G(as
->J
)->tmptv
);
1698 static void asm_uref(ASMState
*as
, IRIns
*ir
)
1700 /* NYI: Check that UREFO is still open and not aliasing a slot. */
1702 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1703 if (irref_isk(ir
->op1
)) {
1704 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
1705 TValue
**v
= &gcref(fn
->l
.uvptr
[ir
->op2
])->uv
.v
;
1706 emit_rma(as
, XO_MOV
, dest
, v
);
1708 Reg uv
= ra_scratch(as
, RSET_GPR
);
1709 Reg func
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1710 if (ir
->o
== IR_UREFC
) {
1711 emit_rmro(as
, XO_LEA
, dest
, uv
, offsetof(GCupval
, tv
));
1712 asm_guardcc(as
, CC_NE
);
1714 emit_rmro(as
, XO_ARITHib
, XOg_CMP
, uv
, offsetof(GCupval
, closed
));
1716 emit_rmro(as
, XO_MOV
, dest
, uv
, offsetof(GCupval
, v
));
1718 emit_rmro(as
, XO_MOV
, uv
, func
,
1719 (int32_t)offsetof(GCfuncL
, uvptr
) + 4*(int32_t)ir
->op2
);
1724 static void asm_fref(ASMState
*as
, IRIns
*ir
)
1726 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1727 asm_fusefref(as
, ir
, RSET_GPR
);
1728 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1731 static void asm_strref(ASMState
*as
, IRIns
*ir
)
1733 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1734 asm_fusestrref(as
, ir
, RSET_GPR
);
1735 if (as
->mrm
.base
== RID_NONE
)
1736 emit_loadi(as
, dest
, as
->mrm
.ofs
);
1737 else if (as
->mrm
.base
== dest
&& as
->mrm
.idx
== RID_NONE
)
1738 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, as
->mrm
.ofs
);
1740 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1743 /* -- Loads and stores ---------------------------------------------------- */
1745 static void asm_fxload(ASMState
*as
, IRIns
*ir
)
1747 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1749 if (ir
->o
== IR_FLOAD
)
1750 asm_fusefref(as
, ir
, RSET_GPR
);
1752 asm_fusexref(as
, IR(ir
->op1
), RSET_GPR
);
1753 /* ir->op2 is ignored -- unaligned loads are ok on x86. */
1754 switch (irt_type(ir
->t
)) {
1755 case IRT_I8
: xo
= XO_MOVSXb
; break;
1756 case IRT_U8
: xo
= XO_MOVZXb
; break;
1757 case IRT_I16
: xo
= XO_MOVSXw
; break;
1758 case IRT_U16
: xo
= XO_MOVZXw
; break;
1760 lua_assert(irt_isint(ir
->t
) || irt_isaddr(ir
->t
));
1764 emit_mrm(as
, xo
, dest
, RID_MRM
);
1767 static void asm_fstore(ASMState
*as
, IRIns
*ir
)
1769 RegSet allow
= RSET_GPR
;
1771 /* The IRT_I16/IRT_U16 stores should never be simplified for constant
1772 ** values since mov word [mem], imm16 has a length-changing prefix.
1774 if (!irref_isk(ir
->op2
) || irt_isi16(ir
->t
) || irt_isu16(ir
->t
)) {
1775 RegSet allow8
= (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) ? RSET_GPR8
: RSET_GPR
;
1776 src
= ra_alloc1(as
, ir
->op2
, allow8
);
1777 rset_clear(allow
, src
);
1779 asm_fusefref(as
, IR(ir
->op1
), allow
);
1780 if (ra_hasreg(src
)) {
1782 switch (irt_type(ir
->t
)) {
1783 case IRT_I8
: case IRT_U8
: xo
= XO_MOVtob
; src
|= FORCE_REX
; break;
1784 case IRT_I16
: case IRT_U16
: xo
= XO_MOVtow
; break;
1786 lua_assert(irt_isint(ir
->t
) || irt_isaddr(ir
->t
));
1790 emit_mrm(as
, xo
, src
, RID_MRM
);
1792 if (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) {
1793 emit_i8(as
, IR(ir
->op2
)->i
);
1794 emit_mrm(as
, XO_MOVmib
, 0, RID_MRM
);
1796 lua_assert(irt_isint(ir
->t
) || irt_isaddr(ir
->t
));
1797 emit_i32(as
, IR(ir
->op2
)->i
);
1798 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1803 static void asm_ahuload(ASMState
*as
, IRIns
*ir
)
1805 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
1806 lua_assert(irt_isnum(ir
->t
) || irt_ispri(ir
->t
) || irt_isaddr(ir
->t
));
1808 Reg dest
= ra_dest(as
, ir
, allow
);
1809 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1810 emit_mrm(as
, dest
< RID_MAX_GPR
? XO_MOV
: XMM_MOVRM(as
), dest
, RID_MRM
);
1812 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1814 /* Always do the type check, even if the load result is unused. */
1815 asm_guardcc(as
, irt_isnum(ir
->t
) ? CC_A
: CC_NE
);
1816 emit_i8(as
, ~irt_type(ir
->t
));
1818 emit_mrm(as
, XO_ARITHi8
, XOg_CMP
, RID_MRM
);
1821 static void asm_ahustore(ASMState
*as
, IRIns
*ir
)
1823 if (irt_isnum(ir
->t
)) {
1824 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_FPR
);
1825 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1826 emit_mrm(as
, XO_MOVSDto
, src
, RID_MRM
);
1828 IRIns
*irr
= IR(ir
->op2
);
1829 RegSet allow
= RSET_GPR
;
1831 if (!irref_isk(ir
->op2
)) {
1832 src
= ra_alloc1(as
, ir
->op2
, allow
);
1833 rset_clear(allow
, src
);
1835 asm_fuseahuref(as
, ir
->op1
, allow
);
1836 if (ra_hasreg(src
)) {
1837 emit_mrm(as
, XO_MOVto
, src
, RID_MRM
);
1838 } else if (!irt_ispri(irr
->t
)) {
1839 lua_assert(irt_isaddr(ir
->t
));
1840 emit_i32(as
, irr
->i
);
1841 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1844 emit_i32(as
, (int32_t)~irt_type(ir
->t
));
1845 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1849 static void asm_sload(ASMState
*as
, IRIns
*ir
)
1851 int32_t ofs
= 8*((int32_t)ir
->op1
-1);
1854 lua_assert(!(ir
->op2
& IRSLOAD_PARENT
)); /* Handled by asm_head_side(). */
1856 Reg left
= ra_scratch(as
, RSET_FPR
);
1857 asm_tointg(as
, ir
, left
); /* Frees dest reg. Do this before base alloc. */
1858 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1859 emit_rmro(as
, XMM_MOVRM(as
), left
, base
, ofs
);
1860 t
.irt
= IRT_NUM
; /* Continue with a regular number type check. */
1861 } else if (ra_used(ir
)) {
1862 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
1863 Reg dest
= ra_dest(as
, ir
, allow
);
1864 lua_assert(irt_isnum(ir
->t
) || irt_isaddr(ir
->t
));
1865 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1866 emit_movrmro(as
, dest
, base
, ofs
);
1868 if (!irt_isguard(ir
->t
))
1869 return; /* No type check: avoid base alloc. */
1870 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1872 if (irt_isguard(ir
->t
)) {
1873 /* Need type check, even if the load result is unused. */
1874 asm_guardcc(as
, irt_isnum(t
) ? CC_A
: CC_NE
);
1875 emit_i8(as
, ~irt_type(t
));
1876 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, base
, ofs
+4);
1880 /* -- Allocations --------------------------------------------------------- */
1882 static void asm_snew(ASMState
*as
, IRIns
*ir
)
1884 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_str_new
];
1890 asm_setupresult(as
, ir
, ci
);
1891 asm_gencall(as
, ci
, args
);
1894 static void asm_tnew(ASMState
*as
, IRIns
*ir
)
1896 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_new1
];
1899 args
[1] = ASMREF_TMP1
;
1901 asm_setupresult(as
, ir
, ci
);
1902 asm_gencall(as
, ci
, args
);
1903 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP1
), ir
->op1
| (ir
->op2
<< 24));
1906 static void asm_tdup(ASMState
*as
, IRIns
*ir
)
1908 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_tab_dup
];
1913 asm_setupresult(as
, ir
, ci
);
1914 asm_gencall(as
, ci
, args
);
1917 /* -- Write barriers ------------------------------------------------------ */
1919 static void asm_tbar(ASMState
*as
, IRIns
*ir
)
1921 Reg tab
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1922 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, tab
));
1923 MCLabel l_end
= emit_label(as
);
1924 emit_movtomro(as
, tmp
, tab
, offsetof(GCtab
, gclist
));
1925 emit_setgl(as
, tab
, gc
.grayagain
);
1926 emit_getgl(as
, tmp
, gc
.grayagain
);
1927 emit_i8(as
, ~LJ_GC_BLACK
);
1928 emit_rmro(as
, XO_ARITHib
, XOg_AND
, tab
, offsetof(GCtab
, marked
));
1929 emit_sjcc(as
, CC_Z
, l_end
);
1930 emit_i8(as
, LJ_GC_BLACK
);
1931 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, tab
, offsetof(GCtab
, marked
));
1934 static void asm_obar(ASMState
*as
, IRIns
*ir
)
1936 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_barrieruv
];
1940 /* No need for other object barriers (yet). */
1941 lua_assert(IR(ir
->op1
)->o
== IR_UREFC
);
1942 l_end
= emit_label(as
);
1943 args
[0] = ASMREF_TMP1
;
1945 asm_gencall(as
, ci
, args
);
1946 emit_loada(as
, ra_releasetmp(as
, ASMREF_TMP1
), J2G(as
->J
));
1947 obj
= IR(ir
->op1
)->r
;
1948 emit_sjcc(as
, CC_Z
, l_end
);
1949 emit_i8(as
, LJ_GC_WHITES
);
1950 if (irref_isk(ir
->op2
)) {
1951 GCobj
*vp
= ir_kgc(IR(ir
->op2
));
1952 emit_rma(as
, XO_GROUP3b
, XOg_TEST
, &vp
->gch
.marked
);
1954 Reg val
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_SCRATCH
&RSET_GPR
, obj
));
1955 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, val
, (int32_t)offsetof(GChead
, marked
));
1957 emit_sjcc(as
, CC_Z
, l_end
);
1958 emit_i8(as
, LJ_GC_BLACK
);
1959 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, obj
,
1960 (int32_t)offsetof(GCupval
, marked
)-(int32_t)offsetof(GCupval
, tv
));
1963 /* -- FP/int arithmetic and logic operations ------------------------------ */
1965 /* Load reference onto x87 stack. Force a spill to memory if needed. */
1966 static void asm_x87load(ASMState
*as
, IRRef ref
)
1968 IRIns
*ir
= IR(ref
);
1969 if (ir
->o
== IR_KNUM
) {
1970 cTValue
*tv
= ir_knum(ir
);
1971 if (tvispzero(tv
)) /* Use fldz only for +0. */
1972 emit_x87op(as
, XI_FLDZ
);
1973 else if (tvispone(tv
))
1974 emit_x87op(as
, XI_FLD1
);
1976 emit_rma(as
, XO_FLDq
, XOg_FLDq
, tv
);
1977 } else if (ir
->o
== IR_TONUM
&& !ra_used(ir
) &&
1978 !irref_isk(ir
->op1
) && mayfuse(as
, ir
->op1
)) {
1979 IRIns
*iri
= IR(ir
->op1
);
1980 emit_rmro(as
, XO_FILDd
, XOg_FILDd
, RID_ESP
, ra_spill(as
, iri
));
1982 emit_mrm(as
, XO_FLDq
, XOg_FLDq
, asm_fuseload(as
, ref
, RSET_EMPTY
));
1986 /* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
1987 static int fpmjoin_pow(ASMState
*as
, IRIns
*ir
)
1989 IRIns
*irp
= IR(ir
->op1
);
1990 if (irp
== ir
-1 && irp
->o
== IR_MUL
&& !ra_used(irp
)) {
1991 IRIns
*irpp
= IR(irp
->op1
);
1992 if (irpp
== ir
-2 && irpp
->o
== IR_FPMATH
&&
1993 irpp
->op2
== IRFPM_LOG2
&& !ra_used(irpp
)) {
1994 /* The modified regs must match with the *.dasc implementation. */
1995 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM2
+1)|RID2RSET(RID_EAX
);
1997 if (ra_hasreg(ir
->r
))
1998 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
1999 ra_evictset(as
, drop
);
2000 ra_destreg(as
, ir
, RID_XMM0
);
2001 emit_call(as
, lj_vm_pow_sse
);
2002 irx
= IR(irpp
->op1
);
2003 if (ra_noreg(irx
->r
) && ra_gethint(irx
->r
) == RID_XMM1
)
2004 irx
->r
= RID_INIT
; /* Avoid allocating xmm1 for x. */
2005 ra_left(as
, RID_XMM0
, irpp
->op1
);
2006 ra_left(as
, RID_XMM1
, irp
->op2
);
2013 static void asm_fpmath(ASMState
*as
, IRIns
*ir
)
2015 IRFPMathOp fpm
= ir
->o
== IR_FPMATH
? (IRFPMathOp
)ir
->op2
: IRFPM_OTHER
;
2016 if (fpm
== IRFPM_SQRT
) {
2017 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
2018 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
2019 emit_mrm(as
, XO_SQRTSD
, dest
, left
);
2020 } else if (fpm
<= IRFPM_TRUNC
) {
2021 if (as
->flags
& JIT_F_SSE4_1
) { /* SSE4.1 has a rounding instruction. */
2022 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
2023 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
2024 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
2025 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
2026 ** This is atrocious, but the alternatives are much worse.
2028 /* Round down/up/trunc == 1001/1010/1011. */
2029 emit_i8(as
, 0x09 + fpm
);
2030 emit_mrm(as
, XO_ROUNDSD
, dest
, left
);
2031 if (LJ_64
&& as
->mcp
[1] != (MCode
)(XO_ROUNDSD
>> 16)) {
2032 as
->mcp
[0] = as
->mcp
[1]; as
->mcp
[1] = 0x0f; /* Swap 0F and REX. */
2034 *--as
->mcp
= 0x66; /* 1st byte of ROUNDSD opcode. */
2035 } else { /* Call helper functions for SSE2 variant. */
2036 /* The modified regs must match with the *.dasc implementation. */
2037 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM3
+1)|RID2RSET(RID_EAX
);
2038 if (ra_hasreg(ir
->r
))
2039 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
2040 ra_evictset(as
, drop
);
2041 ra_destreg(as
, ir
, RID_XMM0
);
2042 emit_call(as
, fpm
== IRFPM_FLOOR
? lj_vm_floor_sse
:
2043 fpm
== IRFPM_CEIL
? lj_vm_ceil_sse
: lj_vm_trunc_sse
);
2044 ra_left(as
, RID_XMM0
, ir
->op1
);
2046 } else if (fpm
== IRFPM_EXP2
&& fpmjoin_pow(as
, ir
)) {
2047 /* Rejoined to pow(). */
2048 } else { /* Handle x87 ops. */
2049 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or slots SPS_TEMP1/2. */
2051 if (ra_hasreg(dest
)) {
2053 ra_modified(as
, dest
);
2054 emit_rmro(as
, XMM_MOVRM(as
), dest
, RID_ESP
, ofs
);
2056 emit_rmro(as
, XO_FSTPq
, XOg_FSTPq
, RID_ESP
, ofs
);
2057 switch (fpm
) { /* st0 = lj_vm_*(st0) */
2058 case IRFPM_EXP
: emit_call(as
, lj_vm_exp
); break;
2059 case IRFPM_EXP2
: emit_call(as
, lj_vm_exp2
); break;
2060 case IRFPM_SIN
: emit_x87op(as
, XI_FSIN
); break;
2061 case IRFPM_COS
: emit_x87op(as
, XI_FCOS
); break;
2062 case IRFPM_TAN
: emit_x87op(as
, XI_FPOP
); emit_x87op(as
, XI_FPTAN
); break;
2063 case IRFPM_LOG
: case IRFPM_LOG2
: case IRFPM_LOG10
:
2064 /* Note: the use of fyl2xp1 would be pointless here. When computing
2065 ** log(1.0+eps) the precision is already lost after 1.0 is added.
2066 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
2068 emit_x87op(as
, XI_FYL2X
); break;
2072 emit_x87op(as
, XI_FPATAN
); asm_x87load(as
, ir
->op2
); break;
2074 emit_x87op(as
, XI_FPOP1
); emit_x87op(as
, XI_FSCALE
); break;
2075 default: lua_assert(0); break;
2078 default: lua_assert(0); break;
2080 asm_x87load(as
, ir
->op1
);
2082 case IRFPM_LOG
: emit_x87op(as
, XI_FLDLN2
); break;
2083 case IRFPM_LOG2
: emit_x87op(as
, XI_FLD1
); break;
2084 case IRFPM_LOG10
: emit_x87op(as
, XI_FLDLG2
); break;
2086 if (ir
->o
== IR_LDEXP
) asm_x87load(as
, ir
->op2
);
2093 static void asm_powi(ASMState
*as
, IRIns
*ir
)
2095 /* The modified regs must match with the *.dasc implementation. */
2096 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM1
+1)|RID2RSET(RID_EAX
);
2097 if (ra_hasreg(ir
->r
))
2098 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
2099 ra_evictset(as
, drop
);
2100 ra_destreg(as
, ir
, RID_XMM0
);
2101 emit_call(as
, lj_vm_powi_sse
);
2102 ra_left(as
, RID_XMM0
, ir
->op1
);
2103 ra_left(as
, RID_EAX
, ir
->op2
);
2106 /* Find out whether swapping operands might be beneficial. */
2107 static int swapops(ASMState
*as
, IRIns
*ir
)
2109 IRIns
*irl
= IR(ir
->op1
);
2110 IRIns
*irr
= IR(ir
->op2
);
2111 lua_assert(ra_noreg(irr
->r
));
2112 if (!irm_iscomm(lj_ir_mode
[ir
->o
]))
2113 return 0; /* Can't swap non-commutative operations. */
2114 if (irref_isk(ir
->op2
))
2115 return 0; /* Don't swap constants to the left. */
2116 if (ra_hasreg(irl
->r
))
2117 return 1; /* Swap if left already has a register. */
2118 if (ra_samehint(ir
->r
, irr
->r
))
2119 return 1; /* Swap if dest and right have matching hints. */
2120 if (ir
->op1
< as
->loopref
&& !irt_isphi(irl
->t
) &&
2121 !(ir
->op2
< as
->loopref
&& !irt_isphi(irr
->t
)))
2122 return 1; /* Swap invariants to the right. */
2123 if (opisfusableload(irl
->o
))
2124 return 1; /* Swap fusable loads to the right. */
2125 return 0; /* Otherwise don't swap. */
2128 static void asm_fparith(ASMState
*as
, IRIns
*ir
, x86Op xo
)
2130 IRRef lref
= ir
->op1
;
2131 IRRef rref
= ir
->op2
;
2132 RegSet allow
= RSET_FPR
;
2134 Reg right
= IR(rref
)->r
;
2135 if (ra_hasreg(right
))
2136 rset_clear(allow
, right
);
2137 dest
= ra_dest(as
, ir
, allow
);
2140 } else if (ra_noreg(right
)) {
2141 if (swapops(as
, ir
)) {
2142 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2144 right
= asm_fuseload(as
, rref
, rset_clear(allow
, dest
));
2146 emit_mrm(as
, xo
, dest
, right
);
2147 ra_left(as
, dest
, lref
);
2150 static void asm_intarith(ASMState
*as
, IRIns
*ir
, x86Arith xa
)
2152 IRRef lref
= ir
->op1
;
2153 IRRef rref
= ir
->op2
;
2154 RegSet allow
= RSET_GPR
;
2156 if (as
->testmcp
== as
->mcp
) { /* Drop test r,r instruction. */
2158 as
->mcp
+= (LJ_64
&& *as
->mcp
!= XI_TEST
) ? 3 : 2;
2160 right
= IR(rref
)->r
;
2161 if (ra_hasreg(right
))
2162 rset_clear(allow
, right
);
2163 dest
= ra_dest(as
, ir
, allow
);
2166 } else if (ra_noreg(right
) && !irref_isk(rref
)) {
2167 if (swapops(as
, ir
)) {
2168 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2170 right
= asm_fuseload(as
, rref
, rset_clear(allow
, dest
));
2171 /* Note: fuses only with IR_FLOAD for now. */
2173 if (irt_isguard(ir
->t
)) /* For IR_ADDOV etc. */
2174 asm_guardcc(as
, CC_O
);
2175 if (ra_hasreg(right
))
2176 emit_mrm(as
, XO_ARITH(xa
), dest
, right
);
2178 emit_gri(as
, XG_ARITHi(xa
), dest
, IR(ir
->op2
)->i
);
2179 ra_left(as
, dest
, lref
);
2182 /* LEA is really a 4-operand ADD with an independent destination register,
2183 ** up to two source registers and an immediate. One register can be scaled
2184 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
2187 ** Currently only a few common cases are supported:
2188 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated
2189 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b
2190 ** - Right ADD fusion: y = a+(b+k)
2191 ** The ommited variants have already been reduced by FOLD.
2193 ** There are more fusion opportunities, like gathering shifts or joining
2194 ** common references. But these are probably not worth the trouble, since
2195 ** array indexing is not decomposed and already makes use of all fields
2196 ** of the ModRM operand.
2198 static int asm_lea(ASMState
*as
, IRIns
*ir
)
2200 IRIns
*irl
= IR(ir
->op1
);
2201 IRIns
*irr
= IR(ir
->op2
);
2202 RegSet allow
= RSET_GPR
;
2204 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
2205 as
->mrm
.scale
= XM_SCALE1
;
2207 if (ra_hasreg(irl
->r
)) {
2208 rset_clear(allow
, irl
->r
);
2209 as
->mrm
.base
= irl
->r
;
2210 if (irref_isk(ir
->op2
) || ra_hasreg(irr
->r
)) {
2211 /* The PHI renaming logic does a better job in some cases. */
2212 if (ra_hasreg(ir
->r
) &&
2213 ((irt_isphi(irl
->t
) && as
->phireg
[ir
->r
] == ir
->op1
) ||
2214 (irt_isphi(irr
->t
) && as
->phireg
[ir
->r
] == ir
->op2
)))
2216 if (irref_isk(ir
->op2
)) {
2217 as
->mrm
.ofs
= irr
->i
;
2219 rset_clear(allow
, irr
->r
);
2220 as
->mrm
.idx
= irr
->r
;
2222 } else if (irr
->o
== IR_ADD
&& mayfuse(as
, ir
->op2
) &&
2223 irref_isk(irr
->op2
)) {
2224 Reg idx
= ra_alloc1(as
, irr
->op1
, allow
);
2225 rset_clear(allow
, idx
);
2226 as
->mrm
.idx
= (uint8_t)idx
;
2227 as
->mrm
.ofs
= IR(irr
->op2
)->i
;
2231 } else if (ir
->op1
!= ir
->op2
&& irl
->o
== IR_ADD
&& mayfuse(as
, ir
->op1
) &&
2232 (irref_isk(ir
->op2
) || irref_isk(irl
->op2
))) {
2233 Reg idx
, base
= ra_alloc1(as
, irl
->op1
, allow
);
2234 rset_clear(allow
, base
);
2235 as
->mrm
.base
= (uint8_t)base
;
2236 if (irref_isk(ir
->op2
)) {
2237 as
->mrm
.ofs
= irr
->i
;
2238 idx
= ra_alloc1(as
, irl
->op2
, allow
);
2240 as
->mrm
.ofs
= IR(irl
->op2
)->i
;
2241 idx
= ra_alloc1(as
, ir
->op2
, allow
);
2243 rset_clear(allow
, idx
);
2244 as
->mrm
.idx
= (uint8_t)idx
;
2248 dest
= ra_dest(as
, ir
, allow
);
2249 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
2250 return 1; /* Success. */
2253 static void asm_add(ASMState
*as
, IRIns
*ir
)
2255 if (irt_isnum(ir
->t
))
2256 asm_fparith(as
, ir
, XO_ADDSD
);
2257 else if ((as
->flags
& JIT_F_LEA_AGU
) || as
->testmcp
== as
->mcp
||
2259 asm_intarith(as
, ir
, XOg_ADD
);
2262 static void asm_bitnot(ASMState
*as
, IRIns
*ir
)
2264 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2265 emit_rr(as
, XO_GROUP3
, XOg_NOT
, dest
);
2266 ra_left(as
, dest
, ir
->op1
);
2269 static void asm_bitswap(ASMState
*as
, IRIns
*ir
)
2271 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2273 p
[-1] = (MCode
)(XI_BSWAP
+(dest
&7));
2278 ra_left(as
, dest
, ir
->op1
);
2281 static void asm_bitshift(ASMState
*as
, IRIns
*ir
, x86Shift xs
)
2283 IRRef rref
= ir
->op2
;
2284 IRIns
*irr
= IR(rref
);
2286 if (irref_isk(rref
)) { /* Constant shifts. */
2288 dest
= ra_dest(as
, ir
, RSET_GPR
);
2289 shift
= irr
->i
& 31; /* Handle shifts of 0..31 bits. */
2292 case 1: emit_rr(as
, XO_SHIFT1
, (Reg
)xs
, dest
); break;
2293 default: emit_shifti(as
, xs
, dest
, shift
); break;
2295 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
2296 RegSet allow
= rset_exclude(RSET_GPR
, RID_ECX
);
2298 if (ra_noreg(right
)) {
2299 right
= ra_allocref(as
, rref
, RID2RSET(RID_ECX
));
2300 } else if (right
!= RID_ECX
) {
2301 rset_clear(allow
, right
);
2302 ra_scratch(as
, RID2RSET(RID_ECX
));
2304 dest
= ra_dest(as
, ir
, allow
);
2305 emit_rr(as
, XO_SHIFTcl
, (Reg
)xs
, dest
);
2306 if (right
!= RID_ECX
)
2307 emit_rr(as
, XO_MOV
, RID_ECX
, right
);
2309 ra_left(as
, dest
, ir
->op1
);
2311 ** Note: avoid using the flags resulting from a shift or rotate!
2312 ** All of them cause a partial flag stall, except for r,1 shifts
2313 ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
2317 /* -- Comparisons --------------------------------------------------------- */
2319 /* Virtual flags for unordered FP comparisons. */
2320 #define VCC_U 0x100 /* Unordered. */
2321 #define VCC_P 0x200 /* Needs extra CC_P branch. */
2322 #define VCC_S 0x400 /* Swap avoids CC_P branch. */
2323 #define VCC_PS (VCC_P|VCC_S)
2325 static void asm_comp_(ASMState
*as
, IRIns
*ir
, int cc
)
2327 if (irt_isnum(ir
->t
)) {
2328 IRRef lref
= ir
->op1
;
2329 IRRef rref
= ir
->op2
;
2333 ** An extra CC_P branch is required to preserve ordered/unordered
2334 ** semantics for FP comparisons. This can be avoided by swapping
2335 ** the operands and inverting the condition (except for EQ and UNE).
2336 ** So always try to swap if possible.
2338 ** Another option would be to swap operands to achieve better memory
2339 ** operand fusion. But it's unlikely that this outweighs the cost
2340 ** of the extra branches.
2342 if (cc
& VCC_S
) { /* Swap? */
2343 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2344 cc
^= (VCC_PS
|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
2346 left
= ra_alloc1(as
, lref
, RSET_FPR
);
2347 right
= asm_fuseload(as
, rref
, rset_exclude(RSET_FPR
, left
));
2348 l_around
= emit_label(as
);
2349 asm_guardcc(as
, cc
>> 4);
2350 if (cc
& VCC_P
) { /* Extra CC_P branch required? */
2351 if (!(cc
& VCC_U
)) {
2352 asm_guardcc(as
, CC_P
); /* Branch to exit for ordered comparisons. */
2353 } else if (l_around
!= as
->invmcp
) {
2354 emit_sjcc(as
, CC_P
, l_around
); /* Branch around for unordered. */
2356 /* Patched to mcloop by asm_loop_fixup. */
2359 emit_sjcc(as
, CC_P
, as
->mcp
);
2361 emit_jcc(as
, CC_P
, as
->mcp
);
2364 emit_mrm(as
, XO_UCOMISD
, left
, right
);
2366 IRRef lref
= ir
->op1
, rref
= ir
->op2
;
2367 IROp leftop
= (IROp
)(IR(lref
)->o
);
2368 lua_assert(irt_isint(ir
->t
) || (irt_isaddr(ir
->t
) && (cc
& 0xe) == CC_E
));
2369 /* Swap constants (only for ABC) and fusable loads to the right. */
2370 if (irref_isk(lref
) || (!irref_isk(rref
) && opisfusableload(leftop
))) {
2371 if ((cc
& 0xc) == 0xc) cc
^= 3; /* L <-> G, LE <-> GE */
2372 else if ((cc
& 0xa) == 0x2) cc
^= 5; /* A <-> B, AE <-> BE */
2373 lref
= ir
->op2
; rref
= ir
->op1
;
2375 if (irref_isk(rref
)) {
2376 IRIns
*irl
= IR(lref
);
2377 int32_t imm
= IR(rref
)->i
;
2378 /* Check wether we can use test ins. Not for unsigned, since CF=0. */
2379 int usetest
= (imm
== 0 && (cc
& 0xa) != 0x2);
2380 if (usetest
&& irl
->o
== IR_BAND
&& irl
+1 == ir
&& !ra_used(irl
)) {
2381 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
2382 Reg right
, left
= RID_NONE
;
2383 RegSet allow
= RSET_GPR
;
2384 if (!irref_isk(irl
->op2
)) {
2385 left
= ra_alloc1(as
, irl
->op2
, allow
);
2386 rset_clear(allow
, left
);
2388 right
= asm_fuseload(as
, irl
->op1
, allow
);
2389 asm_guardcc(as
, cc
);
2390 if (irref_isk(irl
->op2
)) {
2391 emit_i32(as
, IR(irl
->op2
)->i
);
2392 emit_mrm(as
, XO_GROUP3
, XOg_TEST
, right
);
2394 emit_mrm(as
, XO_TEST
, left
, right
);
2398 if (opisfusableload((IROp
)irl
->o
) &&
2399 ((irt_isu8(irl
->t
) && checku8(imm
)) ||
2400 ((irt_isi8(irl
->t
) || irt_isi16(irl
->t
)) && checki8(imm
)) ||
2401 (irt_isu16(irl
->t
) && checku16(imm
) && checki8((int16_t)imm
)))) {
2402 /* Only the IRT_INT case is fused by asm_fuseload.
2403 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
2404 ** are handled here.
2405 ** Note that cmp word [mem], imm16 should not be generated,
2406 ** since it has a length-changing prefix. Compares of a word
2407 ** against a sign-extended imm8 are ok, however.
2409 IRType1 origt
= irl
->t
; /* Temporarily flip types. */
2410 irl
->t
.irt
= (irl
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
2411 left
= asm_fuseload(as
, lref
, RSET_GPR
);
2413 if (left
== RID_MRM
) { /* Fusion succeeded? */
2414 asm_guardcc(as
, cc
);
2416 emit_mrm(as
, (irt_isi8(origt
) || irt_isu8(origt
)) ?
2417 XO_ARITHib
: XO_ARITHiw8
, XOg_CMP
, RID_MRM
);
2419 } /* Otherwise handle register case as usual. */
2421 left
= asm_fuseload(as
, lref
, RSET_GPR
);
2423 asm_guardcc(as
, cc
);
2424 if (usetest
&& left
!= RID_MRM
) {
2425 /* Use test r,r instead of cmp r,0. */
2426 if (irl
+1 == ir
) /* Referencing previous ins? */
2427 as
->testmcp
= as
->mcp
; /* Set flag to drop test r,r if possible. */
2428 emit_rr(as
, XO_TEST
, left
, left
);
2438 emit_mrm(as
, xo
, XOg_CMP
, left
);
2442 Reg left
= ra_alloc1(as
, lref
, RSET_GPR
);
2443 Reg right
= asm_fuseload(as
, rref
, rset_exclude(RSET_GPR
, left
));
2444 asm_guardcc(as
, cc
);
2445 emit_mrm(as
, XO_CMP
, left
, right
);
2450 #define asm_comp(as, ir, ci, cf, cu) \
2451 asm_comp_(as, ir, (ci)+((cf)<<4)+(cu))
2453 /* -- GC handling --------------------------------------------------------- */
2455 /* Sync all live GC values to Lua stack slots. */
2456 static void asm_gc_sync(ASMState
*as
, SnapShot
*snap
, Reg base
)
2458 /* Some care must be taken when allocating registers here, since this is
2459 ** not part of the fast path. All scratch registers are evicted in the
2460 ** fast path, so it's easiest to force allocation from scratch registers
2461 ** only. This avoids register allocation state unification.
2463 RegSet allow
= rset_exclude(RSET_SCRATCH
& RSET_GPR
, base
);
2464 IRRef2
*map
= &as
->T
->snapmap
[snap
->mapofs
];
2465 BCReg s
, nslots
= snap
->nslots
;
2466 for (s
= 0; s
< nslots
; s
++) {
2467 IRRef ref
= snap_ref(map
[s
]);
2468 if (!irref_isk(ref
)) {
2469 IRIns
*ir
= IR(ref
);
2470 if (ir
->o
== IR_FRAME
) {
2471 /* NYI: sync the frame, bump base, set topslot, clear new slots. */
2472 lj_trace_err(as
->J
, LJ_TRERR_NYIGCF
);
2473 } else if (irt_isgcv(ir
->t
) &&
2474 !(ir
->o
== IR_SLOAD
&& ir
->op1
< nslots
&& map
[ir
->op1
] == 0)) {
2475 Reg src
= ra_alloc1(as
, ref
, allow
);
2476 int32_t ofs
= 8*(int32_t)(s
-1);
2477 emit_movtomro(as
, src
, base
, ofs
);
2478 emit_movmroi(as
, base
, ofs
+4, irt_toitype(ir
->t
));
2485 /* Check GC threshold and do one or more GC steps. */
2486 static void asm_gc_check(ASMState
*as
, SnapShot
*snap
)
2488 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_step_jit
];
2491 Reg base
, lstate
, tmp
;
2492 RegSet drop
= RSET_SCRATCH
;
2493 if (ra_hasreg(IR(REF_BASE
)->r
)) /* Stack may be reallocated by the GC. */
2494 drop
|= RID2RSET(IR(REF_BASE
)->r
); /* Need to evict BASE, too. */
2495 ra_evictset(as
, drop
);
2496 l_end
= emit_label(as
);
2498 args
[1] = ASMREF_TMP1
;
2499 asm_gencall(as
, ci
, args
);
2500 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
2501 emit_loadi(as
, tmp
, (int32_t)as
->gcsteps
);
2502 /* We don't know spadj yet, so get the C frame from L->cframe. */
2503 emit_movmroi(as
, tmp
, CFRAME_OFS_PC
,
2504 (int32_t)as
->T
->snapmap
[snap
->mapofs
+snap
->nslots
]);
2505 emit_gri(as
, XG_ARITHi(XOg_AND
), tmp
, CFRAME_RAWMASK
);
2506 lstate
= IR(ASMREF_L
)->r
;
2507 emit_movrmro(as
, tmp
, lstate
, offsetof(lua_State
, cframe
));
2508 /* It's ok if lstate is already in a non-scratch reg. But all allocations
2509 ** in the non-fast path must use a scratch reg. See comment above.
2511 base
= ra_alloc1(as
, REF_BASE
, rset_exclude(RSET_SCRATCH
& RSET_GPR
, lstate
));
2512 emit_movtomro(as
, base
, lstate
, offsetof(lua_State
, base
));
2513 asm_gc_sync(as
, snap
, base
);
2514 /* BASE/L get restored anyway, better do it inside the slow path. */
2515 if (as
->parent
|| as
->curins
== as
->loopref
) ra_restore(as
, REF_BASE
);
2516 if (rset_test(RSET_SCRATCH
, lstate
) && ra_hasreg(IR(ASMREF_L
)->r
))
2517 ra_restore(as
, ASMREF_L
);
2518 /* Jump around GC step if GC total < GC threshold. */
2519 tmp
= ra_scratch(as
, RSET_SCRATCH
& RSET_GPR
);
2520 emit_sjcc(as
, CC_B
, l_end
);
2521 emit_opgl(as
, XO_ARITH(XOg_CMP
), tmp
, gc
.threshold
);
2522 emit_getgl(as
, tmp
, gc
.total
);
2527 /* -- PHI and loop handling ----------------------------------------------- */
2529 /* Break a PHI cycle by renaming to a free register (evict if needed). */
2530 static void asm_phi_break(ASMState
*as
, RegSet blocked
, RegSet blockedby
,
2533 RegSet candidates
= blocked
& allow
;
2534 if (candidates
) { /* If this register file has candidates. */
2535 /* Note: the set for ra_pick cannot be empty, since each register file
2536 ** has some registers never allocated to PHIs.
2538 Reg down
, up
= ra_pick(as
, ~blocked
& allow
); /* Get a free register. */
2539 if (candidates
& ~blockedby
) /* Optimize shifts, else it's a cycle. */
2540 candidates
= candidates
& ~blockedby
;
2541 down
= rset_picktop(candidates
); /* Pick candidate PHI register. */
2542 ra_rename(as
, down
, up
); /* And rename it to the free register. */
2546 /* PHI register shuffling.
2548 ** The allocator tries hard to preserve PHI register assignments across
2549 ** the loop body. Most of the time this loop does nothing, since there
2550 ** are no register mismatches.
2552 ** If a register mismatch is detected and ...
2553 ** - the register is currently free: rename it.
2554 ** - the register is blocked by an invariant: restore/remat and rename it.
2555 ** - Otherwise the register is used by another PHI, so mark it as blocked.
2557 ** The renames are order-sensitive, so just retry the loop if a register
2558 ** is marked as blocked, but has been freed in the meantime. A cycle is
2559 ** detected if all of the blocked registers are allocated. To break the
2560 ** cycle rename one of them to a free register and retry.
2562 ** Note that PHI spill slots are kept in sync and don't need to be shuffled.
2564 static void asm_phi_shuffle(ASMState
*as
)
2568 /* Find and resolve PHI register mismatches. */
2570 RegSet blocked
= RSET_EMPTY
;
2571 RegSet blockedby
= RSET_EMPTY
;
2572 RegSet phiset
= as
->phiset
;
2573 while (phiset
) { /* Check all left PHI operand registers. */
2574 Reg r
= rset_picktop(phiset
);
2575 IRIns
*irl
= IR(as
->phireg
[r
]);
2577 if (r
!= left
) { /* Mismatch? */
2578 if (!rset_test(as
->freeset
, r
)) { /* PHI register blocked? */
2579 IRRef ref
= regcost_ref(as
->cost
[r
]);
2580 if (irt_ismarked(IR(ref
)->t
)) { /* Blocked by other PHI (w/reg)? */
2581 rset_set(blocked
, r
);
2582 if (ra_hasreg(left
))
2583 rset_set(blockedby
, left
);
2585 } else { /* Otherwise grab register from invariant. */
2586 ra_restore(as
, ref
);
2590 if (ra_hasreg(left
)) {
2591 ra_rename(as
, left
, r
);
2595 rset_clear(phiset
, r
);
2597 if (!blocked
) break; /* Finished. */
2598 if (!(as
->freeset
& blocked
)) { /* Break cycles if none are free. */
2599 asm_phi_break(as
, blocked
, blockedby
, RSET_GPR
);
2600 asm_phi_break(as
, blocked
, blockedby
, RSET_FPR
);
2602 } /* Else retry some more renames. */
2605 /* Restore/remat invariants whose registers are modified inside the loop. */
2606 work
= as
->modset
& ~(as
->freeset
| as
->phiset
);
2608 Reg r
= rset_picktop(work
);
2609 ra_restore(as
, regcost_ref(as
->cost
[r
]));
2610 rset_clear(work
, r
);
2614 /* Allocate and save all unsaved PHI regs and clear marks. */
2617 Reg r
= rset_picktop(work
);
2618 IRRef lref
= as
->phireg
[r
];
2619 IRIns
*ir
= IR(lref
);
2620 if (ra_hasspill(ir
->s
)) { /* Left PHI gained a spill slot? */
2621 irt_clearmark(ir
->t
); /* Handled here, so clear marker now. */
2622 ra_alloc1(as
, lref
, RID2RSET(r
));
2623 ra_save(as
, ir
, r
); /* Save to spill slot inside the loop. */
2626 rset_clear(work
, r
);
2630 /* Emit renames for left PHIs which are only spilled outside the loop. */
2631 static void asm_phi_fixup(ASMState
*as
)
2633 RegSet work
= as
->phiset
;
2635 Reg r
= rset_picktop(work
);
2636 IRRef lref
= as
->phireg
[r
];
2637 IRIns
*ir
= IR(lref
);
2638 /* Left PHI gained a spill slot before the loop? */
2639 if (irt_ismarked(ir
->t
) && ra_hasspill(ir
->s
)) {
2641 lj_ir_set(as
->J
, IRT(IR_RENAME
, IRT_NIL
), lref
, as
->loopsnapno
);
2642 ren
= tref_ref(lj_ir_emit(as
->J
));
2643 as
->ir
= as
->T
->ir
; /* The IR may have been reallocated. */
2644 IR(ren
)->r
= (uint8_t)r
;
2645 IR(ren
)->s
= SPS_NONE
;
2647 irt_clearmark(ir
->t
); /* Always clear marker. */
2648 rset_clear(work
, r
);
2652 /* Setup right PHI reference. */
2653 static void asm_phi(ASMState
*as
, IRIns
*ir
)
2655 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
2656 RegSet afree
= (as
->freeset
& allow
);
2657 IRIns
*irl
= IR(ir
->op1
);
2658 IRIns
*irr
= IR(ir
->op2
);
2659 /* Spill slot shuffling is not implemented yet (but rarely needed). */
2660 if (ra_hasspill(irl
->s
) || ra_hasspill(irr
->s
))
2661 lj_trace_err(as
->J
, LJ_TRERR_NYIPHI
);
2662 /* Leave at least one register free for non-PHIs (and PHI cycle breaking). */
2663 if ((afree
& (afree
-1))) { /* Two or more free registers? */
2665 if (ra_noreg(irr
->r
)) { /* Get a register for the right PHI. */
2666 r
= ra_allocref(as
, ir
->op2
, allow
);
2667 } else { /* Duplicate right PHI, need a copy (rare). */
2668 r
= ra_scratch(as
, allow
);
2669 emit_movrr(as
, r
, irr
->r
);
2672 rset_set(as
->phiset
, r
);
2673 as
->phireg
[r
] = (IRRef1
)ir
->op1
;
2674 irt_setmark(irl
->t
); /* Marks left PHIs _with_ register. */
2675 if (ra_noreg(irl
->r
))
2676 ra_sethint(irl
->r
, r
); /* Set register hint for left PHI. */
2677 } else { /* Otherwise allocate a spill slot. */
2678 /* This is overly restrictive, but it triggers only on synthetic code. */
2679 if (ra_hasreg(irl
->r
) || ra_hasreg(irr
->r
))
2680 lj_trace_err(as
->J
, LJ_TRERR_NYIPHI
);
2682 irl
->s
= irr
->s
= ir
->s
; /* Sync left/right PHI spill slots. */
2686 /* Fixup the loop branch. */
2687 static void asm_loop_fixup(ASMState
*as
)
2689 MCode
*p
= as
->mctop
;
2690 MCode
*target
= as
->mcp
;
2691 if (as
->realign
) { /* Realigned loops use short jumps. */
2692 as
->realign
= NULL
; /* Stop another retry. */
2693 lua_assert(((intptr_t)target
& 15) == 0);
2694 if (as
->loopinv
) { /* Inverted loop branch? */
2697 lua_assert(target
- p
>= -128);
2698 p
[-1] = (MCode
)(target
- p
); /* Patch sjcc. */
2699 if (as
->loopinv
== 2)
2700 p
[-3] = (MCode
)(target
- p
+ 2); /* Patch opt. short jp. */
2702 lua_assert(target
- p
>= -128);
2703 p
[-1] = (MCode
)(int8_t)(target
- p
); /* Patch short jmp. */
2709 if (as
->loopinv
) { /* Inverted loop branch? */
2710 /* asm_guardcc already inverted the jcc and patched the jmp. */
2713 *(int32_t *)(p
-4) = (int32_t)(target
- p
); /* Patch jcc. */
2714 if (as
->loopinv
== 2) {
2715 *(int32_t *)(p
-10) = (int32_t)(target
- p
+ 6); /* Patch opt. jp. */
2718 } else { /* Otherwise just patch jmp. */
2719 *(int32_t *)(p
-4) = (int32_t)(target
- p
);
2722 /* Realign small loops and shorten the loop branch. */
2723 if (newloop
>= p
- 128) {
2724 as
->realign
= newloop
; /* Force a retry and remember alignment. */
2725 as
->curins
= as
->stopins
; /* Abort asm_trace now. */
2726 as
->T
->nins
= as
->orignins
; /* Remove any added renames. */
2731 /* Middle part of a loop. */
2732 static void asm_loop(ASMState
*as
)
2734 /* LOOP is a guard, so the snapno is up to date. */
2735 as
->loopsnapno
= as
->snapno
;
2737 asm_gc_check(as
, &as
->T
->snap
[as
->loopsnapno
]);
2738 /* LOOP marks the transition from the variant to the invariant part. */
2739 as
->testmcp
= as
->invmcp
= NULL
;
2741 if (!neverfuse(as
)) as
->fuseref
= 0;
2742 asm_phi_shuffle(as
);
2744 as
->mcloop
= as
->mcp
;
2745 RA_DBGX((as
, "===== LOOP ====="));
2746 if (!as
->realign
) RA_DBG_FLUSH();
2749 /* -- Head of trace ------------------------------------------------------- */
2751 /* Rematerialize all remaining constants in registers. */
2752 static void asm_const_remat(ASMState
*as
)
2754 RegSet work
= ~as
->freeset
& RSET_ALL
;
2756 Reg r
= rset_pickbot(work
);
2757 IRRef ref
= regcost_ref(as
->cost
[r
]);
2758 if (irref_isk(ref
) || ref
== REF_BASE
) {
2759 ra_rematk(as
, IR(ref
));
2762 rset_clear(work
, r
);
2766 /* Head of a root trace. */
2767 static void asm_head_root(ASMState
*as
)
2770 emit_setgli(as
, vmstate
, (int32_t)as
->J
->curtrace
);
2771 spadj
= sps_adjust(as
->evenspill
);
2772 as
->T
->spadjust
= (uint16_t)spadj
;
2773 emit_addptr(as
, RID_ESP
, -spadj
);
2776 /* Handle BASE coalescing for a root trace. */
2777 static void asm_head_base(ASMState
*as
)
2779 IRIns
*ir
= IR(REF_BASE
);
2781 lua_assert(!ra_hasspill(ir
->s
));
2784 if (r
!= RID_BASE
) {
2785 ra_scratch(as
, RID2RSET(RID_BASE
));
2786 emit_rr(as
, XO_MOV
, r
, RID_BASE
);
2791 /* Check Lua stack size for overflow at the start of a side trace.
2792 ** Stack overflow is rare, so let the regular exit handling fix this up.
2793 ** This is done in the context of the *parent* trace and parent exitno!
2795 static void asm_checkstack(ASMState
*as
, RegSet allow
)
2797 /* Try to get an unused temp. register, otherwise spill/restore eax. */
2798 Reg r
= allow
? rset_pickbot(allow
) : RID_EAX
;
2799 emit_jcc(as
, CC_B
, exitstub_addr(as
->J
, as
->J
->exitno
));
2800 if (allow
== RSET_EMPTY
) /* Restore temp. register. */
2801 emit_rmro(as
, XO_MOV
, r
, RID_ESP
, sps_scale(SPS_TEMP1
));
2802 emit_gri(as
, XG_ARITHi(XOg_CMP
), r
, (int32_t)(8*as
->topslot
));
2803 emit_rmro(as
, XO_ARITH(XOg_SUB
), r
, RID_NONE
, ptr2addr(&J2G(as
->J
)->jit_base
));
2804 emit_rmro(as
, XO_MOV
, r
, r
, offsetof(lua_State
, maxstack
));
2805 emit_getgl(as
, r
, jit_L
);
2806 if (allow
== RSET_EMPTY
) /* Spill temp. register. */
2807 emit_rmro(as
, XO_MOVto
, r
, RID_ESP
, sps_scale(SPS_TEMP1
));
2810 /* Head of a side trace.
2812 ** The current simplistic algorithm requires that all slots inherited
2813 ** from the parent are live in a register between pass 2 and pass 3. This
2814 ** avoids the complexity of stack slot shuffling. But of course this may
2815 ** overflow the register set in some cases and cause the dreaded error:
2816 ** "NYI: register coalescing too complex". A refined algorithm is needed.
2818 static void asm_head_side(ASMState
*as
)
2820 IRRef1 sloadins
[RID_MAX
];
2821 RegSet allow
= RSET_ALL
; /* Inverse of all coalesced registers. */
2822 RegSet live
= RSET_EMPTY
; /* Live parent registers. */
2823 int32_t spadj
, spdelta
;
2828 /* Scan all parent SLOADs and collect register dependencies. */
2829 for (i
= as
->curins
; i
> REF_BASE
; i
--) {
2831 lua_assert((ir
->o
== IR_SLOAD
&& (ir
->op2
& IRSLOAD_PARENT
)) ||
2833 if (ir
->o
== IR_SLOAD
) {
2834 RegSP rs
= as
->parentmap
[ir
->op1
];
2835 if (ra_hasreg(ir
->r
)) {
2836 rset_clear(allow
, ir
->r
);
2837 if (ra_hasspill(ir
->s
))
2838 ra_save(as
, ir
, ir
->r
);
2839 } else if (ra_hasspill(ir
->s
)) {
2843 if (ir
->r
== rs
) { /* Coalesce matching registers right now. */
2845 } else if (ra_hasspill(regsp_spill(rs
))) {
2846 if (ra_hasreg(ir
->r
))
2848 } else if (ra_used(ir
)) {
2849 sloadins
[rs
] = (IRRef1
)i
;
2850 rset_set(live
, rs
); /* Block live parent register. */
2855 /* Calculate stack frame adjustment. */
2856 spadj
= sps_adjust(as
->evenspill
);
2857 spdelta
= spadj
- (int32_t)as
->parent
->spadjust
;
2858 if (spdelta
< 0) { /* Don't shrink the stack frame. */
2859 spadj
= (int32_t)as
->parent
->spadjust
;
2862 as
->T
->spadjust
= (uint16_t)spadj
;
2864 /* Reload spilled target registers. */
2866 for (i
= as
->curins
; i
> REF_BASE
; i
--) {
2868 if (irt_ismarked(ir
->t
)) {
2872 irt_clearmark(ir
->t
);
2873 rs
= as
->parentmap
[ir
->op1
];
2874 if (!ra_hasspill(regsp_spill(rs
)))
2875 ra_sethint(ir
->r
, rs
); /* Hint may be gone, set it again. */
2876 else if (sps_scale(regsp_spill(rs
))+spdelta
== sps_scale(ir
->s
))
2877 continue; /* Same spill slot, do nothing. */
2878 mask
= (irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
) & allow
;
2879 if (mask
== RSET_EMPTY
)
2880 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
2881 r
= ra_allocref(as
, i
, mask
);
2883 rset_clear(allow
, r
);
2884 if (r
== rs
) { /* Coalesce matching registers right now. */
2886 rset_clear(live
, r
);
2887 } else if (ra_hasspill(regsp_spill(rs
))) {
2895 /* Store trace number and adjust stack frame relative to the parent. */
2896 emit_setgli(as
, vmstate
, (int32_t)as
->J
->curtrace
);
2897 emit_addptr(as
, RID_ESP
, -spdelta
);
2899 /* Restore target registers from parent spill slots. */
2901 RegSet work
= ~as
->freeset
& RSET_ALL
;
2903 Reg r
= rset_pickbot(work
);
2904 IRIns
*ir
= IR(regcost_ref(as
->cost
[r
]));
2905 RegSP rs
= as
->parentmap
[ir
->op1
];
2906 rset_clear(work
, r
);
2907 if (ra_hasspill(regsp_spill(rs
))) {
2908 int32_t ofs
= sps_scale(regsp_spill(rs
));
2910 emit_movrmro(as
, r
, RID_ESP
, ofs
);
2916 /* Shuffle registers to match up target regs with parent regs. */
2920 /* Repeatedly coalesce free live registers by moving to their target. */
2921 while ((work
= as
->freeset
& live
) != RSET_EMPTY
) {
2922 Reg rp
= rset_pickbot(work
);
2923 IRIns
*ir
= IR(sloadins
[rp
]);
2924 rset_clear(live
, rp
);
2925 rset_clear(allow
, rp
);
2927 emit_movrr(as
, ir
->r
, rp
);
2931 /* We're done if no live registers remain. */
2932 if (live
== RSET_EMPTY
)
2935 /* Break cycles by renaming one target to a temp. register. */
2936 if (live
& RSET_GPR
) {
2937 RegSet tmpset
= as
->freeset
& ~live
& allow
& RSET_GPR
;
2938 if (tmpset
== RSET_EMPTY
)
2939 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
2940 ra_rename(as
, rset_pickbot(live
& RSET_GPR
), rset_pickbot(tmpset
));
2942 if (live
& RSET_FPR
) {
2943 RegSet tmpset
= as
->freeset
& ~live
& allow
& RSET_FPR
;
2944 if (tmpset
== RSET_EMPTY
)
2945 lj_trace_err(as
->J
, LJ_TRERR_NYICOAL
);
2946 ra_rename(as
, rset_pickbot(live
& RSET_FPR
), rset_pickbot(tmpset
));
2949 /* Continue with coalescing to fix up the broken cycle(s). */
2952 /* Check Lua stack size if frames have been added. */
2954 asm_checkstack(as
, allow
& RSET_GPR
);
2957 /* -- Tail of trace ------------------------------------------------------- */
2959 /* Sync Lua stack slots to match the last snapshot.
2960 ** Note: code generation is backwards, so this is best read bottom-up.
2962 static void asm_tail_sync(ASMState
*as
)
2964 SnapShot
*snap
= &as
->T
->snap
[as
->T
->nsnap
-1]; /* Last snapshot. */
2965 BCReg s
, nslots
= snap
->nslots
;
2966 IRRef2
*map
= &as
->T
->snapmap
[snap
->mapofs
];
2967 IRRef2
*flinks
= map
+ nslots
+ snap
->nframelinks
;
2969 BCReg secondbase
= ~(BCReg
)0;
2973 ra_allocref(as
, REF_BASE
, RID2RSET(RID_BASE
));
2975 /* Must check all frames to find topslot (outer can be larger than inner). */
2976 for (s
= 0; s
< nslots
; s
++) {
2977 IRRef ref
= snap_ref(map
[s
]);
2978 if (!irref_isk(ref
)) {
2979 IRIns
*ir
= IR(ref
);
2980 if (ir
->o
== IR_FRAME
&& irt_isfunc(ir
->t
)) {
2981 GCfunc
*fn
= ir_kfunc(IR(ir
->op2
));
2982 if (isluafunc(fn
)) {
2983 BCReg fs
= s
+ funcproto(fn
)->framesize
;
2984 if (fs
> topslot
) topslot
= fs
;
2987 if (secondbase
== ~(BCReg
)0) secondbase
= s
;
2993 as
->topslot
= topslot
; /* Used in asm_head_side(). */
2995 if (as
->T
->link
== TRACE_INTERP
) {
2996 /* Setup fixed registers for exit to interpreter. */
2997 emit_loada(as
, RID_DISPATCH
, J2GG(as
->J
)->dispatch
);
2998 emit_loadi(as
, RID_PC
, (int32_t)map
[nslots
]);
2999 } else if (newbase
) {
3000 /* Save modified BASE for linking to trace with higher start frame. */
3001 emit_setgl(as
, RID_BASE
, jit_base
);
3004 emit_addptr(as
, RID_BASE
, 8*(int32_t)newbase
);
3006 /* Clear stack slots of newly added frames. */
3007 if (nslots
<= topslot
) {
3008 if (nslots
< topslot
) {
3009 for (s
= nslots
; s
<= topslot
; s
++) {
3010 emit_movtomro(as
, RID_EAX
, RID_BASE
, 8*(int32_t)s
-4);
3013 emit_loadi(as
, RID_EAX
, LJ_TNIL
);
3015 emit_movmroi(as
, RID_BASE
, 8*(int32_t)nslots
-4, LJ_TNIL
);
3019 /* Store the value of all modified slots to the Lua stack. */
3020 for (s
= 0; s
< nslots
; s
++) {
3021 int32_t ofs
= 8*((int32_t)s
-1);
3022 IRRef ref
= snap_ref(map
[s
]);
3024 IRIns
*ir
= IR(ref
);
3025 /* No need to restore readonly slots and unmodified non-parent slots. */
3026 if (ir
->o
== IR_SLOAD
&& ir
->op1
== s
&&
3027 (ir
->op2
& (IRSLOAD_READONLY
|IRSLOAD_PARENT
)) != IRSLOAD_PARENT
)
3029 if (irt_isnum(ir
->t
)) {
3030 Reg src
= ra_alloc1(as
, ref
, RSET_FPR
);
3031 emit_rmro(as
, XO_MOVSDto
, src
, RID_BASE
, ofs
);
3032 } else if (ir
->o
== IR_FRAME
) {
3033 emit_movmroi(as
, RID_BASE
, ofs
, ptr2addr(ir_kgc(IR(ir
->op2
))));
3034 if (s
!= 0) /* Do not overwrite link to previous frame. */
3035 emit_movmroi(as
, RID_BASE
, ofs
+4, (int32_t)(*--flinks
));
3037 lua_assert(irt_ispri(ir
->t
) || irt_isaddr(ir
->t
));
3038 if (!irref_isk(ref
)) {
3039 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, RID_BASE
));
3040 emit_movtomro(as
, src
, RID_BASE
, ofs
);
3041 } else if (!irt_ispri(ir
->t
)) {
3042 emit_movmroi(as
, RID_BASE
, ofs
, ir
->i
);
3044 emit_movmroi(as
, RID_BASE
, ofs
+4, irt_toitype(ir
->t
));
3046 } else if (s
> secondbase
) {
3047 emit_movmroi(as
, RID_BASE
, ofs
+4, LJ_TNIL
);
3051 lua_assert(map
+ nslots
== flinks
-1);
3054 /* Fixup the tail code. */
3055 static void asm_tail_fixup(ASMState
*as
, TraceNo lnk
)
3057 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
3058 MCode
*p
= as
->mctop
;
3060 int32_t spadj
= as
->T
->spadjust
;
3062 p
-= (as
->flags
& JIT_F_LEA_AGU
) ? 7 : 6;
3065 /* Patch stack adjustment. */
3066 if (checki8(spadj
)) {
3072 *(int32_t *)p1
= spadj
;
3074 if ((as
->flags
& JIT_F_LEA_AGU
)) {
3075 p1
[-3] = (MCode
)XI_LEA
;
3076 p1
[-2] = MODRM(checki8(spadj
) ? XM_OFS8
: XM_OFS32
, RID_ESP
, RID_ESP
);
3077 p1
[-1] = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
3079 p1
[-2] = (MCode
)(checki8(spadj
) ? XI_ARITHi8
: XI_ARITHi
);
3080 p1
[-1] = MODRM(XM_REG
, XOg_ADD
, RID_ESP
);
3083 /* Patch exit branch. */
3084 target
= lnk
== TRACE_INTERP
? (MCode
*)lj_vm_exit_interp
:
3085 as
->J
->trace
[lnk
]->mcode
;
3086 *(int32_t *)(p
-4) = (int32_t)(target
- p
);
3088 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
3089 for (q
= as
->mctop
-1; q
>= p
; q
--)
3094 /* -- Instruction dispatch ------------------------------------------------ */
3096 /* Assemble a single instruction. */
3097 static void asm_ir(ASMState
*as
, IRIns
*ir
)
3099 switch ((IROp
)ir
->o
) {
3100 /* Miscellaneous ops. */
3101 case IR_LOOP
: asm_loop(as
); break;
3103 case IR_PHI
: asm_phi(as
, ir
); break;
3105 /* Guarded assertions. */
3106 case IR_LT
: asm_comp(as
, ir
, CC_GE
, CC_AE
, VCC_PS
); break;
3107 case IR_GE
: asm_comp(as
, ir
, CC_L
, CC_B
, 0); break;
3108 case IR_LE
: asm_comp(as
, ir
, CC_G
, CC_A
, VCC_PS
); break;
3109 case IR_GT
: asm_comp(as
, ir
, CC_LE
, CC_BE
, 0); break;
3110 case IR_ULT
: asm_comp(as
, ir
, CC_AE
, CC_AE
, VCC_U
); break;
3111 case IR_UGE
: asm_comp(as
, ir
, CC_B
, CC_B
, VCC_U
|VCC_PS
); break;
3112 case IR_ULE
: asm_comp(as
, ir
, CC_A
, CC_A
, VCC_U
); break;
3114 case IR_UGT
: asm_comp(as
, ir
, CC_BE
, CC_BE
, VCC_U
|VCC_PS
); break;
3117 if (ir
->op1
== ir
->op2
) break; /* No check needed for placeholder. */
3119 case IR_EQ
: asm_comp(as
, ir
, CC_NE
, CC_NE
, VCC_P
); break;
3120 case IR_NE
: asm_comp(as
, ir
, CC_E
, CC_E
, VCC_U
|VCC_P
); break;
3123 case IR_BNOT
: asm_bitnot(as
, ir
); break;
3124 case IR_BSWAP
: asm_bitswap(as
, ir
); break;
3126 case IR_BAND
: asm_intarith(as
, ir
, XOg_AND
); break;
3127 case IR_BOR
: asm_intarith(as
, ir
, XOg_OR
); break;
3128 case IR_BXOR
: asm_intarith(as
, ir
, XOg_XOR
); break;
3130 case IR_BSHL
: asm_bitshift(as
, ir
, XOg_SHL
); break;
3131 case IR_BSHR
: asm_bitshift(as
, ir
, XOg_SHR
); break;
3132 case IR_BSAR
: asm_bitshift(as
, ir
, XOg_SAR
); break;
3133 case IR_BROL
: asm_bitshift(as
, ir
, XOg_ROL
); break;
3134 case IR_BROR
: asm_bitshift(as
, ir
, XOg_ROR
); break;
3136 /* Arithmetic ops. */
3137 case IR_ADD
: asm_add(as
, ir
); break;
3139 if (irt_isnum(ir
->t
))
3140 asm_fparith(as
, ir
, XO_SUBSD
);
3141 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
3142 asm_intarith(as
, ir
, XOg_SUB
);
3144 case IR_MUL
: asm_fparith(as
, ir
, XO_MULSD
); break;
3145 case IR_DIV
: asm_fparith(as
, ir
, XO_DIVSD
); break;
3147 case IR_NEG
: asm_fparith(as
, ir
, XO_XORPS
); break;
3148 case IR_ABS
: asm_fparith(as
, ir
, XO_ANDPS
); break;
3150 case IR_MIN
: asm_fparith(as
, ir
, XO_MINSD
); break;
3151 case IR_MAX
: asm_fparith(as
, ir
, XO_MAXSD
); break;
3153 case IR_FPMATH
: case IR_ATAN2
: case IR_LDEXP
:
3156 case IR_POWI
: asm_powi(as
, ir
); break;
3158 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3159 case IR_ADDOV
: asm_intarith(as
, ir
, XOg_ADD
); break;
3160 case IR_SUBOV
: asm_intarith(as
, ir
, XOg_SUB
); break;
3162 /* Memory references. */
3163 case IR_AREF
: asm_aref(as
, ir
); break;
3164 case IR_HREF
: asm_href(as
, ir
); break;
3165 case IR_HREFK
: asm_hrefk(as
, ir
); break;
3166 case IR_NEWREF
: asm_newref(as
, ir
); break;
3167 case IR_UREFO
: case IR_UREFC
: asm_uref(as
, ir
); break;
3168 case IR_FREF
: asm_fref(as
, ir
); break;
3169 case IR_STRREF
: asm_strref(as
, ir
); break;
3171 /* Loads and stores. */
3172 case IR_ALOAD
: case IR_HLOAD
: case IR_ULOAD
: asm_ahuload(as
, ir
); break;
3173 case IR_FLOAD
: case IR_XLOAD
: asm_fxload(as
, ir
); break;
3174 case IR_SLOAD
: asm_sload(as
, ir
); break;
3176 case IR_ASTORE
: case IR_HSTORE
: case IR_USTORE
: asm_ahustore(as
, ir
); break;
3177 case IR_FSTORE
: asm_fstore(as
, ir
); break;
3180 case IR_SNEW
: asm_snew(as
, ir
); break;
3181 case IR_TNEW
: asm_tnew(as
, ir
); break;
3182 case IR_TDUP
: asm_tdup(as
, ir
); break;
3184 /* Write barriers. */
3185 case IR_TBAR
: asm_tbar(as
, ir
); break;
3186 case IR_OBAR
: asm_obar(as
, ir
); break;
3188 /* Type conversions. */
3189 case IR_TONUM
: asm_tonum(as
, ir
); break;
3191 if (irt_isguard(ir
->t
))
3192 asm_tointg(as
, ir
, ra_alloc1(as
, ir
->op1
, RSET_FPR
));
3194 asm_toint(as
, ir
); break;
3196 case IR_TOBIT
: asm_tobit(as
, ir
); break;
3197 case IR_TOSTR
: asm_tostr(as
, ir
); break;
3198 case IR_STRTO
: asm_strto(as
, ir
); break;
3201 case IR_CALLN
: case IR_CALLL
: case IR_CALLS
: asm_call(as
, ir
); break;
3202 case IR_CARG
: break;
3205 setintV(&as
->J
->errinfo
, ir
->o
);
3206 lj_trace_err_info(as
->J
, LJ_TRERR_NYIIR
);
3211 /* Assemble a trace in linear backwards order. */
3212 static void asm_trace(ASMState
*as
)
3214 for (as
->curins
--; as
->curins
> as
->stopins
; as
->curins
--) {
3215 IRIns
*ir
= IR(as
->curins
);
3216 if (irt_isguard(ir
->t
))
3218 else if (!ra_used(ir
) && !irm_sideeff(lj_ir_mode
[ir
->o
]) &&
3219 (as
->flags
& JIT_F_OPT_DCE
))
3220 continue; /* Dead-code elimination can be soooo easy. */
3227 /* -- Trace setup --------------------------------------------------------- */
3229 /* Clear reg/sp for all instructions and add register hints. */
3230 static void asm_setup_regsp(ASMState
*as
, Trace
*T
)
3237 /* Clear reg/sp for constants. */
3238 for (i
= T
->nk
; i
< REF_BIAS
; i
++)
3239 IR(i
)->prev
= REGSP_INIT
;
3241 /* REF_BASE is used for implicit references to the BASE register. */
3242 IR(REF_BASE
)->prev
= REGSP_HINT(RID_BASE
);
3245 if (IR(nins
-1)->o
== IR_RENAME
) {
3246 do { nins
--; } while (IR(nins
-1)->o
== IR_RENAME
);
3247 T
->nins
= nins
; /* Remove any renames left over from ASM restart. */
3249 as
->snaprename
= nins
;
3251 as
->snapno
= T
->nsnap
;
3253 as
->stopins
= REF_BASE
;
3254 as
->orignins
= nins
;
3258 as
->evenspill
= SPS_FIRST
;
3259 for (i
= REF_FIRST
; i
< nins
; i
++) {
3265 /* Set hints for slot loads from a parent trace. */
3267 if ((ir
->op2
& IRSLOAD_PARENT
)) {
3268 RegSP rs
= as
->parentmap
[ir
->op1
];
3269 lua_assert(regsp_used(rs
));
3271 if (!ra_hasspill(regsp_spill(rs
)) && ra_hasreg(regsp_reg(rs
))) {
3272 ir
->prev
= (uint16_t)REGSP_HINT(regsp_reg(rs
));
3278 if (i
== as
->stopins
+1 && ir
->op1
== ir
->op2
)
3281 case IR_CALLN
: case IR_CALLL
: case IR_CALLS
: {
3282 const CCallInfo
*ci
= &lj_ir_callinfo
[ir
->op2
];
3283 /* NYI: not fastcall-aware, but doesn't matter (yet). */
3284 if (CCI_NARGS(ci
) > (uint32_t)as
->evenspill
) /* Leave room for args. */
3285 as
->evenspill
= (int32_t)CCI_NARGS(ci
);
3287 ir
->prev
= REGSP_HINT(irt_isnum(ir
->t
) ? RID_FPRET
: RID_RET
);
3289 ir
->prev
= REGSP_HINT(RID_RET
);
3292 as
->modset
|= (ci
->flags
& CCI_NOFPRCLOBBER
) ?
3293 (RSET_SCRATCH
& ~RSET_FPR
) : RSET_SCRATCH
;
3296 /* C calls evict all scratch regs and return results in RID_RET. */
3297 case IR_SNEW
: case IR_TNEW
: case IR_TDUP
: case IR_TOSTR
:
3299 ir
->prev
= REGSP_HINT(RID_RET
);
3301 as
->modset
= RSET_SCRATCH
;
3303 case IR_STRTO
: case IR_OBAR
:
3305 as
->modset
= RSET_SCRATCH
;
3308 ir
->prev
= REGSP_HINT(RID_XMM0
);
3310 as
->modset
|= RSET_RANGE(RID_XMM0
, RID_XMM1
+1)|RID2RSET(RID_EAX
);
3313 if (ir
->op2
== IRFPM_EXP2
) { /* May be joined to lj_vm_pow_sse. */
3314 ir
->prev
= REGSP_HINT(RID_XMM0
);
3316 if (as
->evenspill
< 4) /* Leave room for 16 byte scratch area. */
3320 as
->modset
|= RSET_RANGE(RID_XMM0
, RID_XMM2
+1)|RID2RSET(RID_EAX
);
3322 } else if (ir
->op2
<= IRFPM_TRUNC
&& !(as
->flags
& JIT_F_SSE4_1
)) {
3323 ir
->prev
= REGSP_HINT(RID_XMM0
);
3325 as
->modset
|= RSET_RANGE(RID_XMM0
, RID_XMM3
+1)|RID2RSET(RID_EAX
);
3329 /* Non-constant shift counts need to be in RID_ECX. */
3330 case IR_BSHL
: case IR_BSHR
: case IR_BSAR
: case IR_BROL
: case IR_BROR
:
3331 if (!irref_isk(ir
->op2
) && !ra_hashint(IR(ir
->op2
)->r
))
3332 IR(ir
->op2
)->r
= REGSP_HINT(RID_ECX
);
3334 /* Do not propagate hints across type conversions. */
3335 case IR_TONUM
: case IR_TOINT
: case IR_TOBIT
:
3338 /* Propagate hints across likely 'op reg, imm' or 'op reg'. */
3339 if (irref_isk(ir
->op2
) && !irref_isk(ir
->op1
)) {
3340 ir
->prev
= IR(ir
->op1
)->prev
;
3345 ir
->prev
= REGSP_INIT
;
3347 if ((as
->evenspill
& 1))
3348 as
->oddspill
= as
->evenspill
++;
3353 /* -- Assembler core ------------------------------------------------------ */
3355 /* Define this if you want to run LuaJIT with Valgrind. */
3356 #ifdef LUAJIT_USE_VALGRIND
3357 #include <valgrind/valgrind.h>
3358 #define VG_INVALIDATE(p, sz) VALGRIND_DISCARD_TRANSLATIONS(p, sz)
3360 #define VG_INVALIDATE(p, sz) ((void)0)
3363 /* Assemble a trace. */
3364 void lj_asm_trace(jit_State
*J
, Trace
*T
)
3367 ASMState
*as
= &as_
;
3369 /* Setup initial state. Copy some fields to reduce indirections. */
3373 as
->flags
= J
->flags
;
3374 as
->loopref
= J
->loopref
;
3378 as
->parent
= J
->trace
[J
->parent
];
3379 lj_snap_regspmap(as
->parentmap
, as
->parent
, J
->exitno
);
3383 as
->mctop
= lj_mcode_reserve(J
, &as
->mcbot
); /* Reserve MCode memory. */
3384 as
->mcp
= as
->mctop
;
3385 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
3386 asm_exitstub_setup(as
, T
->nsnap
);
3389 as
->mcp
= as
->mctop
;
3390 as
->curins
= T
->nins
;
3392 RA_DBGX((as
, "===== STOP ====="));
3393 /* Realign and leave room for backwards loop branch or exit branch. */
3395 int i
= ((int)(intptr_t)as
->realign
) & 15;
3396 MCode
*p
= as
->mctop
;
3397 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
3401 as
->mcp
= p
- (as
->loopinv
? 5 : 2); /* Space for short/near jmp. */
3403 as
->mcp
= as
->mctop
- 5; /* Space for exit branch (near jmp). */
3405 as
->invmcp
= as
->mcp
;
3410 as
->sectref
= as
->loopref
;
3411 as
->fuseref
= (as
->flags
& JIT_F_OPT_FUSE
) ? as
->loopref
: FUSE_DISABLED
;
3413 /* Setup register allocation. */
3414 asm_setup_regsp(as
, T
);
3417 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
3418 as
->mcp
-= (as
->flags
& JIT_F_LEA_AGU
) ? 7 : 6;
3423 } while (as
->realign
); /* Retry in case the MCode needs to be realigned. */
3428 asm_gc_check(as
, &as
->T
->snap
[0]);
3431 asm_const_remat(as
);
3438 RA_DBGX((as
, "===== START ===="));
3440 if (as
->freeset
!= RSET_ALL
)
3441 lj_trace_err(as
->J
, LJ_TRERR_BADRA
); /* Ouch! Should never happen. */
3443 /* Set trace entry point before fixing up tail to allow link to self. */
3445 T
->mcloop
= as
->mcloop
? (MSize
)(as
->mcloop
- as
->mcp
) : 0;
3447 asm_tail_fixup(as
, T
->link
); /* Note: this may change as->mctop! */
3448 T
->szmcode
= (MSize
)(as
->mctop
- as
->mcp
);
3449 VG_INVALIDATE(T
->mcode
, T
->szmcode
);
3452 /* Patch exit jumps of existing machine code to a new target. */
3453 void lj_asm_patchexit(jit_State
*J
, Trace
*T
, ExitNo exitno
, MCode
*target
)
3455 MCode
*p
= T
->mcode
;
3456 MCode
*mcarea
= lj_mcode_patch(J
, p
, 0);
3457 MSize len
= T
->szmcode
;
3458 MCode
*px
= exitstub_addr(J
, exitno
) - 6;
3459 MCode
*pe
= p
+len
-6;
3460 if (len
> 5 && p
[len
-5] == XI_JMP
&& p
+len
-6 + *(int32_t *)(p
+len
-4) == px
)
3461 *(int32_t *)(p
+len
-4) = (int32_t)(target
- (p
+len
));
3462 for (; p
< pe
; p
++) {
3463 if ((*(uint16_t *)p
& 0xf0ff) == 0x800f && p
+ *(int32_t *)(p
+2) == px
) {
3464 *(int32_t *)(p
+2) = (int32_t)(target
- (p
+6));
3468 lj_mcode_patch(J
, mcarea
, 1);
3469 VG_INVALIDATE(T
->mcode
, T
->szmcode
);